Source code for jange.ops.text.clean

"""This module contains several text cleaning operations
"""

from typing import Dict, List, Optional, Tuple, Union

import more_itertools
from spacy.language import Language
from spacy.matcher import Matcher
from spacy.tokens import Doc

from jange import base, ops, stream


[docs]class EmptyTextError(Exception):
    pass


[docs]@base.accepts(str, strict=True)
@base.produces(str)
class CaseChangeOperation(ops.base.Operation):
    """Operation for changing case of the texts.

    Parameters
    ----------
    mode : str
        one of `lower`, `upper` or `capitalize`

    name : str
        name of this operation

    Example
    --------
    >>> ds = DataStream(["AAA", "Bbb"])
    >>> list(ds.apply(CaseChangeOperation(mode="lower)))
    ["aaa", "bbb"]

    Attributes
    ----------
    mode : str
        one of ['lower', 'capitalize', 'upper']

    name : str
        name of this operation
    """

    def __init__(self, mode: str = "lower", name: str = "case_change"):
        super().__init__(name=name)
        valid_modes = ["lower", "upper", "capitalize"]
        mode = mode.lower()
        if mode not in valid_modes:
            raise ValueError(
                f"Invalid value for mode passed."
                f" Expected one of {valid_modes} but received {mode}"
            )
        self.mode = mode

    def run(self, ds: stream.DataStream):
        if self.mode == "upper":
            fn = str.upper
        elif self.mode == "capitalize":
            fn = str.capitalize
        else:
            fn = str.lower
        items = map(fn, ds)
        return stream.DataStream(
            applied_ops=ds.applied_ops + [self], items=items, context=ds.context
        )

    def __repr__(self):
        return f"CaseChangeOperation(mode='{self.mode}')"


[docs]def lowercase(name="lowercase") -> CaseChangeOperation:
    """Helper function to create CaseChangeOperation with mode="lower"
    """
    return CaseChangeOperation(mode="lower", name=name)


[docs]def uppercase(name="uppercase") -> CaseChangeOperation:
    """Helper function to create CaseChangeOperation with mode="upper"
    """
    return CaseChangeOperation(mode="upper", name=name)


def _lemmatize(doc, ctx):
    lemma_tokens = [t.lemma_ for t in doc]
    return " ".join(lemma_tokens), ctx


[docs]def lemmatize(
    nlp: Optional[Language] = None, name="lemmatize"
) -> ops.base.SpacyBasedOperation:
    """Helper function to return SpacyBasedOperation for lemmatizing.
    This operation returns a stream.DataStream where each item is a string after
    being lemmatized.

    Parameters
    ----------
    nlp : Optional[spacy.language.Language]
        spacy's language model or None. If None then by default
        `en_core_web_sm` spacy model is loaded

    name : Optional[str]
        name of this operation

    Returns
    -------
    out : SpacyBasedOperation
    """
    return ops.base.SpacyBasedOperation(nlp=nlp, process_doc_fn=_lemmatize, name=name,)


[docs]@base.accepts(str, Doc)
@base.produces(str)
class TokenFilterOperation(ops.base.SpacyBasedOperation):
    """Operation for filtering individual tokens.

    Spacy's token pattern matching is used for matching various
    tokens in the document. Any tokens matching the filter can
    either be discarded or kept while discarding the non matching ones.

    Parameters
    ----------
    patterns : List[List[Dict]]
        a list of patterns where each pattern is a List[Dict]. The patterns
        are passed to spacy's Token Matcher.
        see https://spacy.io/usage/rule-based-matching for more details
        on how to define patterns.

    nlp : Optional[spacy.language.Language]
        spacy's language model or None. If None then by default
        `en_core_web_sm` spacy model is loaded

    keep_matching_tokens: bool
        if true then any non-matching tokens are discarded from the document (e.g. extracting only nouns)
        if false then any matching tokens are discarded (e.g. stopword removal)

    name : Optional[str]
        name of this operation

    Example
    -------
    >>> nlp = spacy.load("en_core_web_sm")
    >>> # define patterns to match [a, an, the] tokens
    >>> patterns = [
        [{"LOWER": "a"}],
        [{"LOWER": "an"}],
        [{"LOWER": "the"}]
    ]
    >>> # define the token filter operation to match the patterns and discard them
    >>> op = TokenFilterOperation(patterns=patterns, nlp=nlp, keep_matching_tokens=False)
    >>> ds = stream.DataStream(["that is an orange"])
    >>> print(list(ds.apply(op))
    ["that is orange"]

    See https://spacy.io/usage/rule-based-matching#adding-patterns-attributes for more details
    on what token patterns can be used.

    Attributes
    ---------
    nlp : spacy.language.Language
        spacy's language model

    keep_matching_tokens : bool
        whether to discard the tokens matched by the filter from the document
        or to keep them

    patterns : List[List[Dict]]
        patterns to pass to spacy's Matcher

    name : str
        name of this operation

    """

    def __init__(
        self,
        patterns: List[List[Dict]],
        nlp: Optional[Language] = None,
        keep_matching_tokens=False,
        name: Optional[str] = "token_filter",
    ) -> None:
        super().__init__(nlp, name=name)
        self.keep_matching_tokens = keep_matching_tokens
        self.patterns = patterns
        self.matcher = self._get_matcher(self.nlp, self.patterns)

    def _get_matcher(self, nlp, patterns):
        matcher = Matcher(vocab=nlp.vocab, validate=True)

        for p in patterns:
            matcher.add("MATCHES", None, p)

        return matcher

    def _discard_tokens_from_doc(self, doc: Doc, token_ids: List[int]) -> Doc:
        """Returns a new document after discarding the tokens

        Parameters
        ----------
        doc : spacy.tokens.Doc
            orignal document
        token_ids : List[int]
            a list of index of tokens to discard

        Returns
        -------
        out : spacy.tokens.Doc
            a new document which does not contain the tokens specified
        """
        tokens = [t for t in doc if t.i not in token_ids]
        words = [t.text for t in tokens]
        spaces = [t.whitespace_ == " " for t in tokens]
        spaces[-1] = False
        return Doc(self.nlp.vocab, words=words, spaces=spaces)

    def _filter_tokens(self, matcher_output: Tuple[Doc, List[Tuple]]) -> Doc:
        ((doc, matches), context) = matcher_output
        matching_token_ids = []
        for _, start, end in matches:
            for token in doc[start:end]:
                matching_token_ids.append(token.i)

        tokens_to_discard = matching_token_ids
        if self.keep_matching_tokens:
            tokens_to_discard = [t.i for t in doc if t.i not in matching_token_ids]
        # if we have to discard all tokens in the document
        # then throw an exception
        if len(tokens_to_discard) == len(doc):
            raise EmptyTextError
        else:
            return self._discard_tokens_from_doc(doc, tokens_to_discard).text, context

    def run(self, ds: stream.DataStream) -> stream.DataStream:
        docs_ds = self.get_docs_stream(ds)
        docs = zip(docs_ds, docs_ds.context)
        # match results is a tuple ((doc, matches), context)
        match_results = self.matcher.pipe(docs, return_matches=True, as_tuples=True)
        new_docs_with_context = more_itertools.map_except(
            self._filter_tokens, match_results, EmptyTextError
        )
        new_docs, context = more_itertools.unzip(new_docs_with_context)
        return stream.DataStream(
            new_docs, applied_ops=ds.applied_ops + [self], context=context
        )

    def __getstate__(self):
        state = super().__getstate__()
        del state["matcher"]
        return state

    def __setstate__(self, state: dict):
        super().__setstate__(state)
        self.matcher = self._get_matcher(self.nlp, self.patterns)

    def __repr__(self) -> str:
        patterns = (
            self.patterns
            if len(self.patterns) < 10
            else f"{self.patterns[:10]}... and others"
        )
        return f"TokenFilterOperation(patterns={patterns}, keep_matching_tokens={self.keep_matching_tokens}, name={self.name})"


[docs]def token_filter(
    patterns: List[List[Dict]],
    keep_matching_tokens,
    nlp: Optional[Language] = None,
    name: Optional[str] = "token_filter",
) -> TokenFilterOperation:
    """Helper function to create TokenFilterOperation

    Parameters
    ----------
    patterns : List[List[Dict]]
        a list of patterns where each pattern is a List[Dict]. The patterns
        are passed to spacy's Token Matcher.
        see https://spacy.io/usage/rule-based-matching for more details
        on how to define patterns.

    nlp : Optional[spacy.language.Language]
        spacy's language model or None. If None then by default
        `en_core_web_sm` spacy model is loaded

    keep_matching_tokens: bool
        if true then any non-matching tokens are discarded from the document (e.g. extracting only nouns)
        if false then any matching tokens are discarded (e.g. stopword removal)

    name : Optional[str]
        name of this operation

    Returns
    -------
    TokenFilterOperation
    """
    return TokenFilterOperation(
        patterns=patterns,
        nlp=nlp,
        keep_matching_tokens=keep_matching_tokens,
        name=name,
    )


[docs]def pos_filter(
    pos_tags: Union[str, List[str]],
    keep_matching_tokens: bool = False,
    nlp: Optional[Language] = None,
    name: Optional[str] = "filter_pos",
) -> TokenFilterOperation:
    """TokenFilterOperation to filter tokens based on Part of Speech

    Parameters
    ----------
    pos_tags : Union[str, List[str]]
        a single POS tag or a list of POS tags to search for.
        See https://spacy.io/api/annotation#pos-tagging for more details on
        what tags can be used. These depend on the language model used.

    keep_matching_tokens : bool
        if true then tokens having the given part of speech are kept and
        others are discarded from the text. Otherwise, tokens not having
        the given part of speech tags are kept

    nlp : Optional[spacy.language.Language]
        spacy's language model or None. If None then by default
        `en_core_web_sm` spacy model is loaded

    name : Optional[str]
        name of this operation

    Returns
    -------
    TokenFilterOperation

    Example
    -------
    >>> ds = stream.DataStream(["Python is a programming language"])
    >>> print(list(ds.apply(ops.text.filter_pos("NOUN", keep_matching_tokens=True))))
    [programming language]

    """
    patterns = []
    if not isinstance(pos_tags, (list, tuple)):
        pos_tags = [pos_tags]
    for tag in pos_tags:
        patterns.append([{"POS": tag}])
    return TokenFilterOperation(
        patterns, nlp=nlp, keep_matching_tokens=keep_matching_tokens, name=name
    )


[docs]def remove_stopwords(
    words: List[str] = None,
    nlp: Optional[Language] = None,
    name: Optional[str] = "remove_stopwords",
) -> TokenFilterOperation:
    """TokenFilterOperation to remove stopwords

    Parameters
    ----------
    words : List[str]
        a list of words to remove from the text

    nlp : Optional[spacy.language.Language]
        spacy's language model or None. If None then by default
        `en_core_web_sm` spacy model is loaded

    name : Optional[str]
        name of this operation

    Returns
    -------
    TokenFilterOperation

    Example
    -------
    >>> ds = stream.DataStream(["Python is a programming language"])
    >>> print(list(ds.apply(ops.text.remove_stopwords())))
    [Python programming language]
    >>> print(list(ds.apply(ops.text.remove_stopwords(words=["programming]))))
    [Python is a language]
    """
    patterns = []
    if words:
        for word in words:
            patterns.append([{"LOWER": word.lower()}])
    else:
        patterns.append([{"IS_STOP": True}])

    return TokenFilterOperation(
        patterns, nlp=nlp, keep_matching_tokens=False, name=name
    )


[docs]def remove_numbers(
    nlp: Optional[Language] = None, name: Optional[str] = "remove_numbers"
) -> TokenFilterOperation:
    """TokenFilterOperation to remove numbers

    Parameters
    ----------
    nlp : Optional[spacy.language.Language]
        spacy's language model or None. If None then by default
        `en_core_web_sm` spacy model is loaded

    name : Optional[str]
        name of this operation

    Returns
    -------
    TokenFilterOperation
    """
    patterns = [[{"IS_DIGIT": True}]]
    return TokenFilterOperation(
        patterns, nlp=nlp, keep_matching_tokens=False, name=name
    )


[docs]def remove_links(
    nlp: Optional[Language] = None, name: Optional[str] = "remove_links"
) -> TokenFilterOperation:
    """TokenFilterOperation to remove hyperlinks

    Parameters
    ----------
    nlp : Optional[spacy.language.Language]
        spacy's language model or None. If None then by default
        `en_core_web_sm` spacy model is loaded

    name : Optional[str]
        name of this operation

    Returns
    -------
    TokenFilterOperation
    """
    patterns = [[{"LIKE_URL": True}]]
    return TokenFilterOperation(
        patterns, nlp=nlp, keep_matching_tokens=False, name=name
    )


[docs]def remove_emails(
    nlp: Optional[Language] = None, name: Optional[str] = "remove_emails"
) -> TokenFilterOperation:
    """TokenFilterOperation to remove emails

    Parameters
    ----------
    nlp : Optional[spacy.language.Language]
        spacy's language model or None. If None then by default
        `en_core_web_sm` spacy model is loaded

    name : Optional[str]
        name of this operation

    Returns
    -------
    TokenFilterOperation
    """
    patterns = [[{"LIKE_EMAIL": True}]]
    return TokenFilterOperation(
        patterns, nlp=nlp, keep_matching_tokens=False, name=name
    )


[docs]def remove_short_words(
    length: int,
    nlp: Optional[Language] = None,
    name: Optional[str] = "remove_short_words",
) -> TokenFilterOperation:
    """TokenFilterOperation to remove tokens that have fewer characters
    than specified

    Parameters
    ----------
    length : int
        atleast this many characters should be in the token, otherwise
        it is discarded

    nlp : Optional[spacy.language.Language]
        spacy's language model or None. If None then by default
        `en_core_web_sm` spacy model is loaded

    name : Optional[str]
        name of this operation

    Returns
    -------
    TokenFilterOperation
    """
    patterns = [[{"LENGTH": {"<": length}}]]
    return TokenFilterOperation(
        patterns, nlp=nlp, keep_matching_tokens=False, name=name,
    )