Source code for jange.ops.text.encode

"""This module contains several text encoding algorithms including binary or one-hot encoding,
count based and tf-idf
"""
from typing import Optional, Tuple, Union

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from jange.ops.base import ScikitBasedOperation


[docs]def tfidf(
    max_features: Optional[int] = None,
    max_df: Union[int, float] = 1.0,
    min_df: Union[int, float] = 1,
    ngram_range: Tuple[int, int] = (1, 1),
    norm: str = "l2",
    use_idf: bool = True,
    name: str = "tfidf",
    **kwargs,
) -> ScikitBasedOperation:
    """Returns tfidf based feature vector extraction.
    Uses sklearn's TfidfVectorizer as underlying model.

    Parameters
    ----------
    max_features : Optional[int]
        If some value is provided then only top `max_features` words
        order by their count frequency are considered in the vocabulary

    max_df : Union[int, float]
        When building vocabulary, ignore terms that have document frequency higher
        than the given value. If the value is float, then it is considered as a ratio.

    min_df : Union[int, float]
        When building vocabulary, ignore terms that have document frequency less than
        the given valu. If the value is float, then it is considered as a ratio.

    ngram_range : Tuple[int, int]
        The lower and upper boundary of the range of n-values for different
        n-grams to be extracted. All values of n such that min_n <= n <= max_n
        will be used. For example an ``ngram_range`` of ``(1, 1)`` means only
        unigrams, ``(1, 2)`` means unigrams and bigrams, and ``(2, 2)`` means
        only bigrams.

    norm : str
        Each output row will have unit norm, either:
        * 'l2': Sum of squares of vector elements is 1. The cosine
        similarity between two vectors is their dot product when l2 norm has
        been applied.
        * 'l1': Sum of absolute values of vector elements is 1.

    use_idf : bool
        Enable inverse-document-frequency reweighting.

    name : str
        name of this operation

    **kwargs
        Keyword parameters that will be passed to the initializer of CountVectorizer

    Returns
    -------
    SklearnBasedEncodeOperation

    See https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
    for details on the paramters and more examples.
    """
    model = TfidfVectorizer(
        max_features=max_features,
        max_df=max_df,
        min_df=min_df,
        ngram_range=ngram_range,
        norm=norm,
        use_idf=use_idf,
        **kwargs,
    )
    return ScikitBasedOperation(model=model, predict_fn_name="transform", name=name)


[docs]def count(
    max_features: Optional[int] = None,
    max_df: Union[int, float] = 1.0,
    min_df: Union[int, float] = 1,
    ngram_range: Tuple[int, int] = (1, 1),
    name: Optional[str] = "count",
    **kwargs,
) -> ScikitBasedOperation:
    """Returns count based feature vector extraction.
    Uses sklearn's CountVectorizer as underlying model.

    Parameters
    ----------
    max_features : Optional[int]
        If some value is provided then only top `max_features` words
        order by their count frequency are considered in the vocabulary

    max_df : Union[int, float]
        When building vocabulary, ignore terms that have document frequency higher
        than the given value. If the value is float, then it is considered as a ratio.

    min_df : Union[int, float]
        When building vocabulary, ignore terms that have document frequency less than
        the given valu. If the value is float, then it is considered as a ratio.

    ngram_range : Tuple[int, int]
        The lower and upper boundary of the range of n-values for different
        n-grams to be extracted. All values of n such that min_n <= n <= max_n
        will be used. For example an ``ngram_range`` of ``(1, 1)`` means only
        unigrams, ``(1, 2)`` means unigrams and bigrams, and ``(2, 2)`` means
        only bigrams.

    name : str
        name of this operation

    **kwargs
        Keyword parameters that will be passed to the initializer of CountVectorizer

    Returns
    -------
    SklearnBasedEncodeOperation

    See https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
    for details on the paramters and more examples.
    """
    model = CountVectorizer(
        max_features=max_features,
        max_df=max_df,
        min_df=min_df,
        ngram_range=ngram_range,
        **kwargs,
    )
    return ScikitBasedOperation(model=model, predict_fn_name="transform", name=name)


[docs]def one_hot(
    max_features: Optional[int] = None,
    max_df: Union[int, float] = 1.0,
    min_df: Union[int, float] = 1,
    ngram_range: Tuple[int, int] = (1, 1),
    name: Optional[str] = "one_hot",
    **kwargs,
) -> ScikitBasedOperation:
    """Returns operation for performing one hot encoding of texts.

    Uses sklearn.feature_extraction.text.CountVectorizer class with binary=True mode

    Parameters
    ----------
    max_features : Optional[int]
        If some value is provided then only top `max_features` words
        order by their count frequency are considered in the vocabulary

    max_df : Union[int, float]
        When building vocabulary, ignore terms that have document frequency higher
        than the given value. If the value is float, then it is considered as a ratio.

    min_df : Union[int, float]
        When building vocabulary, ignore terms that have document frequency less than
        the given valu. If the value is float, then it is considered as a ratio.

    ngram_range : Tuple[int, int]
        The lower and upper boundary of the range of n-values for different
        n-grams to be extracted. All values of n such that min_n <= n <= max_n
        will be used. For example an ``ngram_range`` of ``(1, 1)`` means only
        unigrams, ``(1, 2)`` means unigrams and bigrams, and ``(2, 2)`` means
        only bigrams.

    name : str
        name of this operation

    **kwargs
        Keyword parameters that will be passed to the initializer of CountVectorizer

    Returns
    -------
    SklearnBasedEncodeOperation

    See https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
    for details on the paramters and more examples.
    """
    model = CountVectorizer(
        max_features=max_features,
        max_df=max_df,
        min_df=min_df,
        ngram_range=ngram_range,
        binary=True,
        **kwargs,
    )
    return ScikitBasedOperation(model=model, predict_fn_name="transform", name=name)