Source code for jange.ops.text.embedding

"""This module contains operations for extracting word/document embeddings using a
language model.
"""
from typing import Optional

from spacy.language import Language

from jange.base import DataStream
from jange.ops.base import SpacyBasedOperation


[docs]class DocumentEmbeddingOperation(SpacyBasedOperation):
    """Operation to calculate document's vector using word-embeddings.
    Word embedding of each token are collected and averaged.

    Parameters
    ----------
    nlp : Optional[Language]
        a spacy model

    name : str
        name of this operation

    Example
    -------
    >>> ds = DataStream(["this is text 1", "this is text 2"])
    >>> vector_ds = ds.apply(DocumentEmbeddingOperation())
    >>> print(vector_ds.items)

    Attributes
    ----------
    nlp : Language
        spacy model

    name : str
        name of this operation
    """

    def __init__(
        self, nlp: Optional[Language] = None, name: str = "doc_embedding"
    ) -> None:
        super().__init__(nlp=nlp, name=name)

    def run(self, ds: DataStream) -> DataStream:
        docs_ds = self.get_docs_stream(ds)
        vecs = (d.vector for d in docs_ds)
        return DataStream(
            items=vecs, applied_ops=ds.applied_ops + [self], context=ds.context
        )


[docs]def doc_embedding(
    nlp: Optional[Language] = None, name: str = "doc_embedding"
) -> DocumentEmbeddingOperation:
    """Helper function to return DocumentEmbeddingOperation

    Parameters
    ----------
    nlp : Optional[Language]
        a spacy model

    name : str
        name of this operation

    Returns
    -------
    DocumentEmbeddingOperation
    """
    return DocumentEmbeddingOperation(nlp=nlp, name=name)