Source code for zvec_db.embedders.sparse.tfidf

"""TF-IDF (Term Frequency-Inverse Document Frequency) sparse embedding.

This module implements TF-IDF embedding using scikit-learn's TfidfVectorizer.
TF-IDF is a statistical measure that evaluates how relevant a word is to a
document in a collection of documents, computed as the product of term
frequency and inverse document frequency.

Classes
-------
TfidfEmbedder
    Sparse TF-IDF embedder using scikit-learn's TfidfVectorizer.

Example Usage
-------------
::

    from zvec_db.embedders import TfidfEmbedder

    embedder = TfidfEmbedder(
        max_features=4096,
        sublinear_tf=True
    )
    embedder.fit(documents)
    vector = embedder.embed("search query")
"""

from typing import TYPE_CHECKING, Callable, Optional

from sklearn.feature_extraction.text import TfidfVectorizer

from ..base import BaseSparseEmbedder, ExtendedList

if TYPE_CHECKING:
    from zvec_db.preprocessing.config import NormalizationConfig

# Default max_features: 8192 (2^13) provides good vocabulary coverage
# while maintaining memory efficiency. This matches the base class default.
DEFAULT_MAX_FEATURES = 8192



[docs]
class TfidfEmbedder(BaseSparseEmbedder):
    """Sparse TF-IDF embedder using scikit-learn's ``TfidfVectorizer``.

    TF-IDF (Term Frequency-Inverse Document Frequency) is a statistical measure
    that evaluates how relevant a word is to a document in a collection of
    documents. It is computed as the product of:

    * **Term Frequency (TF)**: How often a term appears in a document.
    * **Inverse Document Frequency (IDF)**: A penalty factor for terms that
      appear in many documents.

    This embedder supports custom tokenization and pre-tokenized inputs. All
    additional keyword arguments are passed through to the underlying
    ``TfidfVectorizer`` (e.g., ``min_df``, ``max_df``, ``ngram_range``,
    ``sublinear_tf``).

    Args:
        tokenizer (Optional[Callable]): Custom tokenizer function. If provided,
            it will be called on each document before vectorization.
        is_pretokenized (bool): If True, input documents must already be lists
            of tokens. Mutually exclusive with ``tokenizer``.
        max_features (Optional[int]): Maximum number of features to retain per
            document. Defaults to 8192.
        preprocessing_config (Optional[NormalizationConfig]): Configuration for
            automatic text preprocessing (normalization, stemming, stopwords).
            If set, preprocessing is automatically applied during fit() and embed().
        **tfidf_params: Additional keyword arguments passed to
            ``TfidfVectorizer``.

    Example:
        >>> embedder = TfidfEmbedder(min_df=2, sublinear_tf=True)
        >>> embedder.fit(documents)
        >>> vectors = embedder.embed(["query text"])
    """


[docs]
    def __init__(
        self,
        tokenizer: Optional[Callable] = None,
        is_pretokenized: bool = False,
        max_features: Optional[int] = DEFAULT_MAX_FEATURES,
        preprocessing_config: Optional["NormalizationConfig"] = None,
        **tfidf_params,
    ):
        super().__init__(
            tokenizer,
            is_pretokenized,
            max_features,
            preprocessing_config=preprocessing_config,
        )
        self.vectorizer_params = tfidf_params



[docs]
    def fit(self, corpus: ExtendedList, y=None):
        """Fit the TF-IDF vectorizer on a corpus of documents.

        The corpus is pre-processed according to the embedder's configuration:

        * **Custom tokenizer**: Each document is tokenized before vectorization.
        * **Pre-tokenized mode**: Documents are expected to be lists of tokens.
        * **Default**: Raw strings are passed directly to ``TfidfVectorizer``.

        Args:
            corpus (ExtendedList): Training documents. Must be strings unless
                ``is_pretokenized=True`` or a custom ``tokenizer`` is set.
            y: Ignored; present for scikit-learn compatibility.

        Returns:
            self: The fitted embedder.

        Raises:
            ValueError: If corpus format doesn't match the configuration.
        """
        processed = self._prepare_corpus(corpus)
        params = self._prepare_vectorizer_params(self.vectorizer_params)
        self.model = TfidfVectorizer(**params)
        self.model.fit(processed)
        self.is_fitted_ = True
        return self