Source code for zvec_db.embedders.sparse.tfidf

"""TF-IDF (Term Frequency-Inverse Document Frequency) sparse embedding.

This module implements TF-IDF embedding using scikit-learn's TfidfVectorizer.
TF-IDF is a statistical measure that evaluates how relevant a word is to a
document in a collection of documents, computed as the product of term
frequency and inverse document frequency.

Classes
-------
TfidfEmbedder
    Sparse TF-IDF embedder using scikit-learn's TfidfVectorizer.

Example Usage
-------------
::

    from zvec_db.embedders import TfidfEmbedder

    embedder = TfidfEmbedder(
        max_features=4096,
        sublinear_tf=True
    )
    embedder.fit(documents)
    vector = embedder.embed("search query")
"""

from typing import TYPE_CHECKING, Callable, Optional

from sklearn.feature_extraction.text import TfidfVectorizer

from ..base import BaseSparseEmbedder, ExtendedList

if TYPE_CHECKING:
    from zvec_db.preprocessing.config import NormalizationConfig

# Default max_features: 8192 (2^13) provides good vocabulary coverage
# while maintaining memory efficiency. This matches the base class default.
DEFAULT_MAX_FEATURES = 8192


[docs] class TfidfEmbedder(BaseSparseEmbedder): """Sparse TF-IDF embedder using scikit-learn's ``TfidfVectorizer``. TF-IDF (Term Frequency-Inverse Document Frequency) is a statistical measure that evaluates how relevant a word is to a document in a collection of documents. It is computed as the product of: * **Term Frequency (TF)**: How often a term appears in a document. * **Inverse Document Frequency (IDF)**: A penalty factor for terms that appear in many documents. This embedder supports custom tokenization and pre-tokenized inputs. All additional keyword arguments are passed through to the underlying ``TfidfVectorizer`` (e.g., ``min_df``, ``max_df``, ``ngram_range``, ``sublinear_tf``). Args: tokenizer (Optional[Callable]): Custom tokenizer function. If provided, it will be called on each document before vectorization. is_pretokenized (bool): If True, input documents must already be lists of tokens. Mutually exclusive with ``tokenizer``. max_features (Optional[int]): Maximum number of features to retain per document. Defaults to 8192. preprocessing_config (Optional[NormalizationConfig]): Configuration for automatic text preprocessing (normalization, stemming, stopwords). If set, preprocessing is automatically applied during fit() and embed(). **tfidf_params: Additional keyword arguments passed to ``TfidfVectorizer``. Example: >>> embedder = TfidfEmbedder(min_df=2, sublinear_tf=True) >>> embedder.fit(documents) >>> vectors = embedder.embed(["query text"]) """
[docs] def __init__( self, tokenizer: Optional[Callable] = None, is_pretokenized: bool = False, max_features: Optional[int] = DEFAULT_MAX_FEATURES, preprocessing_config: Optional["NormalizationConfig"] = None, **tfidf_params, ): super().__init__( tokenizer, is_pretokenized, max_features, preprocessing_config=preprocessing_config, ) self.vectorizer_params = tfidf_params
[docs] def fit(self, corpus: ExtendedList, y=None): """Fit the TF-IDF vectorizer on a corpus of documents. The corpus is pre-processed according to the embedder's configuration: * **Custom tokenizer**: Each document is tokenized before vectorization. * **Pre-tokenized mode**: Documents are expected to be lists of tokens. * **Default**: Raw strings are passed directly to ``TfidfVectorizer``. Args: corpus (ExtendedList): Training documents. Must be strings unless ``is_pretokenized=True`` or a custom ``tokenizer`` is set. y: Ignored; present for scikit-learn compatibility. Returns: self: The fitted embedder. Raises: ValueError: If corpus format doesn't match the configuration. """ processed = self._prepare_corpus(corpus) params = self._prepare_vectorizer_params(self.vectorizer_params) self.model = TfidfVectorizer(**params) self.model.fit(processed) self.is_fitted_ = True return self