"""TF-IDF (Term Frequency-Inverse Document Frequency) sparse embedding.
This module implements TF-IDF embedding using scikit-learn's TfidfVectorizer.
TF-IDF is a statistical measure that evaluates how relevant a word is to a
document in a collection of documents, computed as the product of term
frequency and inverse document frequency.
Classes
-------
TfidfEmbedder
Sparse TF-IDF embedder using scikit-learn's TfidfVectorizer.
Example Usage
-------------
::
from zvec_db.embedders import TfidfEmbedder
embedder = TfidfEmbedder(
max_features=4096,
sublinear_tf=True
)
embedder.fit(documents)
vector = embedder.embed("search query")
"""
from typing import TYPE_CHECKING, Callable, Optional
from sklearn.feature_extraction.text import TfidfVectorizer
from ..base import BaseSparseEmbedder, ExtendedList
if TYPE_CHECKING:
from zvec_db.preprocessing.config import NormalizationConfig
# Default max_features: 8192 (2^13) provides good vocabulary coverage
# while maintaining memory efficiency. This matches the base class default.
DEFAULT_MAX_FEATURES = 8192
[docs]
class TfidfEmbedder(BaseSparseEmbedder):
"""Sparse TF-IDF embedder using scikit-learn's ``TfidfVectorizer``.
TF-IDF (Term Frequency-Inverse Document Frequency) is a statistical measure
that evaluates how relevant a word is to a document in a collection of
documents. It is computed as the product of:
* **Term Frequency (TF)**: How often a term appears in a document.
* **Inverse Document Frequency (IDF)**: A penalty factor for terms that
appear in many documents.
This embedder supports custom tokenization and pre-tokenized inputs. All
additional keyword arguments are passed through to the underlying
``TfidfVectorizer`` (e.g., ``min_df``, ``max_df``, ``ngram_range``,
``sublinear_tf``).
Args:
tokenizer (Optional[Callable]): Custom tokenizer function. If provided,
it will be called on each document before vectorization.
is_pretokenized (bool): If True, input documents must already be lists
of tokens. Mutually exclusive with ``tokenizer``.
max_features (Optional[int]): Maximum number of features to retain per
document. Defaults to 8192.
preprocessing_config (Optional[NormalizationConfig]): Configuration for
automatic text preprocessing (normalization, stemming, stopwords).
If set, preprocessing is automatically applied during fit() and embed().
**tfidf_params: Additional keyword arguments passed to
``TfidfVectorizer``.
Example:
>>> embedder = TfidfEmbedder(min_df=2, sublinear_tf=True)
>>> embedder.fit(documents)
>>> vectors = embedder.embed(["query text"])
"""
[docs]
def __init__(
self,
tokenizer: Optional[Callable] = None,
is_pretokenized: bool = False,
max_features: Optional[int] = DEFAULT_MAX_FEATURES,
preprocessing_config: Optional["NormalizationConfig"] = None,
**tfidf_params,
):
super().__init__(
tokenizer,
is_pretokenized,
max_features,
preprocessing_config=preprocessing_config,
)
self.vectorizer_params = tfidf_params
[docs]
def fit(self, corpus: ExtendedList, y=None):
"""Fit the TF-IDF vectorizer on a corpus of documents.
The corpus is pre-processed according to the embedder's configuration:
* **Custom tokenizer**: Each document is tokenized before vectorization.
* **Pre-tokenized mode**: Documents are expected to be lists of tokens.
* **Default**: Raw strings are passed directly to ``TfidfVectorizer``.
Args:
corpus (ExtendedList): Training documents. Must be strings unless
``is_pretokenized=True`` or a custom ``tokenizer`` is set.
y: Ignored; present for scikit-learn compatibility.
Returns:
self: The fitted embedder.
Raises:
ValueError: If corpus format doesn't match the configuration.
"""
processed = self._prepare_corpus(corpus)
params = self._prepare_vectorizer_params(self.vectorizer_params)
self.model = TfidfVectorizer(**params)
self.model.fit(processed)
self.is_fitted_ = True
return self