"""BM25L sparse embedding with linear length normalization.
This module implements BM25L, a variant of BM25 that uses linear length
normalization instead of the standard BM25 convex combination. BM25L is
particularly suitable for corpora with highly variable document lengths.
Classes
-------
BM25LTransformer
Scikit-learn transformer implementing BM25L scoring.
BM25LEmbedder
High-level embedder wrapping BM25LTransformer with zvec-db compatibility.
Example Usage
-------------
::
from zvec_db.embedders import BM25LEmbedder
embedder = BM25LEmbedder(
k1=1.2,
max_features=4096
)
embedder.fit(documents)
vector = embedder.embed("search query")
"""
from typing import TYPE_CHECKING, Any, Callable, Optional
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from ..base import BaseSparseEmbedder, ExtendedList
from .base import BaseBM25Transformer
if TYPE_CHECKING:
from zvec_db.preprocessing.config import NormalizationConfig
# Default max_features: 8192 (2^13) provides good vocabulary coverage
# while maintaining memory efficiency. This matches the base class default.
DEFAULT_MAX_FEATURES = 8192
# BM25L hyperparameters - these are standard values from the literature:
# k1=1.2: Standard value for term frequency saturation (typical range: 1.2-2.0)
# BM25L uses linear length normalization instead of BM25's convex combination
DEFAULT_K1 = 1.2
[docs]
class BM25LEmbedder(BaseSparseEmbedder):
"""Sparse embedder implementing the BM25L scoring formula.
BM25L is a variant of BM25 that uses linear length normalization, making it
more suitable for corpora with highly variable document lengths.
This class wires together a ``CountVectorizer`` with a ``BM25LTransformer``.
Tokenisation behaviour is controlled by the two parameters inherited from
:class:`BaseSparseEmbedder`:
* ``is_pretokenized`` tells the embedder to expect lists of tokens as input
and avoids any preprocessing altogether.
* ``tokenizer`` allows the client to supply a callable that will be
executed on every raw text document *before* vectorisation. When a
tokenizer is used the data passed to the scikit-learn pipeline consists
of token lists as well; the vectorizer is therefore configured to act as
an identity transformer.
The two options are mutually exclusive and validated by the base class.
Args:
tokenizer (Optional[Callable]): Custom tokenizer function. If provided,
it will be called on each document before vectorization.
is_pretokenized (bool): If True, input documents must already be lists
of tokens. Mutually exclusive with ``tokenizer``.
max_features (Optional[int]): Maximum number of features to retain per
document. Defaults to 8192.
k1 (float): Term frequency saturation parameter. Defaults to 1.2.
Typical range: 1.2-2.0. Higher values mean slower saturation.
preprocessing_config (Optional[NormalizationConfig]): Configuration for
automatic text preprocessing (normalization, stemming, stopwords).
If set, preprocessing is automatically applied during fit() and embed().
**count_params: Additional keyword arguments passed to
``CountVectorizer`` (e.g., ``min_df``, ``max_df``, ``ngram_range``).
Example:
>>> embedder = BM25LEmbedder(k1=1.5, min_df=2)
>>> embedder.fit(documents)
>>> vectors = embedder.embed(["query text"])
"""
[docs]
def __init__(
self,
tokenizer: Optional[Callable] = None,
is_pretokenized: bool = False,
max_features: Optional[int] = DEFAULT_MAX_FEATURES,
k1: float = DEFAULT_K1,
preprocessing_config: Optional["NormalizationConfig"] = None,
**count_params,
):
super().__init__(
tokenizer,
is_pretokenized,
max_features,
preprocessing_config=preprocessing_config,
)
self.k1 = k1
self.vectorizer_params = count_params
[docs]
def fit(self, corpus: ExtendedList, y: Any = None) -> "BM25LEmbedder":
"""Train the BM25L pipeline on a corpus of documents.
This method builds a scikit-learn pipeline consisting of:
1. ``CountVectorizer``: Tokenizes documents and builds term counts.
2. ``BM25LTransformer``: Applies BM25L weighting to the count matrix.
The corpus is pre-processed according to the embedder's configuration
(custom tokenizer or pre-tokenized mode) before being passed to the
pipeline.
Args:
corpus (ExtendedList): Training documents. Must be strings unless
``is_pretokenized=True`` or a custom ``tokenizer`` is set.
y: Ignored; present for scikit-learn compatibility.
Returns:
self: The fitted embedder.
Raises:
ValueError: If corpus format doesn't match the configuration.
"""
processed = self._prepare_corpus(corpus)
params = self._prepare_vectorizer_params(self.vectorizer_params)
self.model = Pipeline(
[
("count", CountVectorizer(**params)),
("bm25l", BM25LTransformer(k1=self.k1)),
]
)
self.model.fit(processed)
self.is_fitted_ = True
return self