Source code for zvec_db.embedders.sparse.bm25l

"""BM25L sparse embedding with linear length normalization.

This module implements BM25L, a variant of BM25 that uses linear length
normalization instead of the standard BM25 convex combination. BM25L is
particularly suitable for corpora with highly variable document lengths.

Classes
-------
BM25LTransformer
    Scikit-learn transformer implementing BM25L scoring.
BM25LEmbedder
    High-level embedder wrapping BM25LTransformer with zvec-db compatibility.

Example Usage
-------------
::

    from zvec_db.embedders import BM25LEmbedder

    embedder = BM25LEmbedder(
        k1=1.2,
        max_features=4096
    )
    embedder.fit(documents)
    vector = embedder.embed("search query")
"""

from typing import TYPE_CHECKING, Any, Callable, Optional

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

from ..base import BaseSparseEmbedder, ExtendedList
from .base import BaseBM25Transformer

if TYPE_CHECKING:
    from zvec_db.preprocessing.config import NormalizationConfig

# Default max_features: 8192 (2^13) provides good vocabulary coverage
# while maintaining memory efficiency. This matches the base class default.
DEFAULT_MAX_FEATURES = 8192

# BM25L hyperparameters - these are standard values from the literature:
# k1=1.2: Standard value for term frequency saturation (typical range: 1.2-2.0)
# BM25L uses linear length normalization instead of BM25's convex combination
DEFAULT_K1 = 1.2



[docs]
class BM25LTransformer(BaseBM25Transformer):
    """Transformer implementing the BM25L scoring formula.

    BM25L is a variant of BM25 that uses linear length normalization instead of
    the standard BM25 convex combination. This makes it more suitable for corpora
    with highly variable document lengths.

    The BM25L score for a term :math:`t` in document :math:`d` is computed as:

    .. math::

        \\text{BM25L}(t, d) = \\text{IDF}(t) \\times \\frac{f(t, d) \\times (k_1 + 1)}
        {f(t, d) + k_1 \\times \\frac{|d|}{\\text{avgdl}}}

    where:
        - :math:`f(t, d)` is the term frequency of :math:`t` in document :math:`d`
        - :math:`|d|` is the document length
        - :math:`\\text{avgdl}` is the average document length in the corpus
        - :math:`\\text{IDF}(t)` is the inverse document frequency of term :math:`t`

    Key difference from BM25:
        BM25 uses: ``1 - b + b × (|d|/avgdl)`` with b ∈ [0, 1]
        BM25L uses: ``|d|/avgdl`` directly (pure linear normalization)

    This makes BM25L more aggressive in penalizing long documents, which can be
    beneficial when document lengths vary significantly in the corpus.

    Args:
        k1 (float): Term frequency saturation parameter. Controls how quickly
            term frequency saturates. Higher values mean slower saturation.
            Typical range: 1.2 to 2.0. Defaults to 1.2.

    Example:
        >>> from sklearn.feature_extraction.text import CountVectorizer
        >>> from sklearn.pipeline import Pipeline
        >>> pipeline = Pipeline([
        ...     ("count", CountVectorizer()),
        ...     ("bm25l", BM25LTransformer(k1=1.5))
        ... ])
        >>> pipeline.fit(documents)
    """


[docs]
    def __init__(self, k1: float = DEFAULT_K1):
        """Initialize the BM25L transformer.

        Args:
            k1 (float): Term frequency saturation parameter. Defaults to 1.2.
                Typical range: 1.2-2.0. Higher values mean slower saturation.
        """
        super().__init__(k1)


    def _compute_norm(self, doc_lengths: np.ndarray) -> np.ndarray:
        """Compute BM25L normalization: k1 × |d|/avgdl (pure linear)."""
        return self.k1 * doc_lengths / self.avgdl_

    def _compute_scores(
        self, data: np.ndarray, norm: np.ndarray, cols: np.ndarray
    ) -> np.ndarray:
        """Compute BM25L: IDF × (TF × (k1 + 1)) / (TF + norm)."""
        denominator = data + norm
        return (
            self.idf_[cols]
            * data
            * (self.k1 + 1.0)
            / np.where(denominator != 0, denominator, 1e-10)
        )




[docs]
class BM25LEmbedder(BaseSparseEmbedder):
    """Sparse embedder implementing the BM25L scoring formula.

    BM25L is a variant of BM25 that uses linear length normalization, making it
    more suitable for corpora with highly variable document lengths.

    This class wires together a ``CountVectorizer`` with a ``BM25LTransformer``.
    Tokenisation behaviour is controlled by the two parameters inherited from
    :class:`BaseSparseEmbedder`:

    * ``is_pretokenized`` tells the embedder to expect lists of tokens as input
      and avoids any preprocessing altogether.
    * ``tokenizer`` allows the client to supply a callable that will be
      executed on every raw text document *before* vectorisation. When a
      tokenizer is used the data passed to the scikit-learn pipeline consists
      of token lists as well; the vectorizer is therefore configured to act as
      an identity transformer.

    The two options are mutually exclusive and validated by the base class.

    Args:
        tokenizer (Optional[Callable]): Custom tokenizer function. If provided,
            it will be called on each document before vectorization.
        is_pretokenized (bool): If True, input documents must already be lists
            of tokens. Mutually exclusive with ``tokenizer``.
        max_features (Optional[int]): Maximum number of features to retain per
            document. Defaults to 8192.
        k1 (float): Term frequency saturation parameter. Defaults to 1.2.
            Typical range: 1.2-2.0. Higher values mean slower saturation.
        preprocessing_config (Optional[NormalizationConfig]): Configuration for
            automatic text preprocessing (normalization, stemming, stopwords).
            If set, preprocessing is automatically applied during fit() and embed().
        **count_params: Additional keyword arguments passed to
            ``CountVectorizer`` (e.g., ``min_df``, ``max_df``, ``ngram_range``).

    Example:
        >>> embedder = BM25LEmbedder(k1=1.5, min_df=2)
        >>> embedder.fit(documents)
        >>> vectors = embedder.embed(["query text"])
    """


[docs]
    def __init__(
        self,
        tokenizer: Optional[Callable] = None,
        is_pretokenized: bool = False,
        max_features: Optional[int] = DEFAULT_MAX_FEATURES,
        k1: float = DEFAULT_K1,
        preprocessing_config: Optional["NormalizationConfig"] = None,
        **count_params,
    ):
        super().__init__(
            tokenizer,
            is_pretokenized,
            max_features,
            preprocessing_config=preprocessing_config,
        )
        self.k1 = k1
        self.vectorizer_params = count_params



[docs]
    def fit(self, corpus: ExtendedList, y: Any = None) -> "BM25LEmbedder":
        """Train the BM25L pipeline on a corpus of documents.

        This method builds a scikit-learn pipeline consisting of:
        1. ``CountVectorizer``: Tokenizes documents and builds term counts.
        2. ``BM25LTransformer``: Applies BM25L weighting to the count matrix.

        The corpus is pre-processed according to the embedder's configuration
        (custom tokenizer or pre-tokenized mode) before being passed to the
        pipeline.

        Args:
            corpus (ExtendedList): Training documents. Must be strings unless
                ``is_pretokenized=True`` or a custom ``tokenizer`` is set.
            y: Ignored; present for scikit-learn compatibility.

        Returns:
            self: The fitted embedder.

        Raises:
            ValueError: If corpus format doesn't match the configuration.
        """
        processed = self._prepare_corpus(corpus)
        params = self._prepare_vectorizer_params(self.vectorizer_params)
        self.model = Pipeline(
            [
                ("count", CountVectorizer(**params)),
                ("bm25l", BM25LTransformer(k1=self.k1)),
            ]
        )
        self.model.fit(processed)
        self.is_fitted_ = True
        return self