Source code for zvec_db.embedders.sparse.bm25

"""BM25 sparse embedding using scikit-learn pipelines.

This module implements the BM25 (Best Matching 25) scoring formula, a
probabilistic ranking function widely used in information retrieval.
BM25 improves upon simple term frequency by accounting for document
length normalization and term saturation.

Classes
-------
BM25Transformer
    Scikit-learn transformer implementing BM25 scoring.
BM25Embedder
    High-level embedder wrapping BM25Transformer with zvec-db compatibility.

Example Usage
-------------
::

    from zvec_db.embedders import BM25Embedder

    embedder = BM25Embedder(
        k1=1.2,
        b=0.75,
        max_features=4096
    )
    embedder.fit(documents)
    vector = embedder.embed("search query")
"""

from typing import TYPE_CHECKING, Any, Callable, Optional

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

from ..base import BaseSparseEmbedder, ExtendedList
from .base import BaseBM25Transformer

if TYPE_CHECKING:
    from zvec_db.preprocessing.config import NormalizationConfig

# Default max_features: 8192 (2^13) provides good vocabulary coverage
# while maintaining memory efficiency. This matches the base class default.
DEFAULT_MAX_FEATURES = 8192

# BM25 hyperparameters - these are standard values from the literature:
# k1=1.2: Standard value for term frequency saturation (typical range: 1.2-2.0)
# b=0.75: Standard value for length normalization (typical range: 0.5-1.0)
DEFAULT_K1 = 1.2
DEFAULT_B = 0.75



[docs]
class BM25Transformer(BaseBM25Transformer):
    """Transformer implementing the BM25 scoring formula.

    BM25 (Best Matching 25) is a probabilistic ranking function widely used
    in information retrieval. It improves upon simple term frequency by
    accounting for document length normalization and term saturation.

    The BM25 score for a term :math:`t` in document :math:`d` is computed as:

    .. math::

        \\text{BM25}(t, d) = \\text{IDF}(t) \\times \\frac{f(t, d) \\times (k_1 + 1)}
        {f(t, d) + k_1 \\times (1 - b + b \\times \\frac{|d|}{\\text{avgdl}})}

    where:
        - :math:`f(t, d)` is the term frequency of :math:`t` in document :math:`d`
        - :math:`|d|` is the document length
        - :math:`\\text{avgdl}` is the average document length in the corpus
        - :math:`\\text{IDF}(t)` is the inverse document frequency of term :math:`t`

    Args:
        k1 (float): Term frequency saturation parameter. Controls how quickly
            term frequency saturates. Higher values mean slower saturation.
            Typical range: 1.2 to 2.0. Defaults to 1.2.
        b (float): Length normalization parameter. Controls the influence of
            document length. ``b=1.0`` means full length normalization,
            ``b=0.0`` disables it. Defaults to 0.75.

    Example:
        >>> from sklearn.feature_extraction.text import CountVectorizer
        >>> from sklearn.pipeline import Pipeline
        >>> pipeline = Pipeline([
        ...     ("count", CountVectorizer()),
        ...     ("bm25", BM25Transformer(k1=1.5, b=0.8))
        ... ])
        >>> pipeline.fit(documents)
    """


[docs]
    def __init__(self, k1: float = DEFAULT_K1, b: float = DEFAULT_B):
        """Initialize the BM25 transformer.

        Args:
            k1 (float): Term frequency saturation parameter. Defaults to 1.2.
                Typical range: 1.2-2.0. Higher values mean slower saturation.
            b (float): Length normalization parameter. Defaults to 0.75.
                Typical range: 0.5-1.0. b=1.0 means full length normalization.
        """
        super().__init__(k1)
        self.b = b


    def _compute_norm(self, doc_lengths: np.ndarray) -> np.ndarray:
        """Compute BM25 normalization: k1 × (1 - b + b × |d|/avgdl)."""
        return self.k1 * (1.0 - self.b + self.b * doc_lengths / self.avgdl_)

    def _compute_scores(
        self, data: np.ndarray, norm: np.ndarray, cols: np.ndarray
    ) -> np.ndarray:
        """Compute BM25: IDF × (TF × (k1 + 1)) / (TF + norm)."""
        denominator = data + norm
        return (
            self.idf_[cols]
            * data
            * (self.k1 + 1.0)
            / np.where(denominator != 0, denominator, 1e-10)
        )




[docs]
class BM25Embedder(BaseSparseEmbedder):
    """Sparse embedder implementing the BM25 scoring formula.

    This class wires together a ``CountVectorizer`` with a lightweight
    ``BM25Transformer``.  Tokenisation behaviour is controlled by the two
    parameters inherited from :class:`BaseSparseEmbedder`:

    * ``is_pretokenized`` tells the embedder to expect lists of tokens as input
      and avoids any preprocessing altogether.
    * ``tokenizer`` allows the client to supply a callable that will be
      executed on every raw text document *before* vectorisation.  When a
      tokenizer is used the data passed to the scikit-learn pipeline consists
      of token lists as well; the vectorizer is therefore configured to act as
      an identity transformer.

    The two options are mutually exclusive and validated by the base class.

    Args:
        tokenizer (Optional[Callable]): Custom tokenizer function.
        is_pretokenized (bool): If True, input documents must be lists of tokens.
        max_features (Optional[int]): Maximum number of features to retain.
        k1 (float): Term frequency saturation parameter. Defaults to 1.2.
        b (float): Length normalization parameter. Defaults to 0.75.
        preprocessing_config (Optional[NormalizationConfig]): Configuration for
            automatic text preprocessing (normalization, stemming, stopwords).
            If set, preprocessing is automatically applied during fit() and embed().
        **count_params: Additional parameters for CountVectorizer.
    """


[docs]
    def __init__(
        self,
        tokenizer: Optional[Callable] = None,
        is_pretokenized: bool = False,
        max_features: Optional[int] = DEFAULT_MAX_FEATURES,
        k1: float = DEFAULT_K1,
        b: float = DEFAULT_B,
        preprocessing_config: Optional["NormalizationConfig"] = None,
        **count_params,
    ):
        super().__init__(
            tokenizer,
            is_pretokenized,
            max_features,
            preprocessing_config=preprocessing_config,
        )
        self.k1 = k1
        self.b = b
        self.vectorizer_params = count_params



[docs]
    def fit(self, corpus: ExtendedList, y: Any = None) -> "BM25Embedder":
        """Train the BM25 pipeline on a corpus of documents.

        This method builds a scikit-learn pipeline consisting of:
        1. ``CountVectorizer``: Tokenizes documents and builds term counts.
        2. ``BM25Transformer``: Applies BM25 weighting to the count matrix.

        The corpus is pre-processed according to the embedder's configuration
        (custom tokenizer or pre-tokenized mode) before being passed to the
        pipeline.

        Args:
            corpus (ExtendedList): Training documents. Must be strings unless
                ``is_pretokenized=True`` or a custom ``tokenizer`` is set.
            y: Ignored; present for scikit-learn compatibility.

        Returns:
            self: The fitted embedder.

        Raises:
            ValueError: If corpus format doesn't match the configuration.
        """
        processed = self._prepare_corpus(corpus)
        params = self._prepare_vectorizer_params(self.vectorizer_params)
        self.model = Pipeline(
            [
                ("count", CountVectorizer(**params)),
                ("bm25", BM25Transformer(k1=self.k1, b=self.b)),
            ]
        )
        self.model.fit(processed)
        self.is_fitted_ = True
        return self