Source code for zvec_db.embedders.sparse.bm25plus

"""BM25+ sparse embedding with smoothing to prevent zero scores.

This module implements BM25+, an extension of BM25 that adds a smoothing
parameter (delta) to prevent documents with zero term frequency from having
a zero score. This is particularly useful for corpora with many rare terms
or when combining scores from multiple sources.

Classes
-------
BM25PlusTransformer
    Scikit-learn transformer implementing BM25+ scoring.
BM25PlusEmbedder
    High-level embedder wrapping BM25PlusTransformer with zvec-db compatibility.

Example Usage
-------------
::

    from zvec_db.embedders import BM25PlusEmbedder

    embedder = BM25PlusEmbedder(
        k1=1.2,
        b=0.75,
        delta=0.5,
        max_features=4096
    )
    embedder.fit(documents)
    vector = embedder.embed("search query")
"""

from typing import TYPE_CHECKING, Any, Callable, Optional

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

from ..base import BaseSparseEmbedder, ExtendedList
from .base import BaseBM25Transformer

if TYPE_CHECKING:
    from zvec_db.preprocessing.config import NormalizationConfig

# Default max_features: 8192 (2^13) provides good vocabulary coverage
# while maintaining memory efficiency. This matches the base class default.
DEFAULT_MAX_FEATURES = 8192

# BM25+ hyperparameters - these are standard values from the literature:
# k1=1.2: Standard value for term frequency saturation (typical range: 1.2-2.0)
# b=0.75: Standard value for length normalization (typical range: 0.5-1.0)
# delta=0.5: Smoothing parameter to prevent zero scores (typical range: 0.4-1.0)
DEFAULT_K1 = 1.2
DEFAULT_B = 0.75
DEFAULT_DELTA = 0.5



[docs]
class BM25PlusTransformer(BaseBM25Transformer):
    """Transformer implementing the BM25+ scoring formula.

    BM25+ is an extension of BM25 that adds a smoothing parameter (delta) to
    prevent documents with zero term frequency from having a zero score. This
    is particularly useful for corpora with many rare terms or when combining
    scores from multiple sources.

    The BM25+ score for a term :math:`t` in document :math:`d` is computed as:

    .. math::

        \\text{BM25+}(t, d) = \\text{IDF}(t) \\times \\left(\\delta +
        \\frac{f(t, d) \\times (k_1 + 1)}{f(t, d) + k_1 \\times (1 - b + b
        \\times \\\\frac{|d|}{\\text{avgdl}})}\\right)

    where:
        - :math:`f(t, d)` is the term frequency of :math:`t` in document :math:`d`
        - :math:`|d|` is the document length
        - :math:`\\text{avgdl}` is the average document length in the corpus
        - :math:`\\text{IDF}(t)` is the inverse document frequency of term :math:`t`
        - :math:`\\delta` is the smoothing parameter (default: 0.5)

    Key difference from BM25:
        BM25: ``IDF × (TF × (k1 + 1)) / (TF + k1 × (1 - b + b × |d|/avgdl))``
        BM25+: ``IDF × (δ + (TF × (k1 + 1)) / (TF + k1 × (1 - b + b × |d|/avgdl)))``

    The delta parameter ensures that even terms with TF=0 contribute a small
    score, which can improve retrieval performance in certain scenarios.

    Args:
        k1 (float): Term frequency saturation parameter. Controls how quickly
            term frequency saturates. Higher values mean slower saturation.
            Typical range: 1.2 to 2.0. Defaults to 1.2.
        b (float): Length normalization parameter. Controls the influence of
            document length. ``b=1.0`` means full length normalization,
            ``b=0.0`` disables it. Defaults to 0.75.
        delta (float): Smoothing parameter. Adds a constant to prevent zero
            scores. Typical range: 0.4 to 1.0. Defaults to 0.5.

    Example:
        >>> from sklearn.feature_extraction.text import CountVectorizer
        >>> from sklearn.pipeline import Pipeline
        >>> pipeline = Pipeline([
        ...     ("count", CountVectorizer()),
        ...     ("bm25plus", BM25PlusTransformer(k1=1.5, b=0.8, delta=0.6))
        ... ])
        >>> pipeline.fit(documents)
    """


[docs]
    def __init__(
        self,
        k1: float = DEFAULT_K1,
        b: float = DEFAULT_B,
        delta: float = DEFAULT_DELTA,
    ):
        """Initialize the BM25+ transformer.

        Args:
            k1 (float): Term frequency saturation parameter. Defaults to 1.2.
                Typical range: 1.2-2.0. Higher values mean slower saturation.
            b (float): Length normalization parameter. Defaults to 0.75.
                Typical range: 0.5-1.0. b=1.0 means full length normalization.
            delta (float): Smoothing parameter. Defaults to 0.5.
                Typical range: 0.4-1.0. Higher values increase the baseline score.
        """
        super().__init__(k1)
        self.b = b
        self.delta = delta


    def _compute_norm(self, doc_lengths: np.ndarray) -> np.ndarray:
        """Compute BM25+ normalization: k1 × (1 - b + b × |d|/avgdl)."""
        return self.k1 * (1.0 - self.b + self.b * doc_lengths / self.avgdl_)

    def _compute_scores(
        self, data: np.ndarray, norm: np.ndarray, cols: np.ndarray
    ) -> np.ndarray:
        """Compute BM25+: IDF × (delta + TF-normalized)."""
        denominator = data + norm
        bm25_score = (
            data * (self.k1 + 1.0) / np.where(denominator != 0, denominator, 1e-10)
        )
        return self.idf_[cols] * (self.delta + bm25_score)




[docs]
class BM25PlusEmbedder(BaseSparseEmbedder):
    """Sparse embedder implementing the BM25+ scoring formula.

    BM25+ extends BM25 by adding a smoothing parameter (delta) that prevents
    zero scores for terms with zero term frequency. This can improve retrieval
    performance, especially for corpora with many rare terms.

    This class wires together a ``CountVectorizer`` with a ``BM25PlusTransformer``.
    Tokenisation behaviour is controlled by the two parameters inherited from
    :class:`BaseSparseEmbedder`:

    * ``is_pretokenized`` tells the embedder to expect lists of tokens as input
      and avoids any preprocessing altogether.
    * ``tokenizer`` allows the client to supply a callable that will be
      executed on every raw text document *before* vectorisation. When a
      tokenizer is used the data passed to the scikit-learn pipeline consists
      of token lists as well; the vectorizer is therefore configured to act as
      an identity transformer.

    The two options are mutually exclusive and validated by the base class.

    Args:
        tokenizer (Optional[Callable]): Custom tokenizer function. If provided,
            it will be called on each document before vectorization.
        is_pretokenized (bool): If True, input documents must already be lists
            of tokens. Mutually exclusive with ``tokenizer``.
        max_features (Optional[int]): Maximum number of features to retain per
            document. Defaults to 8192.
        k1 (float): Term frequency saturation parameter. Defaults to 1.2.
            Typical range: 1.2-2.0. Higher values mean slower saturation.
        b (float): Length normalization parameter. Defaults to 0.75.
            Typical range: 0.5-1.0. b=1.0 means full length normalization.
        delta (float): Smoothing parameter. Defaults to 0.5.
            Typical range: 0.4-1.0. Higher values increase the baseline score.
        preprocessing_config (Optional[NormalizationConfig]): Configuration for
            automatic text preprocessing (normalization, stemming, stopwords).
            If set, preprocessing is automatically applied during fit() and embed().
        **count_params: Additional keyword arguments passed to
            ``CountVectorizer`` (e.g., ``min_df``, ``max_df``, ``ngram_range``).

    Example:
        >>> embedder = BM25PlusEmbedder(k1=1.5, b=0.8, delta=0.6, min_df=2)
        >>> embedder.fit(documents)
        >>> vectors = embedder.embed(["query text"])
    """


[docs]
    def __init__(
        self,
        tokenizer: Optional[Callable] = None,
        is_pretokenized: bool = False,
        max_features: Optional[int] = DEFAULT_MAX_FEATURES,
        k1: float = DEFAULT_K1,
        b: float = DEFAULT_B,
        delta: float = DEFAULT_DELTA,
        preprocessing_config: Optional["NormalizationConfig"] = None,
        **count_params,
    ):
        super().__init__(
            tokenizer,
            is_pretokenized,
            max_features,
            preprocessing_config=preprocessing_config,
        )
        self.k1 = k1
        self.b = b
        self.delta = delta
        self.vectorizer_params = count_params



[docs]
    def fit(self, corpus: ExtendedList, y: Any = None) -> "BM25PlusEmbedder":
        """Train the BM25+ pipeline on a corpus of documents.

        This method builds a scikit-learn pipeline consisting of:
        1. ``CountVectorizer``: Tokenizes documents and builds term counts.
        2. ``BM25PlusTransformer``: Applies BM25+ weighting to the count matrix.

        The corpus is pre-processed according to the embedder's configuration
        (custom tokenizer or pre-tokenized mode) before being passed to the
        pipeline.

        Args:
            corpus (ExtendedList): Training documents. Must be strings unless
                ``is_pretokenized=True`` or a custom ``tokenizer`` is set.
            y: Ignored; present for scikit-learn compatibility.

        Returns:
            self: The fitted embedder.

        Raises:
            ValueError: If corpus format doesn't match the configuration.
        """
        processed = self._prepare_corpus(corpus)
        params = self._prepare_vectorizer_params(self.vectorizer_params)
        self.model = Pipeline(
            [
                ("count", CountVectorizer(**params)),
                (
                    "bm25plus",
                    BM25PlusTransformer(k1=self.k1, b=self.b, delta=self.delta),
                ),
            ]
        )
        self.model.fit(processed)
        self.is_fitted_ = True
        return self