Source code for zvec_db.rerankers.utils.normalize

"""Normalization utilities for post-processing raw document relevance scores.

This module defines a lightweight helper that is used by the retrieval layer
in the surrounding application. Scoring routines such as those implemented in
:mod:`zvec_db.rerankers` return raw floats that can vary wildly in magnitude
between queries and algorithms. Feeding these unbounded values directly into
other parts of the stack (reranking, ensembles, thresholding) results in
unexpected behaviour and makes tuning difficult.

The :class:`Normalize` class converts a hunter-style list of ``(uid, score)``
tuples into the unit interval ``[0.0, 1.0]``. Multiple normalisation strategies
are supported:

* **standard** (default) - index-aware scaling that divides by an estimated
  maximum score and clips values to the unit interval.
* **Bayesian / BB25** - sigmoid calibration particularly useful when only the
  relative ordering of positive scores matters. Robust to outliers.
* **minmax** - simple (x - min) / (max - min) scaling. Preserves relative distances.
* **percentile** - rank-based normalization. Very robust to outliers.
* **cosine** - no-op (identity). COSINE scores are already in [0, 1] after conversion.
* **atan** - arctan-based normalization for unbounded scores.

Configuration may be supplied as a simple string (e.g. ``"bayes"``) or as a
more detailed dictionary containing ``method``, ``alpha`` and ``beta`` keys.
Default values are chosen to mirror those described in the reference
implementation.

Constants
---------
SIGMOID_CLIP_MIN, SIGMOID_CLIP_MAX : float
    Bounds for clipping logits before sigmoid computation. The value ±500
    prevents overflow in np.exp() while being large enough to not affect
    practical score ranges (exp(-500) ≈ 10^-217, effectively zero).
DEFAULT_ALPHA : float
    Default scale parameter for Bayesian normalization.
DEFAULT_BETA : None
    Default center parameter (None triggers median-based automatic selection).

Example usage::

    normaliser = Normalize({'method': 'bayes', 'alpha': 2.0})
    results = [("doc1", 3.2), ("doc2", 0.5), ("doc3", -1.0)]
    calibrated = normaliser(results)
    # calibrated -> [("doc1", 1.0), ("doc2", 0.646...), ("doc3", 0.0)]

The module has no dependencies outside of NumPy, which is already required by
other parts of the project.
"""

from __future__ import annotations

import math
from typing import Any, Dict, List, Optional, Tuple, Union

import numpy as np
from zvec.typing import MetricType

# Sigmoid clip bounds: prevents overflow in np.exp()
# exp(-500) ≈ 10^-217, effectively zero for float64
SIGMOID_CLIP_MIN = -500
SIGMOID_CLIP_MAX = 500

# Default Bayesian parameters
DEFAULT_ALPHA = 1.0
DEFAULT_BETA = None  # None triggers median-based selection

ScoreList = List[Tuple[Any, float]]



[docs]
class Normalize:
    """Callable normaliser for lists of ``(uid, score)`` pairs.

    Instances behave like functions: call them with a score list and an
    optional ``avgscore`` and the result will be a new list with all scores
    mapped into the closed unit interval.  The precise transformation is
    determined by the configuration supplied at construction time.

    Attributes
    ----------
    method : str
        Lowercase string naming the chosen normalisation algorithm.
    alpha : float
        Scale parameter used in Bayesian modes.
    beta : Optional[float]
        Centre parameter used in Bayesian modes; ``None`` triggers median-based
        automatic selection.
    """


[docs]
    def __init__(self, config: Union[bool, str, Dict[str, Any], None] = None):
        """Initialise a :class:`Normalize` instance.

        Parameters
        ----------
        config : bool, str, dict or None, optional
            Configuration object that selects the normalisation strategy.
            The following forms are interpreted:

            * ``None`` or ``False`` : equivalent to ``"default"`` - standard
              index-aware scaling.
            * truthy non-dict : also selects the default behaviour.
            * ``str`` : the string value is converted to lower case and used as
              the ``method`` name. Supported methods:
              - ``"bayes"``, ``"bayesian"``, ``"bb25"`` : Bayesian sigmoid calibration
              - ``"minmax"`` : (x - min) / (max - min)
              - ``"percentile"`` (alias: ``"rank"``) : rank-based normalization
              - ``"default"`` : standard index-aware scaling
            * ``dict`` : a copy of the dictionary is stored, and may contain the
              keys ``method`` (string), ``alpha`` (float) and ``beta``
              (float or ``None``).  Any missing keys will be filled with
              defaults (``alpha`` defaults to ``1.0``; ``beta`` to ``None``).

        Notes
        -----
        The configuration is shallow-copied to prevent external modification
        from affecting the normaliser's internal state.
        """
        # normalisation settings
        self.config: Dict[str, Any]
        if isinstance(config, dict):
            self.config = config.copy()
        elif config is None:
            self.config = {}
        else:
            # string or boolean
            self.config = {"method": config}

        method = self.config.get(
            "method", config if isinstance(config, str) else "default"
        )
        self.method: str = str(method).lower()

        # bayesian settings
        self.alpha: float = float(self.config.get("alpha", 1.0))
        self.beta: Optional[float] = self.config.get("beta")
        if self.beta is not None:
            self.beta = float(self.beta)



[docs]
    def __call__(self, scores: ScoreList, avgscore: float = 0.0) -> ScoreList:
        """Normalise a list of document scores.

        Parameters
        ----------
        scores : ScoreList
            Sequence of ``(uid, score)`` pairs, typically produced by a
            retrieval algorithm.  It is assumed that the list is sorted in
            descending order of score; the method will use the first entry to
            compute the maximum when performing default scaling.
        avgscore : float, optional
            Average score computed over the entire corpus.  This is only used
            by the ``default`` normalisation strategy.  In Bayesian modes the
            value is ignored entirely.

        Returns
        -------
        ScoreList
            New list where each score has been replaced with a value in
            ``[0.0, 1.0]`` according to the chosen transformation.

        Notes
        -----
        Multiple normalisation methods are supported:

        * **default** – scales scores relative to an estimated maximum and
          clips values. This keeps the relative ordering intact but bounds the
          range.
        * **bayesian** – applies a sigmoid function calibrated using the
          positive scores only. Negative or zero input scores are mapped to
          ``0.0`` unconditionally. Robust to outliers.
        * **minmax** – (x - min) / (max - min). Preserves relative distances.
        * **percentile** – rank-based normalization. Very robust to outliers.
        * **cosine** – no-op (identity). COSINE conversion `(2-score)/2` already
          produces scores in [0, 1], so no additional normalization is needed.
        * **atan** – arctan-based normalization: ``1 - 2*atan(s)/pi`` for L2,
          ``0.5 + atan(s)/pi`` for IP. Maps unbounded scores to [0, 1].
        """
        # aliases recognised as Bayesian/BB25 modes
        bayesian = ("bayes", "bayesian", "bayesian-bm25", "bb25")
        if self.method in bayesian:
            # perform sigmoid calibration on positive scores only
            return self._bayes(scores)

        # MinMax normalization
        if self.method == "minmax":
            return self._minmax(scores)

        # Percentile/rank normalization (rank is an alias for percentile)
        if self.method in ("percentile", "rank"):
            return self._percentile(scores)

        # Arctan-based normalization
        if self.method == "atan":
            return self._atan(scores)

        # Cosine normalization (no-op: scores already in [0, 1] after conversion)
        if self.method == "cosine":
            return scores

        # fallback to simple index-aware scaling
        return self._default(scores, avgscore)


    # ------------------------------------------------------------------
    # Implementation helpers
    # ------------------------------------------------------------------
    def _default(self, scores: ScoreList, avgscore: float) -> ScoreList:
        """Perform standard normalisation using corpus statistics.

        The method takes the highest score in the provided list and combines it
        with ``avgscore`` to estimate a reasonable upper bound for scaling. The
        bound is computed as::

            maxscore = min(top_score + avgscore, 6 * avgscore)

        which prevents a single exceptionally high value from dominating the
        transformation. Each input score is then divided by ``maxscore`` and
        clipped to the unit interval.

        When ``avgscore`` is zero or not available, the maximum score is simply
        the top score in the list, ensuring the highest score normalizes to 1.0.

        Zero or negative ``maxscore`` values produce a list of zeros to avoid
        division by zero.
        """
        if not scores:
            return []

        top_score = scores[0][1]

        # When avgscore is not available (0), use top_score as maxscore
        if avgscore <= 0:
            maxscore = top_score
        else:
            maxscore = min(top_score + avgscore, 6 * avgscore)

        if maxscore <= 0:
            # avoid division by zero
            return [(uid, 0.0) for uid, _ in scores]
        return [(uid, min(score / maxscore, 1.0)) for uid, score in scores]

    def _bayes(self, scores: ScoreList) -> ScoreList:
        """Bayesian sigmoid calibration (BB25-style).

        This algorithm mirrors the BB25 adjustment described in the retrieval
        literature.  It computes a sigmoid curve that maps raw scores
        into probabilities in ``[0,1]``.

        The calibration parameters ``alpha`` (scale) and ``beta`` (centre) can
        be specified by the user; if ``beta`` is left ``None`` the median of
        all scores is used.  ``alpha`` is normalised by the standard deviation
        of all scores to make the transformation invariant to translation and
        insensitive to the absolute score magnitude.

        Key property: Translation invariant. Adding a constant to all scores
        produces the same normalized probabilities, because both median (beta)
        and std (alpha) shift/scale accordingly.
        """
        if not scores:
            return []

        values = np.array([score for _, score in scores], dtype=np.float64)

        # determine beta (centre) if not fixed - uses ALL scores for translation invariance
        beta = self.beta if self.beta is not None else float(np.median(values))

        # determine effective alpha (scale) by dividing by the standard dev of ALL scores
        std = float(np.std(values))
        alpha = abs(self.alpha / std if std > 0 else self.alpha)

        # Clip logits to prevent overflow in np.exp()
        # See SIGMOID_CLIP_MIN/MAX constants for bounds explanation
        logits = np.clip(alpha * (values - beta), SIGMOID_CLIP_MIN, SIGMOID_CLIP_MAX)
        probs = 1.0 / (1.0 + np.exp(-logits))
        probs = np.clip(probs, 0.0, 1.0)

        return [(uid, float(probs[i])) for i, (uid, _) in enumerate(scores)]

    def _minmax(self, scores: ScoreList) -> ScoreList:
        """Min-max normalization: (x - min) / (max - min).

        Maps scores to [0, 1] where min score becomes 0 and max becomes 1.
        Preserves relative distances between scores.

        Args:
            scores: List of (uid, score) tuples.

        Returns:
            Normalized scores in [0, 1].

        Note:
            If all scores are equal, returns 0.5 for all (no discrimination).
        """
        if not scores:
            return []

        values = [score for _, score in scores]
        min_score = min(values)
        max_score = max(values)
        range_score = max_score - min_score

        if range_score == 0:
            # All scores are equal - no discrimination possible
            return [(uid, 0.5) for uid, _ in scores]

        return [(uid, (score - min_score) / range_score) for uid, score in scores]

    def _percentile(self, scores: ScoreList) -> ScoreList:
        """Percentile ranking: replaces each score with its normalized rank.

        Maps scores to (0, 1] based on their relative rank.
        Very robust to outliers - preserves only the ordering.

        Args:
            scores: List of (uid, score) tuples.

        Returns:
            Percentile ranks in (0, 1].

        Note:
            Equal scores receive the same percentile rank.
            With n unique scores, possible values are 1/n, 2/n, ..., n/n.
        """
        if not scores:
            return []

        # Sort unique scores to assign ranks
        sorted_unique = sorted(set(score for _, score in scores))
        rank_map = {
            s: (i + 1) / len(sorted_unique) for i, s in enumerate(sorted_unique)
        }

        return [(uid, rank_map[score]) for uid, score in scores]

    @staticmethod
    def _atan(scores: ScoreList, metric: Optional[MetricType] = None) -> ScoreList:
        """Arctan-based normalization for scores already oriented "higher=better".

        Uses metric-specific formulas:
        - **L2**: ``1 + 2 * atan(s) / pi`` - maps (-∞, 0] to (0, 1]
          (after conversion s = -distance, so distance 0 → 1.0, distance ∞ → 0.0)
        - **IP/COSINE**: ``0.5 + atan(s) / pi`` - maps (-∞, ∞) to (0, 1)

        Args:
            scores: List of (uid, score) tuples.
            metric: Metric type to select the appropriate formula.

        Returns:
            Normalized scores in [0, 1], higher=better.
        """
        if not scores:
            return []

        result = []
        for uid, score in scores:
            if metric == MetricType.L2:
                # L2: score = -distance (from _convert_metric)
                # Formula: 1 + 2*atan(score)/pi = 1 - 2*atan(distance)/pi
                # distance 0 → 1.0, distance ∞ → 0.0

                # For score = -score
                # same as 1 - 2 * math.atan(-score) / math.pi
                normalized = 1 + 2 * math.atan(score) / math.pi

            elif metric == MetricType.IP:
                # IP: 0.5 + atan(s) / pi maps (-∞, ∞) to (0, 1)
                normalized = 0.5 + math.atan(score) / math.pi
            else:
                # COSINE: never normalized (conversion already produces [0, 1])
                # L2: handled above
                # None/default: return score unchanged
                normalized = score
            result.append((uid, normalized))
        return result