Source code for zvec_db.rerankers.utils.normalize

"""Normalization utilities for post-processing raw document relevance scores.

This module defines a lightweight helper that is used by the retrieval layer
in the surrounding application. Scoring routines such as those implemented in
:mod:`zvec_db.rerankers` return raw floats that can vary wildly in magnitude
between queries and algorithms. Feeding these unbounded values directly into
other parts of the stack (reranking, ensembles, thresholding) results in
unexpected behaviour and makes tuning difficult.

The :class:`Normalize` class converts a hunter-style list of ``(uid, score)``
tuples into the unit interval ``[0.0, 1.0]``. Multiple normalisation strategies
are supported:

* **standard** (default) - index-aware scaling that divides by an estimated
  maximum score and clips values to the unit interval.
* **Bayesian / BB25** - sigmoid calibration particularly useful when only the
  relative ordering of positive scores matters. Robust to outliers.
* **minmax** - simple (x - min) / (max - min) scaling. Preserves relative distances.
* **percentile** - rank-based normalization. Very robust to outliers.
* **cosine** - no-op (identity). COSINE scores are already in [0, 1] after conversion.
* **atan** - arctan-based normalization for unbounded scores.

Configuration may be supplied as a simple string (e.g. ``"bayes"``) or as a
more detailed dictionary containing ``method``, ``alpha`` and ``beta`` keys.
Default values are chosen to mirror those described in the reference
implementation.

Constants
---------
SIGMOID_CLIP_MIN, SIGMOID_CLIP_MAX : float
    Bounds for clipping logits before sigmoid computation. The value ±500
    prevents overflow in np.exp() while being large enough to not affect
    practical score ranges (exp(-500) ≈ 10^-217, effectively zero).
DEFAULT_ALPHA : float
    Default scale parameter for Bayesian normalization.
DEFAULT_BETA : None
    Default center parameter (None triggers median-based automatic selection).

Example usage::

    normaliser = Normalize({'method': 'bayes', 'alpha': 2.0})
    results = [("doc1", 3.2), ("doc2", 0.5), ("doc3", -1.0)]
    calibrated = normaliser(results)
    # calibrated -> [("doc1", 1.0), ("doc2", 0.646...), ("doc3", 0.0)]

The module has no dependencies outside of NumPy, which is already required by
other parts of the project.
"""

from __future__ import annotations

import math
from typing import Any, Dict, List, Optional, Tuple, Union

import numpy as np
from zvec.typing import MetricType

# Sigmoid clip bounds: prevents overflow in np.exp()
# exp(-500) ≈ 10^-217, effectively zero for float64
SIGMOID_CLIP_MIN = -500
SIGMOID_CLIP_MAX = 500

# Default Bayesian parameters
DEFAULT_ALPHA = 1.0
DEFAULT_BETA = None  # None triggers median-based selection

ScoreList = List[Tuple[Any, float]]


[docs] class Normalize: """Callable normaliser for lists of ``(uid, score)`` pairs. Instances behave like functions: call them with a score list and an optional ``avgscore`` and the result will be a new list with all scores mapped into the closed unit interval. The precise transformation is determined by the configuration supplied at construction time. Attributes ---------- method : str Lowercase string naming the chosen normalisation algorithm. alpha : float Scale parameter used in Bayesian modes. beta : Optional[float] Centre parameter used in Bayesian modes; ``None`` triggers median-based automatic selection. """
[docs] def __init__(self, config: Union[bool, str, Dict[str, Any], None] = None): """Initialise a :class:`Normalize` instance. Parameters ---------- config : bool, str, dict or None, optional Configuration object that selects the normalisation strategy. The following forms are interpreted: * ``None`` or ``False`` : equivalent to ``"default"`` - standard index-aware scaling. * truthy non-dict : also selects the default behaviour. * ``str`` : the string value is converted to lower case and used as the ``method`` name. Supported methods: - ``"bayes"``, ``"bayesian"``, ``"bb25"`` : Bayesian sigmoid calibration - ``"minmax"`` : (x - min) / (max - min) - ``"percentile"`` (alias: ``"rank"``) : rank-based normalization - ``"default"`` : standard index-aware scaling * ``dict`` : a copy of the dictionary is stored, and may contain the keys ``method`` (string), ``alpha`` (float) and ``beta`` (float or ``None``). Any missing keys will be filled with defaults (``alpha`` defaults to ``1.0``; ``beta`` to ``None``). Notes ----- The configuration is shallow-copied to prevent external modification from affecting the normaliser's internal state. """ # normalisation settings self.config: Dict[str, Any] if isinstance(config, dict): self.config = config.copy() elif config is None: self.config = {} else: # string or boolean self.config = {"method": config} method = self.config.get( "method", config if isinstance(config, str) else "default" ) self.method: str = str(method).lower() # bayesian settings self.alpha: float = float(self.config.get("alpha", 1.0)) self.beta: Optional[float] = self.config.get("beta") if self.beta is not None: self.beta = float(self.beta)
[docs] def __call__(self, scores: ScoreList, avgscore: float = 0.0) -> ScoreList: """Normalise a list of document scores. Parameters ---------- scores : ScoreList Sequence of ``(uid, score)`` pairs, typically produced by a retrieval algorithm. It is assumed that the list is sorted in descending order of score; the method will use the first entry to compute the maximum when performing default scaling. avgscore : float, optional Average score computed over the entire corpus. This is only used by the ``default`` normalisation strategy. In Bayesian modes the value is ignored entirely. Returns ------- ScoreList New list where each score has been replaced with a value in ``[0.0, 1.0]`` according to the chosen transformation. Notes ----- Multiple normalisation methods are supported: * **default** – scales scores relative to an estimated maximum and clips values. This keeps the relative ordering intact but bounds the range. * **bayesian** – applies a sigmoid function calibrated using the positive scores only. Negative or zero input scores are mapped to ``0.0`` unconditionally. Robust to outliers. * **minmax** – (x - min) / (max - min). Preserves relative distances. * **percentile** – rank-based normalization. Very robust to outliers. * **cosine** – no-op (identity). COSINE conversion `(2-score)/2` already produces scores in [0, 1], so no additional normalization is needed. * **atan** – arctan-based normalization: ``1 - 2*atan(s)/pi`` for L2, ``0.5 + atan(s)/pi`` for IP. Maps unbounded scores to [0, 1]. """ # aliases recognised as Bayesian/BB25 modes bayesian = ("bayes", "bayesian", "bayesian-bm25", "bb25") if self.method in bayesian: # perform sigmoid calibration on positive scores only return self._bayes(scores) # MinMax normalization if self.method == "minmax": return self._minmax(scores) # Percentile/rank normalization (rank is an alias for percentile) if self.method in ("percentile", "rank"): return self._percentile(scores) # Arctan-based normalization if self.method == "atan": return self._atan(scores) # Cosine normalization (no-op: scores already in [0, 1] after conversion) if self.method == "cosine": return scores # fallback to simple index-aware scaling return self._default(scores, avgscore)
# ------------------------------------------------------------------ # Implementation helpers # ------------------------------------------------------------------ def _default(self, scores: ScoreList, avgscore: float) -> ScoreList: """Perform standard normalisation using corpus statistics. The method takes the highest score in the provided list and combines it with ``avgscore`` to estimate a reasonable upper bound for scaling. The bound is computed as:: maxscore = min(top_score + avgscore, 6 * avgscore) which prevents a single exceptionally high value from dominating the transformation. Each input score is then divided by ``maxscore`` and clipped to the unit interval. When ``avgscore`` is zero or not available, the maximum score is simply the top score in the list, ensuring the highest score normalizes to 1.0. Zero or negative ``maxscore`` values produce a list of zeros to avoid division by zero. """ if not scores: return [] top_score = scores[0][1] # When avgscore is not available (0), use top_score as maxscore if avgscore <= 0: maxscore = top_score else: maxscore = min(top_score + avgscore, 6 * avgscore) if maxscore <= 0: # avoid division by zero return [(uid, 0.0) for uid, _ in scores] return [(uid, min(score / maxscore, 1.0)) for uid, score in scores] def _bayes(self, scores: ScoreList) -> ScoreList: """Bayesian sigmoid calibration (BB25-style). This algorithm mirrors the BB25 adjustment described in the retrieval literature. It computes a sigmoid curve that maps raw scores into probabilities in ``[0,1]``. The calibration parameters ``alpha`` (scale) and ``beta`` (centre) can be specified by the user; if ``beta`` is left ``None`` the median of all scores is used. ``alpha`` is normalised by the standard deviation of all scores to make the transformation invariant to translation and insensitive to the absolute score magnitude. Key property: Translation invariant. Adding a constant to all scores produces the same normalized probabilities, because both median (beta) and std (alpha) shift/scale accordingly. """ if not scores: return [] values = np.array([score for _, score in scores], dtype=np.float64) # determine beta (centre) if not fixed - uses ALL scores for translation invariance beta = self.beta if self.beta is not None else float(np.median(values)) # determine effective alpha (scale) by dividing by the standard dev of ALL scores std = float(np.std(values)) alpha = abs(self.alpha / std if std > 0 else self.alpha) # Clip logits to prevent overflow in np.exp() # See SIGMOID_CLIP_MIN/MAX constants for bounds explanation logits = np.clip(alpha * (values - beta), SIGMOID_CLIP_MIN, SIGMOID_CLIP_MAX) probs = 1.0 / (1.0 + np.exp(-logits)) probs = np.clip(probs, 0.0, 1.0) return [(uid, float(probs[i])) for i, (uid, _) in enumerate(scores)] def _minmax(self, scores: ScoreList) -> ScoreList: """Min-max normalization: (x - min) / (max - min). Maps scores to [0, 1] where min score becomes 0 and max becomes 1. Preserves relative distances between scores. Args: scores: List of (uid, score) tuples. Returns: Normalized scores in [0, 1]. Note: If all scores are equal, returns 0.5 for all (no discrimination). """ if not scores: return [] values = [score for _, score in scores] min_score = min(values) max_score = max(values) range_score = max_score - min_score if range_score == 0: # All scores are equal - no discrimination possible return [(uid, 0.5) for uid, _ in scores] return [(uid, (score - min_score) / range_score) for uid, score in scores] def _percentile(self, scores: ScoreList) -> ScoreList: """Percentile ranking: replaces each score with its normalized rank. Maps scores to (0, 1] based on their relative rank. Very robust to outliers - preserves only the ordering. Args: scores: List of (uid, score) tuples. Returns: Percentile ranks in (0, 1]. Note: Equal scores receive the same percentile rank. With n unique scores, possible values are 1/n, 2/n, ..., n/n. """ if not scores: return [] # Sort unique scores to assign ranks sorted_unique = sorted(set(score for _, score in scores)) rank_map = { s: (i + 1) / len(sorted_unique) for i, s in enumerate(sorted_unique) } return [(uid, rank_map[score]) for uid, score in scores] @staticmethod def _atan(scores: ScoreList, metric: Optional[MetricType] = None) -> ScoreList: """Arctan-based normalization for scores already oriented "higher=better". Uses metric-specific formulas: - **L2**: ``1 + 2 * atan(s) / pi`` - maps (-∞, 0] to (0, 1] (after conversion s = -distance, so distance 0 → 1.0, distance ∞ → 0.0) - **IP/COSINE**: ``0.5 + atan(s) / pi`` - maps (-∞, ∞) to (0, 1) Args: scores: List of (uid, score) tuples. metric: Metric type to select the appropriate formula. Returns: Normalized scores in [0, 1], higher=better. """ if not scores: return [] result = [] for uid, score in scores: if metric == MetricType.L2: # L2: score = -distance (from _convert_metric) # Formula: 1 + 2*atan(score)/pi = 1 - 2*atan(distance)/pi # distance 0 → 1.0, distance ∞ → 0.0 # For score = -score # same as 1 - 2 * math.atan(-score) / math.pi normalized = 1 + 2 * math.atan(score) / math.pi elif metric == MetricType.IP: # IP: 0.5 + atan(s) / pi maps (-∞, ∞) to (0, 1) normalized = 0.5 + math.atan(score) / math.pi else: # COSINE: never normalized (conversion already produces [0, 1]) # L2: handled above # None/default: return score unchanged normalized = score result.append((uid, normalized)) return result