"""Normalization utilities for post-processing raw document relevance scores.
This module defines a lightweight helper that is used by the retrieval layer
in the surrounding application. Scoring routines such as those implemented in
:mod:`zvec_db.rerankers` return raw floats that can vary wildly in magnitude
between queries and algorithms. Feeding these unbounded values directly into
other parts of the stack (reranking, ensembles, thresholding) results in
unexpected behaviour and makes tuning difficult.
The :class:`Normalize` class converts a hunter-style list of ``(uid, score)``
tuples into the unit interval ``[0.0, 1.0]``. Multiple normalisation strategies
are supported:
* **standard** (default) - index-aware scaling that divides by an estimated
maximum score and clips values to the unit interval.
* **Bayesian / BB25** - sigmoid calibration particularly useful when only the
relative ordering of positive scores matters. Robust to outliers.
* **minmax** - simple (x - min) / (max - min) scaling. Preserves relative distances.
* **percentile** - rank-based normalization. Very robust to outliers.
* **cosine** - no-op (identity). COSINE scores are already in [0, 1] after conversion.
* **atan** - arctan-based normalization for unbounded scores.
Configuration may be supplied as a simple string (e.g. ``"bayes"``) or as a
more detailed dictionary containing ``method``, ``alpha`` and ``beta`` keys.
Default values are chosen to mirror those described in the reference
implementation.
Constants
---------
SIGMOID_CLIP_MIN, SIGMOID_CLIP_MAX : float
Bounds for clipping logits before sigmoid computation. The value ±500
prevents overflow in np.exp() while being large enough to not affect
practical score ranges (exp(-500) ≈ 10^-217, effectively zero).
DEFAULT_ALPHA : float
Default scale parameter for Bayesian normalization.
DEFAULT_BETA : None
Default center parameter (None triggers median-based automatic selection).
Example usage::
normaliser = Normalize({'method': 'bayes', 'alpha': 2.0})
results = [("doc1", 3.2), ("doc2", 0.5), ("doc3", -1.0)]
calibrated = normaliser(results)
# calibrated -> [("doc1", 1.0), ("doc2", 0.646...), ("doc3", 0.0)]
The module has no dependencies outside of NumPy, which is already required by
other parts of the project.
"""
from __future__ import annotations
import math
from typing import Any, Dict, List, Optional, Tuple, Union
import numpy as np
from zvec.typing import MetricType
# Sigmoid clip bounds: prevents overflow in np.exp()
# exp(-500) ≈ 10^-217, effectively zero for float64
SIGMOID_CLIP_MIN = -500
SIGMOID_CLIP_MAX = 500
# Default Bayesian parameters
DEFAULT_ALPHA = 1.0
DEFAULT_BETA = None # None triggers median-based selection
ScoreList = List[Tuple[Any, float]]
[docs]
class Normalize:
"""Callable normaliser for lists of ``(uid, score)`` pairs.
Instances behave like functions: call them with a score list and an
optional ``avgscore`` and the result will be a new list with all scores
mapped into the closed unit interval. The precise transformation is
determined by the configuration supplied at construction time.
Attributes
----------
method : str
Lowercase string naming the chosen normalisation algorithm.
alpha : float
Scale parameter used in Bayesian modes.
beta : Optional[float]
Centre parameter used in Bayesian modes; ``None`` triggers median-based
automatic selection.
"""
[docs]
def __init__(self, config: Union[bool, str, Dict[str, Any], None] = None):
"""Initialise a :class:`Normalize` instance.
Parameters
----------
config : bool, str, dict or None, optional
Configuration object that selects the normalisation strategy.
The following forms are interpreted:
* ``None`` or ``False`` : equivalent to ``"default"`` - standard
index-aware scaling.
* truthy non-dict : also selects the default behaviour.
* ``str`` : the string value is converted to lower case and used as
the ``method`` name. Supported methods:
- ``"bayes"``, ``"bayesian"``, ``"bb25"`` : Bayesian sigmoid calibration
- ``"minmax"`` : (x - min) / (max - min)
- ``"percentile"`` (alias: ``"rank"``) : rank-based normalization
- ``"default"`` : standard index-aware scaling
* ``dict`` : a copy of the dictionary is stored, and may contain the
keys ``method`` (string), ``alpha`` (float) and ``beta``
(float or ``None``). Any missing keys will be filled with
defaults (``alpha`` defaults to ``1.0``; ``beta`` to ``None``).
Notes
-----
The configuration is shallow-copied to prevent external modification
from affecting the normaliser's internal state.
"""
# normalisation settings
self.config: Dict[str, Any]
if isinstance(config, dict):
self.config = config.copy()
elif config is None:
self.config = {}
else:
# string or boolean
self.config = {"method": config}
method = self.config.get(
"method", config if isinstance(config, str) else "default"
)
self.method: str = str(method).lower()
# bayesian settings
self.alpha: float = float(self.config.get("alpha", 1.0))
self.beta: Optional[float] = self.config.get("beta")
if self.beta is not None:
self.beta = float(self.beta)
[docs]
def __call__(self, scores: ScoreList, avgscore: float = 0.0) -> ScoreList:
"""Normalise a list of document scores.
Parameters
----------
scores : ScoreList
Sequence of ``(uid, score)`` pairs, typically produced by a
retrieval algorithm. It is assumed that the list is sorted in
descending order of score; the method will use the first entry to
compute the maximum when performing default scaling.
avgscore : float, optional
Average score computed over the entire corpus. This is only used
by the ``default`` normalisation strategy. In Bayesian modes the
value is ignored entirely.
Returns
-------
ScoreList
New list where each score has been replaced with a value in
``[0.0, 1.0]`` according to the chosen transformation.
Notes
-----
Multiple normalisation methods are supported:
* **default** – scales scores relative to an estimated maximum and
clips values. This keeps the relative ordering intact but bounds the
range.
* **bayesian** – applies a sigmoid function calibrated using the
positive scores only. Negative or zero input scores are mapped to
``0.0`` unconditionally. Robust to outliers.
* **minmax** – (x - min) / (max - min). Preserves relative distances.
* **percentile** – rank-based normalization. Very robust to outliers.
* **cosine** – no-op (identity). COSINE conversion `(2-score)/2` already
produces scores in [0, 1], so no additional normalization is needed.
* **atan** – arctan-based normalization: ``1 - 2*atan(s)/pi`` for L2,
``0.5 + atan(s)/pi`` for IP. Maps unbounded scores to [0, 1].
"""
# aliases recognised as Bayesian/BB25 modes
bayesian = ("bayes", "bayesian", "bayesian-bm25", "bb25")
if self.method in bayesian:
# perform sigmoid calibration on positive scores only
return self._bayes(scores)
# MinMax normalization
if self.method == "minmax":
return self._minmax(scores)
# Percentile/rank normalization (rank is an alias for percentile)
if self.method in ("percentile", "rank"):
return self._percentile(scores)
# Arctan-based normalization
if self.method == "atan":
return self._atan(scores)
# Cosine normalization (no-op: scores already in [0, 1] after conversion)
if self.method == "cosine":
return scores
# fallback to simple index-aware scaling
return self._default(scores, avgscore)
# ------------------------------------------------------------------
# Implementation helpers
# ------------------------------------------------------------------
def _default(self, scores: ScoreList, avgscore: float) -> ScoreList:
"""Perform standard normalisation using corpus statistics.
The method takes the highest score in the provided list and combines it
with ``avgscore`` to estimate a reasonable upper bound for scaling. The
bound is computed as::
maxscore = min(top_score + avgscore, 6 * avgscore)
which prevents a single exceptionally high value from dominating the
transformation. Each input score is then divided by ``maxscore`` and
clipped to the unit interval.
When ``avgscore`` is zero or not available, the maximum score is simply
the top score in the list, ensuring the highest score normalizes to 1.0.
Zero or negative ``maxscore`` values produce a list of zeros to avoid
division by zero.
"""
if not scores:
return []
top_score = scores[0][1]
# When avgscore is not available (0), use top_score as maxscore
if avgscore <= 0:
maxscore = top_score
else:
maxscore = min(top_score + avgscore, 6 * avgscore)
if maxscore <= 0:
# avoid division by zero
return [(uid, 0.0) for uid, _ in scores]
return [(uid, min(score / maxscore, 1.0)) for uid, score in scores]
def _bayes(self, scores: ScoreList) -> ScoreList:
"""Bayesian sigmoid calibration (BB25-style).
This algorithm mirrors the BB25 adjustment described in the retrieval
literature. It computes a sigmoid curve that maps raw scores
into probabilities in ``[0,1]``.
The calibration parameters ``alpha`` (scale) and ``beta`` (centre) can
be specified by the user; if ``beta`` is left ``None`` the median of
all scores is used. ``alpha`` is normalised by the standard deviation
of all scores to make the transformation invariant to translation and
insensitive to the absolute score magnitude.
Key property: Translation invariant. Adding a constant to all scores
produces the same normalized probabilities, because both median (beta)
and std (alpha) shift/scale accordingly.
"""
if not scores:
return []
values = np.array([score for _, score in scores], dtype=np.float64)
# determine beta (centre) if not fixed - uses ALL scores for translation invariance
beta = self.beta if self.beta is not None else float(np.median(values))
# determine effective alpha (scale) by dividing by the standard dev of ALL scores
std = float(np.std(values))
alpha = abs(self.alpha / std if std > 0 else self.alpha)
# Clip logits to prevent overflow in np.exp()
# See SIGMOID_CLIP_MIN/MAX constants for bounds explanation
logits = np.clip(alpha * (values - beta), SIGMOID_CLIP_MIN, SIGMOID_CLIP_MAX)
probs = 1.0 / (1.0 + np.exp(-logits))
probs = np.clip(probs, 0.0, 1.0)
return [(uid, float(probs[i])) for i, (uid, _) in enumerate(scores)]
def _minmax(self, scores: ScoreList) -> ScoreList:
"""Min-max normalization: (x - min) / (max - min).
Maps scores to [0, 1] where min score becomes 0 and max becomes 1.
Preserves relative distances between scores.
Args:
scores: List of (uid, score) tuples.
Returns:
Normalized scores in [0, 1].
Note:
If all scores are equal, returns 0.5 for all (no discrimination).
"""
if not scores:
return []
values = [score for _, score in scores]
min_score = min(values)
max_score = max(values)
range_score = max_score - min_score
if range_score == 0:
# All scores are equal - no discrimination possible
return [(uid, 0.5) for uid, _ in scores]
return [(uid, (score - min_score) / range_score) for uid, score in scores]
def _percentile(self, scores: ScoreList) -> ScoreList:
"""Percentile ranking: replaces each score with its normalized rank.
Maps scores to (0, 1] based on their relative rank.
Very robust to outliers - preserves only the ordering.
Args:
scores: List of (uid, score) tuples.
Returns:
Percentile ranks in (0, 1].
Note:
Equal scores receive the same percentile rank.
With n unique scores, possible values are 1/n, 2/n, ..., n/n.
"""
if not scores:
return []
# Sort unique scores to assign ranks
sorted_unique = sorted(set(score for _, score in scores))
rank_map = {
s: (i + 1) / len(sorted_unique) for i, s in enumerate(sorted_unique)
}
return [(uid, rank_map[score]) for uid, score in scores]
@staticmethod
def _atan(scores: ScoreList, metric: Optional[MetricType] = None) -> ScoreList:
"""Arctan-based normalization for scores already oriented "higher=better".
Uses metric-specific formulas:
- **L2**: ``1 + 2 * atan(s) / pi`` - maps (-∞, 0] to (0, 1]
(after conversion s = -distance, so distance 0 → 1.0, distance ∞ → 0.0)
- **IP/COSINE**: ``0.5 + atan(s) / pi`` - maps (-∞, ∞) to (0, 1)
Args:
scores: List of (uid, score) tuples.
metric: Metric type to select the appropriate formula.
Returns:
Normalized scores in [0, 1], higher=better.
"""
if not scores:
return []
result = []
for uid, score in scores:
if metric == MetricType.L2:
# L2: score = -distance (from _convert_metric)
# Formula: 1 + 2*atan(score)/pi = 1 - 2*atan(distance)/pi
# distance 0 → 1.0, distance ∞ → 0.0
# For score = -score
# same as 1 - 2 * math.atan(-score) / math.pi
normalized = 1 + 2 * math.atan(score) / math.pi
elif metric == MetricType.IP:
# IP: 0.5 + atan(s) / pi maps (-∞, ∞) to (0, 1)
normalized = 0.5 + math.atan(score) / math.pi
else:
# COSINE: never normalized (conversion already produces [0, 1])
# L2: handled above
# None/default: return score unchanged
normalized = score
result.append((uid, normalized))
return result