Source code for zvec_db.rerankers.utils.base_utils

"""Common utilities for rerankers in zvec-db.

This module provides shared functionality used across multiple reranker
implementations, including score extraction, document text extraction,
and other common operations.
"""

from __future__ import annotations

from typing import Optional

from zvec.model.doc import Doc

# Sentinel value to distinguish "not provided" from "explicitly None"
_UNSET = object()



[docs]
def extract_score(doc: Doc) -> float:
    """Extract score from a document, handling various numeric types.

    Args:
        doc (Doc): Document with a score attribute.

    Returns:
        float: Score as a float, or 0.0 if score is None or invalid.

    Example:
        >>> doc = Doc(id="1", score=0.8)
        >>> extract_score(doc)
        0.8
        >>> doc_no_score = Doc(id="2", score=None)
        >>> extract_score(doc_no_score)
        0.0
    """
    score = doc.score
    if score is None:
        return 0.0
    try:
        return float(score)
    except (TypeError, ValueError):
        return 0.0




[docs]
def extract_field_score(doc: Doc, field_name: str) -> float:
    """Extract score from a specific document field.

    Args:
        doc (Doc): Document with fields attribute.
        field_name (str): Name of the field to extract score from.

    Returns:
        float: Field score as a float, or 0.0 if field is missing or non-numeric.

    Example:
        >>> doc = Doc(id="1", fields={"title_score": 0.9, "content_score": 0.7})
        >>> extract_field_score(doc, "title_score")
        0.9
        >>> extract_field_score(doc, "missing_field")
        0.0
    """
    fields = getattr(doc, "fields", None)
    if not fields:
        return 0.0
    field_value = fields.get(field_name)
    if field_value is None:
        return 0.0
    try:
        return float(field_value)
    except (TypeError, ValueError):
        return 0.0




[docs]
def get_document_text(doc: Doc, rerank_field: Optional[str] = None) -> str:
    """Extract document text for scoring or embedding.

    This function attempts to extract text content from a document using
    the following strategy:

    1. If ``rerank_field`` is specified and the document has that field, use it.
    2. Otherwise, try common field names: "content", "text", "body", "passage".
    3. If no field matches, concatenate all fields.
    4. As a last resort, return the document ID as a string.

    Args:
        doc (Doc): Document to extract text from.
        rerank_field (Optional[str]): Specific field name to use. If None,
            uses the fallback strategy. Defaults to None.

    Returns:
        str: Extracted document text.

    Example:
        >>> doc = Doc(id="1", fields={"content": "Hello world", "title": "Test"})
        >>> get_document_text(doc)
        'Hello world'
        >>> get_document_text(doc, rerank_field="title")
        'Test'
    """
    if rerank_field and doc.has_field(rerank_field):
        return str(doc.field(rerank_field))

    # Try common field names
    for field_name in ["content", "text", "body", "passage"]:
        if doc.has_field(field_name):
            return str(doc.field(field_name))

    # Fallback: use all fields concatenated
    if doc.fields:
        return " ".join(str(v) for v in doc.fields.values())

    return str(doc.id)