User Guide

This guide presents complete examples of zvec-db usage for sparse vectorization and reranking.

Sparse Embedding Examples

BM25 Embedding (Standard FTS)

from zvec_db.embedders import BM25Embedder

# Training documents
documents = [
    "The black cat sleeps on the sofa",
    "A brown dog runs in the garden",
    "The cat and the dog are animals",
]

# Initialization and training
embedder = BM25Embedder(
    k1=1.2,
    b=0.75,
    max_features=4096,
    min_df=1
)
embedder.fit(documents)

# Query embedding
query_vector = embedder.embed("cat sleeps")
print(query_vector)  # {42: 0.523, 108: 0.312, ...}

# Batch embedding
query_vectors = embedder.embed(["cat sleeps", "dog runs"])
print(len(query_vectors))  # 2

BM25 with Trigrams (Character N-grams)

Use character n-grams for fuzzy matching and typo tolerance:

from zvec_db.embedders import BM25Embedder

# Trigram-based BM25 for fuzzy matching
embedder = BM25Embedder(
    k1=1.2,
    b=0.75,
    max_features=8192,
    analyzer="char_wb",    # Character n-grams at word boundaries
    ngram_range=(3, 3),    # Trigrams only
    min_df=2,              # Minimum document frequency
)
embedder.fit(documents)

# Handles typos and partial matches
query_vector = embedder.embed("slep")  # Matches "sleeps", "sleeping"

BM25 with Mixed N-grams

Combine unigrams and n-grams for better coverage:

from zvec_db.embedders import BM25Embedder

# Word unigrams + bigrams
embedder = BM25Embedder(
    max_features=16384,
    analyzer="word",
    ngram_range=(1, 2),  # Unigrams and bigrams
)
embedder.fit(documents)

# Or character n-grams for fuzzy matching
embedder_fuzzy = BM25Embedder(
    max_features=16384,
    analyzer="char_wb",
    ngram_range=(2, 4),  # Bi-grams, trigrams, 4-grams
)
embedder_fuzzy.fit(documents)

CountVectorizer Parameters Reference

All BM25Embedder parameters (except k1, b, preprocessing_config) are passed directly to scikit-learn’s CountVectorizer:

embedder = BM25Embedder(
    max_features=4096,
    k1=1.2,
    b=0.75,
    # Tokenization
    tokenizer=lambda x: x.split(),  # Custom tokenizer
    token_pattern=r"(?u)\b\w+\b",   # Regex pattern for tokens
    # Vocabulary filtering
    min_df=2,            # Minimum document frequency
    max_df=0.8,          # Maximum document frequency
    max_features=10000,  # Max vocabulary size
    # N-grams
    ngram_range=(1, 2),  # (min_n, max_n) - unigrams + bigrams
    analyzer="word",     # "word", "char", or "char_wb"
    # Preprocessing
    lowercase=True,      # Convert to lowercase
    stop_words="english",# Remove English stopwords
)

Analyzer options:

  • "word" - Word-level tokens (default)

  • "char" - Character-level n-grams

  • "char_wb" - Character n-grams at word boundaries (recommended for fuzzy matching)

BM25L for variable document lengths

from zvec_db.embedders import BM25LEmbedder

# Useful when documents have very variable lengths
embedder = BM25LEmbedder(k1=1.2, max_features=4096)
embedder.fit(documents)

BM25+ to avoid zero scores

from zvec_db.embedders import BM25PlusEmbedder

# The delta parameter avoids strictly zero scores
embedder = BM25PlusEmbedder(
    k1=1.2,
    b=0.75,
    delta=0.5,  # Smoothing
    max_features=4096
)
embedder.fit(documents)

Custom tokenization

import re

def custom_tokenizer(text):
    return re.findall(r'\b[a-z]+\b', text.lower())

embedder = BM25Embedder(
    tokenizer=custom_tokenizer,
    max_features=4096
)
embedder.fit(documents)

Pre-tokenized mode

# Already tokenized documents
tokenized_docs = [
    ["cat", "black", "sleeps"],
    ["dog", "brown", "runs"],
]

embedder = BM25Embedder(is_pretokenized=True)
embedder.fit(tokenized_docs)

# Tokenized query
query = ["cat", "sleeps"]
result = embedder.embed(query)

Save and load

# Save
embedder.save("models/bm25_model.joblib")

# Load
new_embedder = BM25Embedder()
new_embedder.load("models/bm25_model.joblib")

Reranking Examples

Understanding Metrics

The metrics parameter controls distance→similarity conversion:

from zvec_db.rerankers import WeightedReranker
from zvec.typing import MetricType

# Case 1: Global metric (all sources use COSINE)
reranker = WeightedReranker(
    topn=10,
    metrics=MetricType.COSINE  # Applied to all sources
)

# Case 2: Per-source metrics
reranker = WeightedReranker(
    topn=10,
    metrics={
        "dense": MetricType.COSINE,  # Convert cosine distance [0,2] → similarity
        "bm25": None,                # No conversion (BM25 scores)
    }
)

# Case 3: Auto-detect from schema
import zvec
collection = zvec.open("./my_collection")
reranker = WeightedReranker(
    topn=10,
    metrics=None,  # Will infer from schema
    schema=collection.schema
)

RrfReranker (RRF)

from zvec_db.rerankers import RrfReranker
from zvec.model.doc import Doc

# Results from different sources
bm25_results = [
    Doc(id="doc1", score=15.5),
    Doc(id="doc2", score=12.3),
]

dense_results = [
    Doc(id="doc2", score=0.92),
    Doc(id="doc1", score=0.75),
]

reranker = RrfReranker(
    topn=10,
    rank_constant=60,
    # Note: normalize has no effect on RRF (uses ranks, not scores)
)

reranked = reranker.rerank({
    "bm25": bm25_results,
    "dense": dense_results
})

WeightedReranker (Weighted Fusion)

from zvec_db.rerankers import WeightedReranker
from zvec.typing import MetricType

# Hybrid search: BM25 + dense with COSINE distances
reranker = WeightedReranker(
    topn=10,
    metrics={
        "bm25": None,              # BM25 scores (not distances)
        "dense": MetricType.COSINE,  # Convert distance [0,2] → similarity
    },
    weights={"bm25": 0.4, "dense": 0.6},
    normalize={
        "bm25": "bayes",  # Robust to outliers
        "dense": True     # Standard normalization
    }
)

reranked = reranker.rerank({
    "bm25": bm25_results,
    "dense": dense_results
})

MultiFieldWeightedReranker

from zvec_db.rerankers import MultiFieldWeightedReranker

reranker = MultiFieldWeightedReranker(
    topn=10,
    source_weights={"bm25": 0.6, "dense": 0.4},
    field_weights={"title": 3.0, "content": 1.0, "tags": 0.5}
)

# Documents with fields
docs = [
    Doc(id="1", score=0.0, fields={"title": 0.9, "content": 0.5}),
    Doc(id="2", score=0.0, fields={"title": 0.8, "content": 0.6}),
]

reranked = reranker.rerank({"bm25": docs})

Complete Pipeline Example with PipelineReranker

This example shows how to use PipelineReranker to chain multiple rerankers: first RRF for fusion, then a cross-encoder for final scoring.

import zvec
from zvec import create_and_open, VectorQuery, CollectionSchema, FieldSchema, VectorSchema, DataType, FlatIndexParam
from zvec_db.embedders import BM25Embedder, SentenceTransformersEmbedder
from zvec_db.rerankers import PipelineReranker, RrfReranker, SentenceTransformerReranker

# 1. Initialize embedders
documents = [
    "Machine learning is a subset of AI",
    "Deep learning uses neural networks",
    "NLP helps computers understand text",
    "Python is a popular programming language",
]

bm25 = BM25Embedder(max_features=4096)
bm25.fit(documents)

dense = SentenceTransformersEmbedder(model_name="all-MiniLM-L6-v2")

# 2. Create collection
schema = CollectionSchema(
    name="docs",
    fields=[FieldSchema("text", DataType.STRING)],
    vectors=[
        VectorSchema(name="sparse", data_type=DataType.SPARSE_VECTOR_FP32, dimension=4096),
        VectorSchema(name="dense", data_type=DataType.VECTOR_FP32, dimension=384,
                    index_param=FlatIndexParam(metric_type=zvec.MetricType.COSINE)),
    ]
)
collection = create_and_open("./my_db", schema)

# 3. Index documents
for i, doc in enumerate(documents):
    collection.insert(zvec.Doc(
        id=str(i),
        fields={"text": doc},
        vectors={
            "sparse": bm25.embed(doc),
            "dense": dense.embed(doc),
        }
    ))

# 4. Create pipeline: RRF (top 50) -> Cross-Encoder (top 10)
pipeline = PipelineReranker(
    rerankers=[
        RrfReranker(
            topn=50,
            rank_constant=60,
            schema=collection.schema  # Auto-detect metrics from schema
        ),
        SentenceTransformerReranker(
            topn=10,
            model_name="cross-encoder/ms-marco-MiniLM-L-6-v2"
        )
    ],
    topn=10
)

# 5. Search with pipeline
query = "neural networks"
results = collection.query(
    vectors=[
        VectorQuery(field_name="sparse", vector=bm25.embed(query)),
        VectorQuery(field_name="dense", vector=dense.embed(query)),
    ],
    topk=10,
    reranker=pipeline
)

# 6. Display results
for i, doc in enumerate(results):
    print(f"{i+1}. {doc.fields['text']} (score: {doc.score:.4f})")

The pipeline applies rerankers sequentially:

  1. RRF combines sparse and dense results, returning up to 50 documents

  2. Cross-Encoder re-scores these 50 documents using query+document attention, returning final top 10

Choosing the Right Embedder

Embedder

When to use

CountEmbedder

Baseline, documents of similar length

BM25Embedder

General use, good IR performance

BM25LEmbedder

Documents with very variable lengths

BM25PlusEmbedder

Many rare terms, need recall

DisMaxEmbedder

Multi-field, match any field

TfidfEmbedder

Relative term importance in corpus

Choosing the Right Reranker

Fusion Rerankers (combine multiple search results):

Reranker

When to use

RrfReranker

Rank fusion, no need for absolute scores

WeightedReranker

Precise control of weights per source

MultiFieldWeightedReranker

Structured documents (title, content, tags)

Cross-Encoder Rerankers (re-score with query + document):

Reranker

When to use

SentenceTransformerReranker

Local cross-encoder models (e.g., ms-marco)

ClassificationReranker

Multi-class classification reranking

OpenAIReranker

OpenAI API-based reranking