User Guide
This guide presents complete examples of zvec-db usage for sparse vectorization and reranking.
Sparse Embedding Examples
BM25 Embedding (Standard FTS)
from zvec_db.embedders import BM25Embedder
# Training documents
documents = [
"The black cat sleeps on the sofa",
"A brown dog runs in the garden",
"The cat and the dog are animals",
]
# Initialization and training
embedder = BM25Embedder(
k1=1.2,
b=0.75,
max_features=4096,
min_df=1
)
embedder.fit(documents)
# Query embedding
query_vector = embedder.embed("cat sleeps")
print(query_vector) # {42: 0.523, 108: 0.312, ...}
# Batch embedding
query_vectors = embedder.embed(["cat sleeps", "dog runs"])
print(len(query_vectors)) # 2
BM25 with Trigrams (Character N-grams)
Use character n-grams for fuzzy matching and typo tolerance:
from zvec_db.embedders import BM25Embedder
# Trigram-based BM25 for fuzzy matching
embedder = BM25Embedder(
k1=1.2,
b=0.75,
max_features=8192,
analyzer="char_wb", # Character n-grams at word boundaries
ngram_range=(3, 3), # Trigrams only
min_df=2, # Minimum document frequency
)
embedder.fit(documents)
# Handles typos and partial matches
query_vector = embedder.embed("slep") # Matches "sleeps", "sleeping"
BM25 with Mixed N-grams
Combine unigrams and n-grams for better coverage:
from zvec_db.embedders import BM25Embedder
# Word unigrams + bigrams
embedder = BM25Embedder(
max_features=16384,
analyzer="word",
ngram_range=(1, 2), # Unigrams and bigrams
)
embedder.fit(documents)
# Or character n-grams for fuzzy matching
embedder_fuzzy = BM25Embedder(
max_features=16384,
analyzer="char_wb",
ngram_range=(2, 4), # Bi-grams, trigrams, 4-grams
)
embedder_fuzzy.fit(documents)
CountVectorizer Parameters Reference
All BM25Embedder parameters (except k1, b, preprocessing_config)
are passed directly to scikit-learn’s CountVectorizer:
embedder = BM25Embedder(
max_features=4096,
k1=1.2,
b=0.75,
# Tokenization
tokenizer=lambda x: x.split(), # Custom tokenizer
token_pattern=r"(?u)\b\w+\b", # Regex pattern for tokens
# Vocabulary filtering
min_df=2, # Minimum document frequency
max_df=0.8, # Maximum document frequency
max_features=10000, # Max vocabulary size
# N-grams
ngram_range=(1, 2), # (min_n, max_n) - unigrams + bigrams
analyzer="word", # "word", "char", or "char_wb"
# Preprocessing
lowercase=True, # Convert to lowercase
stop_words="english",# Remove English stopwords
)
Analyzer options:
"word"- Word-level tokens (default)"char"- Character-level n-grams"char_wb"- Character n-grams at word boundaries (recommended for fuzzy matching)
BM25L for variable document lengths
from zvec_db.embedders import BM25LEmbedder
# Useful when documents have very variable lengths
embedder = BM25LEmbedder(k1=1.2, max_features=4096)
embedder.fit(documents)
BM25+ to avoid zero scores
from zvec_db.embedders import BM25PlusEmbedder
# The delta parameter avoids strictly zero scores
embedder = BM25PlusEmbedder(
k1=1.2,
b=0.75,
delta=0.5, # Smoothing
max_features=4096
)
embedder.fit(documents)
DisMax for multi-field search
from zvec_db.embedders import DisMaxEmbedder
# Takes the maximum score among terms
embedder = DisMaxEmbedder(
k1=1.2,
b=0.75,
tie_breaker=0.1, # Adds 10% of other scores
max_features=4096
)
embedder.fit(documents)
Custom tokenization
import re
def custom_tokenizer(text):
return re.findall(r'\b[a-z]+\b', text.lower())
embedder = BM25Embedder(
tokenizer=custom_tokenizer,
max_features=4096
)
embedder.fit(documents)
Pre-tokenized mode
# Already tokenized documents
tokenized_docs = [
["cat", "black", "sleeps"],
["dog", "brown", "runs"],
]
embedder = BM25Embedder(is_pretokenized=True)
embedder.fit(tokenized_docs)
# Tokenized query
query = ["cat", "sleeps"]
result = embedder.embed(query)
Save and load
# Save
embedder.save("models/bm25_model.joblib")
# Load
new_embedder = BM25Embedder()
new_embedder.load("models/bm25_model.joblib")
Reranking Examples
Understanding Metrics
The metrics parameter controls distance→similarity conversion:
from zvec_db.rerankers import WeightedReranker
from zvec.typing import MetricType
# Case 1: Global metric (all sources use COSINE)
reranker = WeightedReranker(
topn=10,
metrics=MetricType.COSINE # Applied to all sources
)
# Case 2: Per-source metrics
reranker = WeightedReranker(
topn=10,
metrics={
"dense": MetricType.COSINE, # Convert cosine distance [0,2] → similarity
"bm25": None, # No conversion (BM25 scores)
}
)
# Case 3: Auto-detect from schema
import zvec
collection = zvec.open("./my_collection")
reranker = WeightedReranker(
topn=10,
metrics=None, # Will infer from schema
schema=collection.schema
)
RrfReranker (RRF)
from zvec_db.rerankers import RrfReranker
from zvec.model.doc import Doc
# Results from different sources
bm25_results = [
Doc(id="doc1", score=15.5),
Doc(id="doc2", score=12.3),
]
dense_results = [
Doc(id="doc2", score=0.92),
Doc(id="doc1", score=0.75),
]
reranker = RrfReranker(
topn=10,
rank_constant=60,
# Note: normalize has no effect on RRF (uses ranks, not scores)
)
reranked = reranker.rerank({
"bm25": bm25_results,
"dense": dense_results
})
WeightedReranker (Weighted Fusion)
from zvec_db.rerankers import WeightedReranker
from zvec.typing import MetricType
# Hybrid search: BM25 + dense with COSINE distances
reranker = WeightedReranker(
topn=10,
metrics={
"bm25": None, # BM25 scores (not distances)
"dense": MetricType.COSINE, # Convert distance [0,2] → similarity
},
weights={"bm25": 0.4, "dense": 0.6},
normalize={
"bm25": "bayes", # Robust to outliers
"dense": True # Standard normalization
}
)
reranked = reranker.rerank({
"bm25": bm25_results,
"dense": dense_results
})
MultiFieldWeightedReranker
from zvec_db.rerankers import MultiFieldWeightedReranker
reranker = MultiFieldWeightedReranker(
topn=10,
source_weights={"bm25": 0.6, "dense": 0.4},
field_weights={"title": 3.0, "content": 1.0, "tags": 0.5}
)
# Documents with fields
docs = [
Doc(id="1", score=0.0, fields={"title": 0.9, "content": 0.5}),
Doc(id="2", score=0.0, fields={"title": 0.8, "content": 0.6}),
]
reranked = reranker.rerank({"bm25": docs})
Complete Pipeline Example with PipelineReranker
This example shows how to use PipelineReranker to chain multiple rerankers:
first RRF for fusion, then a cross-encoder for final scoring.
import zvec
from zvec import create_and_open, VectorQuery, CollectionSchema, FieldSchema, VectorSchema, DataType, FlatIndexParam
from zvec_db.embedders import BM25Embedder, SentenceTransformersEmbedder
from zvec_db.rerankers import PipelineReranker, RrfReranker, SentenceTransformerReranker
# 1. Initialize embedders
documents = [
"Machine learning is a subset of AI",
"Deep learning uses neural networks",
"NLP helps computers understand text",
"Python is a popular programming language",
]
bm25 = BM25Embedder(max_features=4096)
bm25.fit(documents)
dense = SentenceTransformersEmbedder(model_name="all-MiniLM-L6-v2")
# 2. Create collection
schema = CollectionSchema(
name="docs",
fields=[FieldSchema("text", DataType.STRING)],
vectors=[
VectorSchema(name="sparse", data_type=DataType.SPARSE_VECTOR_FP32, dimension=4096),
VectorSchema(name="dense", data_type=DataType.VECTOR_FP32, dimension=384,
index_param=FlatIndexParam(metric_type=zvec.MetricType.COSINE)),
]
)
collection = create_and_open("./my_db", schema)
# 3. Index documents
for i, doc in enumerate(documents):
collection.insert(zvec.Doc(
id=str(i),
fields={"text": doc},
vectors={
"sparse": bm25.embed(doc),
"dense": dense.embed(doc),
}
))
# 4. Create pipeline: RRF (top 50) -> Cross-Encoder (top 10)
pipeline = PipelineReranker(
rerankers=[
RrfReranker(
topn=50,
rank_constant=60,
schema=collection.schema # Auto-detect metrics from schema
),
SentenceTransformerReranker(
topn=10,
model_name="cross-encoder/ms-marco-MiniLM-L-6-v2"
)
],
topn=10
)
# 5. Search with pipeline
query = "neural networks"
results = collection.query(
vectors=[
VectorQuery(field_name="sparse", vector=bm25.embed(query)),
VectorQuery(field_name="dense", vector=dense.embed(query)),
],
topk=10,
reranker=pipeline
)
# 6. Display results
for i, doc in enumerate(results):
print(f"{i+1}. {doc.fields['text']} (score: {doc.score:.4f})")
The pipeline applies rerankers sequentially:
RRF combines sparse and dense results, returning up to 50 documents
Cross-Encoder re-scores these 50 documents using query+document attention, returning final top 10
Choosing the Right Embedder
Embedder |
When to use |
|---|---|
CountEmbedder |
Baseline, documents of similar length |
BM25Embedder |
General use, good IR performance |
BM25LEmbedder |
Documents with very variable lengths |
BM25PlusEmbedder |
Many rare terms, need recall |
DisMaxEmbedder |
Multi-field, match any field |
TfidfEmbedder |
Relative term importance in corpus |
Choosing the Right Reranker
Fusion Rerankers (combine multiple search results):
Reranker |
When to use |
|---|---|
RrfReranker |
Rank fusion, no need for absolute scores |
WeightedReranker |
Precise control of weights per source |
MultiFieldWeightedReranker |
Structured documents (title, content, tags) |
Cross-Encoder Rerankers (re-score with query + document):
Reranker |
When to use |
|---|---|
SentenceTransformerReranker |
Local cross-encoder models (e.g., ms-marco) |
ClassificationReranker |
Multi-class classification reranking |
OpenAIReranker |
OpenAI API-based reranking |