Add BM25 keyword search alongside semantic search using Reciprocal Rank Fusion (RRF) to improve GL/CC matching accuracy from 92.6% to 94%+.
Current semantic-only search fails in cases where:
BM25 keyword matching complements semantic search by catching exact term overlaps that embeddings may dilute.
Query: "Bechtle | MacBook Pro 16"
│
├──→ Semantic Search (pgvector) ──→ Top 20 results
│
└──→ BM25 Search (tsvector FTS) ──→ Top 20 results
│
RRF Merge ◄─────┘
│
Top 10 hybrid results
│
Curation pipeline (cluster, dedup)
│
LLM GL/CC prediction
New column on line_item table:
ALTER TABLE line_item ADD COLUMN search_vector tsvector;
Population:
UPDATE line_item SET search_vector =
to_tsvector('simple', coalesce(supplier_name, '') || ' ' || coalesce(description, ''));
Using 'simple' dictionary (language-agnostic) because:
Index:
CREATE INDEX idx_line_item_search_vector ON line_item USING GIN(search_vector);
Note: Recreate DB with new schema rather than ALTER TABLE on existing data.
src/search/bm25_search.py)def search_bm25(conn, query_text: str, k: int = 20) -> list[dict]:
"""
Full-text search using PostgreSQL tsvector.
Returns more candidates than needed (k=20) for RRF merging.
"""
query = """
SELECT id, supplier_name, description, debit_account,
credit_account, cost_center, net_amount,
ts_rank(search_vector, plainto_tsquery('simple', %s)) AS rank
FROM line_item
WHERE search_vector @@ plainto_tsquery('simple', %s)
ORDER BY rank DESC
LIMIT %s
"""
src/search/hybrid_search.py)def reciprocal_rank_fusion(
semantic_results: list[dict],
bm25_results: list[dict],
k: int = 60 # Standard RRF constant
) -> list[dict]:
"""
Merge two ranked lists using RRF.
RRF_score = sum(1 / (k + rank_i)) for each list containing the item.
"""
scores = {}
for rank, item in enumerate(semantic_results, 1):
scores[item['id']] = scores.get(item['id'], 0) + 1/(k + rank)
for rank, item in enumerate(bm25_results, 1):
scores[item['id']] = scores.get(item['id'], 0) + 1/(k + rank)
# Merge item data, sort by score descending
def search_hybrid(
conn,
query_text: str,
model: str = 'google',
k: int = 10,
semantic_k: int = 20,
bm25_k: int = 20
) -> list[dict]:
"""
Run semantic + BM25 searches, merge with RRF.
"""
semantic_results = search_single_model(conn, query_text, model, k=semantic_k)
bm25_results = search_bm25(conn, query_text, k=bm25_k)
merged = reciprocal_rank_fusion(semantic_results, bm25_results)
return merged[:k]
Minimal change to src/search/curation.py:
def curate_bookings_for_invoice(
conn,
supplier_name: str,
line_items: list[str],
model: str = 'google',
k: int = 10,
threshold: float = 0.6,
use_hybrid: bool = True, # NEW PARAMETER
) -> dict:
"""
If use_hybrid=True, uses hybrid search instead of semantic-only.
"""
for item in line_items:
query = f"{supplier_name} | {item}"
if use_hybrid:
results = search_hybrid(conn, query, model, k=k)
else:
results = search_single_model(conn, query, model, k=k, threshold=threshold)
# ... rest unchanged (merge, cluster, dedup)
The use_hybrid flag enables A/B comparison between semantic-only and hybrid.
Script to compare retrieval for failure cases:
# scripts/compare_retrieval.py
def extract_failures_from_report(html_path: str) -> list[dict]:
"""
Parse evaluation_report_v4.html to find invoices where GL or CC didn't match.
"""
...
failures = extract_failures_from_report('evaluation_report_v4.html')
for case in failures:
# Run semantic-only search
# Run hybrid search
# Print side-by-side comparison
Modify src/evaluation/llm_eval.py to accept --hybrid flag:
# Semantic-only (current)
python -m src.evaluation.llm_eval
# Hybrid search
python -m src.evaluation.llm_eval --hybrid
src/search/
├── pgvector_search.py # existing - semantic search
├── bm25_search.py # NEW - PostgreSQL FTS search
├── hybrid_search.py # NEW - RRF merge + hybrid entry point
├── curation.py # MODIFY - add use_hybrid flag
└── ...
src/evaluation/
├── llm_eval.py # MODIFY - add --hybrid CLI flag
└── ...
scripts/
└── compare_retrieval.py # NEW - qualitative comparison tool
migrations/
└── 006_add_search_vector.sql # NEW - schema migration