For Claude: REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task.
Goal: Add BM25 keyword search alongside semantic search using RRF to improve GL/CC matching accuracy.
Architecture: Two parallel search paths (pgvector semantic + PostgreSQL FTS) merged using Reciprocal Rank Fusion. Results flow into existing curation pipeline unchanged.
Tech Stack: PostgreSQL tsvector/tsquery, psycopg, existing pgvector infrastructure
Files:
migrations/004_search_vector.sqlStep 1: Write the migration SQL
-- migrations/004_search_vector.sql
-- Add full-text search support for hybrid BM25 + semantic search
-- Add tsvector column for BM25 search
ALTER TABLE line_item ADD COLUMN IF NOT EXISTS search_vector tsvector;
-- Populate search_vector with supplier_name and description
-- Using 'simple' dictionary for language-agnostic tokenization
UPDATE line_item SET search_vector =
to_tsvector('simple', coalesce(supplier_name, '') || ' ' || coalesce(description, ''));
-- Create GIN index for fast full-text search
CREATE INDEX IF NOT EXISTS idx_line_item_search_vector
ON line_item USING GIN(search_vector);
-- Verify
SELECT COUNT(*) FROM line_item WHERE search_vector IS NOT NULL;
Step 2: Run the migration
psql -h localhost -p 5433 -U dev -d semantic_search -f migrations/004_search_vector.sql
Expected: Migration completes, shows count of ~6000+ rows with search_vector populated.
Step 3: Verify the index
psql -h localhost -p 5433 -U dev -d semantic_search -c "\di idx_line_item_search_vector"
Expected: Shows the GIN index on line_item.
Step 4: Test a simple FTS query
psql -h localhost -p 5433 -U dev -d semantic_search -c "
SELECT supplier_name, description, ts_rank(search_vector, plainto_tsquery('simple', 'Bechtle MacBook')) as rank
FROM line_item
WHERE search_vector @@ plainto_tsquery('simple', 'Bechtle MacBook')
ORDER BY rank DESC
LIMIT 5;
"
Expected: Returns Bechtle MacBook-related rows with rank scores.
Step 5: Commit
git add migrations/004_search_vector.sql
git commit -m "feat: add search_vector column for BM25 full-text search"
Files:
src/search/bm25_search.pyStep 1: Create the BM25 search module
# src/search/bm25_search.py
"""BM25 full-text search using PostgreSQL tsvector."""
def search_bm25(
conn,
query_text: str,
k: int = 20,
) -> list[dict]:
"""
Full-text search using PostgreSQL tsvector with BM25-style ranking.
Args:
conn: psycopg database connection
query_text: Search query (natural language)
k: Maximum number of results to return
Returns:
List of dicts with item fields and ts_rank scores, ordered by rank descending.
Each dict has 'rank' field (not 'similarity') to distinguish from semantic search.
"""
with conn.cursor() as cur:
query = """
SELECT
id,
supplier_name,
description,
debit_account,
credit_account,
cost_center,
net_amount,
ts_rank(search_vector, plainto_tsquery('simple', %s)) AS rank
FROM line_item
WHERE search_vector @@ plainto_tsquery('simple', %s)
ORDER BY rank DESC
LIMIT %s
"""
cur.execute(query, (query_text, query_text, k))
columns = [
'id', 'supplier_name', 'description', 'debit_account',
'credit_account', 'cost_center', 'net_amount', 'rank'
]
results = []
for row in cur.fetchall():
results.append(dict(zip(columns, row)))
return results
Step 2: Test the function manually
cd /home/volrath/code/worktrees-orcha-semantic-search/spikes/semantic-search
source .venv/bin/activate
python3 -c "
from src.db import get_connection
from src.search.bm25_search import search_bm25
conn = get_connection()
results = search_bm25(conn, 'Bechtle MacBook', k=5)
for r in results:
print(f\"{r['supplier_name'][:30]:<30} | {r['description'][:40]:<40} | rank={r['rank']:.4f}\")
conn.close()
"
Expected: Returns Bechtle MacBook-related items with rank scores.
Step 3: Commit
git add src/search/bm25_search.py
git commit -m "feat: add BM25 full-text search function"
Files:
src/search/hybrid_search.pyStep 1: Create the hybrid search module with RRF
# src/search/hybrid_search.py
"""Hybrid search combining semantic (pgvector) and BM25 (tsvector) with RRF."""
from typing import Optional
from .pgvector_search import search_single_model
from .bm25_search import search_bm25
def reciprocal_rank_fusion(
semantic_results: list[dict],
bm25_results: list[dict],
k: int = 60,
) -> list[dict]:
"""
Merge two ranked result lists using Reciprocal Rank Fusion.
RRF score = sum(1 / (k + rank)) for each list containing the item.
Higher k reduces the impact of high ranks (standard k=60).
Args:
semantic_results: Results from semantic search (has 'similarity' field)
bm25_results: Results from BM25 search (has 'rank' field)
k: RRF constant (default 60 from original RRF paper)
Returns:
Merged list sorted by RRF score descending.
Each item has 'rrf_score', 'semantic_rank', 'bm25_rank' fields added.
"""
# Build score accumulator and item data store
scores = {} # id -> rrf_score
items = {} # id -> item dict
semantic_ranks = {} # id -> rank (1-indexed)
bm25_ranks = {} # id -> rank (1-indexed)
# Process semantic results
for rank, item in enumerate(semantic_results, 1):
item_id = item['id']
scores[item_id] = scores.get(item_id, 0) + 1 / (k + rank)
items[item_id] = item.copy()
semantic_ranks[item_id] = rank
# Process BM25 results
for rank, item in enumerate(bm25_results, 1):
item_id = item['id']
scores[item_id] = scores.get(item_id, 0) + 1 / (k + rank)
if item_id not in items:
items[item_id] = item.copy()
bm25_ranks[item_id] = rank
# Build final results with RRF metadata
merged = []
for item_id, rrf_score in scores.items():
result = items[item_id]
result['rrf_score'] = rrf_score
result['semantic_rank'] = semantic_ranks.get(item_id)
result['bm25_rank'] = bm25_ranks.get(item_id)
# Normalize: use similarity if available, else derive from rank
if 'similarity' not in result and 'rank' in result:
result['similarity'] = result.pop('rank') # Use BM25 rank as similarity proxy
merged.append(result)
# Sort by RRF score descending
merged.sort(key=lambda x: x['rrf_score'], reverse=True)
return merged
def search_hybrid(
conn,
query_text: str,
model: str = 'google',
k: int = 10,
semantic_k: int = 20,
bm25_k: int = 20,
threshold: float = 0.0,
rrf_k: int = 60,
) -> list[dict]:
"""
Run hybrid search combining semantic and BM25, merged with RRF.
Args:
conn: psycopg database connection
query_text: Search query
model: Embedding model ('google' or 'minilm')
k: Final number of results to return
semantic_k: Number of results from semantic search (before merge)
bm25_k: Number of results from BM25 search (before merge)
threshold: Minimum similarity threshold for semantic search
rrf_k: RRF constant (default 60)
Returns:
Top-k results after RRF merge, with rrf_score and rank metadata.
"""
# Run both searches
semantic_results = search_single_model(conn, query_text, model, k=semantic_k, threshold=threshold)
bm25_results = search_bm25(conn, query_text, k=bm25_k)
# Merge with RRF
merged = reciprocal_rank_fusion(semantic_results, bm25_results, k=rrf_k)
return merged[:k]
Step 2: Test the hybrid search
source .venv/bin/activate
python3 -c "
from src.db import get_connection
from src.search.hybrid_search import search_hybrid
conn = get_connection()
results = search_hybrid(conn, 'Bechtle MacBook Pro', model='google', k=5)
for r in results:
sem_rank = r.get('semantic_rank', '-')
bm25_rank = r.get('bm25_rank', '-')
print(f\"RRF={r['rrf_score']:.4f} | sem={sem_rank} bm25={bm25_rank} | {r['description'][:50]}\")
conn.close()
"
Expected: Results show RRF scores with both semantic and BM25 rank info. Items appearing in both lists should have higher RRF scores.
Step 3: Commit
git add src/search/hybrid_search.py
git commit -m "feat: add hybrid search with reciprocal rank fusion"
Files:
src/search/curation.pyStep 1: Add import for hybrid search
At the top of src/search/curation.py, after line 9 (after existing imports), add:
from .hybrid_search import search_hybrid
Step 2: Add use_hybrid parameter to curate_bookings_for_invoice
Modify the function signature at line 21-28 to add use_hybrid parameter:
def curate_bookings_for_invoice(
conn,
supplier_name: str,
line_items: list[str],
model: str = 'minilm',
k: int = DEFAULT_K,
threshold: Optional[float] = None,
use_hybrid: bool = False, # NEW PARAMETER
) -> dict:
Step 3: Modify the search loop to use hybrid when requested
Replace the search loop (lines 67-77) with:
# Step 1: Per-line-item search (semantic or hybrid)
all_results = []
for line_item in line_items:
query = f"{supplier_name} | {line_item}"
if use_hybrid:
# Hybrid search: semantic + BM25 with RRF
results = search_hybrid(
conn, query, model=model, k=k,
semantic_k=k * 2, bm25_k=k * 2, threshold=threshold or 0.0
)
else:
# Original semantic-only search
query_embedding = embed_func(query)
results = search_pgvector(conn, query_embedding, embedding_column, k, threshold)
# Tag each result with source line item
for r in results:
r['_source_line_item'] = line_item
all_results.extend(results)
Step 4: Update metadata to include use_hybrid flag
Modify the metadata dict (around line 112-120) to include:
return {
'clusters': output_clusters,
'metadata': {
'model': model,
'k': k,
'threshold': threshold,
'use_hybrid': use_hybrid, # NEW FIELD
'line_items_count': len(line_items),
'total_raw_matches': len(all_results),
'unique_matches': len(merged),
'clusters_count': len(output_clusters),
},
}
Step 5: Test the integration
source .venv/bin/activate
python3 -c "
from src.db import get_connection
from src.search.curation import curate_bookings_for_invoice, curated_bookings_to_csv
conn = get_connection()
# Test with hybrid=True
result = curate_bookings_for_invoice(
conn,
supplier_name='Bechtle GmbH IT- Systemhaus',
line_items=['MacBook Pro 16'],
model='google',
use_hybrid=True
)
print('Hybrid search results:')
print(f\"Clusters: {result['metadata']['clusters_count']}, Unique matches: {result['metadata']['unique_matches']}\")
print(curated_bookings_to_csv(result)[:500])
conn.close()
"
Expected: Returns curated results with use_hybrid=True in metadata.
Step 6: Commit
git add src/search/curation.py
git commit -m "feat: add use_hybrid flag to curation pipeline"
Files:
scripts/compare_retrieval.pyStep 1: Create the comparison script
#!/usr/bin/env python3
"""Compare semantic-only vs hybrid retrieval for failure cases."""
import re
import sys
from pathlib import Path
from bs4 import BeautifulSoup
# Add src to path
sys.path.insert(0, str(Path(__file__).parent.parent))
from src.db import get_connection
from src.search.curation import curate_bookings_for_invoice
def extract_failures_from_report(html_path: str) -> list[dict]:
"""
Parse evaluation report HTML to find invoices where GL or CC didn't match.
Returns list of dicts with invoice_number, supplier, expected_gl, expected_cc.
"""
with open(html_path, 'r', encoding='utf-8') as f:
soup = BeautifulSoup(f.read(), 'html.parser')
failures = []
# Find all invoice sections - look for invoice headers
# The report structure has invoice cards with match/mismatch indicators
for card in soup.find_all('div', class_='invoice-card'):
# Check if this invoice has any failures
gl_badge = card.find('span', class_='badge-danger')
cc_badge = card.find('span', class_='badge-warning')
if gl_badge or cc_badge:
# Extract invoice info from the card header
header = card.find('h4') or card.find('h3')
if header:
invoice_text = header.get_text(strip=True)
# Parse invoice number and supplier
match = re.match(r'(.+?)\s*[|\-]\s*(.+)', invoice_text)
if match:
invoice_number = match.group(1).strip()
supplier = match.group(2).strip()
else:
invoice_number = invoice_text
supplier = ''
failures.append({
'invoice_number': invoice_number,
'supplier': supplier,
'has_gl_failure': gl_badge is not None,
'has_cc_failure': cc_badge is not None,
})
return failures
def compare_retrieval(invoice_number: str, supplier: str, line_items: list[str]):
"""Compare semantic-only vs hybrid retrieval for a single invoice."""
conn = get_connection()
print(f"\n{'='*70}")
print(f"Invoice: {invoice_number} | Supplier: {supplier}")
print(f"Line items: {line_items}")
print('='*70)
# Semantic-only search
print("\n--- SEMANTIC ONLY (use_hybrid=False) ---")
semantic_result = curate_bookings_for_invoice(
conn, supplier, line_items, model='google', use_hybrid=False
)
print(f"Clusters: {semantic_result['metadata']['clusters_count']}")
for cluster in semantic_result['clusters'][:3]:
print(f" GL:{cluster['debit_account']} CC:{cluster['cost_center']} ({cluster['total_matches']} matches)")
for item in cluster['items'][:2]:
print(f" - {item['description'][:50]}")
# Hybrid search
print("\n--- HYBRID (use_hybrid=True) ---")
hybrid_result = curate_bookings_for_invoice(
conn, supplier, line_items, model='google', use_hybrid=True
)
print(f"Clusters: {hybrid_result['metadata']['clusters_count']}")
for cluster in hybrid_result['clusters'][:3]:
print(f" GL:{cluster['debit_account']} CC:{cluster['cost_center']} ({cluster['total_matches']} matches)")
for item in cluster['items'][:2]:
print(f" - {item['description'][:50]}")
conn.close()
def main():
# Check for report file
report_path = Path('evaluation_report_v4.html')
if not report_path.exists():
print(f"Error: {report_path} not found")
print("Please provide the path to the evaluation report HTML file.")
sys.exit(1)
print("Extracting failure cases from evaluation report...")
failures = extract_failures_from_report(str(report_path))
print(f"Found {len(failures)} invoices with failures")
if not failures:
print("No failures found in report. Exiting.")
return
# For each failure, we need to get the line items
# This is a simplified version - in reality you'd fetch from Orcha DB
print("\nComparing retrieval for each failure case...")
print("(Using supplier name as single line item for demo)")
for case in failures[:5]: # Limit to first 5 for manual review
compare_retrieval(
case['invoice_number'],
case['supplier'],
[case['supplier']] # Simplified - use supplier as line item
)
if __name__ == '__main__':
main()
Step 2: Install beautifulsoup4 if not present
source .venv/bin/activate
pip install beautifulsoup4
Step 3: Run the comparison (if report exists)
python scripts/compare_retrieval.py
Expected: Shows side-by-side comparison of semantic vs hybrid results for failure cases.
Step 4: Commit
git add scripts/compare_retrieval.py
git commit -m "feat: add qualitative retrieval comparison script"
Files:
src/evaluation/llm_eval.pyStep 1: Add argparse import and CLI parsing
At the top of the file (after other imports around line 16), add:
import argparse
Step 2: Add argument parsing in main block
Find the if __name__ == '__main__': block at the end of the file and replace it with:
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Evaluate LLM GL/CC predictions')
parser.add_argument('--hybrid', action='store_true',
help='Use hybrid BM25+semantic search instead of semantic-only')
parser.add_argument('--output', type=str, default='evaluation_report.html',
help='Output HTML report filename')
args = parser.parse_args()
# Pass hybrid flag to evaluation
run_evaluation(use_hybrid=args.hybrid, output_file=args.output)
Step 3: Modify run_evaluation function to accept use_hybrid
Find the run_evaluation function definition and add the use_hybrid parameter:
def run_evaluation(use_hybrid: bool = False, output_file: str = 'evaluation_report.html'):
Step 4: Pass use_hybrid to curate_bookings_for_invoice calls
Find where curate_bookings_for_invoice is called in the evaluation code and add the use_hybrid parameter:
curated = curate_bookings_for_invoice(
conn_semantic,
supplier_name=invoice['issuer_name'],
line_items=descriptions,
model='google',
use_hybrid=use_hybrid, # ADD THIS
)
Step 5: Test the CLI
source .venv/bin/activate
python -m src.evaluation.llm_eval --help
Expected: Shows help with --hybrid flag option.
Step 6: Commit
git add src/evaluation/llm_eval.py
git commit -m "feat: add --hybrid flag to LLM evaluation CLI"
Step 1: Run evaluation with hybrid search
source .venv/bin/activate
python -m src.evaluation.llm_eval --hybrid --output evaluation_report_hybrid.html
Expected: Generates new evaluation report using hybrid search.
Step 2: Compare results
Compare evaluation_report_hybrid.html with previous evaluation_report_v4.html:
Step 3: Document results
Update the design doc or create a results summary with:
Step 4: Final commit with results
git add evaluation_report_hybrid.html
git commit -m "feat: evaluation results with hybrid BM25+semantic search"