Purpose: INFRA-06 requires local embedding model (no API dependency). EVAL-02 requires synthetic query variations for robust evaluation. HNSW indexes enable fast similarity search in Phase 3.
Output: MiniLM embeddings for all items, HNSW indexes on all 3 embedding columns, synthetic query variations stored for test set items.
<execution_context> @./.claude/get-shit-done/workflows/execute-plan.md @./.claude/get-shit-done/templates/summary.md </execution_context>
@.planning/phases/02-embedding-generation/02-01-SUMMARY.md
@src/db.py @src/embeddings/text_prep.py @src/embeddings/batch_processor.py
src/embeddings/minilm_embed.py:"""Local all-MiniLM-L6-v2 embeddings via sentence-transformers."""
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import numpy as np
# Model loaded lazily (22MB download on first use)
_model = None
def _get_model():
global _model
if _model is None:
print("Loading MiniLM model (first time may download)...")
_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
return _model
def embed_minilm(texts: list[str]) -> list[list[float]]:
"""
Embed texts using local MiniLM model.
Args:
texts: List of texts to embed
Returns:
List of 384-dimensional embedding vectors
"""
model = _get_model()
embeddings = model.encode(
texts,
batch_size=128,
show_progress_bar=False,
normalize_embeddings=True, # For cosine similarity
)
return embeddings.tolist()
def embed_minilm_batch(
conn,
batch_size: int = 500, # Larger batches OK for local model
) -> int:
"""
Generate MiniLM embeddings for all line items.
Args:
conn: psycopg connection
batch_size: Items per database write batch
Returns:
Number of items embedded
"""
from .text_prep import prepare_embedding_text
from .batch_processor import update_embeddings_batch
# Fetch all items needing embeddings
with conn.cursor() as cur:
cur.execute("""
SELECT id, supplier_name_normalized, description_normalized
FROM line_item
WHERE embedding_minilm IS NULL
ORDER BY id
""")
rows = cur.fetchall()
if not rows:
print("All items already have MiniLM embeddings")
return 0
# Prepare texts
ids = [r[0] for r in rows]
texts = [prepare_embedding_text(r[1], r[2]) for r in rows]
# Generate embeddings - local model is fast, do all at once
print(f"Generating {len(texts)} MiniLM embeddings...")
model = _get_model()
embeddings = model.encode(
texts,
batch_size=128,
show_progress_bar=True,
normalize_embeddings=True,
)
embeddings = embeddings.tolist()
# Write to database in batches
total_updated = 0
for i in tqdm(range(0, len(ids), batch_size), desc="Writing to DB"):
batch_ids = ids[i:i + batch_size]
batch_embeddings = embeddings[i:i + batch_size]
pairs = list(zip(batch_ids, batch_embeddings))
updated = update_embeddings_batch(conn, 'embedding_minilm', pairs)
total_updated += updated
print(f"Generated {total_updated} MiniLM embeddings (384 dimensions)")
return total_updated
if __name__ == "__main__":
from src.db import get_connection
conn = get_connection()
embed_minilm_batch(conn)
conn.close()
src/embeddings/__init__.py to include minilm:"""Embedding generation for semantic search comparison."""
from .text_prep import prepare_embedding_text
from .batch_processor import batch_embed_with_progress, update_embeddings_batch
from .minilm_embed import embed_minilm, embed_minilm_batch
__all__ = [
"prepare_embedding_text",
"batch_embed_with_progress",
"update_embeddings_batch",
"embed_minilm",
"embed_minilm_batch",
]
uv run python -m src.embeddings.minilm_embed
Expected: ~6078 embeddings generated in ~30 seconds (local model, ~1000 texts/sec on CPU).
docker compose exec -T postgres psql -U dev -d dev -c "SELECT COUNT(*) FROM line_item WHERE embedding_minilm IS NOT NULL"
Expected: 6078 (all items have MiniLM embeddings)
migrations/003_hnsw_indexes.sql:-- HNSW indexes for similarity search
-- Created AFTER embedding population for efficiency
-- Parameters: m=16 (connections per node), ef_construction=64 (build quality)
-- Google embeddings (768 dimensions)
CREATE INDEX IF NOT EXISTS idx_embedding_google_hnsw
ON line_item
USING hnsw (embedding_google vector_cosine_ops)
WITH (m = 16, ef_construction = 64);
-- Jina embeddings (1024 dimensions)
CREATE INDEX IF NOT EXISTS idx_embedding_jina_hnsw
ON line_item
USING hnsw (embedding_jina vector_cosine_ops)
WITH (m = 16, ef_construction = 64);
-- MiniLM embeddings (384 dimensions)
CREATE INDEX IF NOT EXISTS idx_embedding_minilm_hnsw
ON line_item
USING hnsw (embedding_minilm vector_cosine_ops)
WITH (m = 16, ef_construction = 64);
-- Verify indexes created
SELECT indexname, indexdef
FROM pg_indexes
WHERE tablename = 'line_item' AND indexname LIKE 'idx_embedding%';
docker compose exec -T postgres psql -U dev -d dev < migrations/003_hnsw_indexes.sql
-- HNSW indexes created after data population
-- Uncomment these after initial data load in Phase 2
-- CREATE INDEX idx_embedding_google_hnsw ON line_item USING hnsw (embedding_google vector_cosine_ops) WITH (m = 16, ef_construction = 64);
-- CREATE INDEX idx_embedding_jina_hnsw ON line_item USING hnsw (embedding_jina vector_cosine_ops) WITH (m = 16, ef_construction = 64);
-- CREATE INDEX idx_embedding_minilm_hnsw ON line_item USING hnsw (embedding_minilm vector_cosine_ops) WITH (m = 16, ef_construction = 64);
Expected: Index creation takes ~10-30 seconds per index with 6K items.
docker compose exec -T postgres psql -U dev -d dev -c "SELECT indexname FROM pg_indexes WHERE tablename = 'line_item' AND indexname LIKE 'idx_embedding%'"
Expected: 3 indexes listed (idx_embedding_google_hnsw, idx_embedding_jina_hnsw, idx_embedding_minilm_hnsw)
migrations/002_test_query_variation.sql:-- Stores synthetic query variations for test set evaluation
CREATE TABLE IF NOT EXISTS test_query_variation (
id BIGSERIAL PRIMARY KEY,
line_item_id BIGINT NOT NULL REFERENCES line_item(id),
variation_type TEXT NOT NULL, -- 'typo', 'reorder', 'paraphrase'
original_text TEXT NOT NULL,
varied_text TEXT NOT NULL,
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
);
CREATE INDEX IF NOT EXISTS idx_test_query_variation_line_item
ON test_query_variation(line_item_id);
CREATE INDEX IF NOT EXISTS idx_test_query_variation_type
ON test_query_variation(variation_type);
docker compose exec -T postgres psql -U dev -d dev < migrations/002_test_query_variation.sql
uv add nlpaug
src/evaluation/query_variations.py:"""Generate synthetic query variations for test set evaluation."""
import random
import os
from typing import Optional
# Lazy imports for optional dependencies
_typo_aug = None
_genai_client = None
def _get_typo_augmenter():
"""Get keyboard typo augmenter (QWERTZ for German)."""
global _typo_aug
if _typo_aug is None:
import nlpaug.augmenter.char as nac
_typo_aug = nac.KeyboardAug(
aug_char_p=0.1,
aug_word_p=0.3,
include_numeric=False,
lang='de', # German QWERTZ layout
)
return _typo_aug
def _get_genai_client():
"""Get Gemini client for paraphrasing."""
global _genai_client
if _genai_client is None:
from google import genai
os.environ.setdefault('GOOGLE_GENAI_USE_VERTEXAI', 'True')
_genai_client = genai.Client()
return _genai_client
def generate_typo_variation(text: str) -> str:
"""Generate realistic QWERTZ keyboard typo variation."""
aug = _get_typo_augmenter()
result = aug.augment(text)
return result[0] if isinstance(result, list) else result
def generate_reorder_variation(text: str) -> str:
"""Reorder words in text (keep first and last for context)."""
words = text.split()
if len(words) <= 2:
return text
# Keep first word (often supplier context), shuffle middle, keep structure
if '|' in text:
# Has separator - reorder within each part
parts = text.split('|')
varied_parts = []
for part in parts:
words = part.strip().split()
if len(words) > 2:
middle = words[1:-1]
random.shuffle(middle)
words = [words[0]] + middle + [words[-1]]
varied_parts.append(' '.join(words))
return ' | '.join(varied_parts)
else:
middle = words[1:-1]
random.shuffle(middle)
return ' '.join([words[0]] + middle + [words[-1]])
def generate_paraphrase_variation(text: str) -> str:
"""Use Gemini to paraphrase German accounting text."""
client = _get_genai_client()
response = client.models.generate_content(
model='gemini-2.0-flash',
contents=f"""Paraphrase this German invoice description in a different way while keeping the same meaning.
Only output the paraphrase, nothing else. Keep it concise.
Original: {text}""",
)
return response.text.strip()
def generate_variations(
text: str,
variation_types: Optional[list[str]] = None
) -> dict[str, str]:
"""
Generate all variation types for a given text.
Args:
text: Original text to vary
variation_types: List of types to generate (default: all)
Returns:
Dict mapping variation_type to varied_text
"""
if variation_types is None:
variation_types = ['typo', 'reorder', 'paraphrase']
variations = {}
if 'typo' in variation_types:
variations['typo'] = generate_typo_variation(text)
if 'reorder' in variation_types:
variations['reorder'] = generate_reorder_variation(text)
if 'paraphrase' in variation_types:
try:
variations['paraphrase'] = generate_paraphrase_variation(text)
except Exception as e:
print(f"Paraphrase failed: {e}")
# Fallback to simple reorder if paraphrase fails
variations['paraphrase'] = generate_reorder_variation(text)
return variations
def generate_all_test_variations(
conn,
variation_types: Optional[list[str]] = None,
limit: Optional[int] = None
) -> int:
"""
Generate query variations for all test set items.
Args:
conn: psycopg connection
variation_types: Types to generate (default: all)
limit: Optional limit for testing
Returns:
Number of variations created
"""
from tqdm import tqdm
from src.embeddings.text_prep import prepare_embedding_text
if variation_types is None:
variation_types = ['typo', 'reorder', 'paraphrase']
# Fetch test set items without variations
with conn.cursor() as cur:
query = """
SELECT l.id, l.supplier_name_normalized, l.description_normalized
FROM line_item l
WHERE l.is_test_set = TRUE
AND NOT EXISTS (
SELECT 1 FROM test_query_variation v WHERE v.line_item_id = l.id
)
ORDER BY l.id
"""
if limit:
query += f" LIMIT {limit}"
cur.execute(query)
rows = cur.fetchall()
if not rows:
print("All test items already have variations")
return 0
print(f"Generating variations for {len(rows)} test items...")
total_created = 0
for row in tqdm(rows, desc="Generating variations"):
item_id, supplier, description = row
original_text = prepare_embedding_text(supplier, description)
# Generate variations
variations = generate_variations(original_text, variation_types)
# Insert into database
with conn.cursor() as cur:
for var_type, varied_text in variations.items():
cur.execute("""
INSERT INTO test_query_variation
(line_item_id, variation_type, original_text, varied_text)
VALUES (%s, %s, %s, %s)
""", (item_id, var_type, original_text, varied_text))
total_created += 1
conn.commit()
print(f"Created {total_created} query variations")
return total_created
if __name__ == "__main__":
from src.db import get_connection
conn = get_connection()
generate_all_test_variations(conn)
conn.close()
src/evaluation/__init__.py:"""Evaluation utilities for semantic search comparison."""
from .train_test_split import create_train_test_split, get_split_stats
from .query_variations import generate_variations, generate_all_test_variations
__all__ = [
"create_train_test_split",
"get_split_stats",
"generate_variations",
"generate_all_test_variations",
]
uv run python -m src.evaluation.query_variations
Expected: ~1216 test items * 3 variations = ~3648 total variations. Takes ~5-10 minutes due to paraphrase API calls.
docker compose exec -T postgres psql -U dev -d dev -c "SELECT variation_type, COUNT(*) FROM test_query_variation GROUP BY variation_type"
Expected: ~1216 rows each for typo, reorder, paraphrase
<success_criteria>