Generate local MiniLM embeddings, create HNSW indexes, and generate synthetic query variations for test set.

Purpose: INFRA-06 requires local embedding model (no API dependency). EVAL-02 requires synthetic query variations for robust evaluation. HNSW indexes enable fast similarity search in Phase 3.

Output: MiniLM embeddings for all items, HNSW indexes on all 3 embedding columns, synthetic query variations stored for test set items.

<execution_context> @./.claude/get-shit-done/workflows/execute-plan.md @./.claude/get-shit-done/templates/summary.md </execution_context>

@.planning/PROJECT.md @.planning/ROADMAP.md @.planning/STATE.md @.planning/phases/02-embedding-generation/02-RESEARCH.md

Prior plan context

@.planning/phases/02-embedding-generation/02-01-SUMMARY.md

Existing source files

@src/db.py @src/embeddings/text_prep.py @src/embeddings/batch_processor.py

Task 1: Create MiniLM embedding module and generate embeddings src/embeddings/minilm_embed.py Create the local MiniLM embedding module and generate all embeddings:

Create src/embeddings/minilm_embed.py:

"""Local all-MiniLM-L6-v2 embeddings via sentence-transformers."""
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import numpy as np

# Model loaded lazily (22MB download on first use)
_model = None

def _get_model():
    global _model
    if _model is None:
        print("Loading MiniLM model (first time may download)...")
        _model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    return _model


def embed_minilm(texts: list[str]) -> list[list[float]]:
    """
    Embed texts using local MiniLM model.

    Args:
        texts: List of texts to embed

    Returns:
        List of 384-dimensional embedding vectors
    """
    model = _get_model()
    embeddings = model.encode(
        texts,
        batch_size=128,
        show_progress_bar=False,
        normalize_embeddings=True,  # For cosine similarity
    )
    return embeddings.tolist()


def embed_minilm_batch(
    conn,
    batch_size: int = 500,  # Larger batches OK for local model
) -> int:
    """
    Generate MiniLM embeddings for all line items.

    Args:
        conn: psycopg connection
        batch_size: Items per database write batch

    Returns:
        Number of items embedded
    """
    from .text_prep import prepare_embedding_text
    from .batch_processor import update_embeddings_batch

    # Fetch all items needing embeddings
    with conn.cursor() as cur:
        cur.execute("""
            SELECT id, supplier_name_normalized, description_normalized
            FROM line_item
            WHERE embedding_minilm IS NULL
            ORDER BY id
        """)
        rows = cur.fetchall()

    if not rows:
        print("All items already have MiniLM embeddings")
        return 0

    # Prepare texts
    ids = [r[0] for r in rows]
    texts = [prepare_embedding_text(r[1], r[2]) for r in rows]

    # Generate embeddings - local model is fast, do all at once
    print(f"Generating {len(texts)} MiniLM embeddings...")
    model = _get_model()
    embeddings = model.encode(
        texts,
        batch_size=128,
        show_progress_bar=True,
        normalize_embeddings=True,
    )
    embeddings = embeddings.tolist()

    # Write to database in batches
    total_updated = 0
    for i in tqdm(range(0, len(ids), batch_size), desc="Writing to DB"):
        batch_ids = ids[i:i + batch_size]
        batch_embeddings = embeddings[i:i + batch_size]
        pairs = list(zip(batch_ids, batch_embeddings))
        updated = update_embeddings_batch(conn, 'embedding_minilm', pairs)
        total_updated += updated

    print(f"Generated {total_updated} MiniLM embeddings (384 dimensions)")
    return total_updated


if __name__ == "__main__":
    from src.db import get_connection
    conn = get_connection()
    embed_minilm_batch(conn)
    conn.close()

Update src/embeddings/__init__.py to include minilm:

"""Embedding generation for semantic search comparison."""
from .text_prep import prepare_embedding_text
from .batch_processor import batch_embed_with_progress, update_embeddings_batch
from .minilm_embed import embed_minilm, embed_minilm_batch

__all__ = [
    "prepare_embedding_text",
    "batch_embed_with_progress",
    "update_embeddings_batch",
    "embed_minilm",
    "embed_minilm_batch",
]

Run the embedding generation:

uv run python -m src.embeddings.minilm_embed

Expected: ~6078 embeddings generated in ~30 seconds (local model, ~1000 texts/sec on CPU). Run docker compose exec -T postgres psql -U dev -d dev -c "SELECT COUNT(*) FROM line_item WHERE embedding_minilm IS NOT NULL" Expected: 6078 (all items have MiniLM embeddings)

src/embeddings/minilm_embed.py exports embed_minilm and embed_minilm_batch
All 6078 line items have embedding_minilm populated
Embeddings are 384 dimensions, normalized for cosine similarity

Task 2: Create HNSW indexes for all embedding columns migrations/003_hnsw_indexes.sql Create HNSW indexes AFTER all embeddings are populated (this is critical - indexing empty/sparse data is inefficient).

Create migrations/003_hnsw_indexes.sql:

-- HNSW indexes for similarity search
-- Created AFTER embedding population for efficiency
-- Parameters: m=16 (connections per node), ef_construction=64 (build quality)

-- Google embeddings (768 dimensions)
CREATE INDEX IF NOT EXISTS idx_embedding_google_hnsw
ON line_item
USING hnsw (embedding_google vector_cosine_ops)
WITH (m = 16, ef_construction = 64);

-- Jina embeddings (1024 dimensions)
CREATE INDEX IF NOT EXISTS idx_embedding_jina_hnsw
ON line_item
USING hnsw (embedding_jina vector_cosine_ops)
WITH (m = 16, ef_construction = 64);

-- MiniLM embeddings (384 dimensions)
CREATE INDEX IF NOT EXISTS idx_embedding_minilm_hnsw
ON line_item
USING hnsw (embedding_minilm vector_cosine_ops)
WITH (m = 16, ef_construction = 64);

-- Verify indexes created
SELECT indexname, indexdef
FROM pg_indexes
WHERE tablename = 'line_item' AND indexname LIKE 'idx_embedding%';

Apply the migration:

docker compose exec -T postgres psql -U dev -d dev < migrations/003_hnsw_indexes.sql

Also update init.sql to include these indexes (for fresh database setup): Add at the end of init.sql:

-- HNSW indexes created after data population
-- Uncomment these after initial data load in Phase 2
-- CREATE INDEX idx_embedding_google_hnsw ON line_item USING hnsw (embedding_google vector_cosine_ops) WITH (m = 16, ef_construction = 64);
-- CREATE INDEX idx_embedding_jina_hnsw ON line_item USING hnsw (embedding_jina vector_cosine_ops) WITH (m = 16, ef_construction = 64);
-- CREATE INDEX idx_embedding_minilm_hnsw ON line_item USING hnsw (embedding_minilm vector_cosine_ops) WITH (m = 16, ef_construction = 64);

Expected: Index creation takes ~10-30 seconds per index with 6K items. Run docker compose exec -T postgres psql -U dev -d dev -c "SELECT indexname FROM pg_indexes WHERE tablename = 'line_item' AND indexname LIKE 'idx_embedding%'" Expected: 3 indexes listed (idx_embedding_google_hnsw, idx_embedding_jina_hnsw, idx_embedding_minilm_hnsw)

HNSW indexes exist for all 3 embedding columns
Indexes use cosine distance (vector_cosine_ops)
Index parameters: m=16, ef_construction=64

Task 3: Create query variation module and generate test set variations src/evaluation/query_variations.py, migrations/002_test_query_variation.sql Create the query variation system for test set evaluation:

Create migrations/002_test_query_variation.sql:

-- Stores synthetic query variations for test set evaluation
CREATE TABLE IF NOT EXISTS test_query_variation (
    id BIGSERIAL PRIMARY KEY,
    line_item_id BIGINT NOT NULL REFERENCES line_item(id),
    variation_type TEXT NOT NULL,  -- 'typo', 'reorder', 'paraphrase'
    original_text TEXT NOT NULL,
    varied_text TEXT NOT NULL,
    created_at TIMESTAMPTZ NOT NULL DEFAULT now()
);

CREATE INDEX IF NOT EXISTS idx_test_query_variation_line_item
ON test_query_variation(line_item_id);

CREATE INDEX IF NOT EXISTS idx_test_query_variation_type
ON test_query_variation(variation_type);

Apply migration:

docker compose exec -T postgres psql -U dev -d dev < migrations/002_test_query_variation.sql

Add nlpaug dependency:

uv add nlpaug

Create src/evaluation/query_variations.py:

"""Generate synthetic query variations for test set evaluation."""
import random
import os
from typing import Optional

# Lazy imports for optional dependencies
_typo_aug = None
_genai_client = None


def _get_typo_augmenter():
    """Get keyboard typo augmenter (QWERTZ for German)."""
    global _typo_aug
    if _typo_aug is None:
        import nlpaug.augmenter.char as nac
        _typo_aug = nac.KeyboardAug(
            aug_char_p=0.1,
            aug_word_p=0.3,
            include_numeric=False,
            lang='de',  # German QWERTZ layout
        )
    return _typo_aug


def _get_genai_client():
    """Get Gemini client for paraphrasing."""
    global _genai_client
    if _genai_client is None:
        from google import genai
        os.environ.setdefault('GOOGLE_GENAI_USE_VERTEXAI', 'True')
        _genai_client = genai.Client()
    return _genai_client


def generate_typo_variation(text: str) -> str:
    """Generate realistic QWERTZ keyboard typo variation."""
    aug = _get_typo_augmenter()
    result = aug.augment(text)
    return result[0] if isinstance(result, list) else result


def generate_reorder_variation(text: str) -> str:
    """Reorder words in text (keep first and last for context)."""
    words = text.split()
    if len(words) <= 2:
        return text
    # Keep first word (often supplier context), shuffle middle, keep structure
    if '|' in text:
        # Has separator - reorder within each part
        parts = text.split('|')
        varied_parts = []
        for part in parts:
            words = part.strip().split()
            if len(words) > 2:
                middle = words[1:-1]
                random.shuffle(middle)
                words = [words[0]] + middle + [words[-1]]
            varied_parts.append(' '.join(words))
        return ' | '.join(varied_parts)
    else:
        middle = words[1:-1]
        random.shuffle(middle)
        return ' '.join([words[0]] + middle + [words[-1]])


def generate_paraphrase_variation(text: str) -> str:
    """Use Gemini to paraphrase German accounting text."""
    client = _get_genai_client()
    response = client.models.generate_content(
        model='gemini-2.0-flash',
        contents=f"""Paraphrase this German invoice description in a different way while keeping the same meaning.
Only output the paraphrase, nothing else. Keep it concise.

Original: {text}""",
    )
    return response.text.strip()


def generate_variations(
    text: str,
    variation_types: Optional[list[str]] = None
) -> dict[str, str]:
    """
    Generate all variation types for a given text.

    Args:
        text: Original text to vary
        variation_types: List of types to generate (default: all)

    Returns:
        Dict mapping variation_type to varied_text
    """
    if variation_types is None:
        variation_types = ['typo', 'reorder', 'paraphrase']

    variations = {}

    if 'typo' in variation_types:
        variations['typo'] = generate_typo_variation(text)

    if 'reorder' in variation_types:
        variations['reorder'] = generate_reorder_variation(text)

    if 'paraphrase' in variation_types:
        try:
            variations['paraphrase'] = generate_paraphrase_variation(text)
        except Exception as e:
            print(f"Paraphrase failed: {e}")
            # Fallback to simple reorder if paraphrase fails
            variations['paraphrase'] = generate_reorder_variation(text)

    return variations


def generate_all_test_variations(
    conn,
    variation_types: Optional[list[str]] = None,
    limit: Optional[int] = None
) -> int:
    """
    Generate query variations for all test set items.

    Args:
        conn: psycopg connection
        variation_types: Types to generate (default: all)
        limit: Optional limit for testing

    Returns:
        Number of variations created
    """
    from tqdm import tqdm
    from src.embeddings.text_prep import prepare_embedding_text

    if variation_types is None:
        variation_types = ['typo', 'reorder', 'paraphrase']

    # Fetch test set items without variations
    with conn.cursor() as cur:
        query = """
            SELECT l.id, l.supplier_name_normalized, l.description_normalized
            FROM line_item l
            WHERE l.is_test_set = TRUE
            AND NOT EXISTS (
                SELECT 1 FROM test_query_variation v WHERE v.line_item_id = l.id
            )
            ORDER BY l.id
        """
        if limit:
            query += f" LIMIT {limit}"
        cur.execute(query)
        rows = cur.fetchall()

    if not rows:
        print("All test items already have variations")
        return 0

    print(f"Generating variations for {len(rows)} test items...")
    total_created = 0

    for row in tqdm(rows, desc="Generating variations"):
        item_id, supplier, description = row
        original_text = prepare_embedding_text(supplier, description)

        # Generate variations
        variations = generate_variations(original_text, variation_types)

        # Insert into database
        with conn.cursor() as cur:
            for var_type, varied_text in variations.items():
                cur.execute("""
                    INSERT INTO test_query_variation
                    (line_item_id, variation_type, original_text, varied_text)
                    VALUES (%s, %s, %s, %s)
                """, (item_id, var_type, original_text, varied_text))
                total_created += 1

        conn.commit()

    print(f"Created {total_created} query variations")
    return total_created


if __name__ == "__main__":
    from src.db import get_connection
    conn = get_connection()
    generate_all_test_variations(conn)
    conn.close()

Update src/evaluation/__init__.py:

"""Evaluation utilities for semantic search comparison."""
from .train_test_split import create_train_test_split, get_split_stats
from .query_variations import generate_variations, generate_all_test_variations

__all__ = [
    "create_train_test_split",
    "get_split_stats",
    "generate_variations",
    "generate_all_test_variations",
]

Run the variation generation:

uv run python -m src.evaluation.query_variations

Expected: ~1216 test items * 3 variations = ~3648 total variations. Takes ~5-10 minutes due to paraphrase API calls. Run docker compose exec -T postgres psql -U dev -d dev -c "SELECT variation_type, COUNT(*) FROM test_query_variation GROUP BY variation_type" Expected: ~1216 rows each for typo, reorder, paraphrase

test_query_variation table exists with proper indexes
Query variations generated for all test set items
Each test item has 3 variations: typo, reorder, paraphrase
Variations use German QWERTZ layout for typos

After all tasks: 1. `SELECT COUNT(*) FROM line_item WHERE embedding_minilm IS NOT NULL` = 6078 2. `SELECT indexname FROM pg_indexes WHERE tablename = 'line_item' AND indexname LIKE 'idx_embedding%'` = 3 indexes 3. `SELECT COUNT(*) FROM test_query_variation` = ~3648 (3 per test item) 4. `SELECT variation_type, COUNT(*) FROM test_query_variation GROUP BY variation_type` shows balanced distribution

<success_criteria>

All 6078 line items have embedding_minilm (384 dimensions) populated
HNSW indexes exist for all 3 embedding columns
Test set items (~1216) each have 3 query variations
Variations include typos (German QWERTZ), reordering, and LLM paraphrases </success_criteria>

After completion, create `.planning/phases/02-embedding-generation/02-03-SUMMARY.md`