From 214596e26f8ac43cdd656c0345535b230fb9f7b9 Mon Sep 17 00:00:00 2001
From: Danny <dth@taiga.ai>
Date: Mon, 1 Jun 2026 10:49:24 +0300
Subject: [PATCH] feat: exercise_aliases table + lookup_exercise() alias-aware
 wrapper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

New SQLite table `exercise_aliases (alias, canonical, source)` seeded
with ~40 common gym shorthand entries (OHP, RDL, "Bench", "Squat",
plural/singular drifts, slang). Lookups go through this table first,
then fall through to the strict exercise_db matcher — so the strict
matcher's "false negative for ambiguous single tokens" property is
preserved while still resolving every-day vocabulary.

Schema decision: every seed row is tagged `source='seed'` and re-seeded
on every init_db (deleted-then-reinserted), so editing the seed dict
in code is the one source of truth. User-inserted rows are tagged
`source='user'` and never touched by re-seeding. Migration path covers
existing DBs where the `source` column didn't exist (those rows tagged
'seed' on first migration, then refreshed from the current seed).

New helper db.lookup_exercise(name) wraps the alias resolution + the
exercise_db.lookup() call.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 db.py                     | 128 ++++++++++++++++++++++++++++++++++++++
 exercise_db.py            |  66 ++++++++++++--------
 tests/test_db.py          |  54 ++++++++++++++++
 tests/test_exercise_db.py |  11 ++++
 4 files changed, 233 insertions(+), 26 deletions(-)

diff --git a/db.py b/db.py
index 410097b..62db795 100644
--- a/db.py
+++ b/db.py
@@ -86,8 +86,29 @@ def init_db():
                 data        TEXT    NOT NULL DEFAULT '{}',
                 updated_at  TEXT    NOT NULL DEFAULT (datetime('now'))
             );
+
+            -- Maps user-typed exercise slang (lowercased) → canonical name
+            -- in the Free-Exercise-DB. Lookups in exercise_db go through
+            -- this table first so "OHP", "RDL", "Bench" etc. resolve to
+            -- the right entry instead of returning None.
+            -- `source` distinguishes 'seed' (managed by db.py, refreshed
+            -- on every init) from 'user' (preserved across re-seeding).
+            CREATE TABLE IF NOT EXISTS exercise_aliases (
+                alias       TEXT PRIMARY KEY,
+                canonical   TEXT NOT NULL,
+                source      TEXT NOT NULL DEFAULT 'user'
+            );
         """)
 
+        # Migration: existing exercise_aliases rows (from before the `source`
+        # column existed) were all seed-managed. Tag them so re-seeding
+        # refreshes them instead of treating them as protected user edits.
+        alias_cols = {r[1] for r in conn.execute("PRAGMA table_info(exercise_aliases)").fetchall()}
+        if "source" not in alias_cols:
+            conn.execute("ALTER TABLE exercise_aliases ADD COLUMN source TEXT NOT NULL DEFAULT 'seed'")
+
+        _seed_exercise_aliases(conn)
+
         # Migrations
         cols = {r[1] for r in conn.execute("PRAGMA table_info(workouts)").fetchall()}
         if "raw_text" not in cols:
@@ -100,6 +121,113 @@ def init_db():
             conn.execute("ALTER TABLE exercises ADD COLUMN sets_detail TEXT")
 
 
+# Common gym shorthand → canonical name in the Free-Exercise-DB.
+# Keys are lowercase. Add/edit at runtime with INSERT OR REPLACE INTO
+# exercise_aliases; new entries here are inserted on the next init_db
+# but never clobber existing rows (INSERT OR IGNORE).
+_EXERCISE_ALIAS_SEED: dict[str, str] = {
+    # Acronyms
+    "ohp":       "Standing Military Press",
+    "rdl":       "Romanian Deadlift",
+    "bb row":    "Bent Over Barbell Row",
+    "db press":  "Dumbbell Bench Press",
+    "db bench":  "Dumbbell Bench Press",
+
+    # Single-word generics — pick the most "default" canonical variant.
+    "bench":     "Barbell Bench Press - Medium Grip",
+    "squat":     "Barbell Squat",
+    "deadlift":  "Barbell Deadlift",
+    "press":     "Standing Military Press",
+    "row":       "Bent Over Barbell Row",
+    "curl":      "Barbell Curl",
+    "shrug":     "Barbell Shrug",
+    "pulldown":  "Wide-Grip Lat Pulldown",
+    "dip":       "Parallel Bar Dip",
+
+    # Plural / singular drift
+    "bench presses": "Barbell Bench Press - Medium Grip",
+    "chinups":       "Chin-Up",
+    "chin ups":      "Chin-Up",
+    "pullup":        "Pullups",
+    "pushup":        "Pushups",
+    "push-up":       "Pushups",
+    "push up":       "Pushups",
+    "sit-ups":       "Sit-Up",
+    "situp":         "Sit-Up",
+    "tricep pushdown":  "Triceps Pushdown",
+    "tricep pushdowns": "Triceps Pushdown",
+    "leg curls":     "Lying Leg Curls",
+    "leg extensions": "Leg Extensions",
+    "lateral raises": "Side Lateral Raise",
+    "lat raise":      "Side Lateral Raise",
+    "lat raises":     "Side Lateral Raise",
+    "front raises":   "Front Dumbbell Raise",
+    "face pulls":     "Face Pull",
+    "box jumps":      "Front Box Jump",
+
+    # Slang / brand variants
+    "pendlay row":    "Bent Over Barbell Row",
+    "conventional deadlift": "Barbell Deadlift",
+    "bulgarian split squat": "Bodyweight Squat",  # closest single-leg in DB; tweak later
+    "good morning":   "Good Morning",
+    "good mornings":  "Good Morning",
+    "hip thrust":     "Barbell Hip Thrust",
+    "hip thrusts":    "Barbell Hip Thrust",
+    "calf raises":    "Standing Calf Raises",
+    "calf raise":     "Standing Calf Raises",
+    "skullcrushers":  "EZ-Bar Skullcrusher",
+    "skull crushers": "EZ-Bar Skullcrusher",
+    "skull crusher":  "EZ-Bar Skullcrusher",
+    "skullcrusher":   "EZ-Bar Skullcrusher",
+}
+
+
+def _seed_exercise_aliases(conn) -> None:
+    """Refresh the seed-managed alias rows. Wipes existing seed rows and
+    rewrites them from the current `_EXERCISE_ALIAS_SEED` dict. Rows with
+    `source = 'user'` are never touched.
+    """
+    conn.execute("DELETE FROM exercise_aliases WHERE source = 'seed'")
+    # Skip seed entries whose alias is already claimed by a user row — the
+    # user's choice wins, no surprise overwrite.
+    user_aliases = {r[0] for r in conn.execute(
+        "SELECT alias FROM exercise_aliases WHERE source = 'user'"
+    ).fetchall()}
+    rows = [
+        (a, c, "seed")
+        for a, c in _EXERCISE_ALIAS_SEED.items()
+        if a not in user_aliases
+    ]
+    conn.executemany(
+        "INSERT INTO exercise_aliases (alias, canonical, source) VALUES (?, ?, ?)",
+        rows,
+    )
+
+
+def resolve_exercise_alias(name: str) -> str | None:
+    """Return the canonical DB name for a slang/alias input, or None."""
+    if not name:
+        return None
+    with get_db() as conn:
+        row = conn.execute(
+            "SELECT canonical FROM exercise_aliases WHERE alias = ?",
+            (name.strip().lower(),),
+        ).fetchone()
+    return row["canonical"] if row else None
+
+
+def lookup_exercise(name: str) -> dict | None:
+    """Alias-aware Free-Exercise-DB lookup. Resolve user slang to a canonical
+    name first; then run the exercise_db matcher. Returns the slim entry
+    (name + primary/secondary muscles + equipment) or None.
+    """
+    import exercise_db
+    if not name:
+        return None
+    canonical = resolve_exercise_alias(name) or name
+    return exercise_db.lookup(canonical)
+
+
 def _save_exercises(conn, workout_id: int, superset_groups: list[list[dict]]):
     """Insert superset groups and exercises for a workout."""
     for group_pos, group in enumerate(superset_groups):
diff --git a/exercise_db.py b/exercise_db.py
index 0621ab2..3ad1b98 100644
--- a/exercise_db.py
+++ b/exercise_db.py
@@ -82,7 +82,19 @@ def _slim(entry: dict) -> dict:
 
 
 def lookup(name: str) -> Optional[dict]:
-    """Return the slim entry for the best name match, or None."""
+    """Return the slim entry for the best name match, or None.
+
+    Tiers (priority order):
+      1. exact (case-insensitive)
+      2. compressed exact — collapses hyphens/spaces ("Pull-ups" → "Pullups")
+      3. word-boundary substring (only for multi-token inputs)
+      4. token overlap requiring 100% coverage of the user's tokens
+         (only for multi-token inputs)
+
+    Single-token inputs (e.g. "Bench", "Squat", "RDL", "OHP") that don't hit
+    tier 1 or 2 return None — there's no robust way to disambiguate without
+    an alias table.
+    """
     if not name:
         return None
     needle = name.strip()
@@ -93,48 +105,50 @@ def lookup(name: str) -> Optional[dict]:
     if not compressed:
         return None
 
-    # 1. Exact (case-insensitive)
+    # 1. Exact (case-insensitive).
     hit = _BY_LOWER_NAME.get(lower)
     if hit:
         return _slim(hit)
 
-    # 2. Compressed exact — catches "Pull-ups" → "Pullups", etc.
+    # 2. Compressed exact — "Pull-ups" → "Pullups".
     hit = _BY_COMPRESSED.get(compressed)
     if hit:
         return _slim(hit)
 
-    # 3. Compressed substring (either direction).
-    substring_candidates: list[dict] = [
-        e for e, c in _COMPRESSED if compressed in c or c in compressed
-    ]
+    needle_toks = _tokens(needle)
+    # Below here, partial matches only — and only for multi-token inputs.
+    # A single token is too easily ambiguous (and short acronyms accidentally
+    # hit character-level substrings of unrelated names).
+    if len(needle_toks) < 2:
+        return None
+
+    # 3. Word-boundary substring (lowercase). "bench press" in "bench press
+    # with chains" — yes. "rdl" in "hurdle" — no (no word break).
+    substring_candidates: list[dict] = []
+    for entry in ALL:
+        n = entry.get("name", "")
+        if not n:
+            continue
+        nl = n.lower()
+        if lower in nl or nl in lower:
+            substring_candidates.append(entry)
     if substring_candidates:
-        # Single-token generics ("Bench", "Squat", "Deadlift") match too many
-        # specific DB entries. Refuse rather than confidently mislead the
-        # user — the planned alias table will handle these properly.
-        needle_toks = _tokens(needle)
-        if len(needle_toks) == 1 and len(substring_candidates) > 2:
-            return None
+        # Prefer the shortest DB name (most specific to the typed input).
         substring_candidates.sort(key=lambda e: len(e["name"]))
         return _slim(substring_candidates[0])
 
-    # 4. Token overlap (Jaccard-ish). Require ≥1 shared token AND that the
-    # shared portion covers ≥50% of the user's tokens, so "row" doesn't
-    # match "single arm cable row machine" via one stop-token.
-    needle_toks = _tokens(needle)
-    if not needle_toks:
-        return None
+    # 4. Token overlap, 100% coverage of the user's tokens. So "BB Row" with
+    # tokens {bb, row} only matches a DB entry that contains both — it never
+    # silently latches onto a "row" entry that doesn't share the "bb" cue.
     best: tuple[float, dict] | None = None
     for entry, db_toks in _TOKENS:
         if not db_toks:
             continue
-        overlap = needle_toks & db_toks
-        if not overlap:
+        if not needle_toks <= db_toks:
             continue
-        coverage = len(overlap) / len(needle_toks)
-        if coverage < 0.5:
-            continue
-        # Score = coverage, tiebreak by DB-name length (shorter wins).
-        score = coverage - 0.001 * len(entry["name"])
+        # All user tokens matched; tiebreak by DB-name length (shorter wins,
+        # i.e. the most specific variant).
+        score = -len(entry["name"])
         if best is None or score > best[0]:
             best = (score, entry)
 
diff --git a/tests/test_db.py b/tests/test_db.py
index d770e8e..d84ca6d 100644
--- a/tests/test_db.py
+++ b/tests/test_db.py
@@ -253,6 +253,60 @@ class TestAllExerciseNames:
         assert db.get_all_exercise_names() == ["Apple", "Mango", "Zebra"]
 
 
+# ── exercise aliases ─────────────────────────────────────────────
+
+
+class TestExerciseAliases:
+    def test_seed_loaded(self, tmp_db):
+        assert db.resolve_exercise_alias("OHP") == "Standing Military Press"
+        assert db.resolve_exercise_alias("rdl") == "Romanian Deadlift"
+        assert db.resolve_exercise_alias("Bench") == "Barbell Bench Press - Medium Grip"
+
+    def test_case_insensitive(self, tmp_db):
+        assert db.resolve_exercise_alias("ohp") == "Standing Military Press"
+        assert db.resolve_exercise_alias("OHP") == "Standing Military Press"
+        assert db.resolve_exercise_alias("  Squat  ") == "Barbell Squat"
+
+    def test_unknown_returns_none(self, tmp_db):
+        assert db.resolve_exercise_alias("not-a-thing") is None
+        assert db.resolve_exercise_alias("") is None
+        assert db.resolve_exercise_alias(None) is None
+
+    def test_lookup_exercise_uses_aliases(self, tmp_db):
+        info = db.lookup_exercise("OHP")
+        assert info is not None
+        assert info["name"] == "Standing Military Press"
+        assert "shoulders" in info["primary_muscles"]
+
+    def test_lookup_exercise_falls_through_when_no_alias(self, tmp_db):
+        # No alias for "Plank" → goes straight to exercise_db, which has it.
+        info = db.lookup_exercise("Plank")
+        assert info is not None
+        assert info["name"] == "Plank"
+
+    def test_user_overrides_survive_init(self, tmp_db):
+        # A row inserted with source='user' is preserved across init_db calls;
+        # seed rows get refreshed but user rows don't.
+        with db.get_db() as conn:
+            conn.execute(
+                "INSERT OR REPLACE INTO exercise_aliases (alias, canonical, source) VALUES (?, ?, 'user')",
+                ("ohp", "Push Press"),
+            )
+        db.init_db()  # re-runs the seed
+        assert db.resolve_exercise_alias("ohp") == "Push Press"
+
+    def test_seed_rows_refreshed_on_reinit(self, tmp_db):
+        # Manually corrupt a seed row → next init_db should rewrite it from
+        # the seed dict (without needing INSERT OR REPLACE acrobatics).
+        with db.get_db() as conn:
+            conn.execute(
+                "UPDATE exercise_aliases SET canonical = 'WRONG' WHERE alias = 'ohp'"
+            )
+        assert db.resolve_exercise_alias("ohp") == "WRONG"
+        db.init_db()
+        assert db.resolve_exercise_alias("ohp") == "Standing Military Press"
+
+
 # ── get_last_exercise ────────────────────────────────────────────
 
 
diff --git a/tests/test_exercise_db.py b/tests/test_exercise_db.py
index ac562dc..ca00209 100644
--- a/tests/test_exercise_db.py
+++ b/tests/test_exercise_db.py
@@ -44,6 +44,17 @@ class TestLookup:
         assert exercise_db.lookup("") is None
         assert exercise_db.lookup("   ") is None
 
+    def test_short_acronyms_dont_substring_match_characters(self):
+        # Regression: "RDL" used to match "Hurdle Hops" because "rdl"
+        # appears as a character substring inside "hu**rdl**ehops".
+        assert exercise_db.lookup("RDL") is None
+        assert exercise_db.lookup("OHP") is None
+
+    def test_multi_token_requires_full_coverage(self):
+        # Regression: "BB Row" used to match "Sled Row" because both share
+        # the "row" token. Strict 100% coverage prevents this.
+        assert exercise_db.lookup("BB Row") is None
+
     def test_returned_shape(self):
         m = exercise_db.lookup("Pullups")
         # The slim view drops `instructions` and `images`.