feat: exercise_aliases table + lookup_exercise() alias-aware wrapper

New SQLite table `exercise_aliases (alias, canonical, source)` seeded with ~40 common gym shorthand entries (OHP, RDL, "Bench", "Squat", plural/singular drifts, slang). Lookups go through this table first, then fall through to the strict exercise_db matcher — so the strict matcher's "false negative for ambiguous single tokens" property is preserved while still resolving every-day vocabulary. Schema decision: every seed row is tagged `source='seed'` and re-seeded on every init_db (deleted-then-reinserted), so editing the seed dict in code is the one source of truth. User-inserted rows are tagged `source='user'` and never touched by re-seeding. Migration path covers existing DBs where the `source` column didn't exist (those rows tagged 'seed' on first migration, then refreshed from the current seed). New helper db.lookup_exercise(name) wraps the alias resolution + the exercise_db.lookup() call. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-01 10:49:24 +03:00 · 2026-06-01 10:49:24 +03:00 · 214596e26f
commit 214596e26f
parent ebd0016a62
4 changed files with 233 additions and 26 deletions
--- a/db.py
+++ b/db.py
@ -86,8 +86,29 @@ def init_db():
                data        TEXT    NOT NULL DEFAULT '{}',
                updated_at  TEXT    NOT NULL DEFAULT (datetime('now'))
            );
+
+            -- Maps user-typed exercise slang (lowercased) → canonical name
+            -- in the Free-Exercise-DB. Lookups in exercise_db go through
+            -- this table first so "OHP", "RDL", "Bench" etc. resolve to
+            -- the right entry instead of returning None.
+            -- `source` distinguishes 'seed' (managed by db.py, refreshed
+            -- on every init) from 'user' (preserved across re-seeding).
+            CREATE TABLE IF NOT EXISTS exercise_aliases (
+                alias       TEXT PRIMARY KEY,
+                canonical   TEXT NOT NULL,
+                source      TEXT NOT NULL DEFAULT 'user'
+            );
        """)

+        # Migration: existing exercise_aliases rows (from before the `source`
+        # column existed) were all seed-managed. Tag them so re-seeding
+        # refreshes them instead of treating them as protected user edits.
+        alias_cols = {r[1] for r in conn.execute("PRAGMA table_info(exercise_aliases)").fetchall()}
+        if "source" not in alias_cols:
+            conn.execute("ALTER TABLE exercise_aliases ADD COLUMN source TEXT NOT NULL DEFAULT 'seed'")
+
+        _seed_exercise_aliases(conn)
+
        # Migrations
        cols = {r[1] for r in conn.execute("PRAGMA table_info(workouts)").fetchall()}
        if "raw_text" not in cols:
@ -100,6 +121,113 @@ def init_db():
            conn.execute("ALTER TABLE exercises ADD COLUMN sets_detail TEXT")


+# Common gym shorthand → canonical name in the Free-Exercise-DB.
+# Keys are lowercase. Add/edit at runtime with INSERT OR REPLACE INTO
+# exercise_aliases; new entries here are inserted on the next init_db
+# but never clobber existing rows (INSERT OR IGNORE).
+_EXERCISE_ALIAS_SEED: dict[str, str] = {
+    # Acronyms
+    "ohp":       "Standing Military Press",
+    "rdl":       "Romanian Deadlift",
+    "bb row":    "Bent Over Barbell Row",
+    "db press":  "Dumbbell Bench Press",
+    "db bench":  "Dumbbell Bench Press",
+
+    # Single-word generics — pick the most "default" canonical variant.
+    "bench":     "Barbell Bench Press - Medium Grip",
+    "squat":     "Barbell Squat",
+    "deadlift":  "Barbell Deadlift",
+    "press":     "Standing Military Press",
+    "row":       "Bent Over Barbell Row",
+    "curl":      "Barbell Curl",
+    "shrug":     "Barbell Shrug",
+    "pulldown":  "Wide-Grip Lat Pulldown",
+    "dip":       "Parallel Bar Dip",
+
+    # Plural / singular drift
+    "bench presses": "Barbell Bench Press - Medium Grip",
+    "chinups":       "Chin-Up",
+    "chin ups":      "Chin-Up",
+    "pullup":        "Pullups",
+    "pushup":        "Pushups",
+    "push-up":       "Pushups",
+    "push up":       "Pushups",
+    "sit-ups":       "Sit-Up",
+    "situp":         "Sit-Up",
+    "tricep pushdown":  "Triceps Pushdown",
+    "tricep pushdowns": "Triceps Pushdown",
+    "leg curls":     "Lying Leg Curls",
+    "leg extensions": "Leg Extensions",
+    "lateral raises": "Side Lateral Raise",
+    "lat raise":      "Side Lateral Raise",
+    "lat raises":     "Side Lateral Raise",
+    "front raises":   "Front Dumbbell Raise",
+    "face pulls":     "Face Pull",
+    "box jumps":      "Front Box Jump",
+
+    # Slang / brand variants
+    "pendlay row":    "Bent Over Barbell Row",
+    "conventional deadlift": "Barbell Deadlift",
+    "bulgarian split squat": "Bodyweight Squat",  # closest single-leg in DB; tweak later
+    "good morning":   "Good Morning",
+    "good mornings":  "Good Morning",
+    "hip thrust":     "Barbell Hip Thrust",
+    "hip thrusts":    "Barbell Hip Thrust",
+    "calf raises":    "Standing Calf Raises",
+    "calf raise":     "Standing Calf Raises",
+    "skullcrushers":  "EZ-Bar Skullcrusher",
+    "skull crushers": "EZ-Bar Skullcrusher",
+    "skull crusher":  "EZ-Bar Skullcrusher",
+    "skullcrusher":   "EZ-Bar Skullcrusher",
+}
+
+
+def _seed_exercise_aliases(conn) -> None:
+    """Refresh the seed-managed alias rows. Wipes existing seed rows and
+    rewrites them from the current `_EXERCISE_ALIAS_SEED` dict. Rows with
+    `source = 'user'` are never touched.
+    """
+    conn.execute("DELETE FROM exercise_aliases WHERE source = 'seed'")
+    # Skip seed entries whose alias is already claimed by a user row — the
+    # user's choice wins, no surprise overwrite.
+    user_aliases = {r[0] for r in conn.execute(
+        "SELECT alias FROM exercise_aliases WHERE source = 'user'"
+    ).fetchall()}
+    rows = [
+        (a, c, "seed")
+        for a, c in _EXERCISE_ALIAS_SEED.items()
+        if a not in user_aliases
+    ]
+    conn.executemany(
+        "INSERT INTO exercise_aliases (alias, canonical, source) VALUES (?, ?, ?)",
+        rows,
+    )
+
+
+def resolve_exercise_alias(name: str) -> str | None:
+    """Return the canonical DB name for a slang/alias input, or None."""
+    if not name:
+        return None
+    with get_db() as conn:
+        row = conn.execute(
+            "SELECT canonical FROM exercise_aliases WHERE alias = ?",
+            (name.strip().lower(),),
+        ).fetchone()
+    return row["canonical"] if row else None
+
+
+def lookup_exercise(name: str) -> dict | None:
+    """Alias-aware Free-Exercise-DB lookup. Resolve user slang to a canonical
+    name first; then run the exercise_db matcher. Returns the slim entry
+    (name + primary/secondary muscles + equipment) or None.
+    """
+    import exercise_db
+    if not name:
+        return None
+    canonical = resolve_exercise_alias(name) or name
+    return exercise_db.lookup(canonical)
+
+
 def _save_exercises(conn, workout_id: int, superset_groups: list[list[dict]]):
    """Insert superset groups and exercises for a workout."""
    for group_pos, group in enumerate(superset_groups):
--- a/exercise_db.py
+++ b/exercise_db.py
@ -82,7 +82,19 @@ def _slim(entry: dict) -> dict:


 def lookup(name: str) -> Optional[dict]:
-    """Return the slim entry for the best name match, or None."""
+    """Return the slim entry for the best name match, or None.
+
+    Tiers (priority order):
+      1. exact (case-insensitive)
+      2. compressed exact — collapses hyphens/spaces ("Pull-ups" → "Pullups")
+      3. word-boundary substring (only for multi-token inputs)
+      4. token overlap requiring 100% coverage of the user's tokens
+         (only for multi-token inputs)
+
+    Single-token inputs (e.g. "Bench", "Squat", "RDL", "OHP") that don't hit
+    tier 1 or 2 return None — there's no robust way to disambiguate without
+    an alias table.
+    """
    if not name:
        return None
    needle = name.strip()
@ -93,48 +105,50 @@ def lookup(name: str) -> Optional[dict]:
    if not compressed:
        return None

-    # 1. Exact (case-insensitive)
+    # 1. Exact (case-insensitive).
    hit = _BY_LOWER_NAME.get(lower)
    if hit:
        return _slim(hit)

-    # 2. Compressed exact — catches "Pull-ups" → "Pullups", etc.
+    # 2. Compressed exact — "Pull-ups" → "Pullups".
    hit = _BY_COMPRESSED.get(compressed)
    if hit:
        return _slim(hit)

-    # 3. Compressed substring (either direction).
-    substring_candidates: list[dict] = [
-        e for e, c in _COMPRESSED if compressed in c or c in compressed
-    ]
-    if substring_candidates:
-        # Single-token generics ("Bench", "Squat", "Deadlift") match too many
-        # specific DB entries. Refuse rather than confidently mislead the
-        # user — the planned alias table will handle these properly.
    needle_toks = _tokens(needle)
-        if len(needle_toks) == 1 and len(substring_candidates) > 2:
+    # Below here, partial matches only — and only for multi-token inputs.
+    # A single token is too easily ambiguous (and short acronyms accidentally
+    # hit character-level substrings of unrelated names).
+    if len(needle_toks) < 2:
        return None
+
+    # 3. Word-boundary substring (lowercase). "bench press" in "bench press
+    # with chains" — yes. "rdl" in "hurdle" — no (no word break).
+    substring_candidates: list[dict] = []
+    for entry in ALL:
+        n = entry.get("name", "")
+        if not n:
+            continue
+        nl = n.lower()
+        if lower in nl or nl in lower:
+            substring_candidates.append(entry)
+    if substring_candidates:
+        # Prefer the shortest DB name (most specific to the typed input).
        substring_candidates.sort(key=lambda e: len(e["name"]))
        return _slim(substring_candidates[0])

-    # 4. Token overlap (Jaccard-ish). Require ≥1 shared token AND that the
-    # shared portion covers ≥50% of the user's tokens, so "row" doesn't
-    # match "single arm cable row machine" via one stop-token.
-    needle_toks = _tokens(needle)
-    if not needle_toks:
-        return None
+    # 4. Token overlap, 100% coverage of the user's tokens. So "BB Row" with
+    # tokens {bb, row} only matches a DB entry that contains both — it never
+    # silently latches onto a "row" entry that doesn't share the "bb" cue.
    best: tuple[float, dict] | None = None
    for entry, db_toks in _TOKENS:
        if not db_toks:
            continue
-        overlap = needle_toks & db_toks
-        if not overlap:
+        if not needle_toks <= db_toks:
            continue
-        coverage = len(overlap) / len(needle_toks)
-        if coverage < 0.5:
-            continue
-        # Score = coverage, tiebreak by DB-name length (shorter wins).
-        score = coverage - 0.001 * len(entry["name"])
+        # All user tokens matched; tiebreak by DB-name length (shorter wins,
+        # i.e. the most specific variant).
+        score = -len(entry["name"])
        if best is None or score > best[0]:
            best = (score, entry)

--- a/tests/test_db.py
+++ b/tests/test_db.py
@ -253,6 +253,60 @@ class TestAllExerciseNames:
        assert db.get_all_exercise_names() == ["Apple", "Mango", "Zebra"]


+# ── exercise aliases ─────────────────────────────────────────────
+
+
+class TestExerciseAliases:
+    def test_seed_loaded(self, tmp_db):
+        assert db.resolve_exercise_alias("OHP") == "Standing Military Press"
+        assert db.resolve_exercise_alias("rdl") == "Romanian Deadlift"
+        assert db.resolve_exercise_alias("Bench") == "Barbell Bench Press - Medium Grip"
+
+    def test_case_insensitive(self, tmp_db):
+        assert db.resolve_exercise_alias("ohp") == "Standing Military Press"
+        assert db.resolve_exercise_alias("OHP") == "Standing Military Press"
+        assert db.resolve_exercise_alias("  Squat  ") == "Barbell Squat"
+
+    def test_unknown_returns_none(self, tmp_db):
+        assert db.resolve_exercise_alias("not-a-thing") is None
+        assert db.resolve_exercise_alias("") is None
+        assert db.resolve_exercise_alias(None) is None
+
+    def test_lookup_exercise_uses_aliases(self, tmp_db):
+        info = db.lookup_exercise("OHP")
+        assert info is not None
+        assert info["name"] == "Standing Military Press"
+        assert "shoulders" in info["primary_muscles"]
+
+    def test_lookup_exercise_falls_through_when_no_alias(self, tmp_db):
+        # No alias for "Plank" → goes straight to exercise_db, which has it.
+        info = db.lookup_exercise("Plank")
+        assert info is not None
+        assert info["name"] == "Plank"
+
+    def test_user_overrides_survive_init(self, tmp_db):
+        # A row inserted with source='user' is preserved across init_db calls;
+        # seed rows get refreshed but user rows don't.
+        with db.get_db() as conn:
+            conn.execute(
+                "INSERT OR REPLACE INTO exercise_aliases (alias, canonical, source) VALUES (?, ?, 'user')",
+                ("ohp", "Push Press"),
+            )
+        db.init_db()  # re-runs the seed
+        assert db.resolve_exercise_alias("ohp") == "Push Press"
+
+    def test_seed_rows_refreshed_on_reinit(self, tmp_db):
+        # Manually corrupt a seed row → next init_db should rewrite it from
+        # the seed dict (without needing INSERT OR REPLACE acrobatics).
+        with db.get_db() as conn:
+            conn.execute(
+                "UPDATE exercise_aliases SET canonical = 'WRONG' WHERE alias = 'ohp'"
+            )
+        assert db.resolve_exercise_alias("ohp") == "WRONG"
+        db.init_db()
+        assert db.resolve_exercise_alias("ohp") == "Standing Military Press"
+
+
 # ── get_last_exercise ────────────────────────────────────────────


--- a/tests/test_exercise_db.py
+++ b/tests/test_exercise_db.py
@ -44,6 +44,17 @@ class TestLookup:
        assert exercise_db.lookup("") is None
        assert exercise_db.lookup("   ") is None

+    def test_short_acronyms_dont_substring_match_characters(self):
+        # Regression: "RDL" used to match "Hurdle Hops" because "rdl"
+        # appears as a character substring inside "hu**rdl**ehops".
+        assert exercise_db.lookup("RDL") is None
+        assert exercise_db.lookup("OHP") is None
+
+    def test_multi_token_requires_full_coverage(self):
+        # Regression: "BB Row" used to match "Sled Row" because both share
+        # the "row" token. Strict 100% coverage prevents this.
+        assert exercise_db.lookup("BB Row") is None
+
    def test_returned_shape(self):
        m = exercise_db.lookup("Pullups")
        # The slim view drops `instructions` and `images`.