feat: exercise_aliases table + lookup_exercise() alias-aware wrapper

New SQLite table `exercise_aliases (alias, canonical, source)` seeded with ~40 common gym shorthand entries (OHP, RDL, "Bench", "Squat", plural/singular drifts, slang). Lookups go through this table first, then fall through to the strict exercise_db matcher — so the strict matcher's "false negative for ambiguous single tokens" property is preserved while still resolving every-day vocabulary. Schema decision: every seed row is tagged `source='seed'` and re-seeded on every init_db (deleted-then-reinserted), so editing the seed dict in code is the one source of truth. User-inserted rows are tagged `source='user'` and never touched by re-seeding. Migration path covers existing DBs where the `source` column didn't exist (those rows tagged 'seed' on first migration, then refreshed from the current seed). New helper db.lookup_exercise(name) wraps the alias resolution + the exercise_db.lookup() call. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
feat: bundle Free-Exercise-DB + name matcher (step 1)
2026-06-01 10:49:24 +03:00 · 2026-05-24 11:11:34 +02:00
6 changed files with 23031 additions and 0 deletions
--- a/data/exercises.json
+++ b/data/exercises.json
--- a/db.py
+++ b/db.py
@ -86,8 +86,29 @@ def init_db():
                data        TEXT    NOT NULL DEFAULT '{}',
                updated_at  TEXT    NOT NULL DEFAULT (datetime('now'))
            );
+
+            -- Maps user-typed exercise slang (lowercased) → canonical name
+            -- in the Free-Exercise-DB. Lookups in exercise_db go through
+            -- this table first so "OHP", "RDL", "Bench" etc. resolve to
+            -- the right entry instead of returning None.
+            -- `source` distinguishes 'seed' (managed by db.py, refreshed
+            -- on every init) from 'user' (preserved across re-seeding).
+            CREATE TABLE IF NOT EXISTS exercise_aliases (
+                alias       TEXT PRIMARY KEY,
+                canonical   TEXT NOT NULL,
+                source      TEXT NOT NULL DEFAULT 'user'
+            );
        """)

+        # Migration: existing exercise_aliases rows (from before the `source`
+        # column existed) were all seed-managed. Tag them so re-seeding
+        # refreshes them instead of treating them as protected user edits.
+        alias_cols = {r[1] for r in conn.execute("PRAGMA table_info(exercise_aliases)").fetchall()}
+        if "source" not in alias_cols:
+            conn.execute("ALTER TABLE exercise_aliases ADD COLUMN source TEXT NOT NULL DEFAULT 'seed'")
+
+        _seed_exercise_aliases(conn)
+
        # Migrations
        cols = {r[1] for r in conn.execute("PRAGMA table_info(workouts)").fetchall()}
        if "raw_text" not in cols:
@ -100,6 +121,113 @@ def init_db():
            conn.execute("ALTER TABLE exercises ADD COLUMN sets_detail TEXT")


+# Common gym shorthand → canonical name in the Free-Exercise-DB.
+# Keys are lowercase. Add/edit at runtime with INSERT OR REPLACE INTO
+# exercise_aliases; new entries here are inserted on the next init_db
+# but never clobber existing rows (INSERT OR IGNORE).
+_EXERCISE_ALIAS_SEED: dict[str, str] = {
+    # Acronyms
+    "ohp":       "Standing Military Press",
+    "rdl":       "Romanian Deadlift",
+    "bb row":    "Bent Over Barbell Row",
+    "db press":  "Dumbbell Bench Press",
+    "db bench":  "Dumbbell Bench Press",
+
+    # Single-word generics — pick the most "default" canonical variant.
+    "bench":     "Barbell Bench Press - Medium Grip",
+    "squat":     "Barbell Squat",
+    "deadlift":  "Barbell Deadlift",
+    "press":     "Standing Military Press",
+    "row":       "Bent Over Barbell Row",
+    "curl":      "Barbell Curl",
+    "shrug":     "Barbell Shrug",
+    "pulldown":  "Wide-Grip Lat Pulldown",
+    "dip":       "Parallel Bar Dip",
+
+    # Plural / singular drift
+    "bench presses": "Barbell Bench Press - Medium Grip",
+    "chinups":       "Chin-Up",
+    "chin ups":      "Chin-Up",
+    "pullup":        "Pullups",
+    "pushup":        "Pushups",
+    "push-up":       "Pushups",
+    "push up":       "Pushups",
+    "sit-ups":       "Sit-Up",
+    "situp":         "Sit-Up",
+    "tricep pushdown":  "Triceps Pushdown",
+    "tricep pushdowns": "Triceps Pushdown",
+    "leg curls":     "Lying Leg Curls",
+    "leg extensions": "Leg Extensions",
+    "lateral raises": "Side Lateral Raise",
+    "lat raise":      "Side Lateral Raise",
+    "lat raises":     "Side Lateral Raise",
+    "front raises":   "Front Dumbbell Raise",
+    "face pulls":     "Face Pull",
+    "box jumps":      "Front Box Jump",
+
+    # Slang / brand variants
+    "pendlay row":    "Bent Over Barbell Row",
+    "conventional deadlift": "Barbell Deadlift",
+    "bulgarian split squat": "Bodyweight Squat",  # closest single-leg in DB; tweak later
+    "good morning":   "Good Morning",
+    "good mornings":  "Good Morning",
+    "hip thrust":     "Barbell Hip Thrust",
+    "hip thrusts":    "Barbell Hip Thrust",
+    "calf raises":    "Standing Calf Raises",
+    "calf raise":     "Standing Calf Raises",
+    "skullcrushers":  "EZ-Bar Skullcrusher",
+    "skull crushers": "EZ-Bar Skullcrusher",
+    "skull crusher":  "EZ-Bar Skullcrusher",
+    "skullcrusher":   "EZ-Bar Skullcrusher",
+}
+
+
+def _seed_exercise_aliases(conn) -> None:
+    """Refresh the seed-managed alias rows. Wipes existing seed rows and
+    rewrites them from the current `_EXERCISE_ALIAS_SEED` dict. Rows with
+    `source = 'user'` are never touched.
+    """
+    conn.execute("DELETE FROM exercise_aliases WHERE source = 'seed'")
+    # Skip seed entries whose alias is already claimed by a user row — the
+    # user's choice wins, no surprise overwrite.
+    user_aliases = {r[0] for r in conn.execute(
+        "SELECT alias FROM exercise_aliases WHERE source = 'user'"
+    ).fetchall()}
+    rows = [
+        (a, c, "seed")
+        for a, c in _EXERCISE_ALIAS_SEED.items()
+        if a not in user_aliases
+    ]
+    conn.executemany(
+        "INSERT INTO exercise_aliases (alias, canonical, source) VALUES (?, ?, ?)",
+        rows,
+    )
+
+
+def resolve_exercise_alias(name: str) -> str | None:
+    """Return the canonical DB name for a slang/alias input, or None."""
+    if not name:
+        return None
+    with get_db() as conn:
+        row = conn.execute(
+            "SELECT canonical FROM exercise_aliases WHERE alias = ?",
+            (name.strip().lower(),),
+        ).fetchone()
+    return row["canonical"] if row else None
+
+
+def lookup_exercise(name: str) -> dict | None:
+    """Alias-aware Free-Exercise-DB lookup. Resolve user slang to a canonical
+    name first; then run the exercise_db matcher. Returns the slim entry
+    (name + primary/secondary muscles + equipment) or None.
+    """
+    import exercise_db
+    if not name:
+        return None
+    canonical = resolve_exercise_alias(name) or name
+    return exercise_db.lookup(canonical)
+
+
 def _save_exercises(conn, workout_id: int, superset_groups: list[list[dict]]):
    """Insert superset groups and exercises for a workout."""
    for group_pos, group in enumerate(superset_groups):
--- a/exercise_db.py
+++ b/exercise_db.py
@ -0,0 +1,155 @@
+"""Static exercise reference data from the Free-Exercise-DB.
+
+Source: https://github.com/yuhonas/free-exercise-db (public domain).
+Bundled at data/exercises.json (~870 entries). Loaded once at import.
+
+Exports:
+- `lookup(name)` — best-effort fuzzy name match → dict with primary/secondary
+  muscles, equipment, etc. Returns None if no plausible match.
+- `ALL` — the raw list (for ad-hoc queries).
+
+Matching, in priority order:
+  1. exact case-insensitive name match
+  2. case-insensitive substring (either way)
+  3. token-overlap score above a small threshold
+
+Keep this conservative — a wrong match is worse than no match for the user.
+"""
+from __future__ import annotations
+
+import json
+import pathlib
+import re
+from typing import Optional
+
+_DATA_PATH = pathlib.Path(__file__).parent / "data" / "exercises.json"
+
+
+def _load() -> list[dict]:
+    try:
+        with _DATA_PATH.open() as f:
+            return json.load(f)
+    except (OSError, json.JSONDecodeError):
+        return []
+
+
+ALL: list[dict] = _load()
+
+# Normalised name → entry (case-insensitive exact key)
+_BY_LOWER_NAME: dict[str, dict] = {e["name"].lower(): e for e in ALL if e.get("name")}
+
+
+_TOKEN_RE = re.compile(r"[a-z0-9]+")
+_NON_ALNUM = re.compile(r"[^a-z0-9]")
+
+
+def _tokens(s: str) -> set[str]:
+    return set(_TOKEN_RE.findall(s.lower()))
+
+
+def _compress(s: str) -> str:
+    """Collapse to lowercase alphanumeric, no separators.
+
+    "Pull-Ups", "Pull Ups", "Pullups" all → "pullups".
+    """
+    return _NON_ALNUM.sub("", s.lower())
+
+
+# Pre-compute token sets and compressed forms (one-time at import).
+_TOKENS: list[tuple[dict, set[str]]] = [
+    (e, _tokens(e["name"])) for e in ALL if e.get("name")
+]
+_COMPRESSED: list[tuple[dict, str]] = [
+    (e, _compress(e["name"])) for e in ALL if e.get("name")
+]
+_BY_COMPRESSED: dict[str, dict] = {
+    _compress(e["name"]): e for e in ALL if e.get("name")
+}
+
+
+# Public-facing slim shape — drop instructions/images for now (heavy).
+def _slim(entry: dict) -> dict:
+    return {
+        "name": entry.get("name"),
+        "primary_muscles": entry.get("primaryMuscles") or [],
+        "secondary_muscles": entry.get("secondaryMuscles") or [],
+        "equipment": entry.get("equipment"),
+        "category": entry.get("category"),
+        "level": entry.get("level"),
+        "force": entry.get("force"),
+        "mechanic": entry.get("mechanic"),
+    }
+
+
+def lookup(name: str) -> Optional[dict]:
+    """Return the slim entry for the best name match, or None.
+
+    Tiers (priority order):
+      1. exact (case-insensitive)
+      2. compressed exact — collapses hyphens/spaces ("Pull-ups" → "Pullups")
+      3. word-boundary substring (only for multi-token inputs)
+      4. token overlap requiring 100% coverage of the user's tokens
+         (only for multi-token inputs)
+
+    Single-token inputs (e.g. "Bench", "Squat", "RDL", "OHP") that don't hit
+    tier 1 or 2 return None — there's no robust way to disambiguate without
+    an alias table.
+    """
+    if not name:
+        return None
+    needle = name.strip()
+    if not needle:
+        return None
+    lower = needle.lower()
+    compressed = _compress(needle)
+    if not compressed:
+        return None
+
+    # 1. Exact (case-insensitive).
+    hit = _BY_LOWER_NAME.get(lower)
+    if hit:
+        return _slim(hit)
+
+    # 2. Compressed exact — "Pull-ups" → "Pullups".
+    hit = _BY_COMPRESSED.get(compressed)
+    if hit:
+        return _slim(hit)
+
+    needle_toks = _tokens(needle)
+    # Below here, partial matches only — and only for multi-token inputs.
+    # A single token is too easily ambiguous (and short acronyms accidentally
+    # hit character-level substrings of unrelated names).
+    if len(needle_toks) < 2:
+        return None
+
+    # 3. Word-boundary substring (lowercase). "bench press" in "bench press
+    # with chains" — yes. "rdl" in "hurdle" — no (no word break).
+    substring_candidates: list[dict] = []
+    for entry in ALL:
+        n = entry.get("name", "")
+        if not n:
+            continue
+        nl = n.lower()
+        if lower in nl or nl in lower:
+            substring_candidates.append(entry)
+    if substring_candidates:
+        # Prefer the shortest DB name (most specific to the typed input).
+        substring_candidates.sort(key=lambda e: len(e["name"]))
+        return _slim(substring_candidates[0])
+
+    # 4. Token overlap, 100% coverage of the user's tokens. So "BB Row" with
+    # tokens {bb, row} only matches a DB entry that contains both — it never
+    # silently latches onto a "row" entry that doesn't share the "bb" cue.
+    best: tuple[float, dict] | None = None
+    for entry, db_toks in _TOKENS:
+        if not db_toks:
+            continue
+        if not needle_toks <= db_toks:
+            continue
+        # All user tokens matched; tiebreak by DB-name length (shorter wins,
+        # i.e. the most specific variant).
+        score = -len(entry["name"])
+        if best is None or score > best[0]:
+            best = (score, entry)
+
+    return _slim(best[1]) if best else None
--- a/server.py
+++ b/server.py
@ -18,6 +18,7 @@ from urllib.parse import parse_qs
 from aiohttp import web

 from db import init_db, save_workout, get_workouts, get_workout_count, get_stats_sql, delete_workout, update_workout, export_workouts, get_user_workout_number, get_all_exercise_names, log_event, get_settings, update_settings, get_last_exercise
+import exercise_db
 from parser import parse_workout, format_workout

 logging.basicConfig(
@ -293,6 +294,15 @@ async def api_get_last_exercise(request: web.Request):
    return web.json_response({"last": last})


+@require_auth
+async def api_lookup_exercise(request: web.Request):
+    """Look up an exercise in the static Free-Exercise-DB reference data."""
+    name = request.query.get("name", "").strip()
+    if not name:
+        return web.json_response({"error": "Missing name"}, status=400)
+    return web.json_response({"match": exercise_db.lookup(name)})
+
+
@require_auth
 async def api_get_stats(request: web.Request):
    """Return summary stats for the user."""
@ -377,6 +387,7 @@ def create_app() -> web.Application:
    app.router.add_delete("/api/workouts/{workout_id}", api_delete_workout)
    app.router.add_get("/api/exercises", api_get_exercise_names)
    app.router.add_get("/api/exercises/last", api_get_last_exercise)
+    app.router.add_get("/api/exercises/lookup", api_lookup_exercise)
    app.router.add_get("/api/stats", api_get_stats)
    app.router.add_get("/api/export/json", api_export_json)
    app.router.add_get("/api/export/csv", api_export_csv)
--- a/tests/test_db.py
+++ b/tests/test_db.py
@ -253,6 +253,60 @@ class TestAllExerciseNames:
        assert db.get_all_exercise_names() == ["Apple", "Mango", "Zebra"]


+# ── exercise aliases ─────────────────────────────────────────────
+
+
+class TestExerciseAliases:
+    def test_seed_loaded(self, tmp_db):
+        assert db.resolve_exercise_alias("OHP") == "Standing Military Press"
+        assert db.resolve_exercise_alias("rdl") == "Romanian Deadlift"
+        assert db.resolve_exercise_alias("Bench") == "Barbell Bench Press - Medium Grip"
+
+    def test_case_insensitive(self, tmp_db):
+        assert db.resolve_exercise_alias("ohp") == "Standing Military Press"
+        assert db.resolve_exercise_alias("OHP") == "Standing Military Press"
+        assert db.resolve_exercise_alias("  Squat  ") == "Barbell Squat"
+
+    def test_unknown_returns_none(self, tmp_db):
+        assert db.resolve_exercise_alias("not-a-thing") is None
+        assert db.resolve_exercise_alias("") is None
+        assert db.resolve_exercise_alias(None) is None
+
+    def test_lookup_exercise_uses_aliases(self, tmp_db):
+        info = db.lookup_exercise("OHP")
+        assert info is not None
+        assert info["name"] == "Standing Military Press"
+        assert "shoulders" in info["primary_muscles"]
+
+    def test_lookup_exercise_falls_through_when_no_alias(self, tmp_db):
+        # No alias for "Plank" → goes straight to exercise_db, which has it.
+        info = db.lookup_exercise("Plank")
+        assert info is not None
+        assert info["name"] == "Plank"
+
+    def test_user_overrides_survive_init(self, tmp_db):
+        # A row inserted with source='user' is preserved across init_db calls;
+        # seed rows get refreshed but user rows don't.
+        with db.get_db() as conn:
+            conn.execute(
+                "INSERT OR REPLACE INTO exercise_aliases (alias, canonical, source) VALUES (?, ?, 'user')",
+                ("ohp", "Push Press"),
+            )
+        db.init_db()  # re-runs the seed
+        assert db.resolve_exercise_alias("ohp") == "Push Press"
+
+    def test_seed_rows_refreshed_on_reinit(self, tmp_db):
+        # Manually corrupt a seed row → next init_db should rewrite it from
+        # the seed dict (without needing INSERT OR REPLACE acrobatics).
+        with db.get_db() as conn:
+            conn.execute(
+                "UPDATE exercise_aliases SET canonical = 'WRONG' WHERE alias = 'ohp'"
+            )
+        assert db.resolve_exercise_alias("ohp") == "WRONG"
+        db.init_db()
+        assert db.resolve_exercise_alias("ohp") == "Standing Military Press"
+
+
 # ── get_last_exercise ────────────────────────────────────────────


--- a/tests/test_exercise_db.py
+++ b/tests/test_exercise_db.py
@ -0,0 +1,66 @@
+"""Tests for the static exercise reference matcher."""
+import exercise_db
+
+
+class TestLookup:
+    def test_loads_bundled_data(self):
+        # Free-Exercise-DB ships ~870 entries; just sanity-check it's non-empty.
+        assert len(exercise_db.ALL) > 500
+
+    def test_exact_match(self):
+        m = exercise_db.lookup("Pullups")
+        assert m is not None
+        assert m["name"] == "Pullups"
+        assert "lats" in m["primary_muscles"]
+
+    def test_case_insensitive(self):
+        a = exercise_db.lookup("PULLUPS")
+        b = exercise_db.lookup("pullups")
+        assert a == b
+        assert a["name"] == "Pullups"
+
+    def test_hyphen_matches_compressed(self):
+        # User types "Pull-ups", DB has "Pullups". Compressed form catches it.
+        m = exercise_db.lookup("Pull-ups")
+        assert m is not None
+        assert m["name"] == "Pullups"
+
+    def test_multi_word_substring(self):
+        m = exercise_db.lookup("Romanian deadlift")
+        assert m is not None
+        assert m["name"] == "Romanian Deadlift"
+        assert "hamstrings" in m["primary_muscles"]
+
+    def test_ambiguous_single_token_returns_none(self):
+        # Lots of DB entries contain "Bench" / "Squat" / "Deadlift" as one
+        # token. Returning the shortest would mislead ("Bench" → "Bench Dips"
+        # → triceps). Refuse instead.
+        assert exercise_db.lookup("Bench") is None
+        assert exercise_db.lookup("Squat") is None
+        assert exercise_db.lookup("Deadlift") is None
+
+    def test_nonsense_returns_none(self):
+        assert exercise_db.lookup("flarbenstompf") is None
+        assert exercise_db.lookup("") is None
+        assert exercise_db.lookup("   ") is None
+
+    def test_short_acronyms_dont_substring_match_characters(self):
+        # Regression: "RDL" used to match "Hurdle Hops" because "rdl"
+        # appears as a character substring inside "hu**rdl**ehops".
+        assert exercise_db.lookup("RDL") is None
+        assert exercise_db.lookup("OHP") is None
+
+    def test_multi_token_requires_full_coverage(self):
+        # Regression: "BB Row" used to match "Sled Row" because both share
+        # the "row" token. Strict 100% coverage prevents this.
+        assert exercise_db.lookup("BB Row") is None
+
+    def test_returned_shape(self):
+        m = exercise_db.lookup("Pullups")
+        # The slim view drops `instructions` and `images`.
+        assert set(m.keys()) >= {
+            "name", "primary_muscles", "secondary_muscles",
+            "equipment", "category", "level",
+        }
+        assert "instructions" not in m
+        assert "images" not in m