feat: bundle Free-Exercise-DB + name matcher (step 1)

Adds the static exercise reference data (~870 entries, public domain, source: github.com/yuhonas/free-exercise-db) plus a conservative name matcher. New endpoint: GET /api/exercises/lookup?name=<name> → {"match": {"name", "primary_muscles", "secondary_muscles", "equipment", "category", "level", ...}} → {"match": null} when nothing plausibly matches. Matcher tiers (priority order): 1. exact (case-insensitive) 2. compressed exact ("Pull-ups" → "Pullups") 3. compressed substring, with a guard: single-token generics like "Bench"/"Squat" return null instead of misleading the user — the planned alias table will handle these properly. 4. token-overlap with ≥50% coverage of the user's tokens. UI integration ("Trains: chest · shoulders") comes in step 2. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-24 11:11:34 +02:00 · 2026-05-24 11:11:34 +02:00 · ebd0016a62
commit ebd0016a62
parent 9e50686983
4 changed files with 22824 additions and 0 deletions
--- a/data/exercises.json
+++ b/data/exercises.json
--- a/exercise_db.py
+++ b/exercise_db.py
@ -0,0 +1,141 @@
+"""Static exercise reference data from the Free-Exercise-DB.
+
+Source: https://github.com/yuhonas/free-exercise-db (public domain).
+Bundled at data/exercises.json (~870 entries). Loaded once at import.
+
+Exports:
+- `lookup(name)` — best-effort fuzzy name match → dict with primary/secondary
+  muscles, equipment, etc. Returns None if no plausible match.
+- `ALL` — the raw list (for ad-hoc queries).
+
+Matching, in priority order:
+  1. exact case-insensitive name match
+  2. case-insensitive substring (either way)
+  3. token-overlap score above a small threshold
+
+Keep this conservative — a wrong match is worse than no match for the user.
+"""
+from __future__ import annotations
+
+import json
+import pathlib
+import re
+from typing import Optional
+
+_DATA_PATH = pathlib.Path(__file__).parent / "data" / "exercises.json"
+
+
+def _load() -> list[dict]:
+    try:
+        with _DATA_PATH.open() as f:
+            return json.load(f)
+    except (OSError, json.JSONDecodeError):
+        return []
+
+
+ALL: list[dict] = _load()
+
+# Normalised name → entry (case-insensitive exact key)
+_BY_LOWER_NAME: dict[str, dict] = {e["name"].lower(): e for e in ALL if e.get("name")}
+
+
+_TOKEN_RE = re.compile(r"[a-z0-9]+")
+_NON_ALNUM = re.compile(r"[^a-z0-9]")
+
+
+def _tokens(s: str) -> set[str]:
+    return set(_TOKEN_RE.findall(s.lower()))
+
+
+def _compress(s: str) -> str:
+    """Collapse to lowercase alphanumeric, no separators.
+
+    "Pull-Ups", "Pull Ups", "Pullups" all → "pullups".
+    """
+    return _NON_ALNUM.sub("", s.lower())
+
+
+# Pre-compute token sets and compressed forms (one-time at import).
+_TOKENS: list[tuple[dict, set[str]]] = [
+    (e, _tokens(e["name"])) for e in ALL if e.get("name")
+]
+_COMPRESSED: list[tuple[dict, str]] = [
+    (e, _compress(e["name"])) for e in ALL if e.get("name")
+]
+_BY_COMPRESSED: dict[str, dict] = {
+    _compress(e["name"]): e for e in ALL if e.get("name")
+}
+
+
+# Public-facing slim shape — drop instructions/images for now (heavy).
+def _slim(entry: dict) -> dict:
+    return {
+        "name": entry.get("name"),
+        "primary_muscles": entry.get("primaryMuscles") or [],
+        "secondary_muscles": entry.get("secondaryMuscles") or [],
+        "equipment": entry.get("equipment"),
+        "category": entry.get("category"),
+        "level": entry.get("level"),
+        "force": entry.get("force"),
+        "mechanic": entry.get("mechanic"),
+    }
+
+
+def lookup(name: str) -> Optional[dict]:
+    """Return the slim entry for the best name match, or None."""
+    if not name:
+        return None
+    needle = name.strip()
+    if not needle:
+        return None
+    lower = needle.lower()
+    compressed = _compress(needle)
+    if not compressed:
+        return None
+
+    # 1. Exact (case-insensitive)
+    hit = _BY_LOWER_NAME.get(lower)
+    if hit:
+        return _slim(hit)
+
+    # 2. Compressed exact — catches "Pull-ups" → "Pullups", etc.
+    hit = _BY_COMPRESSED.get(compressed)
+    if hit:
+        return _slim(hit)
+
+    # 3. Compressed substring (either direction).
+    substring_candidates: list[dict] = [
+        e for e, c in _COMPRESSED if compressed in c or c in compressed
+    ]
+    if substring_candidates:
+        # Single-token generics ("Bench", "Squat", "Deadlift") match too many
+        # specific DB entries. Refuse rather than confidently mislead the
+        # user — the planned alias table will handle these properly.
+        needle_toks = _tokens(needle)
+        if len(needle_toks) == 1 and len(substring_candidates) > 2:
+            return None
+        substring_candidates.sort(key=lambda e: len(e["name"]))
+        return _slim(substring_candidates[0])
+
+    # 4. Token overlap (Jaccard-ish). Require ≥1 shared token AND that the
+    # shared portion covers ≥50% of the user's tokens, so "row" doesn't
+    # match "single arm cable row machine" via one stop-token.
+    needle_toks = _tokens(needle)
+    if not needle_toks:
+        return None
+    best: tuple[float, dict] | None = None
+    for entry, db_toks in _TOKENS:
+        if not db_toks:
+            continue
+        overlap = needle_toks & db_toks
+        if not overlap:
+            continue
+        coverage = len(overlap) / len(needle_toks)
+        if coverage < 0.5:
+            continue
+        # Score = coverage, tiebreak by DB-name length (shorter wins).
+        score = coverage - 0.001 * len(entry["name"])
+        if best is None or score > best[0]:
+            best = (score, entry)
+
+    return _slim(best[1]) if best else None
--- a/server.py
+++ b/server.py
@ -18,6 +18,7 @@ from urllib.parse import parse_qs
 from aiohttp import web

 from db import init_db, save_workout, get_workouts, get_workout_count, get_stats_sql, delete_workout, update_workout, export_workouts, get_user_workout_number, get_all_exercise_names, log_event, get_settings, update_settings, get_last_exercise
+import exercise_db
 from parser import parse_workout, format_workout

 logging.basicConfig(
@ -293,6 +294,15 @@ async def api_get_last_exercise(request: web.Request):
    return web.json_response({"last": last})


+@require_auth
+async def api_lookup_exercise(request: web.Request):
+    """Look up an exercise in the static Free-Exercise-DB reference data."""
+    name = request.query.get("name", "").strip()
+    if not name:
+        return web.json_response({"error": "Missing name"}, status=400)
+    return web.json_response({"match": exercise_db.lookup(name)})
+
+
@require_auth
 async def api_get_stats(request: web.Request):
    """Return summary stats for the user."""
@ -377,6 +387,7 @@ def create_app() -> web.Application:
    app.router.add_delete("/api/workouts/{workout_id}", api_delete_workout)
    app.router.add_get("/api/exercises", api_get_exercise_names)
    app.router.add_get("/api/exercises/last", api_get_last_exercise)
+    app.router.add_get("/api/exercises/lookup", api_lookup_exercise)
    app.router.add_get("/api/stats", api_get_stats)
    app.router.add_get("/api/export/json", api_export_json)
    app.router.add_get("/api/export/csv", api_export_csv)
--- a/tests/test_exercise_db.py
+++ b/tests/test_exercise_db.py
@ -0,0 +1,55 @@
+"""Tests for the static exercise reference matcher."""
+import exercise_db
+
+
+class TestLookup:
+    def test_loads_bundled_data(self):
+        # Free-Exercise-DB ships ~870 entries; just sanity-check it's non-empty.
+        assert len(exercise_db.ALL) > 500
+
+    def test_exact_match(self):
+        m = exercise_db.lookup("Pullups")
+        assert m is not None
+        assert m["name"] == "Pullups"
+        assert "lats" in m["primary_muscles"]
+
+    def test_case_insensitive(self):
+        a = exercise_db.lookup("PULLUPS")
+        b = exercise_db.lookup("pullups")
+        assert a == b
+        assert a["name"] == "Pullups"
+
+    def test_hyphen_matches_compressed(self):
+        # User types "Pull-ups", DB has "Pullups". Compressed form catches it.
+        m = exercise_db.lookup("Pull-ups")
+        assert m is not None
+        assert m["name"] == "Pullups"
+
+    def test_multi_word_substring(self):
+        m = exercise_db.lookup("Romanian deadlift")
+        assert m is not None
+        assert m["name"] == "Romanian Deadlift"
+        assert "hamstrings" in m["primary_muscles"]
+
+    def test_ambiguous_single_token_returns_none(self):
+        # Lots of DB entries contain "Bench" / "Squat" / "Deadlift" as one
+        # token. Returning the shortest would mislead ("Bench" → "Bench Dips"
+        # → triceps). Refuse instead.
+        assert exercise_db.lookup("Bench") is None
+        assert exercise_db.lookup("Squat") is None
+        assert exercise_db.lookup("Deadlift") is None
+
+    def test_nonsense_returns_none(self):
+        assert exercise_db.lookup("flarbenstompf") is None
+        assert exercise_db.lookup("") is None
+        assert exercise_db.lookup("   ") is None
+
+    def test_returned_shape(self):
+        m = exercise_db.lookup("Pullups")
+        # The slim view drops `instructions` and `images`.
+        assert set(m.keys()) >= {
+            "name", "primary_muscles", "secondary_muscles",
+            "equipment", "category", "level",
+        }
+        assert "instructions" not in m
+        assert "images" not in m