feat: bundle Free-Exercise-DB + name matcher (step 1)

Adds the static exercise reference data (~870 entries, public domain, source: github.com/yuhonas/free-exercise-db) plus a conservative name matcher. New endpoint: GET /api/exercises/lookup?name=<name> → {"match": {"name", "primary_muscles", "secondary_muscles", "equipment", "category", "level", ...}} → {"match": null} when nothing plausibly matches. Matcher tiers (priority order): 1. exact (case-insensitive) 2. compressed exact ("Pull-ups" → "Pullups") 3. compressed substring, with a guard: single-token generics like "Bench"/"Squat" return null instead of misleading the user — the planned alias table will handle these properly. 4. token-overlap with ≥50% coverage of the user's tokens. UI integration ("Trains: chest · shoulders") comes in step 2. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-24 11:11:34 +02:00 · 2026-05-24 11:11:34 +02:00 · ebd0016a62
commit ebd0016a62
parent 9e50686983
4 changed files with 22824 additions and 0 deletions
--- a/exercise_db.py
+++ b/exercise_db.py
@ -0,0 +1,141 @@
+"""Static exercise reference data from the Free-Exercise-DB.
+
+Source: https://github.com/yuhonas/free-exercise-db (public domain).
+Bundled at data/exercises.json (~870 entries). Loaded once at import.
+
+Exports:
+- `lookup(name)` — best-effort fuzzy name match → dict with primary/secondary
+  muscles, equipment, etc. Returns None if no plausible match.
+- `ALL` — the raw list (for ad-hoc queries).
+
+Matching, in priority order:
+  1. exact case-insensitive name match
+  2. case-insensitive substring (either way)
+  3. token-overlap score above a small threshold
+
+Keep this conservative — a wrong match is worse than no match for the user.
+"""
+from __future__ import annotations
+
+import json
+import pathlib
+import re
+from typing import Optional
+
+_DATA_PATH = pathlib.Path(__file__).parent / "data" / "exercises.json"
+
+
+def _load() -> list[dict]:
+    try:
+        with _DATA_PATH.open() as f:
+            return json.load(f)
+    except (OSError, json.JSONDecodeError):
+        return []
+
+
+ALL: list[dict] = _load()
+
+# Normalised name → entry (case-insensitive exact key)
+_BY_LOWER_NAME: dict[str, dict] = {e["name"].lower(): e for e in ALL if e.get("name")}
+
+
+_TOKEN_RE = re.compile(r"[a-z0-9]+")
+_NON_ALNUM = re.compile(r"[^a-z0-9]")
+
+
+def _tokens(s: str) -> set[str]:
+    return set(_TOKEN_RE.findall(s.lower()))
+
+
+def _compress(s: str) -> str:
+    """Collapse to lowercase alphanumeric, no separators.
+
+    "Pull-Ups", "Pull Ups", "Pullups" all → "pullups".
+    """
+    return _NON_ALNUM.sub("", s.lower())
+
+
+# Pre-compute token sets and compressed forms (one-time at import).
+_TOKENS: list[tuple[dict, set[str]]] = [
+    (e, _tokens(e["name"])) for e in ALL if e.get("name")
+]
+_COMPRESSED: list[tuple[dict, str]] = [
+    (e, _compress(e["name"])) for e in ALL if e.get("name")
+]
+_BY_COMPRESSED: dict[str, dict] = {
+    _compress(e["name"]): e for e in ALL if e.get("name")
+}
+
+
+# Public-facing slim shape — drop instructions/images for now (heavy).
+def _slim(entry: dict) -> dict:
+    return {
+        "name": entry.get("name"),
+        "primary_muscles": entry.get("primaryMuscles") or [],
+        "secondary_muscles": entry.get("secondaryMuscles") or [],
+        "equipment": entry.get("equipment"),
+        "category": entry.get("category"),
+        "level": entry.get("level"),
+        "force": entry.get("force"),
+        "mechanic": entry.get("mechanic"),
+    }
+
+
+def lookup(name: str) -> Optional[dict]:
+    """Return the slim entry for the best name match, or None."""
+    if not name:
+        return None
+    needle = name.strip()
+    if not needle:
+        return None
+    lower = needle.lower()
+    compressed = _compress(needle)
+    if not compressed:
+        return None
+
+    # 1. Exact (case-insensitive)
+    hit = _BY_LOWER_NAME.get(lower)
+    if hit:
+        return _slim(hit)
+
+    # 2. Compressed exact — catches "Pull-ups" → "Pullups", etc.
+    hit = _BY_COMPRESSED.get(compressed)
+    if hit:
+        return _slim(hit)
+
+    # 3. Compressed substring (either direction).
+    substring_candidates: list[dict] = [
+        e for e, c in _COMPRESSED if compressed in c or c in compressed
+    ]
+    if substring_candidates:
+        # Single-token generics ("Bench", "Squat", "Deadlift") match too many
+        # specific DB entries. Refuse rather than confidently mislead the
+        # user — the planned alias table will handle these properly.
+        needle_toks = _tokens(needle)
+        if len(needle_toks) == 1 and len(substring_candidates) > 2:
+            return None
+        substring_candidates.sort(key=lambda e: len(e["name"]))
+        return _slim(substring_candidates[0])
+
+    # 4. Token overlap (Jaccard-ish). Require ≥1 shared token AND that the
+    # shared portion covers ≥50% of the user's tokens, so "row" doesn't
+    # match "single arm cable row machine" via one stop-token.
+    needle_toks = _tokens(needle)
+    if not needle_toks:
+        return None
+    best: tuple[float, dict] | None = None
+    for entry, db_toks in _TOKENS:
+        if not db_toks:
+            continue
+        overlap = needle_toks & db_toks
+        if not overlap:
+            continue
+        coverage = len(overlap) / len(needle_toks)
+        if coverage < 0.5:
+            continue
+        # Score = coverage, tiebreak by DB-name length (shorter wins).
+        score = coverage - 0.001 * len(entry["name"])
+        if best is None or score > best[0]:
+            best = (score, entry)
+
+    return _slim(best[1]) if best else None