bigbiggerbiggestbot/exercise_db.py

"""Static exercise reference data from the Free-Exercise-DB.

Source: https://github.com/yuhonas/free-exercise-db (public domain).
Bundled at data/exercises.json (~870 entries). Loaded once at import.

Exports:
- `lookup(name)` — best-effort fuzzy name match → dict with primary/secondary
  muscles, equipment, etc. Returns None if no plausible match.
- `ALL` — the raw list (for ad-hoc queries).

Matching, in priority order:
  1. exact case-insensitive name match
  2. case-insensitive substring (either way)
  3. token-overlap score above a small threshold

Keep this conservative — a wrong match is worse than no match for the user.
"""
from __future__ import annotations

import json
import pathlib
import re
from typing import Optional

_DATA_PATH = pathlib.Path(__file__).parent / "data" / "exercises.json"


def _load() -> list[dict]:
    try:
        with _DATA_PATH.open() as f:
            return json.load(f)
    except (OSError, json.JSONDecodeError):
        return []


ALL: list[dict] = _load()

# Normalised name → entry (case-insensitive exact key)
_BY_LOWER_NAME: dict[str, dict] = {e["name"].lower(): e for e in ALL if e.get("name")}


_TOKEN_RE = re.compile(r"[a-z0-9]+")
_NON_ALNUM = re.compile(r"[^a-z0-9]")


def _tokens(s: str) -> set[str]:
    return set(_TOKEN_RE.findall(s.lower()))


def _compress(s: str) -> str:
    """Collapse to lowercase alphanumeric, no separators.

    "Pull-Ups", "Pull Ups", "Pullups" all → "pullups".
    """
    return _NON_ALNUM.sub("", s.lower())


# Pre-compute token sets and compressed forms (one-time at import).
_TOKENS: list[tuple[dict, set[str]]] = [
    (e, _tokens(e["name"])) for e in ALL if e.get("name")
]
_COMPRESSED: list[tuple[dict, str]] = [
    (e, _compress(e["name"])) for e in ALL if e.get("name")
]
_BY_COMPRESSED: dict[str, dict] = {
    _compress(e["name"]): e for e in ALL if e.get("name")
}


# Public-facing slim shape — drop instructions/images for now (heavy).
def _slim(entry: dict) -> dict:
    return {
        "name": entry.get("name"),
        "primary_muscles": entry.get("primaryMuscles") or [],
        "secondary_muscles": entry.get("secondaryMuscles") or [],
        "equipment": entry.get("equipment"),
        "category": entry.get("category"),
        "level": entry.get("level"),
        "force": entry.get("force"),
        "mechanic": entry.get("mechanic"),
    }


def lookup(name: str) -> Optional[dict]:
    """Return the slim entry for the best name match, or None.

    Tiers (priority order):
      1. exact (case-insensitive)
      2. compressed exact — collapses hyphens/spaces ("Pull-ups" → "Pullups")
      3. word-boundary substring (only for multi-token inputs)
      4. token overlap requiring 100% coverage of the user's tokens
         (only for multi-token inputs)

    Single-token inputs (e.g. "Bench", "Squat", "RDL", "OHP") that don't hit
    tier 1 or 2 return None — there's no robust way to disambiguate without
    an alias table.
    """
    if not name:
        return None
    needle = name.strip()
    if not needle:
        return None
    lower = needle.lower()
    compressed = _compress(needle)
    if not compressed:
        return None

    # 1. Exact (case-insensitive).
    hit = _BY_LOWER_NAME.get(lower)
    if hit:
        return _slim(hit)

    # 2. Compressed exact — "Pull-ups" → "Pullups".
    hit = _BY_COMPRESSED.get(compressed)
    if hit:
        return _slim(hit)

    needle_toks = _tokens(needle)
    # Below here, partial matches only — and only for multi-token inputs.
    # A single token is too easily ambiguous (and short acronyms accidentally
    # hit character-level substrings of unrelated names).
    if len(needle_toks) < 2:
        return None

    # 3. Word-boundary substring (lowercase). "bench press" in "bench press
    # with chains" — yes. "rdl" in "hurdle" — no (no word break).
    substring_candidates: list[dict] = []
    for entry in ALL:
        n = entry.get("name", "")
        if not n:
            continue
        nl = n.lower()
        if lower in nl or nl in lower:
            substring_candidates.append(entry)
    if substring_candidates:
        # Prefer the shortest DB name (most specific to the typed input).
        substring_candidates.sort(key=lambda e: len(e["name"]))
        return _slim(substring_candidates[0])

    # 4. Token overlap, 100% coverage of the user's tokens. So "BB Row" with
    # tokens {bb, row} only matches a DB entry that contains both — it never
    # silently latches onto a "row" entry that doesn't share the "bb" cue.
    best: tuple[float, dict] | None = None
    for entry, db_toks in _TOKENS:
        if not db_toks:
            continue
        if not needle_toks <= db_toks:
            continue
        # All user tokens matched; tiebreak by DB-name length (shorter wins,
        # i.e. the most specific variant).
        score = -len(entry["name"])
        if best is None or score > best[0]:
            best = (score, entry)

    return _slim(best[1]) if best else None