journal-bot/tests/test_prompt_regression.py

81 lines
3.0 KiB
Python

"""Regression tests that pin down our processor contract via mocked LLM output.
These tests don't call a real model. They simulate what a *correct* model would
return for each input, and ensure our schema + wiring accepts it. If we later
change the prompt or schema, this test surfaces silent regressions.
"""
import json
from pathlib import Path
import yaml
import httpx
import respx
from journal_bot.processor_lmstudio import LMStudioProcessor
from journal_bot.processor_protocol import ProcessorInput
FIXTURE = Path(__file__).parent / "fixtures" / "prompt_regression" / "inputs.yaml"
def _golden_response(case: dict) -> dict:
"""Construct the JSON a well-behaved model would return for the input case."""
today = case["today"]
target_date = case.get("expected", {}).get("target_date", today)
text = case["text"]
clarifications: list[str] = []
raw_excluded: list[str] = []
entry = f"## {case['received_time']}\n{text}"
if "Schreib ins Journal" in text:
raw_excluded.append("Schreib ins Journal, dass")
entry = f"## {case['received_time']}\nIch habe gut geschlafen"
if case["name"] == "ambiguous_person_clarification":
clarifications.append("Welcher Steffen?")
entry = f"## {case['received_time']}\nTreffen mit Steffen besprochen"
if case["name"] == "gestern_resolves_to_yesterday":
entry = f"## {case['received_time']}\nParty gefeiert"
return {
"target_date": target_date,
"target_path": f"05 Daily Notes/{target_date}.md",
"entry_markdown": entry,
"clarifications": clarifications,
"raw_excluded": raw_excluded,
}
def _load_cases():
return yaml.safe_load(FIXTURE.read_text(encoding="utf-8"))
@respx.mock
def test_prompt_regression_cases():
cases = _load_cases()
processor = LMStudioProcessor(
base_url="http://localhost:1234/v1",
model="qwen/qwen3-vl-8b",
system_prompt="SYS",
)
for case in cases:
respx.post("http://localhost:1234/v1/chat/completions").mock(
return_value=httpx.Response(200, json={
"choices": [{"message": {"content": json.dumps(_golden_response(case))}}]
})
)
payload = ProcessorInput(
today=case["today"],
weekday="Sonntag",
received_time=case["received_time"],
persons=case.get("persons", []),
projects=[],
text=case["text"],
)
out = processor.process(payload)
exp = case.get("expected", {})
if "target_date" in exp:
assert out.target_date == exp["target_date"], case["name"]
if exp.get("raw_excluded_contains"):
assert any(exp["raw_excluded_contains"] in r for r in out.raw_excluded), case["name"]
if exp.get("entry_excludes"):
assert exp["entry_excludes"] not in out.entry_markdown, case["name"]
if exp.get("clarifications_nonempty"):
assert len(out.clarifications) > 0, case["name"]
respx.reset()