brain/scripts/import_notes.py

"""
Import UpNote 'N:' notes into the Obsidian vault.

Classifies by title keywords into existing vault folders.
Unknown -> 01 Inbox/. All imports tagged with 'upnote-import' for later triage.

Usage:
  python import_notes.py --dry-run     # preview classifications
  python import_notes.py --test        # write to 01 Inbox/_import_test/
  python import_notes.py               # full import
"""

from __future__ import annotations

import argparse
import re
import sys

# force utf-8 stdout for emoji-laden titles
try:
    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore[attr-defined]
except Exception:
    pass
from dataclasses import dataclass
from pathlib import Path

UPNOTE_ROOT = Path(
    r"C:\Users\d-chrka\AppData\Roaming\UpNote\UpNote Backup"
    r"\HtgSdi2hYyUfnYq3OZkBwx13H5q2\Markdown\General Space"
)
VAULT = Path(r"D:\projects\chrka\brain")
INBOX = VAULT / "01 Inbox"

TITLE_RE = re.compile(r"^#{1,3}\s*N:\s*(.+?)(?:\s*📑)?\s*$", re.MULTILINE)
META_TABLE_RE = re.compile(r"^\|.*\|\s*\n\|[\s\-:|]+\|\s*\n(?:\|.*\|\s*\n)+", re.MULTILINE)
HR_RE = re.compile(r"^\*\s*\*\s*\*\s*$", re.MULTILINE)
BR_RE = re.compile(r"^\s*<br\s*/?>\s*$", re.MULTILINE)
LP_LINK_RE = re.compile(r"^\s*\[\[LP:[^\]]+\]\]\s*$", re.MULTILINE)

# Classifier: (regex on lowercased title, target folder relative to VAULT)
# First match wins. Order matters.
RULES: list[tuple[str, str]] = [
    # --- Versicherungen & Finanzen (Familie)
    (r"versicherung|haftpflicht|hausrat|rente|rürup|riester|krankenzusatz|berufsunfähig|basisrente|metallrente|direktversicherung|steuererklärung|volkswohlbund|canadalife|union investment|gebäudeversicherung|jagdhaftpflicht|gesetzliche krankenversicherung",
     "03 Bereiche/Finanzen"),

    # --- Gesundheit
    (r"supplemente|schmerzmittel|sportübung|7mind|bike-fitting|fitting|präventionskurs|stressmanagement",
     "03 Bereiche/Gesundheit"),

    # --- Politik (UCW/UWG/Fraktion)
    (r"\bucw\b|\buwg\b|fraktionssitzung|mitgliederversammlung|wählergruppen|wahlabend|pressemitteilung|sitzungsvorbereitung|plakatierung|haushalt 20|feuerwehr drolshagen|uwg meeting|terma",
     "04 Ressourcen/Politik"),

    # --- Heimnetz & Home Assistant (privat)
    (r"home-?assistant|home-?assistent|müllkalender|zigbee|haproxy|opnsense|adguard|nginx ?proxy|\bnpm\b|multicast dns|macvlan|vlan|sophos|openSence|opensence|heimnetz|reverse-proxy|mqtt|teleport|admin-netz|netzwerk-interface|acme|full cert chain|straso|koogle",
     "03 Bereiche/Heimnetz & Home Assistant"),

    # --- Projekte (abgeschlossen/Urlaube/Feiern -> Archiv)
    (r"bretagne|slowenien|italien|kegeltour|ostpreußen|hausbooturlaub|südholland|planung geburtstag|partyplanung|urlaubsplanung|wahlabend ucw",
     "06 Archiv"),

    # --- KIT / IT-Management Arbeit
    (r"tisax|pc-migration|berechtigungskonzept|personalgespräch|personalentwicklung|jour fixe|witec|key-user|hydra|mes-support|desktop central|krah-app|verlagerung|docuware",
     "03 Bereiche/KIT"),

    # --- SAP
    (r"sap-transaktionen|sap |\bspn\b|kerberos",
     "03 Bereiche/SAP"),

    # --- IT-Ressourcen (How-Tos, technische Notes)
    (r"jenkins|docker|grafana|paperless|immich|gitea|blazor|resharper|jetbrains|wsl|powershell|sql-transaktion|sql-server|appflowy|ec2|claude|prompt|prompot|os-ticket|reject tickets|custom field|autoassign|subscribe longrunning|ssl-zertifikat|hetty|http-interception|dashboard für pv|netzwerkverbindung|fast typing|getting started|notes-organisation|second brain|meta-framework|mail automizer",
     "04 Ressourcen/IT"),

    # --- Psychologie / Persönlich
    (r"persönlichkeitstest|die 3 ks",
     "04 Ressourcen/Psychologie"),

    # --- Buchhagen / Haus
    (r"\bpool\b|teich|klärgrube|zapfanlage|pv-anlage|pv anlage|wechselrichter|stromzähler|dashboard für pv",
     "03 Bereiche/Familie"),

    # --- divers Ressourcen (Ideen, Listen)
    (r"bücher|lesestoff|geschenkideen|ideen essen|kindernamen|fahrradtouren|fahrrdtouren|hausrat hetty",
     "04 Ressourcen/divers"),

    # --- Rechtliches / Buchungen
    (r"agb|geschäftsbedingungen|buchungsbestätigung|chaterbedingungen",
     "04 Ressourcen/divers"),

    # --- Vorlagen / Templates
    (r"@@titel@@",
     "04 Ressourcen/divers"),
]


@dataclass
class Note:
    uuid: str
    title: str
    body: str
    target_rel: str   # folder relative to VAULT


def slugify(title: str) -> str:
    # keep German chars; remove filesystem-hostile chars
    t = title.strip()
    t = re.sub(r"[\\/:*?\"<>|]", "-", t)
    t = re.sub(r"\s+", " ", t)
    return t[:120].strip(" .-")


def classify(title: str) -> str:
    low = title.lower()
    for pat, folder in RULES:
        if re.search(pat, low):
            return folder
    return "01 Inbox"


def parse_note(src: Path) -> Note | None:
    text = src.read_text(encoding="utf-8")
    m = TITLE_RE.search(text)
    if not m:
        return None
    title = m.group(1).strip()
    # strip surrounding markdown bold/italic from title
    title = re.sub(r"^\*+|\*+$", "", title).strip()

    rest = text[m.end():]
    rest = META_TABLE_RE.sub("", rest, count=1)
    rest = HR_RE.sub("", rest, count=1)
    rest = LP_LINK_RE.sub("", rest)
    rest = BR_RE.sub("", rest)
    rest = re.sub(r"\n{3,}", "\n\n", rest).strip()

    if not rest or len(rest) < 3:
        return None

    target = classify(title)
    return Note(uuid=src.stem, title=title, body=rest, target_rel=target)


def render(note: Note) -> str:
    return (
        f"---\n"
        f"tags:\n  - upnote-import\n"
        f"---\n\n"
        f"# {note.title}\n\n"
        f"{note.body}\n"
    )


def collect_source_files(trash: bool = False) -> list[Path]:
    """Root .md files that start with N: title; optionally include trash."""
    out = []
    dirs = [UPNOTE_ROOT]
    if trash:
        dirs = [UPNOTE_ROOT / "trash"]
    for d in dirs:
        for p in d.glob("*.md"):
            try:
                head = p.read_text(encoding="utf-8", errors="ignore")[:200]
            except Exception:
                continue
            if re.match(r"^#{1,3}\s*N:\s", head):
                out.append(p)
    return out


def existing_titles_in_vault() -> set[str]:
    """Lowercased filename stems of all .md files in vault (excludes _import_test)."""
    s = set()
    for p in VAULT.rglob("*.md"):
        if ".obsidian" in p.parts or "node_modules" in p.parts or "_import_test" in p.parts:
            continue
        s.add(p.stem.lower())
    return s


def main() -> int:
    ap = argparse.ArgumentParser()
    ap.add_argument("--dry-run", action="store_true")
    ap.add_argument("--test", action="store_true")
    ap.add_argument("--trash", action="store_true", help="source from trash/, skip titles already in vault")
    ap.add_argument("--limit", type=int, default=0)
    args = ap.parse_args()

    existing = existing_titles_in_vault() if args.trash else set()

    notes: list[Note] = []
    skipped_empty = 0
    skipped_dupe = 0
    for src in collect_source_files(trash=args.trash):
        n = parse_note(src)
        if n is None:
            skipped_empty += 1
            continue
        if args.trash and slugify(n.title).lower() in existing:
            skipped_dupe += 1
            print(f"  skip (already in vault): {n.title}")
            continue
        notes.append(n)

    notes.sort(key=lambda n: (n.target_rel, n.title.lower()))
    if args.limit:
        notes = notes[: args.limit]

    # stats
    counts: dict[str, int] = {}
    for n in notes:
        counts[n.target_rel] = counts.get(n.target_rel, 0) + 1
    print(f"Parsed notes: {len(notes)} (skipped empty: {skipped_empty})")
    print("Classification:")
    for k in sorted(counts, key=lambda x: (-counts[x], x)):
        print(f"  {counts[k]:3d}  {k}")

    if args.dry_run:
        print("\n--- Dry-run detail ---")
        for n in notes:
            print(f"  [{n.target_rel}]  {n.title}")
        return 0

    written = skipped_exists = 0
    test_prefix = "_import_test/" if args.test else ""
    for n in notes:
        out_dir = VAULT / (test_prefix + n.target_rel) if args.test else VAULT / n.target_rel
        out_dir.mkdir(parents=True, exist_ok=True)
        fname = slugify(n.title) + ".md"
        target = out_dir / fname
        if target.exists():
            # avoid overwriting existing files; disambiguate with UUID
            target = out_dir / f"{slugify(n.title)} ({n.uuid[:8]}).md"
            if target.exists():
                skipped_exists += 1
                continue
        target.write_text(render(n), encoding="utf-8")
        written += 1

    print(f"\nWritten: {written}, skipped (exists): {skipped_exists}")
    return 0


if __name__ == "__main__":
    sys.exit(main())