brain/scripts/import_notes.py

246 lines
8.5 KiB
Python

"""
Import UpNote 'N:' notes into the Obsidian vault.
Classifies by title keywords into existing vault folders.
Unknown -> 01 Inbox/. All imports tagged with 'upnote-import' for later triage.
Usage:
python import_notes.py --dry-run # preview classifications
python import_notes.py --test # write to 01 Inbox/_import_test/
python import_notes.py # full import
"""
from __future__ import annotations
import argparse
import re
import sys
# force utf-8 stdout for emoji-laden titles
try:
sys.stdout.reconfigure(encoding="utf-8") # type: ignore[attr-defined]
except Exception:
pass
from dataclasses import dataclass
from pathlib import Path
UPNOTE_ROOT = Path(
r"C:\Users\d-chrka\AppData\Roaming\UpNote\UpNote Backup"
r"\HtgSdi2hYyUfnYq3OZkBwx13H5q2\Markdown\General Space"
)
VAULT = Path(r"D:\projects\chrka\brain")
INBOX = VAULT / "01 Inbox"
TITLE_RE = re.compile(r"^#{1,3}\s*N:\s*(.+?)(?:\s*📑)?\s*$", re.MULTILINE)
META_TABLE_RE = re.compile(r"^\|.*\|\s*\n\|[\s\-:|]+\|\s*\n(?:\|.*\|\s*\n)+", re.MULTILINE)
HR_RE = re.compile(r"^\*\s*\*\s*\*\s*$", re.MULTILINE)
BR_RE = re.compile(r"^\s*<br\s*/?>\s*$", re.MULTILINE)
LP_LINK_RE = re.compile(r"^\s*\[\[LP:[^\]]+\]\]\s*$", re.MULTILINE)
# Classifier: (regex on lowercased title, target folder relative to VAULT)
# First match wins. Order matters.
RULES: list[tuple[str, str]] = [
# --- Versicherungen & Finanzen (Familie)
(r"versicherung|haftpflicht|hausrat|rente|rürup|riester|krankenzusatz|berufsunfähig|basisrente|metallrente|direktversicherung|steuererklärung|volkswohlbund|canadalife|union investment|gebäudeversicherung|jagdhaftpflicht|gesetzliche krankenversicherung",
"03 Bereiche/Finanzen"),
# --- Gesundheit
(r"supplemente|schmerzmittel|sportübung|7mind|bike-fitting|fitting|präventionskurs|stressmanagement",
"03 Bereiche/Gesundheit"),
# --- Politik (UCW/UWG/Fraktion)
(r"\bucw\b|\buwg\b|fraktionssitzung|mitgliederversammlung|wählergruppen|wahlabend|pressemitteilung|sitzungsvorbereitung|plakatierung|haushalt 20|feuerwehr drolshagen|uwg meeting|terma",
"04 Ressourcen/Politik"),
# --- Heimnetz & Home Assistant (privat)
(r"home-?assistant|home-?assistent|müllkalender|zigbee|haproxy|opnsense|adguard|nginx ?proxy|\bnpm\b|multicast dns|macvlan|vlan|sophos|openSence|opensence|heimnetz|reverse-proxy|mqtt|teleport|admin-netz|netzwerk-interface|acme|full cert chain|straso|koogle",
"03 Bereiche/Heimnetz & Home Assistant"),
# --- Projekte (abgeschlossen/Urlaube/Feiern -> Archiv)
(r"bretagne|slowenien|italien|kegeltour|ostpreußen|hausbooturlaub|südholland|planung geburtstag|partyplanung|urlaubsplanung|wahlabend ucw",
"06 Archiv"),
# --- KIT / IT-Management Arbeit
(r"tisax|pc-migration|berechtigungskonzept|personalgespräch|personalentwicklung|jour fixe|witec|key-user|hydra|mes-support|desktop central|krah-app|verlagerung|docuware",
"03 Bereiche/KIT"),
# --- SAP
(r"sap-transaktionen|sap |\bspn\b|kerberos",
"03 Bereiche/SAP"),
# --- IT-Ressourcen (How-Tos, technische Notes)
(r"jenkins|docker|grafana|paperless|immich|gitea|blazor|resharper|jetbrains|wsl|powershell|sql-transaktion|sql-server|appflowy|ec2|claude|prompt|prompot|os-ticket|reject tickets|custom field|autoassign|subscribe longrunning|ssl-zertifikat|hetty|http-interception|dashboard für pv|netzwerkverbindung|fast typing|getting started|notes-organisation|second brain|meta-framework|mail automizer",
"04 Ressourcen/IT"),
# --- Psychologie / Persönlich
(r"persönlichkeitstest|die 3 ks",
"04 Ressourcen/Psychologie"),
# --- Buchhagen / Haus
(r"\bpool\b|teich|klärgrube|zapfanlage|pv-anlage|pv anlage|wechselrichter|stromzähler|dashboard für pv",
"03 Bereiche/Familie"),
# --- divers Ressourcen (Ideen, Listen)
(r"bücher|lesestoff|geschenkideen|ideen essen|kindernamen|fahrradtouren|fahrrdtouren|hausrat hetty",
"04 Ressourcen/divers"),
# --- Rechtliches / Buchungen
(r"agb|geschäftsbedingungen|buchungsbestätigung|chaterbedingungen",
"04 Ressourcen/divers"),
# --- Vorlagen / Templates
(r"@@titel@@",
"04 Ressourcen/divers"),
]
@dataclass
class Note:
uuid: str
title: str
body: str
target_rel: str # folder relative to VAULT
def slugify(title: str) -> str:
# keep German chars; remove filesystem-hostile chars
t = title.strip()
t = re.sub(r"[\\/:*?\"<>|]", "-", t)
t = re.sub(r"\s+", " ", t)
return t[:120].strip(" .-")
def classify(title: str) -> str:
low = title.lower()
for pat, folder in RULES:
if re.search(pat, low):
return folder
return "01 Inbox"
def parse_note(src: Path) -> Note | None:
text = src.read_text(encoding="utf-8")
m = TITLE_RE.search(text)
if not m:
return None
title = m.group(1).strip()
# strip surrounding markdown bold/italic from title
title = re.sub(r"^\*+|\*+$", "", title).strip()
rest = text[m.end():]
rest = META_TABLE_RE.sub("", rest, count=1)
rest = HR_RE.sub("", rest, count=1)
rest = LP_LINK_RE.sub("", rest)
rest = BR_RE.sub("", rest)
rest = re.sub(r"\n{3,}", "\n\n", rest).strip()
if not rest or len(rest) < 3:
return None
target = classify(title)
return Note(uuid=src.stem, title=title, body=rest, target_rel=target)
def render(note: Note) -> str:
return (
f"---\n"
f"tags:\n - upnote-import\n"
f"---\n\n"
f"# {note.title}\n\n"
f"{note.body}\n"
)
def collect_source_files(trash: bool = False) -> list[Path]:
"""Root .md files that start with N: title; optionally include trash."""
out = []
dirs = [UPNOTE_ROOT]
if trash:
dirs = [UPNOTE_ROOT / "trash"]
for d in dirs:
for p in d.glob("*.md"):
try:
head = p.read_text(encoding="utf-8", errors="ignore")[:200]
except Exception:
continue
if re.match(r"^#{1,3}\s*N:\s", head):
out.append(p)
return out
def existing_titles_in_vault() -> set[str]:
"""Lowercased filename stems of all .md files in vault (excludes _import_test)."""
s = set()
for p in VAULT.rglob("*.md"):
if ".obsidian" in p.parts or "node_modules" in p.parts or "_import_test" in p.parts:
continue
s.add(p.stem.lower())
return s
def main() -> int:
ap = argparse.ArgumentParser()
ap.add_argument("--dry-run", action="store_true")
ap.add_argument("--test", action="store_true")
ap.add_argument("--trash", action="store_true", help="source from trash/, skip titles already in vault")
ap.add_argument("--limit", type=int, default=0)
args = ap.parse_args()
existing = existing_titles_in_vault() if args.trash else set()
notes: list[Note] = []
skipped_empty = 0
skipped_dupe = 0
for src in collect_source_files(trash=args.trash):
n = parse_note(src)
if n is None:
skipped_empty += 1
continue
if args.trash and slugify(n.title).lower() in existing:
skipped_dupe += 1
print(f" skip (already in vault): {n.title}")
continue
notes.append(n)
notes.sort(key=lambda n: (n.target_rel, n.title.lower()))
if args.limit:
notes = notes[: args.limit]
# stats
counts: dict[str, int] = {}
for n in notes:
counts[n.target_rel] = counts.get(n.target_rel, 0) + 1
print(f"Parsed notes: {len(notes)} (skipped empty: {skipped_empty})")
print("Classification:")
for k in sorted(counts, key=lambda x: (-counts[x], x)):
print(f" {counts[k]:3d} {k}")
if args.dry_run:
print("\n--- Dry-run detail ---")
for n in notes:
print(f" [{n.target_rel}] {n.title}")
return 0
written = skipped_exists = 0
test_prefix = "_import_test/" if args.test else ""
for n in notes:
out_dir = VAULT / (test_prefix + n.target_rel) if args.test else VAULT / n.target_rel
out_dir.mkdir(parents=True, exist_ok=True)
fname = slugify(n.title) + ".md"
target = out_dir / fname
if target.exists():
# avoid overwriting existing files; disambiguate with UUID
target = out_dir / f"{slugify(n.title)} ({n.uuid[:8]}).md"
if target.exists():
skipped_exists += 1
continue
target.write_text(render(n), encoding="utf-8")
written += 1
print(f"\nWritten: {written}, skipped (exists): {skipped_exists}")
return 0
if __name__ == "__main__":
sys.exit(main())