feat: initial whisper-dictation repo, migrated from OneDrive

2026-03-19 17:55:40 +01:00 · 2026-03-19 17:55:40 +01:00 · e1a3eba05a
commit e1a3eba05a
10 changed files with 781 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,11 @@
+.venv-windows/
+.venv-linux/
+.venv/
+__pycache__/
+*.pyc
+*.pyo
+config_local.json
+models/
+*.log
+.DS_Store
+Thumbs.db
--- a/config.json
+++ b/config.json
@ -0,0 +1,6 @@
+{
+  "hotkey": "ctrl+shift+space",
+  "model": "medium",
+  "language": "de",
+  "sample_rate": 16000
+}
--- a/dictate.py
+++ b/dictate.py
@ -0,0 +1,688 @@
+"""
+Whisper Dictation — local GPU speech-to-text with system tray and settings GUI.
+Hold hotkey to record, release to transcribe and type into active window.
+"""
+import json
+import os
+import threading
+import time
+import tkinter as tk
+from tkinter import ttk
+
+import numpy as np
+import sounddevice as sd
+import keyboard
+import pystray
+from PIL import Image, ImageDraw
+from pynput.keyboard import Controller as KeyboardController
+from faster_whisper import WhisperModel
+
+# Shared data dir: script directory (= git repo root, synced via git pull).
+_script_dir = os.path.dirname(os.path.abspath(__file__))
+DATA_DIR    = os.environ.get("WHISPER_DATA_DIR", _script_dir)
+os.makedirs(DATA_DIR, exist_ok=True)
+
+# Local config dir: machine-specific settings (audio device, device, compute_type).
+# Windows: %LOCALAPPDATA%\WhisperDictation
+# Linux:   ~/.local/share/WhisperDictation
+_env_local = os.environ.get("WHISPER_LOCAL_DIR")
+if _env_local:
+    _local_dir = _env_local
+elif os.name == "nt":
+    _local_dir = os.path.join(os.environ.get("LOCALAPPDATA", _script_dir), "WhisperDictation")
+else:
+    _local_dir = os.path.join(os.path.expanduser("~"), ".local", "share", "WhisperDictation")
+os.makedirs(_local_dir, exist_ok=True)
+
+CONFIG_FILE       = os.path.join(DATA_DIR,   "config.json")      # shared via git
+CONFIG_LOCAL_FILE = os.path.join(_local_dir, "config_local.json") # machine-specific, not in git
+VOCAB_FILE        = os.path.join(DATA_DIR,   "vocabulary.json")   # shared via git
+
+DEFAULT_CONFIG = {
+    "hotkey": "ctrl+shift+space",
+    "model": "medium",
+    "device": "cuda",
+    "compute_type": "float16",
+    "language": "de",
+    "audio_device": None,
+    "sample_rate": 16000,
+}
+
+MODELS = ["tiny", "base", "small", "medium", "large-v2", "large-v3"]
+LANGUAGES = {"Deutsch": "de", "English": "en", "Français": "fr", "Español": "es", "Italiano": "it", "Auto": None}
+DEVICES = ["cuda", "cpu"]
+COMPUTE_TYPES = {"float16 (GPU)": "float16", "int8 (CPU/GPU)": "int8", "float32": "float32"}
+
+
+# ── State ─────────────────────────────────────────────────────────────────────
+
+class AppState:
+    IDLE = "idle"
+    RECORDING = "recording"
+    TRANSCRIBING = "transcribing"
+
+state = AppState.IDLE
+audio_chunks = []
+model = None
+typer = KeyboardController()
+config = {}
+tray_icon = None
+overlay_window = None
+overlay_tk = None
+
+
+# ── Config ────────────────────────────────────────────────────────────────────
+
+LOCAL_KEYS = {"audio_device", "device", "compute_type"}  # keys stored only in config_local.json
+
+def load_config():
+    global config
+    config = dict(DEFAULT_CONFIG)
+    if os.path.exists(CONFIG_FILE):
+        with open(CONFIG_FILE) as f:
+            config.update(json.load(f))
+    if os.path.exists(CONFIG_LOCAL_FILE):
+        with open(CONFIG_LOCAL_FILE) as f:
+            config.update(json.load(f))
+
+def save_config():
+    shared = {k: v for k, v in config.items() if k not in LOCAL_KEYS}
+    local  = {k: v for k, v in config.items() if k in LOCAL_KEYS}
+    with open(CONFIG_FILE, "w") as f:
+        json.dump(shared, f, indent=2)
+    with open(CONFIG_LOCAL_FILE, "w") as f:
+        json.dump(local, f, indent=2)
+
+
+# ── Vocabulary ────────────────────────────────────────────────────────────────
+
+vocab = {"words": [], "replacements": []}  # {from, to}
+
+def load_vocab():
+    global vocab
+    if os.path.exists(VOCAB_FILE):
+        with open(VOCAB_FILE) as f:
+            vocab = json.load(f)
+    else:
+        vocab = {"words": [], "replacements": []}
+
+def save_vocab():
+    with open(VOCAB_FILE, "w") as f:
+        json.dump(vocab, f, indent=2, ensure_ascii=False)
+
+def apply_vocab(text: str) -> str:
+    for r in vocab.get("replacements", []):
+        text = text.replace(r["from"], r["to"])
+    return text
+
+def get_initial_prompt() -> str:
+    words = vocab.get("words", [])
+    return ", ".join(words) if words else ""
+
+
+# ── Tray icon ─────────────────────────────────────────────────────────────────
+
+def make_icon(color):
+    img = Image.new("RGBA", (64, 64), (0, 0, 0, 0))
+    d = ImageDraw.Draw(img)
+    d.ellipse([4, 4, 60, 60], fill=color)
+    return img
+
+ICONS = {
+    AppState.IDLE:         make_icon((40, 200, 80)),
+    AppState.RECORDING:    make_icon((220, 50, 50)),
+    AppState.TRANSCRIBING: make_icon((220, 180, 30)),
+}
+
+def set_state(new_state):
+    global state
+    state = new_state
+    if tray_icon:
+        tray_icon.icon = ICONS[new_state]
+    if new_state == AppState.RECORDING:
+        show_overlay()
+    else:
+        hide_overlay()
+
+
+# ── Overlay window ────────────────────────────────────────────────────────────
+
+def show_overlay():
+    if overlay_tk is None:
+        return
+    overlay_tk.after(0, _show_overlay_main)
+
+def hide_overlay():
+    if overlay_tk is None:
+        return
+    overlay_tk.after(0, _hide_overlay_main)
+
+def _show_overlay_main():
+    overlay_window.deiconify()
+    # Position bottom-right
+    sw = overlay_tk.winfo_screenwidth()
+    sh = overlay_tk.winfo_screenheight()
+    overlay_window.geometry(f"220x54+{sw - 240}+{sh - 100}")
+    overlay_window.lift()
+
+def _hide_overlay_main():
+    overlay_window.withdraw()
+
+def create_overlay(root):
+    global overlay_window
+    win = tk.Toplevel(root)
+    win.withdraw()
+    win.overrideredirect(True)
+    win.attributes("-topmost", True)
+    win.attributes("-alpha", 0.92)
+    win.configure(bg="#1a1a1a")
+
+    frame = tk.Frame(win, bg="#1a1a1a", padx=12, pady=10)
+    frame.pack(fill="both", expand=True)
+
+    dot = tk.Canvas(frame, width=14, height=14, bg="#1a1a1a", highlightthickness=0)
+    dot.create_oval(2, 2, 12, 12, fill="#e03030", outline="")
+    dot.pack(side="left", padx=(0, 8))
+
+    tk.Label(frame, text="Aufnahme läuft …", fg="white", bg="#1a1a1a",
+             font=("Segoe UI", 11)).pack(side="left")
+
+    overlay_window = win
+
+
+# ── Audio ─────────────────────────────────────────────────────────────────────
+
+def audio_callback(indata, frames, time_info, status):
+    if state == AppState.RECORDING:
+        audio_chunks.append(indata.copy())
+
+def get_audio_stream():
+    device = config.get("audio_device")
+    return sd.InputStream(
+        samplerate=config["sample_rate"],
+        channels=1,
+        device=device,
+        callback=audio_callback,
+    )
+
+
+# ── Recording & transcription ─────────────────────────────────────────────────
+
+def start_recording():
+    global audio_chunks
+    if state == AppState.RECORDING:
+        return
+    audio_chunks = []
+    set_state(AppState.RECORDING)
+    print("Recording...", flush=True)
+
+def stop_and_transcribe():
+    if state != AppState.RECORDING:
+        return
+    set_state(AppState.TRANSCRIBING)
+    chunks = list(audio_chunks)
+
+    if not chunks:
+        set_state(AppState.IDLE)
+        return
+
+    audio = np.concatenate(chunks, axis=0).flatten().astype(np.float32)
+    duration = len(audio) / config["sample_rate"]
+    rms = float(np.sqrt(np.mean(audio ** 2)))
+    print(f"Audio: {duration:.1f}s  RMS: {rms:.5f}", flush=True)
+
+    if duration < 0.3 or rms < 0.0005:
+        print("Too short or silent — skipped.", flush=True)
+        set_state(AppState.IDLE)
+        return
+
+    lang = config["language"] if config["language"] else None
+    prompt = get_initial_prompt()
+    segments, _ = model.transcribe(
+        audio, language=lang, beam_size=5, vad_filter=True,
+        initial_prompt=prompt if prompt else None,
+    )
+    text = " ".join(s.text for s in segments).strip()
+    text = apply_vocab(text)
+    print(f"Result: {repr(text)}", flush=True)
+
+    set_state(AppState.IDLE)
+    if text:
+        time.sleep(0.15)
+        typer.type(text)
+
+def on_space_release(e):
+    if state == AppState.RECORDING:
+        threading.Thread(target=stop_and_transcribe, daemon=True).start()
+
+
+# ── Model loading ─────────────────────────────────────────────────────────────
+
+def load_model():
+    global model
+    print(f"Loading {config['model']} on {config['device']}...", flush=True)
+    model = WhisperModel(
+        config["model"],
+        device=config["device"],
+        compute_type=config["compute_type"],
+    )
+    print("Model ready.", flush=True)
+
+
+# ── Settings window ───────────────────────────────────────────────────────────
+
+def open_settings():
+    if overlay_tk is None:
+        return
+    overlay_tk.after(0, _open_settings_main)
+
+def _open_settings_main():
+    # ── Palette: "Precision Audio" ──────────────────────────────────────────
+    BG      = "#18181f"   # deep void
+    BG2     = "#22222c"   # panel
+    BG3     = "#2c2c38"   # elevated
+    BORDER  = "#38384a"
+    FG      = "#e8e8f0"
+    FG2     = "#7878a0"
+    AMBER   = "#f5a623"
+    AMBER2  = "#c8831a"
+    GREEN   = "#4ade80"
+    FONT    = ("Consolas", 11)
+    FONT_UI = ("Segoe UI", 11)
+    FONT_B  = ("Segoe UI", 11, "bold")
+    FONT_S  = ("Segoe UI", 9)
+    FONT_H  = ("Segoe UI Semibold", 16)
+
+    win = tk.Toplevel(overlay_tk)
+    win.title("Whisper Dictation")
+    win.configure(bg=BG)
+    win.attributes("-topmost", True)
+    win.resizable(False, False)
+
+    # Center
+    W, H = 680, 660
+    win.update_idletasks()
+    sw, sh = win.winfo_screenwidth(), win.winfo_screenheight()
+    win.geometry(f"{W}x{H}+{(sw-W)//2}+{(sh-H)//2}")
+
+    # Global option for OptionMenu dropdowns (dark listbox)
+    win.option_add("*Menu.background",       BG3)
+    win.option_add("*Menu.foreground",        FG)
+    win.option_add("*Menu.activeBackground",  AMBER)
+    win.option_add("*Menu.activeForeground",  BG)
+    win.option_add("*Menu.font",              FONT_UI)
+
+    # ── Header ──
+    hdr = tk.Frame(win, bg=BG2, pady=20)
+    hdr.pack(fill="x")
+    # Amber accent bar
+    tk.Frame(hdr, bg=AMBER, height=3).pack(fill="x")
+    tk.Label(hdr, text="WHISPER DICTATION", font=FONT_H,
+             bg=BG2, fg=FG, pady=12).pack()
+    tk.Label(hdr, text="Lokale GPU-Transkription  ·  offline  ·  privat",
+             font=FONT_S, bg=BG2, fg=FG2).pack()
+
+    # ── Scrollable content ──
+    canvas = tk.Canvas(win, bg=BG, highlightthickness=0)
+    canvas.pack(fill="both", expand=True)
+    content = tk.Frame(canvas, bg=BG, padx=36, pady=16)
+    canvas.create_window((0, 0), window=content, anchor="nw")
+
+    def section(label):
+        f = tk.Frame(content, bg=BG)
+        f.pack(fill="x", pady=(18, 6))
+        tk.Label(f, text=label, font=("Consolas", 9, "bold"),
+                 bg=BG, fg=AMBER).pack(side="left")
+        tk.Frame(f, bg=BORDER, height=1).pack(side="left", fill="x", expand=True, padx=(10, 0), pady=6)
+
+    def dd(frame, var, values, width=14):
+        """Create dark OptionMenu directly in frame as parent."""
+        m = tk.OptionMenu(frame, var, *values)
+        m.config(bg=BG3, fg=FG, activebackground=BG3, activeforeground=FG,
+                 highlightbackground=BORDER, highlightthickness=1,
+                 relief="flat", font=FONT_UI, anchor="w", bd=0, width=width)
+        m["menu"].config(bg=BG3, fg=FG, activebackground=AMBER,
+                         activeforeground=BG, relief="flat", bd=0)
+        return m
+
+    def row(label, hint=None):
+        """Returns frame — add controls to frame after calling."""
+        f = tk.Frame(content, bg=BG)
+        f.pack(fill="x", pady=5)
+        tk.Label(f, text=label, width=17, anchor="w", font=FONT_UI,
+                 bg=BG, fg=FG2).pack(side="left")
+        if hint:
+            tk.Label(f, text=hint, font=FONT_S, bg=BG, fg=FG2).pack(side="right")
+        return f
+
+    # ── AUDIO ──
+    section("AUDIO")
+    devices = [(i, d["name"]) for i, d in enumerate(sd.query_devices())
+               if d["max_input_channels"] > 0]
+    dev_names = ["Standard"] + [f"{i}: {n}" for i, n in devices]
+    dev_var = tk.StringVar()
+    cur_dev = config.get("audio_device")
+    dev_var.set("Standard" if cur_dev is None else
+                next((f"{i}: {n}" for i, n in devices if i == cur_dev), "Standard"))
+    f = row("Mikrofon")
+    dd(f, dev_var, dev_names, width=44).pack(side="left")
+
+    # ── MODELL ──
+    section("MODELL")
+    model_hints = {
+        "tiny":     "~1 GB VRAM  ·  sehr schnell",
+        "base":     "~1 GB VRAM",
+        "small":    "~2 GB VRAM",
+        "medium":   "~5 GB VRAM  ·  empfohlen ✓",
+        "large-v2": "~10 GB VRAM",
+        "large-v3": "~10 GB VRAM  ·  bestes Ergebnis",
+    }
+    model_var = tk.StringVar(value=config["model"])
+    f_model = row("Modell")
+    dd(f_model, model_var, MODELS, 14).pack(side="left")
+    hint_lbl = tk.Label(f_model, text=model_hints.get(config["model"], ""),
+                        font=FONT_S, bg=BG, fg=FG2)
+    hint_lbl.pack(side="left", padx=(14, 0))
+    model_var.trace_add("write", lambda *_: hint_lbl.config(text=model_hints.get(model_var.get(), "")))
+
+    lang_display = {v: k for k, v in LANGUAGES.items()}
+    lang_var = tk.StringVar(value=lang_display.get(config["language"], "Deutsch"))
+    f = row("Sprache")
+    dd(f, lang_var, list(LANGUAGES.keys()), 14).pack(side="left")
+
+    # ── LEISTUNG ──
+    section("LEISTUNG")
+    device_var = tk.StringVar(value=config["device"])
+    f = row("Gerät (GPU/CPU)", hint="cuda = NVIDIA GPU empfohlen")
+    dd(f, device_var, DEVICES, 8).pack(side="left")
+
+    ct_display = {v: k for k, v in COMPUTE_TYPES.items()}
+    ct_var = tk.StringVar(value=ct_display.get(config["compute_type"], "float16 (GPU)"))
+    f = row("Compute Type")
+    dd(f, ct_var, list(COMPUTE_TYPES.keys()), 18).pack(side="left")
+
+    # ── STEUERUNG ──
+    section("STEUERUNG")
+    hotkey_var = tk.StringVar(value=config["hotkey"])
+    f_hk = row("Hotkey", hint="z.B.  ctrl+shift+space")
+    tk.Entry(f_hk, textvariable=hotkey_var, font=FONT, width=24,
+             bg=BG3, fg=FG, insertbackground=AMBER,
+             relief="flat", bd=6,
+             highlightbackground=BORDER, highlightthickness=1).pack(side="left")
+
+    # ── Buttons ──
+    tk.Frame(win, bg=BORDER, height=1).pack(fill="x")
+    btn_bar = tk.Frame(win, bg=BG2, pady=16, padx=32)
+    btn_bar.pack(fill="x")
+
+    def save():
+        sel = dev_var.get()
+        config["audio_device"] = None if sel == "Standard" else int(sel.split(":")[0])
+        config["model"] = model_var.get()
+        config["language"] = LANGUAGES[lang_var.get()]
+        config["device"] = device_var.get()
+        config["compute_type"] = COMPUTE_TYPES[ct_var.get()]
+        config["hotkey"] = hotkey_var.get()
+        save_config()
+        win.destroy()
+        threading.Thread(target=reload_model_and_hotkey, daemon=True).start()
+
+    def btn_hover(b, c_in, c_out):
+        b.bind("<Enter>", lambda _: b.config(bg=c_in))
+        b.bind("<Leave>", lambda _: b.config(bg=c_out))
+
+    save_btn = tk.Button(btn_bar, text="Speichern & Neuladen", command=save,
+                         bg=AMBER, fg=BG, font=FONT_B,
+                         relief="flat", padx=20, pady=9, cursor="hand2", bd=0)
+    save_btn.pack(side="right")
+    btn_hover(save_btn, AMBER2, AMBER)
+
+    cancel_btn = tk.Button(btn_bar, text="Abbrechen", command=win.destroy,
+                           bg=BG3, fg=FG2, font=FONT_UI,
+                           relief="flat", padx=20, pady=9, cursor="hand2", bd=0)
+    cancel_btn.pack(side="right", padx=(0, 10))
+    btn_hover(cancel_btn, BORDER, BG3)
+
+
+def open_vocab():
+    if overlay_tk is None:
+        return
+    overlay_tk.after(0, _open_vocab_main)
+
+def _open_vocab_main():
+    BG     = "#18181f"
+    BG2    = "#22222c"
+    BG3    = "#2c2c38"
+    BORDER = "#38384a"
+    FG     = "#e8e8f0"
+    FG2    = "#7878a0"
+    AMBER  = "#f5a623"
+    AMBER2 = "#c8831a"
+    RED    = "#f87171"
+    FONT   = ("Segoe UI", 11)
+    FONT_B = ("Segoe UI", 11, "bold")
+    FONT_S = ("Segoe UI", 9)
+    FONT_H = ("Segoe UI Semibold", 14)
+    FONT_M = ("Consolas", 10)
+
+    win = tk.Toplevel(overlay_tk)
+    win.title("Vokabular")
+    win.configure(bg=BG)
+    win.attributes("-topmost", True)
+    win.resizable(False, False)
+    W, H = 600, 620
+    win.update_idletasks()
+    sw, sh = win.winfo_screenwidth(), win.winfo_screenheight()
+    win.geometry(f"{W}x{H}+{(sw-W)//2}+{(sh-H)//2}")
+    win.option_add("*Menu.background", BG3)
+    win.option_add("*Menu.foreground", FG)
+    win.option_add("*Menu.activeBackground", AMBER)
+    win.option_add("*Menu.activeForeground", BG)
+
+    # ── Header ──
+    hdr = tk.Frame(win, bg=BG2)
+    hdr.pack(fill="x")
+    tk.Frame(hdr, bg=AMBER, height=3).pack(fill="x")
+    tk.Label(hdr, text="VOKABULAR & KORREKTUREN", font=FONT_H,
+             bg=BG2, fg=FG, pady=14).pack()
+    tk.Label(hdr, text="Wörter lernen  ·  Ersetzungen definieren",
+             font=FONT_S, bg=BG2, fg=FG2).pack(pady=(0, 10))
+
+    content = tk.Frame(win, bg=BG, padx=28, pady=12)
+    content.pack(fill="both", expand=True)
+
+    # ── Add-word form ─────────────────────────────────────────────────────────
+    is_correction = tk.BooleanVar(value=False)
+
+    form = tk.Frame(content, bg=BG3, padx=16, pady=14)
+    form.pack(fill="x", pady=(0, 16))
+
+    # Toggle row
+    tog_row = tk.Frame(form, bg=BG3)
+    tog_row.pack(fill="x", pady=(0, 10))
+    tk.Label(tog_row, text="Korrektur (falsch → richtig)", font=FONT,
+             bg=BG3, fg=FG).pack(side="left")
+
+    def toggle_form(*_):
+        if is_correction.get():
+            entry_from.pack(side="left", padx=(0, 6))
+            arrow_lbl.pack(side="left", padx=4)
+            entry_to.pack(side="left")
+            entry_word.pack_forget()
+        else:
+            entry_word.pack(side="left", fill="x", expand=True)
+            entry_from.pack_forget()
+            arrow_lbl.pack_forget()
+            entry_to.pack_forget()
+
+    tog_btn = tk.Checkbutton(tog_row, variable=is_correction, command=toggle_form,
+                             bg=BG3, fg=FG2, activebackground=BG3,
+                             selectcolor=AMBER, relief="flat", bd=0,
+                             indicatoron=True)
+    tog_btn.pack(side="right")
+
+    # Input row
+    inp_row = tk.Frame(form, bg=BG3)
+    inp_row.pack(fill="x")
+
+    entry_style = dict(font=FONT_M, bg=BG, fg=FG, insertbackground=AMBER,
+                       relief="flat", bd=6, highlightbackground=BORDER, highlightthickness=1)
+
+    entry_word = tk.Entry(inp_row, width=32, **entry_style)
+    entry_word.insert(0, "")
+    entry_from = tk.Entry(inp_row, width=14, **entry_style)
+    arrow_lbl  = tk.Label(inp_row, text="→", font=("Segoe UI", 14), bg=BG3, fg=AMBER)
+    entry_to   = tk.Entry(inp_row, width=14, **entry_style)
+    entry_word.pack(side="left", fill="x", expand=True)
+
+    def add_entry():
+        if is_correction.get():
+            frm = entry_from.get().strip()
+            to  = entry_to.get().strip()
+            if frm and to:
+                vocab["replacements"].append({"from": frm, "to": to})
+                entry_from.delete(0, tk.END)
+                entry_to.delete(0, tk.END)
+        else:
+            w = entry_word.get().strip()
+            if w and w not in vocab["words"]:
+                vocab["words"].append(w)
+                entry_word.delete(0, tk.END)
+        save_vocab()
+        refresh_lists()
+
+    win.bind("<Return>", lambda _: add_entry())
+
+    add_btn = tk.Button(inp_row, text="Hinzufügen", command=add_entry,
+                        bg=AMBER, fg=BG, font=FONT_B,
+                        relief="flat", padx=14, pady=5, cursor="hand2", bd=0)
+    add_btn.pack(side="right", padx=(10, 0))
+    add_btn.bind("<Enter>", lambda _: add_btn.config(bg=AMBER2))
+    add_btn.bind("<Leave>", lambda _: add_btn.config(bg=AMBER))
+
+    # ── Lists ─────────────────────────────────────────────────────────────────
+    lists_frame = tk.Frame(content, bg=BG)
+    lists_frame.pack(fill="both", expand=True)
+    lists_frame.columnconfigure(0, weight=1)
+    lists_frame.columnconfigure(1, weight=2)
+
+    def section_label(parent, text):
+        tk.Label(parent, text=text, font=("Consolas", 9, "bold"),
+                 bg=BG, fg=AMBER).pack(anchor="w", pady=(0, 6))
+
+    # Words column
+    col_w = tk.Frame(lists_frame, bg=BG)
+    col_w.grid(row=0, column=0, sticky="nsew", padx=(0, 12))
+    section_label(col_w, "WÖRTER")
+
+    words_box = tk.Listbox(col_w, font=FONT_M, bg=BG3, fg=FG,
+                           selectbackground=AMBER, selectforeground=BG,
+                           relief="flat", bd=0, highlightthickness=0,
+                           activestyle="none", height=10)
+    words_box.pack(fill="both", expand=True)
+
+    def del_word():
+        sel = words_box.curselection()
+        if sel:
+            vocab["words"].pop(sel[0])
+            save_vocab()
+            refresh_lists()
+
+    tk.Button(col_w, text="− Entfernen", command=del_word,
+              bg=BG3, fg=RED, font=FONT_S, relief="flat",
+              padx=8, pady=3, cursor="hand2", bd=0).pack(anchor="e", pady=(4, 0))
+
+    # Replacements column
+    col_r = tk.Frame(lists_frame, bg=BG)
+    col_r.grid(row=0, column=1, sticky="nsew")
+    section_label(col_r, "KORREKTUREN")
+
+    repl_box = tk.Listbox(col_r, font=FONT_M, bg=BG3, fg=FG,
+                          selectbackground=AMBER, selectforeground=BG,
+                          relief="flat", bd=0, highlightthickness=0,
+                          activestyle="none", height=10)
+    repl_box.pack(fill="both", expand=True)
+
+    def del_repl():
+        sel = repl_box.curselection()
+        if sel:
+            vocab["replacements"].pop(sel[0])
+            save_vocab()
+            refresh_lists()
+
+    tk.Button(col_r, text="− Entfernen", command=del_repl,
+              bg=BG3, fg=RED, font=FONT_S, relief="flat",
+              padx=8, pady=3, cursor="hand2", bd=0).pack(anchor="e", pady=(4, 0))
+
+    def refresh_lists():
+        words_box.delete(0, tk.END)
+        for w in vocab.get("words", []):
+            words_box.insert(tk.END, f"  {w}")
+        repl_box.delete(0, tk.END)
+        for r in vocab.get("replacements", []):
+            repl_box.insert(tk.END, f"  {r['from']}  →  {r['to']}")
+
+    refresh_lists()
+
+    # ── Footer ──
+    tk.Frame(win, bg=BORDER, height=1).pack(fill="x")
+    tk.Label(win, text="Wörter fließen als Kontext in Whisper ein  ·  Korrekturen werden nach der Transkription angewendet",
+             font=FONT_S, bg=BG2, fg=FG2, pady=8).pack()
+
+
+def reload_model_and_hotkey():
+    keyboard.unhook_all()
+    load_model()
+    keyboard.add_hotkey(config["hotkey"], start_recording, suppress=True)
+    keyboard.on_release_key(config["hotkey"].split("+")[-1], on_space_release)
+    print(f"Hotkey updated: {config['hotkey']}", flush=True)
+
+
+# ── Main ──────────────────────────────────────────────────────────────────────
+
+def main():
+    global tray_icon, overlay_tk
+
+    load_config()
+    load_vocab()
+    load_model()
+
+    # Tkinter root (hidden) for overlay and settings
+    root = tk.Tk()
+    root.withdraw()
+    overlay_tk = root
+    create_overlay(root)
+
+    # Audio stream
+    stream = get_audio_stream()
+    stream.start()
+
+    # Hotkey
+    last_key = config["hotkey"].split("+")[-1]
+    keyboard.add_hotkey(config["hotkey"], start_recording, suppress=True)
+    keyboard.on_release_key(last_key, on_space_release)
+
+    # Tray
+    menu = pystray.Menu(
+        pystray.MenuItem("Einstellungen", lambda: open_settings()),
+        pystray.MenuItem("Vokabular", lambda: open_vocab()),
+        pystray.Menu.SEPARATOR,
+        pystray.MenuItem("Beenden", lambda: quit_app(stream)),
+    )
+    tray_icon = pystray.Icon("whisper", ICONS[AppState.IDLE], "Whisper Dictation", menu)
+
+    threading.Thread(target=tray_icon.run, daemon=True).start()
+
+    print(f"Ready. Hotkey: {config['hotkey']}", flush=True)
+    root.mainloop()
+
+    stream.stop()
+
+def quit_app(stream):
+    stream.stop()
+    tray_icon.stop()
+    overlay_tk.after(0, overlay_tk.quit)
+
+
+if __name__ == "__main__":
+    main()
--- a/install.bat
+++ b/install.bat
@ -0,0 +1,17 @@
+@echo off
+cd /d "%~dp0"
+
+echo Creating Windows venv (.venv-windows)...
+py -3.13 -m venv .venv-windows
+
+set "VENV=%~dp0.venv-windows"
+echo Installing dependencies...
+"%VENV%\Scripts\pip" install --upgrade pip
+"%VENV%\Scripts\pip" install -r requirements.txt
+
+echo Installing CUDA 12 DLLs (required for GPU acceleration)...
+"%VENV%\Scripts\pip" install -r requirements-cuda.txt
+
+echo.
+echo Done. Run start.bat to launch.
+pause
--- a/install.sh
+++ b/install.sh
@ -0,0 +1,13 @@
+#!/bin/bash
+set -e
+cd "$(dirname "$0")"
+
+echo "Creating Linux venv (.venv-linux)..."
+python3 -m venv .venv-linux
+
+echo "Installing dependencies..."
+.venv-linux/bin/pip install --upgrade pip
+.venv-linux/bin/pip install -r requirements.txt
+# No CUDA deps on Linux — runs on CPU
+
+echo "Done. Run ./start.sh to launch."
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@ -0,0 +1,4 @@
+# Windows CUDA 12 DLLs required by ctranslate2 (faster-whisper backend)
+# Install after requirements.txt on Windows with NVIDIA GPU
+nvidia-cublas-cu12
+nvidia-cudnn-cu12
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,7 @@
+faster-whisper>=1.0.2
+sounddevice>=0.4.6
+numpy>=1.24
+keyboard>=0.13
+pystray>=0.19
+Pillow>=9.5
+pynput>=1.7.6
--- a/start.bat
+++ b/start.bat
@ -0,0 +1,14 @@
+@echo off
+cd /d "%~dp0"
+
+set "VENV=%~dp0.venv-windows"
+
+rem Machine-local config dir (device, compute_type, audio_device - not in git)
+if not exist "%LOCALAPPDATA%\WhisperDictation" mkdir "%LOCALAPPDATA%\WhisperDictation"
+set "WHISPER_LOCAL_DIR=%LOCALAPPDATA%\WhisperDictation"
+
+rem CUDA 12 DLLs required by ctranslate2
+set "PATH=%VENV%\Lib\site-packages\nvidia\cublas\bin;%VENV%\Lib\site-packages\nvidia\cudnn\bin;%PATH%"
+
+"%VENV%\Scripts\python.exe" -u "%~dp0dictate.py"
+pause
--- a/start.sh
+++ b/start.sh
@ -0,0 +1,3 @@
+#!/bin/bash
+cd "$(dirname "$0")"
+.venv-linux/bin/python -u dictate.py
--- a/vocabulary.json
+++ b/vocabulary.json
@ -0,0 +1,18 @@
+{
+  "words": [],
+  "replacements": [
+    {"from": "KRA", "to": "KRAH"},
+    {"from": "Atos", "to": "ATHOS"},
+    {"from": "Resistec", "to": "RESISTEC"},
+    {"from": "Resistek", "to": "RESISTEC"},
+    {"from": "HES", "to": "HEES"},
+    {"from": "Ackerschot", "to": "Ackerschott"},
+    {"from": "Carrois", "to": "Kauer"},
+    {"from": "Jouer fixe", "to": "Jour-Fixe"},
+    {"from": "Docuware", "to": "DocuWare"},
+    {"from": "Nates", "to": "Nejc"},
+    {"from": "Bittzeit", "to": "BitSight"},
+    {"from": "Kalmikow", "to": "Kalmykov"},
+    {"from": "Leifert", "to": "Leifer"}
+  ]
+}