""" Whisper Dictation — local GPU speech-to-text with system tray and settings GUI. Hold hotkey to record, release to transcribe and type into active window. """ import json import os import threading import time import tkinter as tk from tkinter import ttk import numpy as np import sounddevice as sd import keyboard import pystray from PIL import Image, ImageDraw from pynput.keyboard import Controller as KeyboardController from faster_whisper import WhisperModel # Shared data dir: script directory (= git repo root, synced via git pull). _script_dir = os.path.dirname(os.path.abspath(__file__)) DATA_DIR = os.environ.get("WHISPER_DATA_DIR", _script_dir) os.makedirs(DATA_DIR, exist_ok=True) # Local config dir: machine-specific settings (audio device, device, compute_type). # Windows: %LOCALAPPDATA%\WhisperDictation # Linux: ~/.local/share/WhisperDictation _env_local = os.environ.get("WHISPER_LOCAL_DIR") if _env_local: _local_dir = _env_local elif os.name == "nt": _local_dir = os.path.join(os.environ.get("LOCALAPPDATA", _script_dir), "WhisperDictation") else: _local_dir = os.path.join(os.path.expanduser("~"), ".local", "share", "WhisperDictation") os.makedirs(_local_dir, exist_ok=True) CONFIG_FILE = os.path.join(DATA_DIR, "config.json") # shared via git CONFIG_LOCAL_FILE = os.path.join(_local_dir, "config_local.json") # machine-specific, not in git VOCAB_FILE = os.path.join(DATA_DIR, "vocabulary.json") # shared via git DEFAULT_CONFIG = { "hotkey": "ctrl+shift+space", "model": "medium", "device": "cuda", "compute_type": "float16", "language": "de", "audio_device": None, "sample_rate": 16000, } MODELS = ["tiny", "base", "small", "medium", "large-v2", "large-v3"] LANGUAGES = {"Deutsch": "de", "English": "en", "Français": "fr", "Español": "es", "Italiano": "it", "Auto": None} DEVICES = ["cuda", "cpu"] COMPUTE_TYPES = {"float16 (GPU)": "float16", "int8 (CPU/GPU)": "int8", "float32": "float32"} # ── State ───────────────────────────────────────────────────────────────────── class AppState: IDLE = "idle" RECORDING = "recording" TRANSCRIBING = "transcribing" state = AppState.IDLE audio_chunks = [] model = None typer = KeyboardController() config = {} tray_icon = None overlay_window = None overlay_tk = None # ── Config ──────────────────────────────────────────────────────────────────── LOCAL_KEYS = {"audio_device", "device", "compute_type"} # keys stored only in config_local.json def load_config(): global config config = dict(DEFAULT_CONFIG) if os.path.exists(CONFIG_FILE): with open(CONFIG_FILE) as f: config.update(json.load(f)) if os.path.exists(CONFIG_LOCAL_FILE): with open(CONFIG_LOCAL_FILE) as f: config.update(json.load(f)) def save_config(): shared = {k: v for k, v in config.items() if k not in LOCAL_KEYS} local = {k: v for k, v in config.items() if k in LOCAL_KEYS} with open(CONFIG_FILE, "w") as f: json.dump(shared, f, indent=2) with open(CONFIG_LOCAL_FILE, "w") as f: json.dump(local, f, indent=2) # ── Vocabulary ──────────────────────────────────────────────────────────────── vocab = {"words": [], "replacements": []} # {from, to} def load_vocab(): global vocab if os.path.exists(VOCAB_FILE): with open(VOCAB_FILE) as f: vocab = json.load(f) else: vocab = {"words": [], "replacements": []} def save_vocab(): with open(VOCAB_FILE, "w") as f: json.dump(vocab, f, indent=2, ensure_ascii=False) def apply_vocab(text: str) -> str: for r in vocab.get("replacements", []): text = text.replace(r["from"], r["to"]) return text def get_initial_prompt() -> str: words = vocab.get("words", []) return ", ".join(words) if words else "" # ── Tray icon ───────────────────────────────────────────────────────────────── def make_icon(color): img = Image.new("RGBA", (64, 64), (0, 0, 0, 0)) d = ImageDraw.Draw(img) d.ellipse([4, 4, 60, 60], fill=color) return img ICONS = { AppState.IDLE: make_icon((40, 200, 80)), AppState.RECORDING: make_icon((220, 50, 50)), AppState.TRANSCRIBING: make_icon((220, 180, 30)), } def set_state(new_state): global state state = new_state if tray_icon: tray_icon.icon = ICONS[new_state] if new_state == AppState.RECORDING: show_overlay() else: hide_overlay() # ── Overlay window ──────────────────────────────────────────────────────────── def show_overlay(): if overlay_tk is None: return overlay_tk.after(0, _show_overlay_main) def hide_overlay(): if overlay_tk is None: return overlay_tk.after(0, _hide_overlay_main) def _show_overlay_main(): overlay_window.deiconify() # Position bottom-right sw = overlay_tk.winfo_screenwidth() sh = overlay_tk.winfo_screenheight() overlay_window.geometry(f"220x54+{sw - 240}+{sh - 100}") overlay_window.lift() def _hide_overlay_main(): overlay_window.withdraw() def create_overlay(root): global overlay_window win = tk.Toplevel(root) win.withdraw() win.overrideredirect(True) win.attributes("-topmost", True) win.attributes("-alpha", 0.92) win.configure(bg="#1a1a1a") frame = tk.Frame(win, bg="#1a1a1a", padx=12, pady=10) frame.pack(fill="both", expand=True) dot = tk.Canvas(frame, width=14, height=14, bg="#1a1a1a", highlightthickness=0) dot.create_oval(2, 2, 12, 12, fill="#e03030", outline="") dot.pack(side="left", padx=(0, 8)) tk.Label(frame, text="Aufnahme läuft …", fg="white", bg="#1a1a1a", font=("Segoe UI", 11)).pack(side="left") overlay_window = win # ── Audio ───────────────────────────────────────────────────────────────────── def audio_callback(indata, frames, time_info, status): if state == AppState.RECORDING: audio_chunks.append(indata.copy()) def get_audio_stream(): device = config.get("audio_device") return sd.InputStream( samplerate=config["sample_rate"], channels=1, device=device, callback=audio_callback, ) # ── Recording & transcription ───────────────────────────────────────────────── def start_recording(): global audio_chunks if state == AppState.RECORDING: return audio_chunks = [] set_state(AppState.RECORDING) print("Recording...", flush=True) def stop_and_transcribe(): if state != AppState.RECORDING: return set_state(AppState.TRANSCRIBING) chunks = list(audio_chunks) if not chunks: set_state(AppState.IDLE) return audio = np.concatenate(chunks, axis=0).flatten().astype(np.float32) duration = len(audio) / config["sample_rate"] rms = float(np.sqrt(np.mean(audio ** 2))) print(f"Audio: {duration:.1f}s RMS: {rms:.5f}", flush=True) if duration < 0.3 or rms < 0.0005: print("Too short or silent — skipped.", flush=True) set_state(AppState.IDLE) return lang = config["language"] if config["language"] else None prompt = get_initial_prompt() segments, _ = model.transcribe( audio, language=lang, beam_size=5, vad_filter=True, initial_prompt=prompt if prompt else None, ) text = " ".join(s.text for s in segments).strip() text = apply_vocab(text) print(f"Result: {repr(text)}", flush=True) set_state(AppState.IDLE) if text: time.sleep(0.15) typer.type(text) def on_space_release(e): if state == AppState.RECORDING: threading.Thread(target=stop_and_transcribe, daemon=True).start() # ── Model loading ───────────────────────────────────────────────────────────── def load_model(): global model print(f"Loading {config['model']} on {config['device']}...", flush=True) model = WhisperModel( config["model"], device=config["device"], compute_type=config["compute_type"], ) print("Model ready.", flush=True) # ── Settings window ─────────────────────────────────────────────────────────── def open_settings(): if overlay_tk is None: return overlay_tk.after(0, _open_settings_main) def _open_settings_main(): # ── Palette: "Precision Audio" ────────────────────────────────────────── BG = "#18181f" # deep void BG2 = "#22222c" # panel BG3 = "#2c2c38" # elevated BORDER = "#38384a" FG = "#e8e8f0" FG2 = "#7878a0" AMBER = "#f5a623" AMBER2 = "#c8831a" GREEN = "#4ade80" FONT = ("Consolas", 11) FONT_UI = ("Segoe UI", 11) FONT_B = ("Segoe UI", 11, "bold") FONT_S = ("Segoe UI", 9) FONT_H = ("Segoe UI Semibold", 16) win = tk.Toplevel(overlay_tk) win.title("Whisper Dictation") win.configure(bg=BG) win.attributes("-topmost", True) win.resizable(False, False) # Center W, H = 680, 660 win.update_idletasks() sw, sh = win.winfo_screenwidth(), win.winfo_screenheight() win.geometry(f"{W}x{H}+{(sw-W)//2}+{(sh-H)//2}") # Global option for OptionMenu dropdowns (dark listbox) win.option_add("*Menu.background", BG3) win.option_add("*Menu.foreground", FG) win.option_add("*Menu.activeBackground", AMBER) win.option_add("*Menu.activeForeground", BG) win.option_add("*Menu.font", FONT_UI) # ── Header ── hdr = tk.Frame(win, bg=BG2, pady=20) hdr.pack(fill="x") # Amber accent bar tk.Frame(hdr, bg=AMBER, height=3).pack(fill="x") tk.Label(hdr, text="WHISPER DICTATION", font=FONT_H, bg=BG2, fg=FG, pady=12).pack() tk.Label(hdr, text="Lokale GPU-Transkription · offline · privat", font=FONT_S, bg=BG2, fg=FG2).pack() # ── Scrollable content ── canvas = tk.Canvas(win, bg=BG, highlightthickness=0) canvas.pack(fill="both", expand=True) content = tk.Frame(canvas, bg=BG, padx=36, pady=16) canvas.create_window((0, 0), window=content, anchor="nw") def section(label): f = tk.Frame(content, bg=BG) f.pack(fill="x", pady=(18, 6)) tk.Label(f, text=label, font=("Consolas", 9, "bold"), bg=BG, fg=AMBER).pack(side="left") tk.Frame(f, bg=BORDER, height=1).pack(side="left", fill="x", expand=True, padx=(10, 0), pady=6) def dd(frame, var, values, width=14): """Create dark OptionMenu directly in frame as parent.""" m = tk.OptionMenu(frame, var, *values) m.config(bg=BG3, fg=FG, activebackground=BG3, activeforeground=FG, highlightbackground=BORDER, highlightthickness=1, relief="flat", font=FONT_UI, anchor="w", bd=0, width=width) m["menu"].config(bg=BG3, fg=FG, activebackground=AMBER, activeforeground=BG, relief="flat", bd=0) return m def row(label, hint=None): """Returns frame — add controls to frame after calling.""" f = tk.Frame(content, bg=BG) f.pack(fill="x", pady=5) tk.Label(f, text=label, width=17, anchor="w", font=FONT_UI, bg=BG, fg=FG2).pack(side="left") if hint: tk.Label(f, text=hint, font=FONT_S, bg=BG, fg=FG2).pack(side="right") return f # ── AUDIO ── section("AUDIO") devices = [(i, d["name"]) for i, d in enumerate(sd.query_devices()) if d["max_input_channels"] > 0] dev_names = ["Standard"] + [f"{i}: {n}" for i, n in devices] dev_var = tk.StringVar() cur_dev = config.get("audio_device") dev_var.set("Standard" if cur_dev is None else next((f"{i}: {n}" for i, n in devices if i == cur_dev), "Standard")) f = row("Mikrofon") dd(f, dev_var, dev_names, width=44).pack(side="left") # ── MODELL ── section("MODELL") model_hints = { "tiny": "~1 GB VRAM · sehr schnell", "base": "~1 GB VRAM", "small": "~2 GB VRAM", "medium": "~5 GB VRAM · empfohlen ✓", "large-v2": "~10 GB VRAM", "large-v3": "~10 GB VRAM · bestes Ergebnis", } model_var = tk.StringVar(value=config["model"]) f_model = row("Modell") dd(f_model, model_var, MODELS, 14).pack(side="left") hint_lbl = tk.Label(f_model, text=model_hints.get(config["model"], ""), font=FONT_S, bg=BG, fg=FG2) hint_lbl.pack(side="left", padx=(14, 0)) model_var.trace_add("write", lambda *_: hint_lbl.config(text=model_hints.get(model_var.get(), ""))) lang_display = {v: k for k, v in LANGUAGES.items()} lang_var = tk.StringVar(value=lang_display.get(config["language"], "Deutsch")) f = row("Sprache") dd(f, lang_var, list(LANGUAGES.keys()), 14).pack(side="left") # ── LEISTUNG ── section("LEISTUNG") device_var = tk.StringVar(value=config["device"]) f = row("Gerät (GPU/CPU)", hint="cuda = NVIDIA GPU empfohlen") dd(f, device_var, DEVICES, 8).pack(side="left") ct_display = {v: k for k, v in COMPUTE_TYPES.items()} ct_var = tk.StringVar(value=ct_display.get(config["compute_type"], "float16 (GPU)")) f = row("Compute Type") dd(f, ct_var, list(COMPUTE_TYPES.keys()), 18).pack(side="left") # ── STEUERUNG ── section("STEUERUNG") hotkey_var = tk.StringVar(value=config["hotkey"]) f_hk = row("Hotkey", hint="z.B. ctrl+shift+space") tk.Entry(f_hk, textvariable=hotkey_var, font=FONT, width=24, bg=BG3, fg=FG, insertbackground=AMBER, relief="flat", bd=6, highlightbackground=BORDER, highlightthickness=1).pack(side="left") # ── Buttons ── tk.Frame(win, bg=BORDER, height=1).pack(fill="x") btn_bar = tk.Frame(win, bg=BG2, pady=16, padx=32) btn_bar.pack(fill="x") def save(): sel = dev_var.get() config["audio_device"] = None if sel == "Standard" else int(sel.split(":")[0]) config["model"] = model_var.get() config["language"] = LANGUAGES[lang_var.get()] config["device"] = device_var.get() config["compute_type"] = COMPUTE_TYPES[ct_var.get()] config["hotkey"] = hotkey_var.get() save_config() win.destroy() threading.Thread(target=reload_model_and_hotkey, daemon=True).start() def btn_hover(b, c_in, c_out): b.bind("", lambda _: b.config(bg=c_in)) b.bind("", lambda _: b.config(bg=c_out)) save_btn = tk.Button(btn_bar, text="Speichern & Neuladen", command=save, bg=AMBER, fg=BG, font=FONT_B, relief="flat", padx=20, pady=9, cursor="hand2", bd=0) save_btn.pack(side="right") btn_hover(save_btn, AMBER2, AMBER) cancel_btn = tk.Button(btn_bar, text="Abbrechen", command=win.destroy, bg=BG3, fg=FG2, font=FONT_UI, relief="flat", padx=20, pady=9, cursor="hand2", bd=0) cancel_btn.pack(side="right", padx=(0, 10)) btn_hover(cancel_btn, BORDER, BG3) def open_vocab(): if overlay_tk is None: return overlay_tk.after(0, _open_vocab_main) def _open_vocab_main(): BG = "#18181f" BG2 = "#22222c" BG3 = "#2c2c38" BORDER = "#38384a" FG = "#e8e8f0" FG2 = "#7878a0" AMBER = "#f5a623" AMBER2 = "#c8831a" RED = "#f87171" FONT = ("Segoe UI", 11) FONT_B = ("Segoe UI", 11, "bold") FONT_S = ("Segoe UI", 9) FONT_H = ("Segoe UI Semibold", 14) FONT_M = ("Consolas", 10) win = tk.Toplevel(overlay_tk) win.title("Vokabular") win.configure(bg=BG) win.attributes("-topmost", True) win.resizable(False, False) W, H = 600, 620 win.update_idletasks() sw, sh = win.winfo_screenwidth(), win.winfo_screenheight() win.geometry(f"{W}x{H}+{(sw-W)//2}+{(sh-H)//2}") win.option_add("*Menu.background", BG3) win.option_add("*Menu.foreground", FG) win.option_add("*Menu.activeBackground", AMBER) win.option_add("*Menu.activeForeground", BG) # ── Header ── hdr = tk.Frame(win, bg=BG2) hdr.pack(fill="x") tk.Frame(hdr, bg=AMBER, height=3).pack(fill="x") tk.Label(hdr, text="VOKABULAR & KORREKTUREN", font=FONT_H, bg=BG2, fg=FG, pady=14).pack() tk.Label(hdr, text="Wörter lernen · Ersetzungen definieren", font=FONT_S, bg=BG2, fg=FG2).pack(pady=(0, 10)) content = tk.Frame(win, bg=BG, padx=28, pady=12) content.pack(fill="both", expand=True) # ── Add-word form ───────────────────────────────────────────────────────── is_correction = tk.BooleanVar(value=False) form = tk.Frame(content, bg=BG3, padx=16, pady=14) form.pack(fill="x", pady=(0, 16)) # Toggle row tog_row = tk.Frame(form, bg=BG3) tog_row.pack(fill="x", pady=(0, 10)) tk.Label(tog_row, text="Korrektur (falsch → richtig)", font=FONT, bg=BG3, fg=FG).pack(side="left") def toggle_form(*_): if is_correction.get(): entry_from.pack(side="left", padx=(0, 6)) arrow_lbl.pack(side="left", padx=4) entry_to.pack(side="left") entry_word.pack_forget() else: entry_word.pack(side="left", fill="x", expand=True) entry_from.pack_forget() arrow_lbl.pack_forget() entry_to.pack_forget() tog_btn = tk.Checkbutton(tog_row, variable=is_correction, command=toggle_form, bg=BG3, fg=FG2, activebackground=BG3, selectcolor=AMBER, relief="flat", bd=0, indicatoron=True) tog_btn.pack(side="right") # Input row inp_row = tk.Frame(form, bg=BG3) inp_row.pack(fill="x") entry_style = dict(font=FONT_M, bg=BG, fg=FG, insertbackground=AMBER, relief="flat", bd=6, highlightbackground=BORDER, highlightthickness=1) entry_word = tk.Entry(inp_row, width=32, **entry_style) entry_word.insert(0, "") entry_from = tk.Entry(inp_row, width=14, **entry_style) arrow_lbl = tk.Label(inp_row, text="→", font=("Segoe UI", 14), bg=BG3, fg=AMBER) entry_to = tk.Entry(inp_row, width=14, **entry_style) entry_word.pack(side="left", fill="x", expand=True) def add_entry(): if is_correction.get(): frm = entry_from.get().strip() to = entry_to.get().strip() if frm and to: vocab["replacements"].append({"from": frm, "to": to}) entry_from.delete(0, tk.END) entry_to.delete(0, tk.END) else: w = entry_word.get().strip() if w and w not in vocab["words"]: vocab["words"].append(w) entry_word.delete(0, tk.END) save_vocab() refresh_lists() win.bind("", lambda _: add_entry()) add_btn = tk.Button(inp_row, text="Hinzufügen", command=add_entry, bg=AMBER, fg=BG, font=FONT_B, relief="flat", padx=14, pady=5, cursor="hand2", bd=0) add_btn.pack(side="right", padx=(10, 0)) add_btn.bind("", lambda _: add_btn.config(bg=AMBER2)) add_btn.bind("", lambda _: add_btn.config(bg=AMBER)) # ── Lists ───────────────────────────────────────────────────────────────── lists_frame = tk.Frame(content, bg=BG) lists_frame.pack(fill="both", expand=True) lists_frame.columnconfigure(0, weight=1) lists_frame.columnconfigure(1, weight=2) def section_label(parent, text): tk.Label(parent, text=text, font=("Consolas", 9, "bold"), bg=BG, fg=AMBER).pack(anchor="w", pady=(0, 6)) # Words column col_w = tk.Frame(lists_frame, bg=BG) col_w.grid(row=0, column=0, sticky="nsew", padx=(0, 12)) section_label(col_w, "WÖRTER") words_box = tk.Listbox(col_w, font=FONT_M, bg=BG3, fg=FG, selectbackground=AMBER, selectforeground=BG, relief="flat", bd=0, highlightthickness=0, activestyle="none", height=10) words_box.pack(fill="both", expand=True) def del_word(): sel = words_box.curselection() if sel: vocab["words"].pop(sel[0]) save_vocab() refresh_lists() tk.Button(col_w, text="− Entfernen", command=del_word, bg=BG3, fg=RED, font=FONT_S, relief="flat", padx=8, pady=3, cursor="hand2", bd=0).pack(anchor="e", pady=(4, 0)) # Replacements column col_r = tk.Frame(lists_frame, bg=BG) col_r.grid(row=0, column=1, sticky="nsew") section_label(col_r, "KORREKTUREN") repl_box = tk.Listbox(col_r, font=FONT_M, bg=BG3, fg=FG, selectbackground=AMBER, selectforeground=BG, relief="flat", bd=0, highlightthickness=0, activestyle="none", height=10) repl_box.pack(fill="both", expand=True) def del_repl(): sel = repl_box.curselection() if sel: vocab["replacements"].pop(sel[0]) save_vocab() refresh_lists() tk.Button(col_r, text="− Entfernen", command=del_repl, bg=BG3, fg=RED, font=FONT_S, relief="flat", padx=8, pady=3, cursor="hand2", bd=0).pack(anchor="e", pady=(4, 0)) def refresh_lists(): words_box.delete(0, tk.END) for w in vocab.get("words", []): words_box.insert(tk.END, f" {w}") repl_box.delete(0, tk.END) for r in vocab.get("replacements", []): repl_box.insert(tk.END, f" {r['from']} → {r['to']}") refresh_lists() # ── Footer ── tk.Frame(win, bg=BORDER, height=1).pack(fill="x") tk.Label(win, text="Wörter fließen als Kontext in Whisper ein · Korrekturen werden nach der Transkription angewendet", font=FONT_S, bg=BG2, fg=FG2, pady=8).pack() def reload_model_and_hotkey(): keyboard.unhook_all() load_model() keyboard.add_hotkey(config["hotkey"], start_recording, suppress=True) keyboard.on_release_key(config["hotkey"].split("+")[-1], on_space_release) print(f"Hotkey updated: {config['hotkey']}", flush=True) # ── Main ────────────────────────────────────────────────────────────────────── def main(): global tray_icon, overlay_tk load_config() load_vocab() load_model() # Tkinter root (hidden) for overlay and settings root = tk.Tk() root.withdraw() overlay_tk = root create_overlay(root) # Audio stream stream = get_audio_stream() stream.start() # Hotkey last_key = config["hotkey"].split("+")[-1] keyboard.add_hotkey(config["hotkey"], start_recording, suppress=True) keyboard.on_release_key(last_key, on_space_release) # Tray menu = pystray.Menu( pystray.MenuItem("Einstellungen", lambda: open_settings()), pystray.MenuItem("Vokabular", lambda: open_vocab()), pystray.Menu.SEPARATOR, pystray.MenuItem("Beenden", lambda: quit_app(stream)), ) tray_icon = pystray.Icon("whisper", ICONS[AppState.IDLE], "Whisper Dictation", menu) threading.Thread(target=tray_icon.run, daemon=True).start() print(f"Ready. Hotkey: {config['hotkey']}", flush=True) root.mainloop() stream.stop() def quit_app(stream): stream.stop() tray_icon.stop() overlay_tk.after(0, overlay_tk.quit) if __name__ == "__main__": main()