diff --git a/.gitignore b/.gitignore index 39629ff..f78af01 100644 --- a/.gitignore +++ b/.gitignore @@ -1,11 +1,11 @@ -.venv-windows/ -.venv-linux/ -.venv/ -__pycache__/ -*.pyc -*.pyo -config_local.json -models/ -*.log -.DS_Store -Thumbs.db +.venv-windows/ +.venv-linux/ +.venv/ +__pycache__/ +*.pyc +*.pyo +config_local.json +models/ +*.log +.DS_Store +Thumbs.db diff --git a/config.json b/config.json index f8fdbcb..3168cd1 100644 --- a/config.json +++ b/config.json @@ -1,6 +1,6 @@ -{ - "hotkey": "ctrl+shift+space", - "model": "medium", - "language": "de", - "sample_rate": 16000 -} +{ + "hotkey": "ctrl+shift+space", + "model": "medium", + "language": "de", + "sample_rate": 16000 +} \ No newline at end of file diff --git a/dictate.py b/dictate.py index bc5bb10..bc600bf 100644 --- a/dictate.py +++ b/dictate.py @@ -1,688 +1,790 @@ -""" -Whisper Dictation — local GPU speech-to-text with system tray and settings GUI. -Hold hotkey to record, release to transcribe and type into active window. -""" -import json -import os -import threading -import time -import tkinter as tk -from tkinter import ttk - -import numpy as np -import sounddevice as sd -import keyboard -import pystray -from PIL import Image, ImageDraw -from pynput.keyboard import Controller as KeyboardController -from faster_whisper import WhisperModel - -# Shared data dir: script directory (= git repo root, synced via git pull). -_script_dir = os.path.dirname(os.path.abspath(__file__)) -DATA_DIR = os.environ.get("WHISPER_DATA_DIR", _script_dir) -os.makedirs(DATA_DIR, exist_ok=True) - -# Local config dir: machine-specific settings (audio device, device, compute_type). -# Windows: %LOCALAPPDATA%\WhisperDictation -# Linux: ~/.local/share/WhisperDictation -_env_local = os.environ.get("WHISPER_LOCAL_DIR") -if _env_local: - _local_dir = _env_local -elif os.name == "nt": - _local_dir = os.path.join(os.environ.get("LOCALAPPDATA", _script_dir), "WhisperDictation") -else: - _local_dir = os.path.join(os.path.expanduser("~"), ".local", "share", "WhisperDictation") -os.makedirs(_local_dir, exist_ok=True) - -CONFIG_FILE = os.path.join(DATA_DIR, "config.json") # shared via git -CONFIG_LOCAL_FILE = os.path.join(_local_dir, "config_local.json") # machine-specific, not in git -VOCAB_FILE = os.path.join(DATA_DIR, "vocabulary.json") # shared via git - -DEFAULT_CONFIG = { - "hotkey": "ctrl+shift+space", - "model": "medium", - "device": "cuda", - "compute_type": "float16", - "language": "de", - "audio_device": None, - "sample_rate": 16000, -} - -MODELS = ["tiny", "base", "small", "medium", "large-v2", "large-v3"] -LANGUAGES = {"Deutsch": "de", "English": "en", "Français": "fr", "Español": "es", "Italiano": "it", "Auto": None} -DEVICES = ["cuda", "cpu"] -COMPUTE_TYPES = {"float16 (GPU)": "float16", "int8 (CPU/GPU)": "int8", "float32": "float32"} - - -# ── State ───────────────────────────────────────────────────────────────────── - -class AppState: - IDLE = "idle" - RECORDING = "recording" - TRANSCRIBING = "transcribing" - -state = AppState.IDLE -audio_chunks = [] -model = None -typer = KeyboardController() -config = {} -tray_icon = None -overlay_window = None -overlay_tk = None - - -# ── Config ──────────────────────────────────────────────────────────────────── - -LOCAL_KEYS = {"audio_device", "device", "compute_type"} # keys stored only in config_local.json - -def load_config(): - global config - config = dict(DEFAULT_CONFIG) - if os.path.exists(CONFIG_FILE): - with open(CONFIG_FILE) as f: - config.update(json.load(f)) - if os.path.exists(CONFIG_LOCAL_FILE): - with open(CONFIG_LOCAL_FILE) as f: - config.update(json.load(f)) - -def save_config(): - shared = {k: v for k, v in config.items() if k not in LOCAL_KEYS} - local = {k: v for k, v in config.items() if k in LOCAL_KEYS} - with open(CONFIG_FILE, "w") as f: - json.dump(shared, f, indent=2) - with open(CONFIG_LOCAL_FILE, "w") as f: - json.dump(local, f, indent=2) - - -# ── Vocabulary ──────────────────────────────────────────────────────────────── - -vocab = {"words": [], "replacements": []} # {from, to} - -def load_vocab(): - global vocab - if os.path.exists(VOCAB_FILE): - with open(VOCAB_FILE) as f: - vocab = json.load(f) - else: - vocab = {"words": [], "replacements": []} - -def save_vocab(): - with open(VOCAB_FILE, "w") as f: - json.dump(vocab, f, indent=2, ensure_ascii=False) - -def apply_vocab(text: str) -> str: - for r in vocab.get("replacements", []): - text = text.replace(r["from"], r["to"]) - return text - -def get_initial_prompt() -> str: - words = vocab.get("words", []) - return ", ".join(words) if words else "" - - -# ── Tray icon ───────────────────────────────────────────────────────────────── - -def make_icon(color): - img = Image.new("RGBA", (64, 64), (0, 0, 0, 0)) - d = ImageDraw.Draw(img) - d.ellipse([4, 4, 60, 60], fill=color) - return img - -ICONS = { - AppState.IDLE: make_icon((40, 200, 80)), - AppState.RECORDING: make_icon((220, 50, 50)), - AppState.TRANSCRIBING: make_icon((220, 180, 30)), -} - -def set_state(new_state): - global state - state = new_state - if tray_icon: - tray_icon.icon = ICONS[new_state] - if new_state == AppState.RECORDING: - show_overlay() - else: - hide_overlay() - - -# ── Overlay window ──────────────────────────────────────────────────────────── - -def show_overlay(): - if overlay_tk is None: - return - overlay_tk.after(0, _show_overlay_main) - -def hide_overlay(): - if overlay_tk is None: - return - overlay_tk.after(0, _hide_overlay_main) - -def _show_overlay_main(): - overlay_window.deiconify() - # Position bottom-right - sw = overlay_tk.winfo_screenwidth() - sh = overlay_tk.winfo_screenheight() - overlay_window.geometry(f"220x54+{sw - 240}+{sh - 100}") - overlay_window.lift() - -def _hide_overlay_main(): - overlay_window.withdraw() - -def create_overlay(root): - global overlay_window - win = tk.Toplevel(root) - win.withdraw() - win.overrideredirect(True) - win.attributes("-topmost", True) - win.attributes("-alpha", 0.92) - win.configure(bg="#1a1a1a") - - frame = tk.Frame(win, bg="#1a1a1a", padx=12, pady=10) - frame.pack(fill="both", expand=True) - - dot = tk.Canvas(frame, width=14, height=14, bg="#1a1a1a", highlightthickness=0) - dot.create_oval(2, 2, 12, 12, fill="#e03030", outline="") - dot.pack(side="left", padx=(0, 8)) - - tk.Label(frame, text="Aufnahme läuft …", fg="white", bg="#1a1a1a", - font=("Segoe UI", 11)).pack(side="left") - - overlay_window = win - - -# ── Audio ───────────────────────────────────────────────────────────────────── - -def audio_callback(indata, frames, time_info, status): - if state == AppState.RECORDING: - audio_chunks.append(indata.copy()) - -def get_audio_stream(): - device = config.get("audio_device") - return sd.InputStream( - samplerate=config["sample_rate"], - channels=1, - device=device, - callback=audio_callback, - ) - - -# ── Recording & transcription ───────────────────────────────────────────────── - -def start_recording(): - global audio_chunks - if state == AppState.RECORDING: - return - audio_chunks = [] - set_state(AppState.RECORDING) - print("Recording...", flush=True) - -def stop_and_transcribe(): - if state != AppState.RECORDING: - return - set_state(AppState.TRANSCRIBING) - chunks = list(audio_chunks) - - if not chunks: - set_state(AppState.IDLE) - return - - audio = np.concatenate(chunks, axis=0).flatten().astype(np.float32) - duration = len(audio) / config["sample_rate"] - rms = float(np.sqrt(np.mean(audio ** 2))) - print(f"Audio: {duration:.1f}s RMS: {rms:.5f}", flush=True) - - if duration < 0.3 or rms < 0.0005: - print("Too short or silent — skipped.", flush=True) - set_state(AppState.IDLE) - return - - lang = config["language"] if config["language"] else None - prompt = get_initial_prompt() - segments, _ = model.transcribe( - audio, language=lang, beam_size=5, vad_filter=True, - initial_prompt=prompt if prompt else None, - ) - text = " ".join(s.text for s in segments).strip() - text = apply_vocab(text) - print(f"Result: {repr(text)}", flush=True) - - set_state(AppState.IDLE) - if text: - time.sleep(0.15) - typer.type(text) - -def on_space_release(e): - if state == AppState.RECORDING: - threading.Thread(target=stop_and_transcribe, daemon=True).start() - - -# ── Model loading ───────────────────────────────────────────────────────────── - -def load_model(): - global model - print(f"Loading {config['model']} on {config['device']}...", flush=True) - model = WhisperModel( - config["model"], - device=config["device"], - compute_type=config["compute_type"], - ) - print("Model ready.", flush=True) - - -# ── Settings window ─────────────────────────────────────────────────────────── - -def open_settings(): - if overlay_tk is None: - return - overlay_tk.after(0, _open_settings_main) - -def _open_settings_main(): - # ── Palette: "Precision Audio" ────────────────────────────────────────── - BG = "#18181f" # deep void - BG2 = "#22222c" # panel - BG3 = "#2c2c38" # elevated - BORDER = "#38384a" - FG = "#e8e8f0" - FG2 = "#7878a0" - AMBER = "#f5a623" - AMBER2 = "#c8831a" - GREEN = "#4ade80" - FONT = ("Consolas", 11) - FONT_UI = ("Segoe UI", 11) - FONT_B = ("Segoe UI", 11, "bold") - FONT_S = ("Segoe UI", 9) - FONT_H = ("Segoe UI Semibold", 16) - - win = tk.Toplevel(overlay_tk) - win.title("Whisper Dictation") - win.configure(bg=BG) - win.attributes("-topmost", True) - win.resizable(False, False) - - # Center - W, H = 680, 660 - win.update_idletasks() - sw, sh = win.winfo_screenwidth(), win.winfo_screenheight() - win.geometry(f"{W}x{H}+{(sw-W)//2}+{(sh-H)//2}") - - # Global option for OptionMenu dropdowns (dark listbox) - win.option_add("*Menu.background", BG3) - win.option_add("*Menu.foreground", FG) - win.option_add("*Menu.activeBackground", AMBER) - win.option_add("*Menu.activeForeground", BG) - win.option_add("*Menu.font", FONT_UI) - - # ── Header ── - hdr = tk.Frame(win, bg=BG2, pady=20) - hdr.pack(fill="x") - # Amber accent bar - tk.Frame(hdr, bg=AMBER, height=3).pack(fill="x") - tk.Label(hdr, text="WHISPER DICTATION", font=FONT_H, - bg=BG2, fg=FG, pady=12).pack() - tk.Label(hdr, text="Lokale GPU-Transkription · offline · privat", - font=FONT_S, bg=BG2, fg=FG2).pack() - - # ── Scrollable content ── - canvas = tk.Canvas(win, bg=BG, highlightthickness=0) - canvas.pack(fill="both", expand=True) - content = tk.Frame(canvas, bg=BG, padx=36, pady=16) - canvas.create_window((0, 0), window=content, anchor="nw") - - def section(label): - f = tk.Frame(content, bg=BG) - f.pack(fill="x", pady=(18, 6)) - tk.Label(f, text=label, font=("Consolas", 9, "bold"), - bg=BG, fg=AMBER).pack(side="left") - tk.Frame(f, bg=BORDER, height=1).pack(side="left", fill="x", expand=True, padx=(10, 0), pady=6) - - def dd(frame, var, values, width=14): - """Create dark OptionMenu directly in frame as parent.""" - m = tk.OptionMenu(frame, var, *values) - m.config(bg=BG3, fg=FG, activebackground=BG3, activeforeground=FG, - highlightbackground=BORDER, highlightthickness=1, - relief="flat", font=FONT_UI, anchor="w", bd=0, width=width) - m["menu"].config(bg=BG3, fg=FG, activebackground=AMBER, - activeforeground=BG, relief="flat", bd=0) - return m - - def row(label, hint=None): - """Returns frame — add controls to frame after calling.""" - f = tk.Frame(content, bg=BG) - f.pack(fill="x", pady=5) - tk.Label(f, text=label, width=17, anchor="w", font=FONT_UI, - bg=BG, fg=FG2).pack(side="left") - if hint: - tk.Label(f, text=hint, font=FONT_S, bg=BG, fg=FG2).pack(side="right") - return f - - # ── AUDIO ── - section("AUDIO") - devices = [(i, d["name"]) for i, d in enumerate(sd.query_devices()) - if d["max_input_channels"] > 0] - dev_names = ["Standard"] + [f"{i}: {n}" for i, n in devices] - dev_var = tk.StringVar() - cur_dev = config.get("audio_device") - dev_var.set("Standard" if cur_dev is None else - next((f"{i}: {n}" for i, n in devices if i == cur_dev), "Standard")) - f = row("Mikrofon") - dd(f, dev_var, dev_names, width=44).pack(side="left") - - # ── MODELL ── - section("MODELL") - model_hints = { - "tiny": "~1 GB VRAM · sehr schnell", - "base": "~1 GB VRAM", - "small": "~2 GB VRAM", - "medium": "~5 GB VRAM · empfohlen ✓", - "large-v2": "~10 GB VRAM", - "large-v3": "~10 GB VRAM · bestes Ergebnis", - } - model_var = tk.StringVar(value=config["model"]) - f_model = row("Modell") - dd(f_model, model_var, MODELS, 14).pack(side="left") - hint_lbl = tk.Label(f_model, text=model_hints.get(config["model"], ""), - font=FONT_S, bg=BG, fg=FG2) - hint_lbl.pack(side="left", padx=(14, 0)) - model_var.trace_add("write", lambda *_: hint_lbl.config(text=model_hints.get(model_var.get(), ""))) - - lang_display = {v: k for k, v in LANGUAGES.items()} - lang_var = tk.StringVar(value=lang_display.get(config["language"], "Deutsch")) - f = row("Sprache") - dd(f, lang_var, list(LANGUAGES.keys()), 14).pack(side="left") - - # ── LEISTUNG ── - section("LEISTUNG") - device_var = tk.StringVar(value=config["device"]) - f = row("Gerät (GPU/CPU)", hint="cuda = NVIDIA GPU empfohlen") - dd(f, device_var, DEVICES, 8).pack(side="left") - - ct_display = {v: k for k, v in COMPUTE_TYPES.items()} - ct_var = tk.StringVar(value=ct_display.get(config["compute_type"], "float16 (GPU)")) - f = row("Compute Type") - dd(f, ct_var, list(COMPUTE_TYPES.keys()), 18).pack(side="left") - - # ── STEUERUNG ── - section("STEUERUNG") - hotkey_var = tk.StringVar(value=config["hotkey"]) - f_hk = row("Hotkey", hint="z.B. ctrl+shift+space") - tk.Entry(f_hk, textvariable=hotkey_var, font=FONT, width=24, - bg=BG3, fg=FG, insertbackground=AMBER, - relief="flat", bd=6, - highlightbackground=BORDER, highlightthickness=1).pack(side="left") - - # ── Buttons ── - tk.Frame(win, bg=BORDER, height=1).pack(fill="x") - btn_bar = tk.Frame(win, bg=BG2, pady=16, padx=32) - btn_bar.pack(fill="x") - - def save(): - sel = dev_var.get() - config["audio_device"] = None if sel == "Standard" else int(sel.split(":")[0]) - config["model"] = model_var.get() - config["language"] = LANGUAGES[lang_var.get()] - config["device"] = device_var.get() - config["compute_type"] = COMPUTE_TYPES[ct_var.get()] - config["hotkey"] = hotkey_var.get() - save_config() - win.destroy() - threading.Thread(target=reload_model_and_hotkey, daemon=True).start() - - def btn_hover(b, c_in, c_out): - b.bind("", lambda _: b.config(bg=c_in)) - b.bind("", lambda _: b.config(bg=c_out)) - - save_btn = tk.Button(btn_bar, text="Speichern & Neuladen", command=save, - bg=AMBER, fg=BG, font=FONT_B, - relief="flat", padx=20, pady=9, cursor="hand2", bd=0) - save_btn.pack(side="right") - btn_hover(save_btn, AMBER2, AMBER) - - cancel_btn = tk.Button(btn_bar, text="Abbrechen", command=win.destroy, - bg=BG3, fg=FG2, font=FONT_UI, - relief="flat", padx=20, pady=9, cursor="hand2", bd=0) - cancel_btn.pack(side="right", padx=(0, 10)) - btn_hover(cancel_btn, BORDER, BG3) - - -def open_vocab(): - if overlay_tk is None: - return - overlay_tk.after(0, _open_vocab_main) - -def _open_vocab_main(): - BG = "#18181f" - BG2 = "#22222c" - BG3 = "#2c2c38" - BORDER = "#38384a" - FG = "#e8e8f0" - FG2 = "#7878a0" - AMBER = "#f5a623" - AMBER2 = "#c8831a" - RED = "#f87171" - FONT = ("Segoe UI", 11) - FONT_B = ("Segoe UI", 11, "bold") - FONT_S = ("Segoe UI", 9) - FONT_H = ("Segoe UI Semibold", 14) - FONT_M = ("Consolas", 10) - - win = tk.Toplevel(overlay_tk) - win.title("Vokabular") - win.configure(bg=BG) - win.attributes("-topmost", True) - win.resizable(False, False) - W, H = 600, 620 - win.update_idletasks() - sw, sh = win.winfo_screenwidth(), win.winfo_screenheight() - win.geometry(f"{W}x{H}+{(sw-W)//2}+{(sh-H)//2}") - win.option_add("*Menu.background", BG3) - win.option_add("*Menu.foreground", FG) - win.option_add("*Menu.activeBackground", AMBER) - win.option_add("*Menu.activeForeground", BG) - - # ── Header ── - hdr = tk.Frame(win, bg=BG2) - hdr.pack(fill="x") - tk.Frame(hdr, bg=AMBER, height=3).pack(fill="x") - tk.Label(hdr, text="VOKABULAR & KORREKTUREN", font=FONT_H, - bg=BG2, fg=FG, pady=14).pack() - tk.Label(hdr, text="Wörter lernen · Ersetzungen definieren", - font=FONT_S, bg=BG2, fg=FG2).pack(pady=(0, 10)) - - content = tk.Frame(win, bg=BG, padx=28, pady=12) - content.pack(fill="both", expand=True) - - # ── Add-word form ───────────────────────────────────────────────────────── - is_correction = tk.BooleanVar(value=False) - - form = tk.Frame(content, bg=BG3, padx=16, pady=14) - form.pack(fill="x", pady=(0, 16)) - - # Toggle row - tog_row = tk.Frame(form, bg=BG3) - tog_row.pack(fill="x", pady=(0, 10)) - tk.Label(tog_row, text="Korrektur (falsch → richtig)", font=FONT, - bg=BG3, fg=FG).pack(side="left") - - def toggle_form(*_): - if is_correction.get(): - entry_from.pack(side="left", padx=(0, 6)) - arrow_lbl.pack(side="left", padx=4) - entry_to.pack(side="left") - entry_word.pack_forget() - else: - entry_word.pack(side="left", fill="x", expand=True) - entry_from.pack_forget() - arrow_lbl.pack_forget() - entry_to.pack_forget() - - tog_btn = tk.Checkbutton(tog_row, variable=is_correction, command=toggle_form, - bg=BG3, fg=FG2, activebackground=BG3, - selectcolor=AMBER, relief="flat", bd=0, - indicatoron=True) - tog_btn.pack(side="right") - - # Input row - inp_row = tk.Frame(form, bg=BG3) - inp_row.pack(fill="x") - - entry_style = dict(font=FONT_M, bg=BG, fg=FG, insertbackground=AMBER, - relief="flat", bd=6, highlightbackground=BORDER, highlightthickness=1) - - entry_word = tk.Entry(inp_row, width=32, **entry_style) - entry_word.insert(0, "") - entry_from = tk.Entry(inp_row, width=14, **entry_style) - arrow_lbl = tk.Label(inp_row, text="→", font=("Segoe UI", 14), bg=BG3, fg=AMBER) - entry_to = tk.Entry(inp_row, width=14, **entry_style) - entry_word.pack(side="left", fill="x", expand=True) - - def add_entry(): - if is_correction.get(): - frm = entry_from.get().strip() - to = entry_to.get().strip() - if frm and to: - vocab["replacements"].append({"from": frm, "to": to}) - entry_from.delete(0, tk.END) - entry_to.delete(0, tk.END) - else: - w = entry_word.get().strip() - if w and w not in vocab["words"]: - vocab["words"].append(w) - entry_word.delete(0, tk.END) - save_vocab() - refresh_lists() - - win.bind("", lambda _: add_entry()) - - add_btn = tk.Button(inp_row, text="Hinzufügen", command=add_entry, - bg=AMBER, fg=BG, font=FONT_B, - relief="flat", padx=14, pady=5, cursor="hand2", bd=0) - add_btn.pack(side="right", padx=(10, 0)) - add_btn.bind("", lambda _: add_btn.config(bg=AMBER2)) - add_btn.bind("", lambda _: add_btn.config(bg=AMBER)) - - # ── Lists ───────────────────────────────────────────────────────────────── - lists_frame = tk.Frame(content, bg=BG) - lists_frame.pack(fill="both", expand=True) - lists_frame.columnconfigure(0, weight=1) - lists_frame.columnconfigure(1, weight=2) - - def section_label(parent, text): - tk.Label(parent, text=text, font=("Consolas", 9, "bold"), - bg=BG, fg=AMBER).pack(anchor="w", pady=(0, 6)) - - # Words column - col_w = tk.Frame(lists_frame, bg=BG) - col_w.grid(row=0, column=0, sticky="nsew", padx=(0, 12)) - section_label(col_w, "WÖRTER") - - words_box = tk.Listbox(col_w, font=FONT_M, bg=BG3, fg=FG, - selectbackground=AMBER, selectforeground=BG, - relief="flat", bd=0, highlightthickness=0, - activestyle="none", height=10) - words_box.pack(fill="both", expand=True) - - def del_word(): - sel = words_box.curselection() - if sel: - vocab["words"].pop(sel[0]) - save_vocab() - refresh_lists() - - tk.Button(col_w, text="− Entfernen", command=del_word, - bg=BG3, fg=RED, font=FONT_S, relief="flat", - padx=8, pady=3, cursor="hand2", bd=0).pack(anchor="e", pady=(4, 0)) - - # Replacements column - col_r = tk.Frame(lists_frame, bg=BG) - col_r.grid(row=0, column=1, sticky="nsew") - section_label(col_r, "KORREKTUREN") - - repl_box = tk.Listbox(col_r, font=FONT_M, bg=BG3, fg=FG, - selectbackground=AMBER, selectforeground=BG, - relief="flat", bd=0, highlightthickness=0, - activestyle="none", height=10) - repl_box.pack(fill="both", expand=True) - - def del_repl(): - sel = repl_box.curselection() - if sel: - vocab["replacements"].pop(sel[0]) - save_vocab() - refresh_lists() - - tk.Button(col_r, text="− Entfernen", command=del_repl, - bg=BG3, fg=RED, font=FONT_S, relief="flat", - padx=8, pady=3, cursor="hand2", bd=0).pack(anchor="e", pady=(4, 0)) - - def refresh_lists(): - words_box.delete(0, tk.END) - for w in vocab.get("words", []): - words_box.insert(tk.END, f" {w}") - repl_box.delete(0, tk.END) - for r in vocab.get("replacements", []): - repl_box.insert(tk.END, f" {r['from']} → {r['to']}") - - refresh_lists() - - # ── Footer ── - tk.Frame(win, bg=BORDER, height=1).pack(fill="x") - tk.Label(win, text="Wörter fließen als Kontext in Whisper ein · Korrekturen werden nach der Transkription angewendet", - font=FONT_S, bg=BG2, fg=FG2, pady=8).pack() - - -def reload_model_and_hotkey(): - keyboard.unhook_all() - load_model() - keyboard.add_hotkey(config["hotkey"], start_recording, suppress=True) - keyboard.on_release_key(config["hotkey"].split("+")[-1], on_space_release) - print(f"Hotkey updated: {config['hotkey']}", flush=True) - - -# ── Main ────────────────────────────────────────────────────────────────────── - -def main(): - global tray_icon, overlay_tk - - load_config() - load_vocab() - load_model() - - # Tkinter root (hidden) for overlay and settings - root = tk.Tk() - root.withdraw() - overlay_tk = root - create_overlay(root) - - # Audio stream - stream = get_audio_stream() - stream.start() - - # Hotkey - last_key = config["hotkey"].split("+")[-1] - keyboard.add_hotkey(config["hotkey"], start_recording, suppress=True) - keyboard.on_release_key(last_key, on_space_release) - - # Tray - menu = pystray.Menu( - pystray.MenuItem("Einstellungen", lambda: open_settings()), - pystray.MenuItem("Vokabular", lambda: open_vocab()), - pystray.Menu.SEPARATOR, - pystray.MenuItem("Beenden", lambda: quit_app(stream)), - ) - tray_icon = pystray.Icon("whisper", ICONS[AppState.IDLE], "Whisper Dictation", menu) - - threading.Thread(target=tray_icon.run, daemon=True).start() - - print(f"Ready. Hotkey: {config['hotkey']}", flush=True) - root.mainloop() - - stream.stop() - -def quit_app(stream): - stream.stop() - tray_icon.stop() - overlay_tk.after(0, overlay_tk.quit) - - -if __name__ == "__main__": - main() +""" +Whisper Dictation — local GPU speech-to-text with system tray and settings GUI. +Hold hotkey to record, release to transcribe and type into active window. +""" +import json +import os +import threading +import time +import tkinter as tk +from tkinter import ttk + +import numpy as np +import sounddevice as sd +import pystray +from PIL import Image, ImageDraw +from pynput.keyboard import Controller as KeyboardController, Listener as KeyboardListener, Key, KeyCode +from faster_whisper import WhisperModel + +# Shared data dir: script directory (= git repo root, synced via git pull). +_script_dir = os.path.dirname(os.path.abspath(__file__)) +DATA_DIR = os.environ.get("WHISPER_DATA_DIR", _script_dir) +os.makedirs(DATA_DIR, exist_ok=True) + +# Local config dir: machine-specific settings (audio device, device, compute_type). +# Windows: %LOCALAPPDATA%\WhisperDictation +# Linux: ~/.local/share/WhisperDictation +_env_local = os.environ.get("WHISPER_LOCAL_DIR") +if _env_local: + _local_dir = _env_local +elif os.name == "nt": + _local_dir = os.path.join(os.environ.get("LOCALAPPDATA", _script_dir), "WhisperDictation") +else: + _local_dir = os.path.join(os.path.expanduser("~"), ".local", "share", "WhisperDictation") +os.makedirs(_local_dir, exist_ok=True) + +CONFIG_FILE = os.path.join(DATA_DIR, "config.json") # shared via git +CONFIG_LOCAL_FILE = os.path.join(_local_dir, "config_local.json") # machine-specific, not in git +VOCAB_FILE = os.path.join(DATA_DIR, "vocabulary.json") # shared via git + +DEFAULT_CONFIG = { + "hotkey": "ctrl+shift+space", + "model": "medium", + "device": "cuda", + "compute_type": "float16", + "language": "de", + "audio_device": None, + "sample_rate": 16000, +} + +MODELS = ["tiny", "base", "small", "medium", "large-v2", "large-v3"] +LANGUAGES = {"Deutsch": "de", "English": "en", "Français": "fr", "Español": "es", "Italiano": "it", "Auto": None} +DEVICES = ["cuda", "cpu"] +COMPUTE_TYPES = {"float16 (GPU)": "float16", "int8 (CPU/GPU)": "int8", "float32": "float32"} + + +# ── State ───────────────────────────────────────────────────────────────────── + +class AppState: + IDLE = "idle" + RECORDING = "recording" + TRANSCRIBING = "transcribing" + +state = AppState.IDLE +audio_chunks = [] +model = None +typer = KeyboardController() +config = {} +tray_icon = None +overlay_window = None +overlay_tk = None +hotkey_listener = None + + +# ── Hotkey via pynput ──────────────────────────────────────────────────────── + +_MODIFIER_MAP = { + "ctrl": {Key.ctrl_l, Key.ctrl_r}, + "ctrl_l": {Key.ctrl_l}, "ctrl_r": {Key.ctrl_r}, + "shift": {Key.shift_l, Key.shift_r}, + "shift_l": {Key.shift_l}, "shift_r": {Key.shift_r}, + "alt": {Key.alt_l, Key.alt_r}, + "alt_l": {Key.alt_l}, "alt_r": {Key.alt_r}, +} + +_KEY_MAP = { + "space": Key.space, "tab": Key.tab, "enter": Key.enter, + "esc": Key.esc, "escape": Key.esc, + "up": Key.up, "down": Key.down, "left": Key.left, "right": Key.right, + "home": Key.home, "end": Key.end, "page_up": Key.page_up, "page_down": Key.page_down, + "insert": Key.insert, "delete": Key.delete, "backspace": Key.backspace, +} +for i in range(1, 13): + _KEY_MAP[f"f{i}"] = getattr(Key, f"f{i}") + + +def _parse_hotkey(hotkey_str): + """Parse hotkey string into (modifier_sets, trigger_key). + Returns: (list of sets-of-pynput-keys for each modifier, pynput key for trigger) + """ + parts = [p.strip().lower() for p in hotkey_str.split("+")] + modifiers = [] + for p in parts[:-1]: + if p in _MODIFIER_MAP: + modifiers.append(_MODIFIER_MAP[p]) + elif p in _KEY_MAP: + modifiers.append({_KEY_MAP[p]}) + else: + modifiers.append({KeyCode.from_char(p)}) + trigger_part = parts[-1] + if trigger_part in _KEY_MAP: + trigger = _KEY_MAP[trigger_part] + elif trigger_part in _MODIFIER_MAP: + trigger = next(iter(_MODIFIER_MAP[trigger_part])) + else: + trigger = KeyCode.from_char(trigger_part) + return modifiers, trigger + + +class HotkeyListener: + """Hold-to-record hotkey using pynput. No root required on X11.""" + + def __init__(self, hotkey_str, on_press, on_release): + self._modifiers, self._trigger = _parse_hotkey(hotkey_str) + self._on_press = on_press + self._on_release = on_release + self._pressed = set() + self._active = False + self._listener = KeyboardListener(on_press=self._key_down, on_release=self._key_up) + self._listener.daemon = True + self._listener.start() + + def _matches_trigger(self, key): + return key == self._trigger + + def _modifiers_held(self): + return all(any(k in self._pressed for k in mod_set) for mod_set in self._modifiers) + + def _key_down(self, key): + self._pressed.add(key) + if not self._active and self._matches_trigger(key) and self._modifiers_held(): + self._active = True + self._on_press() + + def _key_up(self, key): + self._pressed.discard(key) + if self._active and self._matches_trigger(key): + self._active = False + self._on_release() + + def stop(self): + self._listener.stop() + + +# ── Config ──────────────────────────────────────────────────────────────────── + +LOCAL_KEYS = {"audio_device", "device", "compute_type"} # keys stored only in config_local.json + +def load_config(): + global config + config = dict(DEFAULT_CONFIG) + if os.path.exists(CONFIG_FILE): + with open(CONFIG_FILE) as f: + config.update(json.load(f)) + if os.path.exists(CONFIG_LOCAL_FILE): + with open(CONFIG_LOCAL_FILE) as f: + config.update(json.load(f)) + +def save_config(): + shared = {k: v for k, v in config.items() if k not in LOCAL_KEYS} + local = {k: v for k, v in config.items() if k in LOCAL_KEYS} + with open(CONFIG_FILE, "w") as f: + json.dump(shared, f, indent=2) + with open(CONFIG_LOCAL_FILE, "w") as f: + json.dump(local, f, indent=2) + + +# ── Vocabulary ──────────────────────────────────────────────────────────────── + +vocab = {"words": [], "replacements": []} # {from, to} + +def load_vocab(): + global vocab + if os.path.exists(VOCAB_FILE): + with open(VOCAB_FILE) as f: + vocab = json.load(f) + else: + vocab = {"words": [], "replacements": []} + +def save_vocab(): + with open(VOCAB_FILE, "w") as f: + json.dump(vocab, f, indent=2, ensure_ascii=False) + +def apply_vocab(text: str) -> str: + for r in vocab.get("replacements", []): + text = text.replace(r["from"], r["to"]) + return text + +def get_initial_prompt() -> str: + words = vocab.get("words", []) + return ", ".join(words) if words else "" + + +# ── Tray icon ───────────────────────────────────────────────────────────────── + +def make_icon(color): + img = Image.new("RGBA", (64, 64), (0, 0, 0, 0)) + d = ImageDraw.Draw(img) + d.ellipse([4, 4, 60, 60], fill=color) + return img + +ICONS = { + AppState.IDLE: make_icon((40, 200, 80)), + AppState.RECORDING: make_icon((220, 50, 50)), + AppState.TRANSCRIBING: make_icon((220, 180, 30)), +} + +def set_state(new_state): + global state + state = new_state + if tray_icon: + tray_icon.icon = ICONS[new_state] + if new_state == AppState.RECORDING: + show_overlay() + else: + hide_overlay() + + +# ── Overlay window ──────────────────────────────────────────────────────────── + +def show_overlay(): + if overlay_tk is None: + return + overlay_tk.after(0, _show_overlay_main) + +def hide_overlay(): + if overlay_tk is None: + return + overlay_tk.after(0, _hide_overlay_main) + +def _show_overlay_main(): + overlay_window.deiconify() + # Position bottom-right + sw = overlay_tk.winfo_screenwidth() + sh = overlay_tk.winfo_screenheight() + overlay_window.geometry(f"220x54+{sw - 240}+{sh - 100}") + overlay_window.lift() + +def _hide_overlay_main(): + overlay_window.withdraw() + +def create_overlay(root): + global overlay_window + win = tk.Toplevel(root) + win.withdraw() + win.overrideredirect(True) + win.attributes("-topmost", True) + win.attributes("-alpha", 0.92) + win.configure(bg="#1a1a1a") + + frame = tk.Frame(win, bg="#1a1a1a", padx=12, pady=10) + frame.pack(fill="both", expand=True) + + dot = tk.Canvas(frame, width=14, height=14, bg="#1a1a1a", highlightthickness=0) + dot.create_oval(2, 2, 12, 12, fill="#e03030", outline="") + dot.pack(side="left", padx=(0, 8)) + + _sans = "Segoe UI" if os.name == "nt" else "sans-serif" + tk.Label(frame, text="Aufnahme läuft …", fg="white", bg="#1a1a1a", + font=(_sans, 11)).pack(side="left") + + overlay_window = win + + +# ── Audio ───────────────────────────────────────────────────────────────────── + +def audio_callback(indata, frames, time_info, status): + if state == AppState.RECORDING: + audio_chunks.append(indata.copy()) + +def get_audio_stream(): + device = config.get("audio_device") + return sd.InputStream( + samplerate=config["sample_rate"], + channels=1, + device=device, + callback=audio_callback, + ) + + +# ── Recording & transcription ───────────────────────────────────────────────── + +def start_recording(): + global audio_chunks + if state == AppState.RECORDING: + return + audio_chunks = [] + set_state(AppState.RECORDING) + print("Recording...", flush=True) + +def stop_and_transcribe(): + if state != AppState.RECORDING: + return + set_state(AppState.TRANSCRIBING) + chunks = list(audio_chunks) + + if not chunks: + set_state(AppState.IDLE) + return + + audio = np.concatenate(chunks, axis=0).flatten().astype(np.float32) + duration = len(audio) / config["sample_rate"] + rms = float(np.sqrt(np.mean(audio ** 2))) + print(f"Audio: {duration:.1f}s RMS: {rms:.5f}", flush=True) + + if duration < 0.3 or rms < 0.0001: + print("Too short or silent — skipped.", flush=True) + set_state(AppState.IDLE) + return + + # Normalize to target RMS so Whisper gets consistent signal level + target_rms = 0.05 + if rms > 0: + audio = audio * (target_rms / rms) + audio = np.clip(audio, -1.0, 1.0) + + lang = config["language"] if config["language"] else None + prompt = get_initial_prompt() + segments, _ = model.transcribe( + audio, language=lang, beam_size=5, vad_filter=True, + initial_prompt=prompt if prompt else None, + ) + text = " ".join(s.text for s in segments).strip() + text = apply_vocab(text) + print(f"Result: {repr(text)}", flush=True) + + set_state(AppState.IDLE) + if text: + time.sleep(0.15) + typer.type(text) + + + +# ── Model loading ───────────────────────────────────────────────────────────── + +def load_model(): + global model + print(f"Loading {config['model']} on {config['device']}...", flush=True) + model = WhisperModel( + config["model"], + device=config["device"], + compute_type=config["compute_type"], + ) + print("Model ready.", flush=True) + + +# ── Settings window ─────────────────────────────────────────────────────────── + +def open_settings(): + if overlay_tk is None: + return + overlay_tk.after(0, _open_settings_main) + +def _open_settings_main(): + # ── Palette: "Precision Audio" ────────────────────────────────────────── + BG = "#18181f" # deep void + BG2 = "#22222c" # panel + BG3 = "#2c2c38" # elevated + BORDER = "#38384a" + FG = "#e8e8f0" + FG2 = "#7878a0" + AMBER = "#f5a623" + AMBER2 = "#c8831a" + GREEN = "#4ade80" + _mono = "Consolas" if os.name == "nt" else "monospace" + _sans = "Segoe UI" if os.name == "nt" else "sans-serif" + FONT = (_mono, 11) + FONT_UI = (_sans, 11) + FONT_B = (_sans, 11, "bold") + FONT_S = (_sans, 9) + FONT_H = (_sans, 16, "bold") + + win = tk.Toplevel(overlay_tk) + win.title("Whisper Dictation") + win.configure(bg=BG) + win.attributes("-topmost", True) + win.resizable(False, False) + win.minsize(700, 0) + + # Global option for OptionMenu dropdowns (dark listbox) + win.option_add("*Menu.background", BG3) + win.option_add("*Menu.foreground", FG) + win.option_add("*Menu.activeBackground", AMBER) + win.option_add("*Menu.activeForeground", BG) + win.option_add("*Menu.font", FONT_UI) + + # ── Header ── + hdr = tk.Frame(win, bg=BG2, pady=20) + hdr.pack(fill="x") + # Amber accent bar + tk.Frame(hdr, bg=AMBER, height=3).pack(fill="x") + tk.Label(hdr, text="WHISPER DICTATION", font=FONT_H, + bg=BG2, fg=FG, pady=12).pack() + tk.Label(hdr, text="Lokale GPU-Transkription · offline · privat", + font=FONT_S, bg=BG2, fg=FG2).pack() + + # ── Content ── + content = tk.Frame(win, bg=BG, padx=36, pady=16) + content.pack(fill="both", expand=True) + + def section(label): + f = tk.Frame(content, bg=BG) + f.pack(fill="x", pady=(18, 6)) + tk.Label(f, text=label, font=("Consolas", 9, "bold"), + bg=BG, fg=AMBER).pack(side="left") + tk.Frame(f, bg=BORDER, height=1).pack(side="left", fill="x", expand=True, padx=(10, 0), pady=6) + + def dd(frame, var, values, width=14): + """Create dark OptionMenu directly in frame as parent.""" + m = tk.OptionMenu(frame, var, *values) + m.config(bg=BG3, fg=FG, activebackground=BG3, activeforeground=FG, + highlightbackground=BORDER, highlightthickness=1, + relief="flat", font=FONT_UI, anchor="w", bd=0, width=width) + m["menu"].config(bg=BG3, fg=FG, activebackground=AMBER, + activeforeground=BG, relief="flat", bd=0) + return m + + def row(label, hint=None): + """Returns frame — add controls to frame after calling.""" + f = tk.Frame(content, bg=BG) + f.pack(fill="x", pady=5) + tk.Label(f, text=label, width=17, anchor="w", font=FONT_UI, + bg=BG, fg=FG2).pack(side="left") + if hint: + tk.Label(f, text=hint, font=FONT_S, bg=BG, fg=FG2).pack(side="right") + return f + + # ── AUDIO ── + section("AUDIO") + devices = [(i, d["name"]) for i, d in enumerate(sd.query_devices()) + if d["max_input_channels"] > 0] + dev_names = ["Standard"] + [f"{i}: {n}" for i, n in devices] + dev_var = tk.StringVar() + cur_dev = config.get("audio_device") + dev_var.set("Standard" if cur_dev is None else + next((f"{i}: {n}" for i, n in devices if i == cur_dev), "Standard")) + f = row("Mikrofon") + dd(f, dev_var, dev_names, width=44).pack(side="left") + + # ── MODELL ── + section("MODELL") + model_hints = { + "tiny": "~1 GB VRAM · sehr schnell", + "base": "~1 GB VRAM", + "small": "~2 GB VRAM", + "medium": "~5 GB VRAM · empfohlen ✓", + "large-v2": "~10 GB VRAM", + "large-v3": "~10 GB VRAM · bestes Ergebnis", + } + model_var = tk.StringVar(value=config["model"]) + f_model = row("Modell") + dd(f_model, model_var, MODELS, 14).pack(side="left") + hint_lbl = tk.Label(f_model, text=model_hints.get(config["model"], ""), + font=FONT_S, bg=BG, fg=FG2) + hint_lbl.pack(side="left", padx=(14, 0)) + model_var.trace_add("write", lambda *_: hint_lbl.config(text=model_hints.get(model_var.get(), ""))) + + lang_display = {v: k for k, v in LANGUAGES.items()} + lang_var = tk.StringVar(value=lang_display.get(config["language"], "Deutsch")) + f = row("Sprache") + dd(f, lang_var, list(LANGUAGES.keys()), 14).pack(side="left") + + # ── LEISTUNG ── + section("LEISTUNG") + device_var = tk.StringVar(value=config["device"]) + f = row("Gerät (GPU/CPU)", hint="cuda = NVIDIA GPU empfohlen") + dd(f, device_var, DEVICES, 8).pack(side="left") + + ct_display = {v: k for k, v in COMPUTE_TYPES.items()} + ct_var = tk.StringVar(value=ct_display.get(config["compute_type"], "float16 (GPU)")) + f = row("Compute Type") + dd(f, ct_var, list(COMPUTE_TYPES.keys()), 18).pack(side="left") + + # ── STEUERUNG ── + section("STEUERUNG") + hotkey_var = tk.StringVar(value=config["hotkey"]) + f_hk = row("Hotkey", hint="z.B. ctrl+shift+space") + tk.Entry(f_hk, textvariable=hotkey_var, font=FONT, width=24, + bg=BG3, fg=FG, insertbackground=AMBER, + relief="flat", bd=6, + highlightbackground=BORDER, highlightthickness=1).pack(side="left") + + # ── Buttons ── + tk.Frame(win, bg=BORDER, height=1).pack(fill="x") + btn_bar = tk.Frame(win, bg=BG2, pady=16, padx=32) + btn_bar.pack(fill="x") + + def save(): + sel = dev_var.get() + config["audio_device"] = None if sel == "Standard" else int(sel.split(":")[0]) + config["model"] = model_var.get() + config["language"] = LANGUAGES[lang_var.get()] + config["device"] = device_var.get() + config["compute_type"] = COMPUTE_TYPES[ct_var.get()] + config["hotkey"] = hotkey_var.get() + save_config() + win.destroy() + threading.Thread(target=reload_model_and_hotkey, daemon=True).start() + + def btn_hover(b, c_in, c_out): + b.bind("", lambda _: b.config(bg=c_in)) + b.bind("", lambda _: b.config(bg=c_out)) + + save_btn = tk.Button(btn_bar, text="Speichern & Neuladen", command=save, + bg=AMBER, fg=BG, font=FONT_B, + relief="flat", padx=20, pady=9, cursor="hand2", bd=0) + save_btn.pack(side="right") + btn_hover(save_btn, AMBER2, AMBER) + + cancel_btn = tk.Button(btn_bar, text="Abbrechen", command=win.destroy, + bg=BG3, fg=FG2, font=FONT_UI, + relief="flat", padx=20, pady=9, cursor="hand2", bd=0) + cancel_btn.pack(side="right", padx=(0, 10)) + btn_hover(cancel_btn, BORDER, BG3) + + # Center on screen after layout + win.update_idletasks() + sw = win.winfo_screenwidth() + sh = win.winfo_screenheight() + w = win.winfo_reqwidth() + h = win.winfo_reqheight() + win.geometry(f"+{(sw-w)//2}+{(sh-h)//2}") + + +def open_vocab(): + if overlay_tk is None: + return + overlay_tk.after(0, _open_vocab_main) + +def _open_vocab_main(): + BG = "#18181f" + BG2 = "#22222c" + BG3 = "#2c2c38" + BORDER = "#38384a" + FG = "#e8e8f0" + FG2 = "#7878a0" + AMBER = "#f5a623" + AMBER2 = "#c8831a" + RED = "#f87171" + _mono = "Consolas" if os.name == "nt" else "monospace" + _sans = "Segoe UI" if os.name == "nt" else "sans-serif" + FONT = (_sans, 11) + FONT_B = (_sans, 11, "bold") + FONT_S = (_sans, 9) + FONT_H = (_sans, 14, "bold") + FONT_M = (_mono, 10) + + win = tk.Toplevel(overlay_tk) + win.title("Vokabular") + win.configure(bg=BG) + win.attributes("-topmost", True) + win.resizable(False, False) + win.minsize(600, 0) + win.option_add("*Menu.background", BG3) + win.option_add("*Menu.foreground", FG) + win.option_add("*Menu.activeBackground", AMBER) + win.option_add("*Menu.activeForeground", BG) + + # ── Header ── + hdr = tk.Frame(win, bg=BG2) + hdr.pack(fill="x") + tk.Frame(hdr, bg=AMBER, height=3).pack(fill="x") + tk.Label(hdr, text="VOKABULAR & KORREKTUREN", font=FONT_H, + bg=BG2, fg=FG, pady=14).pack() + tk.Label(hdr, text="Wörter lernen · Ersetzungen definieren", + font=FONT_S, bg=BG2, fg=FG2).pack(pady=(0, 10)) + + content = tk.Frame(win, bg=BG, padx=28, pady=12) + content.pack(fill="both", expand=True) + + # ── Add-word form ───────────────────────────────────────────────────────── + is_correction = tk.BooleanVar(value=False) + + form = tk.Frame(content, bg=BG3, padx=16, pady=14) + form.pack(fill="x", pady=(0, 16)) + + # Toggle row + tog_row = tk.Frame(form, bg=BG3) + tog_row.pack(fill="x", pady=(0, 10)) + tk.Label(tog_row, text="Korrektur (falsch → richtig)", font=FONT, + bg=BG3, fg=FG).pack(side="left") + + def toggle_form(*_): + if is_correction.get(): + entry_from.pack(side="left", padx=(0, 6)) + arrow_lbl.pack(side="left", padx=4) + entry_to.pack(side="left") + entry_word.pack_forget() + else: + entry_word.pack(side="left", fill="x", expand=True) + entry_from.pack_forget() + arrow_lbl.pack_forget() + entry_to.pack_forget() + + tog_btn = tk.Checkbutton(tog_row, variable=is_correction, command=toggle_form, + bg=BG3, fg=FG2, activebackground=BG3, + selectcolor=AMBER, relief="flat", bd=0, + indicatoron=True) + tog_btn.pack(side="right") + + # Input row + inp_row = tk.Frame(form, bg=BG3) + inp_row.pack(fill="x") + + entry_style = dict(font=FONT_M, bg=BG, fg=FG, insertbackground=AMBER, + relief="flat", bd=6, highlightbackground=BORDER, highlightthickness=1) + + entry_word = tk.Entry(inp_row, width=32, **entry_style) + entry_word.insert(0, "") + entry_from = tk.Entry(inp_row, width=14, **entry_style) + arrow_lbl = tk.Label(inp_row, text="→", font=("Segoe UI", 14), bg=BG3, fg=AMBER) + entry_to = tk.Entry(inp_row, width=14, **entry_style) + entry_word.pack(side="left", fill="x", expand=True) + + def add_entry(): + if is_correction.get(): + frm = entry_from.get().strip() + to = entry_to.get().strip() + if frm and to: + vocab["replacements"].append({"from": frm, "to": to}) + entry_from.delete(0, tk.END) + entry_to.delete(0, tk.END) + else: + w = entry_word.get().strip() + if w and w not in vocab["words"]: + vocab["words"].append(w) + entry_word.delete(0, tk.END) + save_vocab() + refresh_lists() + + win.bind("", lambda _: add_entry()) + + add_btn = tk.Button(inp_row, text="Hinzufügen", command=add_entry, + bg=AMBER, fg=BG, font=FONT_B, + relief="flat", padx=14, pady=5, cursor="hand2", bd=0) + add_btn.pack(side="right", padx=(10, 0)) + add_btn.bind("", lambda _: add_btn.config(bg=AMBER2)) + add_btn.bind("", lambda _: add_btn.config(bg=AMBER)) + + # ── Lists ───────────────────────────────────────────────────────────────── + lists_frame = tk.Frame(content, bg=BG) + lists_frame.pack(fill="both", expand=True) + lists_frame.columnconfigure(0, weight=1) + lists_frame.columnconfigure(1, weight=2) + + def section_label(parent, text): + tk.Label(parent, text=text, font=("Consolas", 9, "bold"), + bg=BG, fg=AMBER).pack(anchor="w", pady=(0, 6)) + + # Words column + col_w = tk.Frame(lists_frame, bg=BG) + col_w.grid(row=0, column=0, sticky="nsew", padx=(0, 12)) + section_label(col_w, "WÖRTER") + + words_box = tk.Listbox(col_w, font=FONT_M, bg=BG3, fg=FG, + selectbackground=AMBER, selectforeground=BG, + relief="flat", bd=0, highlightthickness=0, + activestyle="none", height=10) + words_box.pack(fill="both", expand=True) + + def del_word(): + sel = words_box.curselection() + if sel: + vocab["words"].pop(sel[0]) + save_vocab() + refresh_lists() + + tk.Button(col_w, text="− Entfernen", command=del_word, + bg=BG3, fg=RED, font=FONT_S, relief="flat", + padx=8, pady=3, cursor="hand2", bd=0).pack(anchor="e", pady=(4, 0)) + + # Replacements column + col_r = tk.Frame(lists_frame, bg=BG) + col_r.grid(row=0, column=1, sticky="nsew") + section_label(col_r, "KORREKTUREN") + + repl_box = tk.Listbox(col_r, font=FONT_M, bg=BG3, fg=FG, + selectbackground=AMBER, selectforeground=BG, + relief="flat", bd=0, highlightthickness=0, + activestyle="none", height=10) + repl_box.pack(fill="both", expand=True) + + def del_repl(): + sel = repl_box.curselection() + if sel: + vocab["replacements"].pop(sel[0]) + save_vocab() + refresh_lists() + + tk.Button(col_r, text="− Entfernen", command=del_repl, + bg=BG3, fg=RED, font=FONT_S, relief="flat", + padx=8, pady=3, cursor="hand2", bd=0).pack(anchor="e", pady=(4, 0)) + + def refresh_lists(): + words_box.delete(0, tk.END) + for w in vocab.get("words", []): + words_box.insert(tk.END, f" {w}") + repl_box.delete(0, tk.END) + for r in vocab.get("replacements", []): + repl_box.insert(tk.END, f" {r['from']} → {r['to']}") + + refresh_lists() + + # ── Footer ── + tk.Frame(win, bg=BORDER, height=1).pack(fill="x") + tk.Label(win, text="Wörter fließen als Kontext in Whisper ein · Korrekturen werden nach der Transkription angewendet", + font=FONT_S, bg=BG2, fg=FG2, pady=8).pack() + + # Center on screen after layout + win.update_idletasks() + sw = win.winfo_screenwidth() + sh = win.winfo_screenheight() + w = win.winfo_reqwidth() + h = win.winfo_reqheight() + win.geometry(f"+{(sw-w)//2}+{(sh-h)//2}") + + +def reload_model_and_hotkey(): + global hotkey_listener + if hotkey_listener: + hotkey_listener.stop() + load_model() + hotkey_listener = HotkeyListener( + config["hotkey"], + on_press=start_recording, + on_release=lambda: threading.Thread(target=stop_and_transcribe, daemon=True).start(), + ) + print(f"Hotkey updated: {config['hotkey']}", flush=True) + + +# ── Main ────────────────────────────────────────────────────────────────────── + +def main(): + global tray_icon, overlay_tk + + load_config() + load_vocab() + load_model() + + # Tkinter root (hidden) for overlay and settings + root = tk.Tk() + root.withdraw() + overlay_tk = root + create_overlay(root) + + # Audio stream + stream = get_audio_stream() + stream.start() + + # Hotkey + global hotkey_listener + hotkey_listener = HotkeyListener( + config["hotkey"], + on_press=start_recording, + on_release=lambda: threading.Thread(target=stop_and_transcribe, daemon=True).start(), + ) + + # Tray + menu = pystray.Menu( + pystray.MenuItem("Einstellungen", lambda: open_settings()), + pystray.MenuItem("Vokabular", lambda: open_vocab()), + pystray.Menu.SEPARATOR, + pystray.MenuItem("Beenden", lambda: quit_app(stream)), + ) + tray_icon = pystray.Icon("whisper", ICONS[AppState.IDLE], "Whisper Dictation", menu) + + threading.Thread(target=tray_icon.run, daemon=True).start() + + print(f"Ready. Hotkey: {config['hotkey']}", flush=True) + root.mainloop() + + stream.stop() + +def quit_app(stream): + stream.stop() + tray_icon.stop() + overlay_tk.after(0, overlay_tk.quit) + + +if __name__ == "__main__": + main() diff --git a/install.bat b/install.bat index 7af6d26..df3df9b 100644 --- a/install.bat +++ b/install.bat @@ -1,17 +1,17 @@ -@echo off -cd /d "%~dp0" - -echo Creating Windows venv (.venv-windows)... -py -3.13 -m venv .venv-windows - -set "VENV=%~dp0.venv-windows" -echo Installing dependencies... -"%VENV%\Scripts\pip" install --upgrade pip -"%VENV%\Scripts\pip" install -r requirements.txt - -echo Installing CUDA 12 DLLs (required for GPU acceleration)... -"%VENV%\Scripts\pip" install -r requirements-cuda.txt - -echo. -echo Done. Run start.bat to launch. -pause +@echo off +cd /d "%~dp0" + +echo Creating Windows venv (.venv-windows)... +py -3.13 -m venv .venv-windows + +set "VENV=%~dp0.venv-windows" +echo Installing dependencies... +"%VENV%\Scripts\python.exe" -m pip install --upgrade pip +"%VENV%\Scripts\python.exe" -m pip install -r requirements.txt + +echo Installing CUDA 12 DLLs (required for GPU acceleration)... +"%VENV%\Scripts\python.exe" -m pip install -r requirements-cuda.txt + +echo. +echo Done. Run start.bat to launch. +pause diff --git a/install.sh b/install.sh index 55a1713..9b8f8c4 100644 --- a/install.sh +++ b/install.sh @@ -3,7 +3,7 @@ set -e cd "$(dirname "$0")" echo "Creating Linux venv (.venv-linux)..." -python3 -m venv .venv-linux +python3 -m venv --system-site-packages .venv-linux echo "Installing dependencies..." .venv-linux/bin/pip install --upgrade pip diff --git a/requirements-cuda.txt b/requirements-cuda.txt index c0cfd24..666439c 100644 --- a/requirements-cuda.txt +++ b/requirements-cuda.txt @@ -1,4 +1,4 @@ -# Windows CUDA 12 DLLs required by ctranslate2 (faster-whisper backend) -# Install after requirements.txt on Windows with NVIDIA GPU -nvidia-cublas-cu12 -nvidia-cudnn-cu12 +# Windows CUDA 12 DLLs required by ctranslate2 (faster-whisper backend) +# Install after requirements.txt on Windows with NVIDIA GPU +nvidia-cublas-cu12 +nvidia-cudnn-cu12 diff --git a/requirements.txt b/requirements.txt index 3a767f6..29e7381 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,6 @@ -faster-whisper>=1.0.2 -sounddevice>=0.4.6 -numpy>=1.24 -keyboard>=0.13 -pystray>=0.19 -Pillow>=9.5 -pynput>=1.7.6 +faster-whisper>=1.0.2 +sounddevice>=0.4.6 +numpy>=1.24 +pystray>=0.19 +Pillow>=9.5 +pynput>=1.7.6 diff --git a/start.bat b/start.bat index 4e9189a..1ec49af 100644 --- a/start.bat +++ b/start.bat @@ -1,14 +1,14 @@ -@echo off -cd /d "%~dp0" - -set "VENV=%~dp0.venv-windows" - -rem Machine-local config dir (device, compute_type, audio_device - not in git) -if not exist "%LOCALAPPDATA%\WhisperDictation" mkdir "%LOCALAPPDATA%\WhisperDictation" -set "WHISPER_LOCAL_DIR=%LOCALAPPDATA%\WhisperDictation" - -rem CUDA 12 DLLs required by ctranslate2 -set "PATH=%VENV%\Lib\site-packages\nvidia\cublas\bin;%VENV%\Lib\site-packages\nvidia\cudnn\bin;%PATH%" - -"%VENV%\Scripts\python.exe" -u "%~dp0dictate.py" -pause +@echo off +cd /d "%~dp0" + +set "VENV=%~dp0.venv-windows" + +rem Machine-local config dir (device, compute_type, audio_device - not in git) +if not exist "%LOCALAPPDATA%\WhisperDictation" mkdir "%LOCALAPPDATA%\WhisperDictation" +set "WHISPER_LOCAL_DIR=%LOCALAPPDATA%\WhisperDictation" + +rem CUDA 12 DLLs required by ctranslate2 +set "PATH=%VENV%\Lib\site-packages\nvidia\cublas\bin;%VENV%\Lib\site-packages\nvidia\cudnn\bin;%PATH%" + +"%VENV%\Scripts\python.exe" -u "%~dp0dictate.py" +pause diff --git a/vocabulary.json b/vocabulary.json index 63d41be..c457ac9 100644 --- a/vocabulary.json +++ b/vocabulary.json @@ -1,18 +1,18 @@ -{ - "words": [], - "replacements": [ - {"from": "KRA", "to": "KRAH"}, - {"from": "Atos", "to": "ATHOS"}, - {"from": "Resistec", "to": "RESISTEC"}, - {"from": "Resistek", "to": "RESISTEC"}, - {"from": "HES", "to": "HEES"}, - {"from": "Ackerschot", "to": "Ackerschott"}, - {"from": "Carrois", "to": "Kauer"}, - {"from": "Jouer fixe", "to": "Jour-Fixe"}, - {"from": "Docuware", "to": "DocuWare"}, - {"from": "Nates", "to": "Nejc"}, - {"from": "Bittzeit", "to": "BitSight"}, - {"from": "Kalmikow", "to": "Kalmykov"}, - {"from": "Leifert", "to": "Leifer"} - ] -} +{ + "words": [], + "replacements": [ + {"from": "KRA", "to": "KRAH"}, + {"from": "Atos", "to": "ATHOS"}, + {"from": "Resistec", "to": "RESISTEC"}, + {"from": "Resistek", "to": "RESISTEC"}, + {"from": "HES", "to": "HEES"}, + {"from": "Ackerschot", "to": "Ackerschott"}, + {"from": "Carrois", "to": "Kauer"}, + {"from": "Jouer fixe", "to": "Jour-Fixe"}, + {"from": "Docuware", "to": "DocuWare"}, + {"from": "Nates", "to": "Nejc"}, + {"from": "Bittzeit", "to": "BitSight"}, + {"from": "Kalmikow", "to": "Kalmykov"}, + {"from": "Leifert", "to": "Leifer"} + ] +}