commit e1a3eba05af4f1a6cb4b3059c03fc207fc03e81f Author: beo3000 Date: Thu Mar 19 17:55:40 2026 +0100 feat: initial whisper-dictation repo, migrated from OneDrive diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..39629ff --- /dev/null +++ b/.gitignore @@ -0,0 +1,11 @@ +.venv-windows/ +.venv-linux/ +.venv/ +__pycache__/ +*.pyc +*.pyo +config_local.json +models/ +*.log +.DS_Store +Thumbs.db diff --git a/config.json b/config.json new file mode 100644 index 0000000..f8fdbcb --- /dev/null +++ b/config.json @@ -0,0 +1,6 @@ +{ + "hotkey": "ctrl+shift+space", + "model": "medium", + "language": "de", + "sample_rate": 16000 +} diff --git a/dictate.py b/dictate.py new file mode 100644 index 0000000..bc5bb10 --- /dev/null +++ b/dictate.py @@ -0,0 +1,688 @@ +""" +Whisper Dictation — local GPU speech-to-text with system tray and settings GUI. +Hold hotkey to record, release to transcribe and type into active window. +""" +import json +import os +import threading +import time +import tkinter as tk +from tkinter import ttk + +import numpy as np +import sounddevice as sd +import keyboard +import pystray +from PIL import Image, ImageDraw +from pynput.keyboard import Controller as KeyboardController +from faster_whisper import WhisperModel + +# Shared data dir: script directory (= git repo root, synced via git pull). +_script_dir = os.path.dirname(os.path.abspath(__file__)) +DATA_DIR = os.environ.get("WHISPER_DATA_DIR", _script_dir) +os.makedirs(DATA_DIR, exist_ok=True) + +# Local config dir: machine-specific settings (audio device, device, compute_type). +# Windows: %LOCALAPPDATA%\WhisperDictation +# Linux: ~/.local/share/WhisperDictation +_env_local = os.environ.get("WHISPER_LOCAL_DIR") +if _env_local: + _local_dir = _env_local +elif os.name == "nt": + _local_dir = os.path.join(os.environ.get("LOCALAPPDATA", _script_dir), "WhisperDictation") +else: + _local_dir = os.path.join(os.path.expanduser("~"), ".local", "share", "WhisperDictation") +os.makedirs(_local_dir, exist_ok=True) + +CONFIG_FILE = os.path.join(DATA_DIR, "config.json") # shared via git +CONFIG_LOCAL_FILE = os.path.join(_local_dir, "config_local.json") # machine-specific, not in git +VOCAB_FILE = os.path.join(DATA_DIR, "vocabulary.json") # shared via git + +DEFAULT_CONFIG = { + "hotkey": "ctrl+shift+space", + "model": "medium", + "device": "cuda", + "compute_type": "float16", + "language": "de", + "audio_device": None, + "sample_rate": 16000, +} + +MODELS = ["tiny", "base", "small", "medium", "large-v2", "large-v3"] +LANGUAGES = {"Deutsch": "de", "English": "en", "Français": "fr", "Español": "es", "Italiano": "it", "Auto": None} +DEVICES = ["cuda", "cpu"] +COMPUTE_TYPES = {"float16 (GPU)": "float16", "int8 (CPU/GPU)": "int8", "float32": "float32"} + + +# ── State ───────────────────────────────────────────────────────────────────── + +class AppState: + IDLE = "idle" + RECORDING = "recording" + TRANSCRIBING = "transcribing" + +state = AppState.IDLE +audio_chunks = [] +model = None +typer = KeyboardController() +config = {} +tray_icon = None +overlay_window = None +overlay_tk = None + + +# ── Config ──────────────────────────────────────────────────────────────────── + +LOCAL_KEYS = {"audio_device", "device", "compute_type"} # keys stored only in config_local.json + +def load_config(): + global config + config = dict(DEFAULT_CONFIG) + if os.path.exists(CONFIG_FILE): + with open(CONFIG_FILE) as f: + config.update(json.load(f)) + if os.path.exists(CONFIG_LOCAL_FILE): + with open(CONFIG_LOCAL_FILE) as f: + config.update(json.load(f)) + +def save_config(): + shared = {k: v for k, v in config.items() if k not in LOCAL_KEYS} + local = {k: v for k, v in config.items() if k in LOCAL_KEYS} + with open(CONFIG_FILE, "w") as f: + json.dump(shared, f, indent=2) + with open(CONFIG_LOCAL_FILE, "w") as f: + json.dump(local, f, indent=2) + + +# ── Vocabulary ──────────────────────────────────────────────────────────────── + +vocab = {"words": [], "replacements": []} # {from, to} + +def load_vocab(): + global vocab + if os.path.exists(VOCAB_FILE): + with open(VOCAB_FILE) as f: + vocab = json.load(f) + else: + vocab = {"words": [], "replacements": []} + +def save_vocab(): + with open(VOCAB_FILE, "w") as f: + json.dump(vocab, f, indent=2, ensure_ascii=False) + +def apply_vocab(text: str) -> str: + for r in vocab.get("replacements", []): + text = text.replace(r["from"], r["to"]) + return text + +def get_initial_prompt() -> str: + words = vocab.get("words", []) + return ", ".join(words) if words else "" + + +# ── Tray icon ───────────────────────────────────────────────────────────────── + +def make_icon(color): + img = Image.new("RGBA", (64, 64), (0, 0, 0, 0)) + d = ImageDraw.Draw(img) + d.ellipse([4, 4, 60, 60], fill=color) + return img + +ICONS = { + AppState.IDLE: make_icon((40, 200, 80)), + AppState.RECORDING: make_icon((220, 50, 50)), + AppState.TRANSCRIBING: make_icon((220, 180, 30)), +} + +def set_state(new_state): + global state + state = new_state + if tray_icon: + tray_icon.icon = ICONS[new_state] + if new_state == AppState.RECORDING: + show_overlay() + else: + hide_overlay() + + +# ── Overlay window ──────────────────────────────────────────────────────────── + +def show_overlay(): + if overlay_tk is None: + return + overlay_tk.after(0, _show_overlay_main) + +def hide_overlay(): + if overlay_tk is None: + return + overlay_tk.after(0, _hide_overlay_main) + +def _show_overlay_main(): + overlay_window.deiconify() + # Position bottom-right + sw = overlay_tk.winfo_screenwidth() + sh = overlay_tk.winfo_screenheight() + overlay_window.geometry(f"220x54+{sw - 240}+{sh - 100}") + overlay_window.lift() + +def _hide_overlay_main(): + overlay_window.withdraw() + +def create_overlay(root): + global overlay_window + win = tk.Toplevel(root) + win.withdraw() + win.overrideredirect(True) + win.attributes("-topmost", True) + win.attributes("-alpha", 0.92) + win.configure(bg="#1a1a1a") + + frame = tk.Frame(win, bg="#1a1a1a", padx=12, pady=10) + frame.pack(fill="both", expand=True) + + dot = tk.Canvas(frame, width=14, height=14, bg="#1a1a1a", highlightthickness=0) + dot.create_oval(2, 2, 12, 12, fill="#e03030", outline="") + dot.pack(side="left", padx=(0, 8)) + + tk.Label(frame, text="Aufnahme läuft …", fg="white", bg="#1a1a1a", + font=("Segoe UI", 11)).pack(side="left") + + overlay_window = win + + +# ── Audio ───────────────────────────────────────────────────────────────────── + +def audio_callback(indata, frames, time_info, status): + if state == AppState.RECORDING: + audio_chunks.append(indata.copy()) + +def get_audio_stream(): + device = config.get("audio_device") + return sd.InputStream( + samplerate=config["sample_rate"], + channels=1, + device=device, + callback=audio_callback, + ) + + +# ── Recording & transcription ───────────────────────────────────────────────── + +def start_recording(): + global audio_chunks + if state == AppState.RECORDING: + return + audio_chunks = [] + set_state(AppState.RECORDING) + print("Recording...", flush=True) + +def stop_and_transcribe(): + if state != AppState.RECORDING: + return + set_state(AppState.TRANSCRIBING) + chunks = list(audio_chunks) + + if not chunks: + set_state(AppState.IDLE) + return + + audio = np.concatenate(chunks, axis=0).flatten().astype(np.float32) + duration = len(audio) / config["sample_rate"] + rms = float(np.sqrt(np.mean(audio ** 2))) + print(f"Audio: {duration:.1f}s RMS: {rms:.5f}", flush=True) + + if duration < 0.3 or rms < 0.0005: + print("Too short or silent — skipped.", flush=True) + set_state(AppState.IDLE) + return + + lang = config["language"] if config["language"] else None + prompt = get_initial_prompt() + segments, _ = model.transcribe( + audio, language=lang, beam_size=5, vad_filter=True, + initial_prompt=prompt if prompt else None, + ) + text = " ".join(s.text for s in segments).strip() + text = apply_vocab(text) + print(f"Result: {repr(text)}", flush=True) + + set_state(AppState.IDLE) + if text: + time.sleep(0.15) + typer.type(text) + +def on_space_release(e): + if state == AppState.RECORDING: + threading.Thread(target=stop_and_transcribe, daemon=True).start() + + +# ── Model loading ───────────────────────────────────────────────────────────── + +def load_model(): + global model + print(f"Loading {config['model']} on {config['device']}...", flush=True) + model = WhisperModel( + config["model"], + device=config["device"], + compute_type=config["compute_type"], + ) + print("Model ready.", flush=True) + + +# ── Settings window ─────────────────────────────────────────────────────────── + +def open_settings(): + if overlay_tk is None: + return + overlay_tk.after(0, _open_settings_main) + +def _open_settings_main(): + # ── Palette: "Precision Audio" ────────────────────────────────────────── + BG = "#18181f" # deep void + BG2 = "#22222c" # panel + BG3 = "#2c2c38" # elevated + BORDER = "#38384a" + FG = "#e8e8f0" + FG2 = "#7878a0" + AMBER = "#f5a623" + AMBER2 = "#c8831a" + GREEN = "#4ade80" + FONT = ("Consolas", 11) + FONT_UI = ("Segoe UI", 11) + FONT_B = ("Segoe UI", 11, "bold") + FONT_S = ("Segoe UI", 9) + FONT_H = ("Segoe UI Semibold", 16) + + win = tk.Toplevel(overlay_tk) + win.title("Whisper Dictation") + win.configure(bg=BG) + win.attributes("-topmost", True) + win.resizable(False, False) + + # Center + W, H = 680, 660 + win.update_idletasks() + sw, sh = win.winfo_screenwidth(), win.winfo_screenheight() + win.geometry(f"{W}x{H}+{(sw-W)//2}+{(sh-H)//2}") + + # Global option for OptionMenu dropdowns (dark listbox) + win.option_add("*Menu.background", BG3) + win.option_add("*Menu.foreground", FG) + win.option_add("*Menu.activeBackground", AMBER) + win.option_add("*Menu.activeForeground", BG) + win.option_add("*Menu.font", FONT_UI) + + # ── Header ── + hdr = tk.Frame(win, bg=BG2, pady=20) + hdr.pack(fill="x") + # Amber accent bar + tk.Frame(hdr, bg=AMBER, height=3).pack(fill="x") + tk.Label(hdr, text="WHISPER DICTATION", font=FONT_H, + bg=BG2, fg=FG, pady=12).pack() + tk.Label(hdr, text="Lokale GPU-Transkription · offline · privat", + font=FONT_S, bg=BG2, fg=FG2).pack() + + # ── Scrollable content ── + canvas = tk.Canvas(win, bg=BG, highlightthickness=0) + canvas.pack(fill="both", expand=True) + content = tk.Frame(canvas, bg=BG, padx=36, pady=16) + canvas.create_window((0, 0), window=content, anchor="nw") + + def section(label): + f = tk.Frame(content, bg=BG) + f.pack(fill="x", pady=(18, 6)) + tk.Label(f, text=label, font=("Consolas", 9, "bold"), + bg=BG, fg=AMBER).pack(side="left") + tk.Frame(f, bg=BORDER, height=1).pack(side="left", fill="x", expand=True, padx=(10, 0), pady=6) + + def dd(frame, var, values, width=14): + """Create dark OptionMenu directly in frame as parent.""" + m = tk.OptionMenu(frame, var, *values) + m.config(bg=BG3, fg=FG, activebackground=BG3, activeforeground=FG, + highlightbackground=BORDER, highlightthickness=1, + relief="flat", font=FONT_UI, anchor="w", bd=0, width=width) + m["menu"].config(bg=BG3, fg=FG, activebackground=AMBER, + activeforeground=BG, relief="flat", bd=0) + return m + + def row(label, hint=None): + """Returns frame — add controls to frame after calling.""" + f = tk.Frame(content, bg=BG) + f.pack(fill="x", pady=5) + tk.Label(f, text=label, width=17, anchor="w", font=FONT_UI, + bg=BG, fg=FG2).pack(side="left") + if hint: + tk.Label(f, text=hint, font=FONT_S, bg=BG, fg=FG2).pack(side="right") + return f + + # ── AUDIO ── + section("AUDIO") + devices = [(i, d["name"]) for i, d in enumerate(sd.query_devices()) + if d["max_input_channels"] > 0] + dev_names = ["Standard"] + [f"{i}: {n}" for i, n in devices] + dev_var = tk.StringVar() + cur_dev = config.get("audio_device") + dev_var.set("Standard" if cur_dev is None else + next((f"{i}: {n}" for i, n in devices if i == cur_dev), "Standard")) + f = row("Mikrofon") + dd(f, dev_var, dev_names, width=44).pack(side="left") + + # ── MODELL ── + section("MODELL") + model_hints = { + "tiny": "~1 GB VRAM · sehr schnell", + "base": "~1 GB VRAM", + "small": "~2 GB VRAM", + "medium": "~5 GB VRAM · empfohlen ✓", + "large-v2": "~10 GB VRAM", + "large-v3": "~10 GB VRAM · bestes Ergebnis", + } + model_var = tk.StringVar(value=config["model"]) + f_model = row("Modell") + dd(f_model, model_var, MODELS, 14).pack(side="left") + hint_lbl = tk.Label(f_model, text=model_hints.get(config["model"], ""), + font=FONT_S, bg=BG, fg=FG2) + hint_lbl.pack(side="left", padx=(14, 0)) + model_var.trace_add("write", lambda *_: hint_lbl.config(text=model_hints.get(model_var.get(), ""))) + + lang_display = {v: k for k, v in LANGUAGES.items()} + lang_var = tk.StringVar(value=lang_display.get(config["language"], "Deutsch")) + f = row("Sprache") + dd(f, lang_var, list(LANGUAGES.keys()), 14).pack(side="left") + + # ── LEISTUNG ── + section("LEISTUNG") + device_var = tk.StringVar(value=config["device"]) + f = row("Gerät (GPU/CPU)", hint="cuda = NVIDIA GPU empfohlen") + dd(f, device_var, DEVICES, 8).pack(side="left") + + ct_display = {v: k for k, v in COMPUTE_TYPES.items()} + ct_var = tk.StringVar(value=ct_display.get(config["compute_type"], "float16 (GPU)")) + f = row("Compute Type") + dd(f, ct_var, list(COMPUTE_TYPES.keys()), 18).pack(side="left") + + # ── STEUERUNG ── + section("STEUERUNG") + hotkey_var = tk.StringVar(value=config["hotkey"]) + f_hk = row("Hotkey", hint="z.B. ctrl+shift+space") + tk.Entry(f_hk, textvariable=hotkey_var, font=FONT, width=24, + bg=BG3, fg=FG, insertbackground=AMBER, + relief="flat", bd=6, + highlightbackground=BORDER, highlightthickness=1).pack(side="left") + + # ── Buttons ── + tk.Frame(win, bg=BORDER, height=1).pack(fill="x") + btn_bar = tk.Frame(win, bg=BG2, pady=16, padx=32) + btn_bar.pack(fill="x") + + def save(): + sel = dev_var.get() + config["audio_device"] = None if sel == "Standard" else int(sel.split(":")[0]) + config["model"] = model_var.get() + config["language"] = LANGUAGES[lang_var.get()] + config["device"] = device_var.get() + config["compute_type"] = COMPUTE_TYPES[ct_var.get()] + config["hotkey"] = hotkey_var.get() + save_config() + win.destroy() + threading.Thread(target=reload_model_and_hotkey, daemon=True).start() + + def btn_hover(b, c_in, c_out): + b.bind("", lambda _: b.config(bg=c_in)) + b.bind("", lambda _: b.config(bg=c_out)) + + save_btn = tk.Button(btn_bar, text="Speichern & Neuladen", command=save, + bg=AMBER, fg=BG, font=FONT_B, + relief="flat", padx=20, pady=9, cursor="hand2", bd=0) + save_btn.pack(side="right") + btn_hover(save_btn, AMBER2, AMBER) + + cancel_btn = tk.Button(btn_bar, text="Abbrechen", command=win.destroy, + bg=BG3, fg=FG2, font=FONT_UI, + relief="flat", padx=20, pady=9, cursor="hand2", bd=0) + cancel_btn.pack(side="right", padx=(0, 10)) + btn_hover(cancel_btn, BORDER, BG3) + + +def open_vocab(): + if overlay_tk is None: + return + overlay_tk.after(0, _open_vocab_main) + +def _open_vocab_main(): + BG = "#18181f" + BG2 = "#22222c" + BG3 = "#2c2c38" + BORDER = "#38384a" + FG = "#e8e8f0" + FG2 = "#7878a0" + AMBER = "#f5a623" + AMBER2 = "#c8831a" + RED = "#f87171" + FONT = ("Segoe UI", 11) + FONT_B = ("Segoe UI", 11, "bold") + FONT_S = ("Segoe UI", 9) + FONT_H = ("Segoe UI Semibold", 14) + FONT_M = ("Consolas", 10) + + win = tk.Toplevel(overlay_tk) + win.title("Vokabular") + win.configure(bg=BG) + win.attributes("-topmost", True) + win.resizable(False, False) + W, H = 600, 620 + win.update_idletasks() + sw, sh = win.winfo_screenwidth(), win.winfo_screenheight() + win.geometry(f"{W}x{H}+{(sw-W)//2}+{(sh-H)//2}") + win.option_add("*Menu.background", BG3) + win.option_add("*Menu.foreground", FG) + win.option_add("*Menu.activeBackground", AMBER) + win.option_add("*Menu.activeForeground", BG) + + # ── Header ── + hdr = tk.Frame(win, bg=BG2) + hdr.pack(fill="x") + tk.Frame(hdr, bg=AMBER, height=3).pack(fill="x") + tk.Label(hdr, text="VOKABULAR & KORREKTUREN", font=FONT_H, + bg=BG2, fg=FG, pady=14).pack() + tk.Label(hdr, text="Wörter lernen · Ersetzungen definieren", + font=FONT_S, bg=BG2, fg=FG2).pack(pady=(0, 10)) + + content = tk.Frame(win, bg=BG, padx=28, pady=12) + content.pack(fill="both", expand=True) + + # ── Add-word form ───────────────────────────────────────────────────────── + is_correction = tk.BooleanVar(value=False) + + form = tk.Frame(content, bg=BG3, padx=16, pady=14) + form.pack(fill="x", pady=(0, 16)) + + # Toggle row + tog_row = tk.Frame(form, bg=BG3) + tog_row.pack(fill="x", pady=(0, 10)) + tk.Label(tog_row, text="Korrektur (falsch → richtig)", font=FONT, + bg=BG3, fg=FG).pack(side="left") + + def toggle_form(*_): + if is_correction.get(): + entry_from.pack(side="left", padx=(0, 6)) + arrow_lbl.pack(side="left", padx=4) + entry_to.pack(side="left") + entry_word.pack_forget() + else: + entry_word.pack(side="left", fill="x", expand=True) + entry_from.pack_forget() + arrow_lbl.pack_forget() + entry_to.pack_forget() + + tog_btn = tk.Checkbutton(tog_row, variable=is_correction, command=toggle_form, + bg=BG3, fg=FG2, activebackground=BG3, + selectcolor=AMBER, relief="flat", bd=0, + indicatoron=True) + tog_btn.pack(side="right") + + # Input row + inp_row = tk.Frame(form, bg=BG3) + inp_row.pack(fill="x") + + entry_style = dict(font=FONT_M, bg=BG, fg=FG, insertbackground=AMBER, + relief="flat", bd=6, highlightbackground=BORDER, highlightthickness=1) + + entry_word = tk.Entry(inp_row, width=32, **entry_style) + entry_word.insert(0, "") + entry_from = tk.Entry(inp_row, width=14, **entry_style) + arrow_lbl = tk.Label(inp_row, text="→", font=("Segoe UI", 14), bg=BG3, fg=AMBER) + entry_to = tk.Entry(inp_row, width=14, **entry_style) + entry_word.pack(side="left", fill="x", expand=True) + + def add_entry(): + if is_correction.get(): + frm = entry_from.get().strip() + to = entry_to.get().strip() + if frm and to: + vocab["replacements"].append({"from": frm, "to": to}) + entry_from.delete(0, tk.END) + entry_to.delete(0, tk.END) + else: + w = entry_word.get().strip() + if w and w not in vocab["words"]: + vocab["words"].append(w) + entry_word.delete(0, tk.END) + save_vocab() + refresh_lists() + + win.bind("", lambda _: add_entry()) + + add_btn = tk.Button(inp_row, text="Hinzufügen", command=add_entry, + bg=AMBER, fg=BG, font=FONT_B, + relief="flat", padx=14, pady=5, cursor="hand2", bd=0) + add_btn.pack(side="right", padx=(10, 0)) + add_btn.bind("", lambda _: add_btn.config(bg=AMBER2)) + add_btn.bind("", lambda _: add_btn.config(bg=AMBER)) + + # ── Lists ───────────────────────────────────────────────────────────────── + lists_frame = tk.Frame(content, bg=BG) + lists_frame.pack(fill="both", expand=True) + lists_frame.columnconfigure(0, weight=1) + lists_frame.columnconfigure(1, weight=2) + + def section_label(parent, text): + tk.Label(parent, text=text, font=("Consolas", 9, "bold"), + bg=BG, fg=AMBER).pack(anchor="w", pady=(0, 6)) + + # Words column + col_w = tk.Frame(lists_frame, bg=BG) + col_w.grid(row=0, column=0, sticky="nsew", padx=(0, 12)) + section_label(col_w, "WÖRTER") + + words_box = tk.Listbox(col_w, font=FONT_M, bg=BG3, fg=FG, + selectbackground=AMBER, selectforeground=BG, + relief="flat", bd=0, highlightthickness=0, + activestyle="none", height=10) + words_box.pack(fill="both", expand=True) + + def del_word(): + sel = words_box.curselection() + if sel: + vocab["words"].pop(sel[0]) + save_vocab() + refresh_lists() + + tk.Button(col_w, text="− Entfernen", command=del_word, + bg=BG3, fg=RED, font=FONT_S, relief="flat", + padx=8, pady=3, cursor="hand2", bd=0).pack(anchor="e", pady=(4, 0)) + + # Replacements column + col_r = tk.Frame(lists_frame, bg=BG) + col_r.grid(row=0, column=1, sticky="nsew") + section_label(col_r, "KORREKTUREN") + + repl_box = tk.Listbox(col_r, font=FONT_M, bg=BG3, fg=FG, + selectbackground=AMBER, selectforeground=BG, + relief="flat", bd=0, highlightthickness=0, + activestyle="none", height=10) + repl_box.pack(fill="both", expand=True) + + def del_repl(): + sel = repl_box.curselection() + if sel: + vocab["replacements"].pop(sel[0]) + save_vocab() + refresh_lists() + + tk.Button(col_r, text="− Entfernen", command=del_repl, + bg=BG3, fg=RED, font=FONT_S, relief="flat", + padx=8, pady=3, cursor="hand2", bd=0).pack(anchor="e", pady=(4, 0)) + + def refresh_lists(): + words_box.delete(0, tk.END) + for w in vocab.get("words", []): + words_box.insert(tk.END, f" {w}") + repl_box.delete(0, tk.END) + for r in vocab.get("replacements", []): + repl_box.insert(tk.END, f" {r['from']} → {r['to']}") + + refresh_lists() + + # ── Footer ── + tk.Frame(win, bg=BORDER, height=1).pack(fill="x") + tk.Label(win, text="Wörter fließen als Kontext in Whisper ein · Korrekturen werden nach der Transkription angewendet", + font=FONT_S, bg=BG2, fg=FG2, pady=8).pack() + + +def reload_model_and_hotkey(): + keyboard.unhook_all() + load_model() + keyboard.add_hotkey(config["hotkey"], start_recording, suppress=True) + keyboard.on_release_key(config["hotkey"].split("+")[-1], on_space_release) + print(f"Hotkey updated: {config['hotkey']}", flush=True) + + +# ── Main ────────────────────────────────────────────────────────────────────── + +def main(): + global tray_icon, overlay_tk + + load_config() + load_vocab() + load_model() + + # Tkinter root (hidden) for overlay and settings + root = tk.Tk() + root.withdraw() + overlay_tk = root + create_overlay(root) + + # Audio stream + stream = get_audio_stream() + stream.start() + + # Hotkey + last_key = config["hotkey"].split("+")[-1] + keyboard.add_hotkey(config["hotkey"], start_recording, suppress=True) + keyboard.on_release_key(last_key, on_space_release) + + # Tray + menu = pystray.Menu( + pystray.MenuItem("Einstellungen", lambda: open_settings()), + pystray.MenuItem("Vokabular", lambda: open_vocab()), + pystray.Menu.SEPARATOR, + pystray.MenuItem("Beenden", lambda: quit_app(stream)), + ) + tray_icon = pystray.Icon("whisper", ICONS[AppState.IDLE], "Whisper Dictation", menu) + + threading.Thread(target=tray_icon.run, daemon=True).start() + + print(f"Ready. Hotkey: {config['hotkey']}", flush=True) + root.mainloop() + + stream.stop() + +def quit_app(stream): + stream.stop() + tray_icon.stop() + overlay_tk.after(0, overlay_tk.quit) + + +if __name__ == "__main__": + main() diff --git a/install.bat b/install.bat new file mode 100644 index 0000000..7af6d26 --- /dev/null +++ b/install.bat @@ -0,0 +1,17 @@ +@echo off +cd /d "%~dp0" + +echo Creating Windows venv (.venv-windows)... +py -3.13 -m venv .venv-windows + +set "VENV=%~dp0.venv-windows" +echo Installing dependencies... +"%VENV%\Scripts\pip" install --upgrade pip +"%VENV%\Scripts\pip" install -r requirements.txt + +echo Installing CUDA 12 DLLs (required for GPU acceleration)... +"%VENV%\Scripts\pip" install -r requirements-cuda.txt + +echo. +echo Done. Run start.bat to launch. +pause diff --git a/install.sh b/install.sh new file mode 100644 index 0000000..55a1713 --- /dev/null +++ b/install.sh @@ -0,0 +1,13 @@ +#!/bin/bash +set -e +cd "$(dirname "$0")" + +echo "Creating Linux venv (.venv-linux)..." +python3 -m venv .venv-linux + +echo "Installing dependencies..." +.venv-linux/bin/pip install --upgrade pip +.venv-linux/bin/pip install -r requirements.txt +# No CUDA deps on Linux — runs on CPU + +echo "Done. Run ./start.sh to launch." diff --git a/requirements-cuda.txt b/requirements-cuda.txt new file mode 100644 index 0000000..c0cfd24 --- /dev/null +++ b/requirements-cuda.txt @@ -0,0 +1,4 @@ +# Windows CUDA 12 DLLs required by ctranslate2 (faster-whisper backend) +# Install after requirements.txt on Windows with NVIDIA GPU +nvidia-cublas-cu12 +nvidia-cudnn-cu12 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..3a767f6 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +faster-whisper>=1.0.2 +sounddevice>=0.4.6 +numpy>=1.24 +keyboard>=0.13 +pystray>=0.19 +Pillow>=9.5 +pynput>=1.7.6 diff --git a/start.bat b/start.bat new file mode 100644 index 0000000..4e9189a --- /dev/null +++ b/start.bat @@ -0,0 +1,14 @@ +@echo off +cd /d "%~dp0" + +set "VENV=%~dp0.venv-windows" + +rem Machine-local config dir (device, compute_type, audio_device - not in git) +if not exist "%LOCALAPPDATA%\WhisperDictation" mkdir "%LOCALAPPDATA%\WhisperDictation" +set "WHISPER_LOCAL_DIR=%LOCALAPPDATA%\WhisperDictation" + +rem CUDA 12 DLLs required by ctranslate2 +set "PATH=%VENV%\Lib\site-packages\nvidia\cublas\bin;%VENV%\Lib\site-packages\nvidia\cudnn\bin;%PATH%" + +"%VENV%\Scripts\python.exe" -u "%~dp0dictate.py" +pause diff --git a/start.sh b/start.sh new file mode 100644 index 0000000..2dd9742 --- /dev/null +++ b/start.sh @@ -0,0 +1,3 @@ +#!/bin/bash +cd "$(dirname "$0")" +.venv-linux/bin/python -u dictate.py diff --git a/vocabulary.json b/vocabulary.json new file mode 100644 index 0000000..63d41be --- /dev/null +++ b/vocabulary.json @@ -0,0 +1,18 @@ +{ + "words": [], + "replacements": [ + {"from": "KRA", "to": "KRAH"}, + {"from": "Atos", "to": "ATHOS"}, + {"from": "Resistec", "to": "RESISTEC"}, + {"from": "Resistek", "to": "RESISTEC"}, + {"from": "HES", "to": "HEES"}, + {"from": "Ackerschot", "to": "Ackerschott"}, + {"from": "Carrois", "to": "Kauer"}, + {"from": "Jouer fixe", "to": "Jour-Fixe"}, + {"from": "Docuware", "to": "DocuWare"}, + {"from": "Nates", "to": "Nejc"}, + {"from": "Bittzeit", "to": "BitSight"}, + {"from": "Kalmikow", "to": "Kalmykov"}, + {"from": "Leifert", "to": "Leifer"} + ] +}