feat: initial whisper-dictation repo, migrated from OneDrive

This commit is contained in:
beo3000 2026-03-19 17:55:40 +01:00
commit e1a3eba05a
10 changed files with 781 additions and 0 deletions

11
.gitignore vendored Normal file
View File

@ -0,0 +1,11 @@
.venv-windows/
.venv-linux/
.venv/
__pycache__/
*.pyc
*.pyo
config_local.json
models/
*.log
.DS_Store
Thumbs.db

6
config.json Normal file
View File

@ -0,0 +1,6 @@
{
"hotkey": "ctrl+shift+space",
"model": "medium",
"language": "de",
"sample_rate": 16000
}

688
dictate.py Normal file
View File

@ -0,0 +1,688 @@
"""
Whisper Dictation local GPU speech-to-text with system tray and settings GUI.
Hold hotkey to record, release to transcribe and type into active window.
"""
import json
import os
import threading
import time
import tkinter as tk
from tkinter import ttk
import numpy as np
import sounddevice as sd
import keyboard
import pystray
from PIL import Image, ImageDraw
from pynput.keyboard import Controller as KeyboardController
from faster_whisper import WhisperModel
# Shared data dir: script directory (= git repo root, synced via git pull).
_script_dir = os.path.dirname(os.path.abspath(__file__))
DATA_DIR = os.environ.get("WHISPER_DATA_DIR", _script_dir)
os.makedirs(DATA_DIR, exist_ok=True)
# Local config dir: machine-specific settings (audio device, device, compute_type).
# Windows: %LOCALAPPDATA%\WhisperDictation
# Linux: ~/.local/share/WhisperDictation
_env_local = os.environ.get("WHISPER_LOCAL_DIR")
if _env_local:
_local_dir = _env_local
elif os.name == "nt":
_local_dir = os.path.join(os.environ.get("LOCALAPPDATA", _script_dir), "WhisperDictation")
else:
_local_dir = os.path.join(os.path.expanduser("~"), ".local", "share", "WhisperDictation")
os.makedirs(_local_dir, exist_ok=True)
CONFIG_FILE = os.path.join(DATA_DIR, "config.json") # shared via git
CONFIG_LOCAL_FILE = os.path.join(_local_dir, "config_local.json") # machine-specific, not in git
VOCAB_FILE = os.path.join(DATA_DIR, "vocabulary.json") # shared via git
DEFAULT_CONFIG = {
"hotkey": "ctrl+shift+space",
"model": "medium",
"device": "cuda",
"compute_type": "float16",
"language": "de",
"audio_device": None,
"sample_rate": 16000,
}
MODELS = ["tiny", "base", "small", "medium", "large-v2", "large-v3"]
LANGUAGES = {"Deutsch": "de", "English": "en", "Français": "fr", "Español": "es", "Italiano": "it", "Auto": None}
DEVICES = ["cuda", "cpu"]
COMPUTE_TYPES = {"float16 (GPU)": "float16", "int8 (CPU/GPU)": "int8", "float32": "float32"}
# ── State ─────────────────────────────────────────────────────────────────────
class AppState:
IDLE = "idle"
RECORDING = "recording"
TRANSCRIBING = "transcribing"
state = AppState.IDLE
audio_chunks = []
model = None
typer = KeyboardController()
config = {}
tray_icon = None
overlay_window = None
overlay_tk = None
# ── Config ────────────────────────────────────────────────────────────────────
LOCAL_KEYS = {"audio_device", "device", "compute_type"} # keys stored only in config_local.json
def load_config():
global config
config = dict(DEFAULT_CONFIG)
if os.path.exists(CONFIG_FILE):
with open(CONFIG_FILE) as f:
config.update(json.load(f))
if os.path.exists(CONFIG_LOCAL_FILE):
with open(CONFIG_LOCAL_FILE) as f:
config.update(json.load(f))
def save_config():
shared = {k: v for k, v in config.items() if k not in LOCAL_KEYS}
local = {k: v for k, v in config.items() if k in LOCAL_KEYS}
with open(CONFIG_FILE, "w") as f:
json.dump(shared, f, indent=2)
with open(CONFIG_LOCAL_FILE, "w") as f:
json.dump(local, f, indent=2)
# ── Vocabulary ────────────────────────────────────────────────────────────────
vocab = {"words": [], "replacements": []} # {from, to}
def load_vocab():
global vocab
if os.path.exists(VOCAB_FILE):
with open(VOCAB_FILE) as f:
vocab = json.load(f)
else:
vocab = {"words": [], "replacements": []}
def save_vocab():
with open(VOCAB_FILE, "w") as f:
json.dump(vocab, f, indent=2, ensure_ascii=False)
def apply_vocab(text: str) -> str:
for r in vocab.get("replacements", []):
text = text.replace(r["from"], r["to"])
return text
def get_initial_prompt() -> str:
words = vocab.get("words", [])
return ", ".join(words) if words else ""
# ── Tray icon ─────────────────────────────────────────────────────────────────
def make_icon(color):
img = Image.new("RGBA", (64, 64), (0, 0, 0, 0))
d = ImageDraw.Draw(img)
d.ellipse([4, 4, 60, 60], fill=color)
return img
ICONS = {
AppState.IDLE: make_icon((40, 200, 80)),
AppState.RECORDING: make_icon((220, 50, 50)),
AppState.TRANSCRIBING: make_icon((220, 180, 30)),
}
def set_state(new_state):
global state
state = new_state
if tray_icon:
tray_icon.icon = ICONS[new_state]
if new_state == AppState.RECORDING:
show_overlay()
else:
hide_overlay()
# ── Overlay window ────────────────────────────────────────────────────────────
def show_overlay():
if overlay_tk is None:
return
overlay_tk.after(0, _show_overlay_main)
def hide_overlay():
if overlay_tk is None:
return
overlay_tk.after(0, _hide_overlay_main)
def _show_overlay_main():
overlay_window.deiconify()
# Position bottom-right
sw = overlay_tk.winfo_screenwidth()
sh = overlay_tk.winfo_screenheight()
overlay_window.geometry(f"220x54+{sw - 240}+{sh - 100}")
overlay_window.lift()
def _hide_overlay_main():
overlay_window.withdraw()
def create_overlay(root):
global overlay_window
win = tk.Toplevel(root)
win.withdraw()
win.overrideredirect(True)
win.attributes("-topmost", True)
win.attributes("-alpha", 0.92)
win.configure(bg="#1a1a1a")
frame = tk.Frame(win, bg="#1a1a1a", padx=12, pady=10)
frame.pack(fill="both", expand=True)
dot = tk.Canvas(frame, width=14, height=14, bg="#1a1a1a", highlightthickness=0)
dot.create_oval(2, 2, 12, 12, fill="#e03030", outline="")
dot.pack(side="left", padx=(0, 8))
tk.Label(frame, text="Aufnahme läuft …", fg="white", bg="#1a1a1a",
font=("Segoe UI", 11)).pack(side="left")
overlay_window = win
# ── Audio ─────────────────────────────────────────────────────────────────────
def audio_callback(indata, frames, time_info, status):
if state == AppState.RECORDING:
audio_chunks.append(indata.copy())
def get_audio_stream():
device = config.get("audio_device")
return sd.InputStream(
samplerate=config["sample_rate"],
channels=1,
device=device,
callback=audio_callback,
)
# ── Recording & transcription ─────────────────────────────────────────────────
def start_recording():
global audio_chunks
if state == AppState.RECORDING:
return
audio_chunks = []
set_state(AppState.RECORDING)
print("Recording...", flush=True)
def stop_and_transcribe():
if state != AppState.RECORDING:
return
set_state(AppState.TRANSCRIBING)
chunks = list(audio_chunks)
if not chunks:
set_state(AppState.IDLE)
return
audio = np.concatenate(chunks, axis=0).flatten().astype(np.float32)
duration = len(audio) / config["sample_rate"]
rms = float(np.sqrt(np.mean(audio ** 2)))
print(f"Audio: {duration:.1f}s RMS: {rms:.5f}", flush=True)
if duration < 0.3 or rms < 0.0005:
print("Too short or silent — skipped.", flush=True)
set_state(AppState.IDLE)
return
lang = config["language"] if config["language"] else None
prompt = get_initial_prompt()
segments, _ = model.transcribe(
audio, language=lang, beam_size=5, vad_filter=True,
initial_prompt=prompt if prompt else None,
)
text = " ".join(s.text for s in segments).strip()
text = apply_vocab(text)
print(f"Result: {repr(text)}", flush=True)
set_state(AppState.IDLE)
if text:
time.sleep(0.15)
typer.type(text)
def on_space_release(e):
if state == AppState.RECORDING:
threading.Thread(target=stop_and_transcribe, daemon=True).start()
# ── Model loading ─────────────────────────────────────────────────────────────
def load_model():
global model
print(f"Loading {config['model']} on {config['device']}...", flush=True)
model = WhisperModel(
config["model"],
device=config["device"],
compute_type=config["compute_type"],
)
print("Model ready.", flush=True)
# ── Settings window ───────────────────────────────────────────────────────────
def open_settings():
if overlay_tk is None:
return
overlay_tk.after(0, _open_settings_main)
def _open_settings_main():
# ── Palette: "Precision Audio" ──────────────────────────────────────────
BG = "#18181f" # deep void
BG2 = "#22222c" # panel
BG3 = "#2c2c38" # elevated
BORDER = "#38384a"
FG = "#e8e8f0"
FG2 = "#7878a0"
AMBER = "#f5a623"
AMBER2 = "#c8831a"
GREEN = "#4ade80"
FONT = ("Consolas", 11)
FONT_UI = ("Segoe UI", 11)
FONT_B = ("Segoe UI", 11, "bold")
FONT_S = ("Segoe UI", 9)
FONT_H = ("Segoe UI Semibold", 16)
win = tk.Toplevel(overlay_tk)
win.title("Whisper Dictation")
win.configure(bg=BG)
win.attributes("-topmost", True)
win.resizable(False, False)
# Center
W, H = 680, 660
win.update_idletasks()
sw, sh = win.winfo_screenwidth(), win.winfo_screenheight()
win.geometry(f"{W}x{H}+{(sw-W)//2}+{(sh-H)//2}")
# Global option for OptionMenu dropdowns (dark listbox)
win.option_add("*Menu.background", BG3)
win.option_add("*Menu.foreground", FG)
win.option_add("*Menu.activeBackground", AMBER)
win.option_add("*Menu.activeForeground", BG)
win.option_add("*Menu.font", FONT_UI)
# ── Header ──
hdr = tk.Frame(win, bg=BG2, pady=20)
hdr.pack(fill="x")
# Amber accent bar
tk.Frame(hdr, bg=AMBER, height=3).pack(fill="x")
tk.Label(hdr, text="WHISPER DICTATION", font=FONT_H,
bg=BG2, fg=FG, pady=12).pack()
tk.Label(hdr, text="Lokale GPU-Transkription · offline · privat",
font=FONT_S, bg=BG2, fg=FG2).pack()
# ── Scrollable content ──
canvas = tk.Canvas(win, bg=BG, highlightthickness=0)
canvas.pack(fill="both", expand=True)
content = tk.Frame(canvas, bg=BG, padx=36, pady=16)
canvas.create_window((0, 0), window=content, anchor="nw")
def section(label):
f = tk.Frame(content, bg=BG)
f.pack(fill="x", pady=(18, 6))
tk.Label(f, text=label, font=("Consolas", 9, "bold"),
bg=BG, fg=AMBER).pack(side="left")
tk.Frame(f, bg=BORDER, height=1).pack(side="left", fill="x", expand=True, padx=(10, 0), pady=6)
def dd(frame, var, values, width=14):
"""Create dark OptionMenu directly in frame as parent."""
m = tk.OptionMenu(frame, var, *values)
m.config(bg=BG3, fg=FG, activebackground=BG3, activeforeground=FG,
highlightbackground=BORDER, highlightthickness=1,
relief="flat", font=FONT_UI, anchor="w", bd=0, width=width)
m["menu"].config(bg=BG3, fg=FG, activebackground=AMBER,
activeforeground=BG, relief="flat", bd=0)
return m
def row(label, hint=None):
"""Returns frame — add controls to frame after calling."""
f = tk.Frame(content, bg=BG)
f.pack(fill="x", pady=5)
tk.Label(f, text=label, width=17, anchor="w", font=FONT_UI,
bg=BG, fg=FG2).pack(side="left")
if hint:
tk.Label(f, text=hint, font=FONT_S, bg=BG, fg=FG2).pack(side="right")
return f
# ── AUDIO ──
section("AUDIO")
devices = [(i, d["name"]) for i, d in enumerate(sd.query_devices())
if d["max_input_channels"] > 0]
dev_names = ["Standard"] + [f"{i}: {n}" for i, n in devices]
dev_var = tk.StringVar()
cur_dev = config.get("audio_device")
dev_var.set("Standard" if cur_dev is None else
next((f"{i}: {n}" for i, n in devices if i == cur_dev), "Standard"))
f = row("Mikrofon")
dd(f, dev_var, dev_names, width=44).pack(side="left")
# ── MODELL ──
section("MODELL")
model_hints = {
"tiny": "~1 GB VRAM · sehr schnell",
"base": "~1 GB VRAM",
"small": "~2 GB VRAM",
"medium": "~5 GB VRAM · empfohlen ✓",
"large-v2": "~10 GB VRAM",
"large-v3": "~10 GB VRAM · bestes Ergebnis",
}
model_var = tk.StringVar(value=config["model"])
f_model = row("Modell")
dd(f_model, model_var, MODELS, 14).pack(side="left")
hint_lbl = tk.Label(f_model, text=model_hints.get(config["model"], ""),
font=FONT_S, bg=BG, fg=FG2)
hint_lbl.pack(side="left", padx=(14, 0))
model_var.trace_add("write", lambda *_: hint_lbl.config(text=model_hints.get(model_var.get(), "")))
lang_display = {v: k for k, v in LANGUAGES.items()}
lang_var = tk.StringVar(value=lang_display.get(config["language"], "Deutsch"))
f = row("Sprache")
dd(f, lang_var, list(LANGUAGES.keys()), 14).pack(side="left")
# ── LEISTUNG ──
section("LEISTUNG")
device_var = tk.StringVar(value=config["device"])
f = row("Gerät (GPU/CPU)", hint="cuda = NVIDIA GPU empfohlen")
dd(f, device_var, DEVICES, 8).pack(side="left")
ct_display = {v: k for k, v in COMPUTE_TYPES.items()}
ct_var = tk.StringVar(value=ct_display.get(config["compute_type"], "float16 (GPU)"))
f = row("Compute Type")
dd(f, ct_var, list(COMPUTE_TYPES.keys()), 18).pack(side="left")
# ── STEUERUNG ──
section("STEUERUNG")
hotkey_var = tk.StringVar(value=config["hotkey"])
f_hk = row("Hotkey", hint="z.B. ctrl+shift+space")
tk.Entry(f_hk, textvariable=hotkey_var, font=FONT, width=24,
bg=BG3, fg=FG, insertbackground=AMBER,
relief="flat", bd=6,
highlightbackground=BORDER, highlightthickness=1).pack(side="left")
# ── Buttons ──
tk.Frame(win, bg=BORDER, height=1).pack(fill="x")
btn_bar = tk.Frame(win, bg=BG2, pady=16, padx=32)
btn_bar.pack(fill="x")
def save():
sel = dev_var.get()
config["audio_device"] = None if sel == "Standard" else int(sel.split(":")[0])
config["model"] = model_var.get()
config["language"] = LANGUAGES[lang_var.get()]
config["device"] = device_var.get()
config["compute_type"] = COMPUTE_TYPES[ct_var.get()]
config["hotkey"] = hotkey_var.get()
save_config()
win.destroy()
threading.Thread(target=reload_model_and_hotkey, daemon=True).start()
def btn_hover(b, c_in, c_out):
b.bind("<Enter>", lambda _: b.config(bg=c_in))
b.bind("<Leave>", lambda _: b.config(bg=c_out))
save_btn = tk.Button(btn_bar, text="Speichern & Neuladen", command=save,
bg=AMBER, fg=BG, font=FONT_B,
relief="flat", padx=20, pady=9, cursor="hand2", bd=0)
save_btn.pack(side="right")
btn_hover(save_btn, AMBER2, AMBER)
cancel_btn = tk.Button(btn_bar, text="Abbrechen", command=win.destroy,
bg=BG3, fg=FG2, font=FONT_UI,
relief="flat", padx=20, pady=9, cursor="hand2", bd=0)
cancel_btn.pack(side="right", padx=(0, 10))
btn_hover(cancel_btn, BORDER, BG3)
def open_vocab():
if overlay_tk is None:
return
overlay_tk.after(0, _open_vocab_main)
def _open_vocab_main():
BG = "#18181f"
BG2 = "#22222c"
BG3 = "#2c2c38"
BORDER = "#38384a"
FG = "#e8e8f0"
FG2 = "#7878a0"
AMBER = "#f5a623"
AMBER2 = "#c8831a"
RED = "#f87171"
FONT = ("Segoe UI", 11)
FONT_B = ("Segoe UI", 11, "bold")
FONT_S = ("Segoe UI", 9)
FONT_H = ("Segoe UI Semibold", 14)
FONT_M = ("Consolas", 10)
win = tk.Toplevel(overlay_tk)
win.title("Vokabular")
win.configure(bg=BG)
win.attributes("-topmost", True)
win.resizable(False, False)
W, H = 600, 620
win.update_idletasks()
sw, sh = win.winfo_screenwidth(), win.winfo_screenheight()
win.geometry(f"{W}x{H}+{(sw-W)//2}+{(sh-H)//2}")
win.option_add("*Menu.background", BG3)
win.option_add("*Menu.foreground", FG)
win.option_add("*Menu.activeBackground", AMBER)
win.option_add("*Menu.activeForeground", BG)
# ── Header ──
hdr = tk.Frame(win, bg=BG2)
hdr.pack(fill="x")
tk.Frame(hdr, bg=AMBER, height=3).pack(fill="x")
tk.Label(hdr, text="VOKABULAR & KORREKTUREN", font=FONT_H,
bg=BG2, fg=FG, pady=14).pack()
tk.Label(hdr, text="Wörter lernen · Ersetzungen definieren",
font=FONT_S, bg=BG2, fg=FG2).pack(pady=(0, 10))
content = tk.Frame(win, bg=BG, padx=28, pady=12)
content.pack(fill="both", expand=True)
# ── Add-word form ─────────────────────────────────────────────────────────
is_correction = tk.BooleanVar(value=False)
form = tk.Frame(content, bg=BG3, padx=16, pady=14)
form.pack(fill="x", pady=(0, 16))
# Toggle row
tog_row = tk.Frame(form, bg=BG3)
tog_row.pack(fill="x", pady=(0, 10))
tk.Label(tog_row, text="Korrektur (falsch → richtig)", font=FONT,
bg=BG3, fg=FG).pack(side="left")
def toggle_form(*_):
if is_correction.get():
entry_from.pack(side="left", padx=(0, 6))
arrow_lbl.pack(side="left", padx=4)
entry_to.pack(side="left")
entry_word.pack_forget()
else:
entry_word.pack(side="left", fill="x", expand=True)
entry_from.pack_forget()
arrow_lbl.pack_forget()
entry_to.pack_forget()
tog_btn = tk.Checkbutton(tog_row, variable=is_correction, command=toggle_form,
bg=BG3, fg=FG2, activebackground=BG3,
selectcolor=AMBER, relief="flat", bd=0,
indicatoron=True)
tog_btn.pack(side="right")
# Input row
inp_row = tk.Frame(form, bg=BG3)
inp_row.pack(fill="x")
entry_style = dict(font=FONT_M, bg=BG, fg=FG, insertbackground=AMBER,
relief="flat", bd=6, highlightbackground=BORDER, highlightthickness=1)
entry_word = tk.Entry(inp_row, width=32, **entry_style)
entry_word.insert(0, "")
entry_from = tk.Entry(inp_row, width=14, **entry_style)
arrow_lbl = tk.Label(inp_row, text="", font=("Segoe UI", 14), bg=BG3, fg=AMBER)
entry_to = tk.Entry(inp_row, width=14, **entry_style)
entry_word.pack(side="left", fill="x", expand=True)
def add_entry():
if is_correction.get():
frm = entry_from.get().strip()
to = entry_to.get().strip()
if frm and to:
vocab["replacements"].append({"from": frm, "to": to})
entry_from.delete(0, tk.END)
entry_to.delete(0, tk.END)
else:
w = entry_word.get().strip()
if w and w not in vocab["words"]:
vocab["words"].append(w)
entry_word.delete(0, tk.END)
save_vocab()
refresh_lists()
win.bind("<Return>", lambda _: add_entry())
add_btn = tk.Button(inp_row, text="Hinzufügen", command=add_entry,
bg=AMBER, fg=BG, font=FONT_B,
relief="flat", padx=14, pady=5, cursor="hand2", bd=0)
add_btn.pack(side="right", padx=(10, 0))
add_btn.bind("<Enter>", lambda _: add_btn.config(bg=AMBER2))
add_btn.bind("<Leave>", lambda _: add_btn.config(bg=AMBER))
# ── Lists ─────────────────────────────────────────────────────────────────
lists_frame = tk.Frame(content, bg=BG)
lists_frame.pack(fill="both", expand=True)
lists_frame.columnconfigure(0, weight=1)
lists_frame.columnconfigure(1, weight=2)
def section_label(parent, text):
tk.Label(parent, text=text, font=("Consolas", 9, "bold"),
bg=BG, fg=AMBER).pack(anchor="w", pady=(0, 6))
# Words column
col_w = tk.Frame(lists_frame, bg=BG)
col_w.grid(row=0, column=0, sticky="nsew", padx=(0, 12))
section_label(col_w, "WÖRTER")
words_box = tk.Listbox(col_w, font=FONT_M, bg=BG3, fg=FG,
selectbackground=AMBER, selectforeground=BG,
relief="flat", bd=0, highlightthickness=0,
activestyle="none", height=10)
words_box.pack(fill="both", expand=True)
def del_word():
sel = words_box.curselection()
if sel:
vocab["words"].pop(sel[0])
save_vocab()
refresh_lists()
tk.Button(col_w, text=" Entfernen", command=del_word,
bg=BG3, fg=RED, font=FONT_S, relief="flat",
padx=8, pady=3, cursor="hand2", bd=0).pack(anchor="e", pady=(4, 0))
# Replacements column
col_r = tk.Frame(lists_frame, bg=BG)
col_r.grid(row=0, column=1, sticky="nsew")
section_label(col_r, "KORREKTUREN")
repl_box = tk.Listbox(col_r, font=FONT_M, bg=BG3, fg=FG,
selectbackground=AMBER, selectforeground=BG,
relief="flat", bd=0, highlightthickness=0,
activestyle="none", height=10)
repl_box.pack(fill="both", expand=True)
def del_repl():
sel = repl_box.curselection()
if sel:
vocab["replacements"].pop(sel[0])
save_vocab()
refresh_lists()
tk.Button(col_r, text=" Entfernen", command=del_repl,
bg=BG3, fg=RED, font=FONT_S, relief="flat",
padx=8, pady=3, cursor="hand2", bd=0).pack(anchor="e", pady=(4, 0))
def refresh_lists():
words_box.delete(0, tk.END)
for w in vocab.get("words", []):
words_box.insert(tk.END, f" {w}")
repl_box.delete(0, tk.END)
for r in vocab.get("replacements", []):
repl_box.insert(tk.END, f" {r['from']}{r['to']}")
refresh_lists()
# ── Footer ──
tk.Frame(win, bg=BORDER, height=1).pack(fill="x")
tk.Label(win, text="Wörter fließen als Kontext in Whisper ein · Korrekturen werden nach der Transkription angewendet",
font=FONT_S, bg=BG2, fg=FG2, pady=8).pack()
def reload_model_and_hotkey():
keyboard.unhook_all()
load_model()
keyboard.add_hotkey(config["hotkey"], start_recording, suppress=True)
keyboard.on_release_key(config["hotkey"].split("+")[-1], on_space_release)
print(f"Hotkey updated: {config['hotkey']}", flush=True)
# ── Main ──────────────────────────────────────────────────────────────────────
def main():
global tray_icon, overlay_tk
load_config()
load_vocab()
load_model()
# Tkinter root (hidden) for overlay and settings
root = tk.Tk()
root.withdraw()
overlay_tk = root
create_overlay(root)
# Audio stream
stream = get_audio_stream()
stream.start()
# Hotkey
last_key = config["hotkey"].split("+")[-1]
keyboard.add_hotkey(config["hotkey"], start_recording, suppress=True)
keyboard.on_release_key(last_key, on_space_release)
# Tray
menu = pystray.Menu(
pystray.MenuItem("Einstellungen", lambda: open_settings()),
pystray.MenuItem("Vokabular", lambda: open_vocab()),
pystray.Menu.SEPARATOR,
pystray.MenuItem("Beenden", lambda: quit_app(stream)),
)
tray_icon = pystray.Icon("whisper", ICONS[AppState.IDLE], "Whisper Dictation", menu)
threading.Thread(target=tray_icon.run, daemon=True).start()
print(f"Ready. Hotkey: {config['hotkey']}", flush=True)
root.mainloop()
stream.stop()
def quit_app(stream):
stream.stop()
tray_icon.stop()
overlay_tk.after(0, overlay_tk.quit)
if __name__ == "__main__":
main()

17
install.bat Normal file
View File

@ -0,0 +1,17 @@
@echo off
cd /d "%~dp0"
echo Creating Windows venv (.venv-windows)...
py -3.13 -m venv .venv-windows
set "VENV=%~dp0.venv-windows"
echo Installing dependencies...
"%VENV%\Scripts\pip" install --upgrade pip
"%VENV%\Scripts\pip" install -r requirements.txt
echo Installing CUDA 12 DLLs (required for GPU acceleration)...
"%VENV%\Scripts\pip" install -r requirements-cuda.txt
echo.
echo Done. Run start.bat to launch.
pause

13
install.sh Normal file
View File

@ -0,0 +1,13 @@
#!/bin/bash
set -e
cd "$(dirname "$0")"
echo "Creating Linux venv (.venv-linux)..."
python3 -m venv .venv-linux
echo "Installing dependencies..."
.venv-linux/bin/pip install --upgrade pip
.venv-linux/bin/pip install -r requirements.txt
# No CUDA deps on Linux — runs on CPU
echo "Done. Run ./start.sh to launch."

4
requirements-cuda.txt Normal file
View File

@ -0,0 +1,4 @@
# Windows CUDA 12 DLLs required by ctranslate2 (faster-whisper backend)
# Install after requirements.txt on Windows with NVIDIA GPU
nvidia-cublas-cu12
nvidia-cudnn-cu12

7
requirements.txt Normal file
View File

@ -0,0 +1,7 @@
faster-whisper>=1.0.2
sounddevice>=0.4.6
numpy>=1.24
keyboard>=0.13
pystray>=0.19
Pillow>=9.5
pynput>=1.7.6

14
start.bat Normal file
View File

@ -0,0 +1,14 @@
@echo off
cd /d "%~dp0"
set "VENV=%~dp0.venv-windows"
rem Machine-local config dir (device, compute_type, audio_device - not in git)
if not exist "%LOCALAPPDATA%\WhisperDictation" mkdir "%LOCALAPPDATA%\WhisperDictation"
set "WHISPER_LOCAL_DIR=%LOCALAPPDATA%\WhisperDictation"
rem CUDA 12 DLLs required by ctranslate2
set "PATH=%VENV%\Lib\site-packages\nvidia\cublas\bin;%VENV%\Lib\site-packages\nvidia\cudnn\bin;%PATH%"
"%VENV%\Scripts\python.exe" -u "%~dp0dictate.py"
pause

3
start.sh Normal file
View File

@ -0,0 +1,3 @@
#!/bin/bash
cd "$(dirname "$0")"
.venv-linux/bin/python -u dictate.py

18
vocabulary.json Normal file
View File

@ -0,0 +1,18 @@
{
"words": [],
"replacements": [
{"from": "KRA", "to": "KRAH"},
{"from": "Atos", "to": "ATHOS"},
{"from": "Resistec", "to": "RESISTEC"},
{"from": "Resistek", "to": "RESISTEC"},
{"from": "HES", "to": "HEES"},
{"from": "Ackerschot", "to": "Ackerschott"},
{"from": "Carrois", "to": "Kauer"},
{"from": "Jouer fixe", "to": "Jour-Fixe"},
{"from": "Docuware", "to": "DocuWare"},
{"from": "Nates", "to": "Nejc"},
{"from": "Bittzeit", "to": "BitSight"},
{"from": "Kalmikow", "to": "Kalmykov"},
{"from": "Leifert", "to": "Leifer"}
]
}