808 lines
28 KiB
Python
808 lines
28 KiB
Python
"""
|
||
Whisper Dictation — local GPU speech-to-text with system tray and settings GUI.
|
||
Hold hotkey to record, release to transcribe and type into active window.
|
||
"""
|
||
import json
|
||
import os
|
||
import shutil
|
||
import subprocess
|
||
import threading
|
||
import time
|
||
import tkinter as tk
|
||
from tkinter import ttk
|
||
|
||
import numpy as np
|
||
import sounddevice as sd
|
||
import pystray
|
||
from PIL import Image, ImageDraw
|
||
from pynput.keyboard import Controller as KeyboardController, Listener as KeyboardListener, Key, KeyCode
|
||
from faster_whisper import WhisperModel
|
||
|
||
# Shared data dir: script directory (= git repo root, synced via git pull).
|
||
_script_dir = os.path.dirname(os.path.abspath(__file__))
|
||
DATA_DIR = os.environ.get("WHISPER_DATA_DIR", _script_dir)
|
||
os.makedirs(DATA_DIR, exist_ok=True)
|
||
|
||
# Local config dir: machine-specific settings (audio device, device, compute_type).
|
||
# Windows: %LOCALAPPDATA%\WhisperDictation
|
||
# Linux: ~/.local/share/WhisperDictation
|
||
_env_local = os.environ.get("WHISPER_LOCAL_DIR")
|
||
if _env_local:
|
||
_local_dir = _env_local
|
||
elif os.name == "nt":
|
||
_local_dir = os.path.join(os.environ.get("LOCALAPPDATA", _script_dir), "WhisperDictation")
|
||
else:
|
||
_local_dir = os.path.join(os.path.expanduser("~"), ".local", "share", "WhisperDictation")
|
||
os.makedirs(_local_dir, exist_ok=True)
|
||
|
||
CONFIG_FILE = os.path.join(DATA_DIR, "config.json") # shared via git
|
||
CONFIG_LOCAL_FILE = os.path.join(_local_dir, "config_local.json") # machine-specific, not in git
|
||
VOCAB_FILE = os.path.join(DATA_DIR, "vocabulary.json") # shared via git
|
||
|
||
DEFAULT_CONFIG = {
|
||
"hotkey": "ctrl+shift+space",
|
||
"model": "medium",
|
||
"device": "cuda",
|
||
"compute_type": "float16",
|
||
"language": "de",
|
||
"audio_device": None,
|
||
"sample_rate": 16000,
|
||
}
|
||
|
||
MODELS = ["tiny", "base", "small", "medium", "large-v2", "large-v3"]
|
||
LANGUAGES = {"Deutsch": "de", "English": "en", "Français": "fr", "Español": "es", "Italiano": "it", "Auto": None}
|
||
DEVICES = ["cuda", "cpu"]
|
||
COMPUTE_TYPES = {"float16 (GPU)": "float16", "int8 (CPU/GPU)": "int8", "float32": "float32"}
|
||
|
||
|
||
# ── State ─────────────────────────────────────────────────────────────────────
|
||
|
||
class AppState:
|
||
IDLE = "idle"
|
||
RECORDING = "recording"
|
||
TRANSCRIBING = "transcribing"
|
||
|
||
state = AppState.IDLE
|
||
audio_chunks = []
|
||
model = None
|
||
config = {}
|
||
tray_icon = None
|
||
overlay_window = None
|
||
overlay_tk = None
|
||
hotkey_listener = None
|
||
|
||
|
||
def type_text(text):
|
||
"""Type text into the active window, cross-platform."""
|
||
if os.name == "nt":
|
||
KeyboardController().type(text)
|
||
return
|
||
session = os.environ.get("XDG_SESSION_TYPE", "")
|
||
if session == "wayland" and shutil.which("wl-copy"):
|
||
subprocess.run(["wl-copy", "--", text], check=False)
|
||
time.sleep(0.05)
|
||
subprocess.run(["xdotool", "key", "ctrl+v"], check=False)
|
||
elif shutil.which("xdotool"):
|
||
subprocess.run(["xdotool", "type", "--clearmodifiers", "--", text], check=False)
|
||
else:
|
||
KeyboardController().type(text)
|
||
|
||
|
||
# ── Hotkey via pynput ────────────────────────────────────────────────────────
|
||
|
||
_MODIFIER_MAP = {
|
||
"ctrl": {Key.ctrl_l, Key.ctrl_r},
|
||
"ctrl_l": {Key.ctrl_l}, "ctrl_r": {Key.ctrl_r},
|
||
"shift": {Key.shift_l, Key.shift_r},
|
||
"shift_l": {Key.shift_l}, "shift_r": {Key.shift_r},
|
||
"alt": {Key.alt_l, Key.alt_r},
|
||
"alt_l": {Key.alt_l}, "alt_r": {Key.alt_r},
|
||
}
|
||
|
||
_KEY_MAP = {
|
||
"space": Key.space, "tab": Key.tab, "enter": Key.enter,
|
||
"esc": Key.esc, "escape": Key.esc,
|
||
"up": Key.up, "down": Key.down, "left": Key.left, "right": Key.right,
|
||
"home": Key.home, "end": Key.end, "page_up": Key.page_up, "page_down": Key.page_down,
|
||
"insert": Key.insert, "delete": Key.delete, "backspace": Key.backspace,
|
||
}
|
||
for i in range(1, 13):
|
||
_KEY_MAP[f"f{i}"] = getattr(Key, f"f{i}")
|
||
|
||
|
||
def _parse_hotkey(hotkey_str):
|
||
"""Parse hotkey string into (modifier_sets, trigger_key).
|
||
Returns: (list of sets-of-pynput-keys for each modifier, pynput key for trigger)
|
||
"""
|
||
parts = [p.strip().lower() for p in hotkey_str.split("+")]
|
||
modifiers = []
|
||
for p in parts[:-1]:
|
||
if p in _MODIFIER_MAP:
|
||
modifiers.append(_MODIFIER_MAP[p])
|
||
elif p in _KEY_MAP:
|
||
modifiers.append({_KEY_MAP[p]})
|
||
else:
|
||
modifiers.append({KeyCode.from_char(p)})
|
||
trigger_part = parts[-1]
|
||
if trigger_part in _KEY_MAP:
|
||
trigger = _KEY_MAP[trigger_part]
|
||
elif trigger_part in _MODIFIER_MAP:
|
||
trigger = next(iter(_MODIFIER_MAP[trigger_part]))
|
||
else:
|
||
trigger = KeyCode.from_char(trigger_part)
|
||
return modifiers, trigger
|
||
|
||
|
||
class HotkeyListener:
|
||
"""Hold-to-record hotkey using pynput. No root required on X11."""
|
||
|
||
def __init__(self, hotkey_str, on_press, on_release):
|
||
self._modifiers, self._trigger = _parse_hotkey(hotkey_str)
|
||
self._on_press = on_press
|
||
self._on_release = on_release
|
||
self._pressed = set()
|
||
self._active = False
|
||
self._listener = KeyboardListener(on_press=self._key_down, on_release=self._key_up)
|
||
self._listener.daemon = True
|
||
self._listener.start()
|
||
|
||
def _matches_trigger(self, key):
|
||
return key == self._trigger
|
||
|
||
def _modifiers_held(self):
|
||
return all(any(k in self._pressed for k in mod_set) for mod_set in self._modifiers)
|
||
|
||
def _key_down(self, key):
|
||
self._pressed.add(key)
|
||
if not self._active and self._matches_trigger(key) and self._modifiers_held():
|
||
self._active = True
|
||
self._on_press()
|
||
|
||
def _key_up(self, key):
|
||
self._pressed.discard(key)
|
||
if self._active and self._matches_trigger(key):
|
||
self._active = False
|
||
self._on_release()
|
||
|
||
def stop(self):
|
||
self._listener.stop()
|
||
|
||
|
||
# ── Config ────────────────────────────────────────────────────────────────────
|
||
|
||
LOCAL_KEYS = {"audio_device", "device", "compute_type"} # keys stored only in config_local.json
|
||
|
||
def load_config():
|
||
global config
|
||
config = dict(DEFAULT_CONFIG)
|
||
if os.path.exists(CONFIG_FILE):
|
||
with open(CONFIG_FILE) as f:
|
||
config.update(json.load(f))
|
||
if os.path.exists(CONFIG_LOCAL_FILE):
|
||
with open(CONFIG_LOCAL_FILE) as f:
|
||
config.update(json.load(f))
|
||
|
||
def save_config():
|
||
shared = {k: v for k, v in config.items() if k not in LOCAL_KEYS}
|
||
local = {k: v for k, v in config.items() if k in LOCAL_KEYS}
|
||
with open(CONFIG_FILE, "w") as f:
|
||
json.dump(shared, f, indent=2)
|
||
with open(CONFIG_LOCAL_FILE, "w") as f:
|
||
json.dump(local, f, indent=2)
|
||
|
||
|
||
# ── Vocabulary ────────────────────────────────────────────────────────────────
|
||
|
||
vocab = {"words": [], "replacements": []} # {from, to}
|
||
|
||
def load_vocab():
|
||
global vocab
|
||
if os.path.exists(VOCAB_FILE):
|
||
with open(VOCAB_FILE) as f:
|
||
vocab = json.load(f)
|
||
else:
|
||
vocab = {"words": [], "replacements": []}
|
||
|
||
def save_vocab():
|
||
with open(VOCAB_FILE, "w") as f:
|
||
json.dump(vocab, f, indent=2, ensure_ascii=False)
|
||
|
||
def apply_vocab(text: str) -> str:
|
||
for r in vocab.get("replacements", []):
|
||
text = text.replace(r["from"], r["to"])
|
||
return text
|
||
|
||
def get_initial_prompt() -> str:
|
||
words = vocab.get("words", [])
|
||
return ", ".join(words) if words else ""
|
||
|
||
|
||
# ── Tray icon ─────────────────────────────────────────────────────────────────
|
||
|
||
def make_icon(color):
|
||
img = Image.new("RGBA", (64, 64), (0, 0, 0, 0))
|
||
d = ImageDraw.Draw(img)
|
||
d.ellipse([4, 4, 60, 60], fill=color)
|
||
return img
|
||
|
||
ICONS = {
|
||
AppState.IDLE: make_icon((40, 200, 80)),
|
||
AppState.RECORDING: make_icon((220, 50, 50)),
|
||
AppState.TRANSCRIBING: make_icon((220, 180, 30)),
|
||
}
|
||
|
||
def set_state(new_state):
|
||
global state
|
||
state = new_state
|
||
if tray_icon:
|
||
tray_icon.icon = ICONS[new_state]
|
||
if new_state == AppState.RECORDING:
|
||
show_overlay()
|
||
else:
|
||
hide_overlay()
|
||
|
||
|
||
# ── Overlay window ────────────────────────────────────────────────────────────
|
||
|
||
def show_overlay():
|
||
if overlay_tk is None:
|
||
return
|
||
overlay_tk.after(0, _show_overlay_main)
|
||
|
||
def hide_overlay():
|
||
if overlay_tk is None:
|
||
return
|
||
overlay_tk.after(0, _hide_overlay_main)
|
||
|
||
def _show_overlay_main():
|
||
overlay_window.deiconify()
|
||
# Position bottom-right
|
||
sw = overlay_tk.winfo_screenwidth()
|
||
sh = overlay_tk.winfo_screenheight()
|
||
overlay_window.geometry(f"220x54+{sw - 240}+{sh - 100}")
|
||
overlay_window.lift()
|
||
|
||
def _hide_overlay_main():
|
||
overlay_window.withdraw()
|
||
|
||
def create_overlay(root):
|
||
global overlay_window
|
||
win = tk.Toplevel(root)
|
||
win.withdraw()
|
||
win.overrideredirect(True)
|
||
win.attributes("-topmost", True)
|
||
win.attributes("-alpha", 0.92)
|
||
win.configure(bg="#1a1a1a")
|
||
|
||
frame = tk.Frame(win, bg="#1a1a1a", padx=12, pady=10)
|
||
frame.pack(fill="both", expand=True)
|
||
|
||
dot = tk.Canvas(frame, width=14, height=14, bg="#1a1a1a", highlightthickness=0)
|
||
dot.create_oval(2, 2, 12, 12, fill="#e03030", outline="")
|
||
dot.pack(side="left", padx=(0, 8))
|
||
|
||
_sans = "Segoe UI" if os.name == "nt" else "sans-serif"
|
||
tk.Label(frame, text="Aufnahme läuft …", fg="white", bg="#1a1a1a",
|
||
font=(_sans, 11)).pack(side="left")
|
||
|
||
overlay_window = win
|
||
|
||
|
||
# ── Audio ─────────────────────────────────────────────────────────────────────
|
||
|
||
def audio_callback(indata, frames, time_info, status):
|
||
if state == AppState.RECORDING:
|
||
audio_chunks.append(indata.copy())
|
||
|
||
def get_audio_stream():
|
||
device = config.get("audio_device")
|
||
return sd.InputStream(
|
||
samplerate=config["sample_rate"],
|
||
channels=1,
|
||
device=device,
|
||
callback=audio_callback,
|
||
)
|
||
|
||
|
||
# ── Recording & transcription ─────────────────────────────────────────────────
|
||
|
||
def start_recording():
|
||
global audio_chunks
|
||
if state == AppState.RECORDING:
|
||
return
|
||
audio_chunks = []
|
||
set_state(AppState.RECORDING)
|
||
print("Recording...", flush=True)
|
||
|
||
def stop_and_transcribe():
|
||
if state != AppState.RECORDING:
|
||
return
|
||
set_state(AppState.TRANSCRIBING)
|
||
chunks = list(audio_chunks)
|
||
|
||
if not chunks:
|
||
set_state(AppState.IDLE)
|
||
return
|
||
|
||
audio = np.concatenate(chunks, axis=0).flatten().astype(np.float32)
|
||
duration = len(audio) / config["sample_rate"]
|
||
rms = float(np.sqrt(np.mean(audio ** 2)))
|
||
print(f"Audio: {duration:.1f}s RMS: {rms:.5f}", flush=True)
|
||
|
||
if duration < 0.3 or rms < 0.0001:
|
||
print("Too short or silent — skipped.", flush=True)
|
||
set_state(AppState.IDLE)
|
||
return
|
||
|
||
# Normalize to target RMS so Whisper gets consistent signal level
|
||
target_rms = 0.05
|
||
if rms > 0:
|
||
audio = audio * (target_rms / rms)
|
||
audio = np.clip(audio, -1.0, 1.0)
|
||
|
||
lang = config["language"] if config["language"] else None
|
||
prompt = get_initial_prompt()
|
||
segments, _ = model.transcribe(
|
||
audio, language=lang, beam_size=5, vad_filter=True,
|
||
initial_prompt=prompt if prompt else None,
|
||
)
|
||
text = " ".join(s.text for s in segments).strip()
|
||
text = apply_vocab(text)
|
||
print(f"Result: {repr(text)}", flush=True)
|
||
|
||
set_state(AppState.IDLE)
|
||
if text:
|
||
time.sleep(0.15)
|
||
type_text(text)
|
||
|
||
|
||
|
||
# ── Model loading ─────────────────────────────────────────────────────────────
|
||
|
||
def load_model():
|
||
global model
|
||
print(f"Loading {config['model']} on {config['device']}...", flush=True)
|
||
model = WhisperModel(
|
||
config["model"],
|
||
device=config["device"],
|
||
compute_type=config["compute_type"],
|
||
)
|
||
print("Model ready.", flush=True)
|
||
|
||
|
||
# ── Settings window ───────────────────────────────────────────────────────────
|
||
|
||
def open_settings():
|
||
if overlay_tk is None:
|
||
return
|
||
overlay_tk.after(0, _open_settings_main)
|
||
|
||
def _open_settings_main():
|
||
# ── Palette: "Precision Audio" ──────────────────────────────────────────
|
||
BG = "#18181f" # deep void
|
||
BG2 = "#22222c" # panel
|
||
BG3 = "#2c2c38" # elevated
|
||
BORDER = "#38384a"
|
||
FG = "#e8e8f0"
|
||
FG2 = "#7878a0"
|
||
AMBER = "#f5a623"
|
||
AMBER2 = "#c8831a"
|
||
GREEN = "#4ade80"
|
||
_mono = "Consolas" if os.name == "nt" else "monospace"
|
||
_sans = "Segoe UI" if os.name == "nt" else "sans-serif"
|
||
FONT = (_mono, 11)
|
||
FONT_UI = (_sans, 11)
|
||
FONT_B = (_sans, 11, "bold")
|
||
FONT_S = (_sans, 9)
|
||
FONT_H = (_sans, 16, "bold")
|
||
|
||
win = tk.Toplevel(overlay_tk)
|
||
win.title("Whisper Dictation")
|
||
win.configure(bg=BG)
|
||
win.attributes("-topmost", True)
|
||
win.resizable(False, False)
|
||
win.minsize(700, 0)
|
||
|
||
# Global option for OptionMenu dropdowns (dark listbox)
|
||
win.option_add("*Menu.background", BG3)
|
||
win.option_add("*Menu.foreground", FG)
|
||
win.option_add("*Menu.activeBackground", AMBER)
|
||
win.option_add("*Menu.activeForeground", BG)
|
||
win.option_add("*Menu.font", FONT_UI)
|
||
|
||
# ── Header ──
|
||
hdr = tk.Frame(win, bg=BG2, pady=20)
|
||
hdr.pack(fill="x")
|
||
# Amber accent bar
|
||
tk.Frame(hdr, bg=AMBER, height=3).pack(fill="x")
|
||
tk.Label(hdr, text="WHISPER DICTATION", font=FONT_H,
|
||
bg=BG2, fg=FG, pady=12).pack()
|
||
tk.Label(hdr, text="Lokale GPU-Transkription · offline · privat",
|
||
font=FONT_S, bg=BG2, fg=FG2).pack()
|
||
|
||
# ── Content ──
|
||
content = tk.Frame(win, bg=BG, padx=36, pady=16)
|
||
content.pack(fill="both", expand=True)
|
||
|
||
def section(label):
|
||
f = tk.Frame(content, bg=BG)
|
||
f.pack(fill="x", pady=(18, 6))
|
||
tk.Label(f, text=label, font=("Consolas", 9, "bold"),
|
||
bg=BG, fg=AMBER).pack(side="left")
|
||
tk.Frame(f, bg=BORDER, height=1).pack(side="left", fill="x", expand=True, padx=(10, 0), pady=6)
|
||
|
||
def dd(frame, var, values, width=14):
|
||
"""Create dark OptionMenu directly in frame as parent."""
|
||
m = tk.OptionMenu(frame, var, *values)
|
||
m.config(bg=BG3, fg=FG, activebackground=BG3, activeforeground=FG,
|
||
highlightbackground=BORDER, highlightthickness=1,
|
||
relief="flat", font=FONT_UI, anchor="w", bd=0, width=width)
|
||
m["menu"].config(bg=BG3, fg=FG, activebackground=AMBER,
|
||
activeforeground=BG, relief="flat", bd=0)
|
||
return m
|
||
|
||
def row(label, hint=None):
|
||
"""Returns frame — add controls to frame after calling."""
|
||
f = tk.Frame(content, bg=BG)
|
||
f.pack(fill="x", pady=5)
|
||
tk.Label(f, text=label, width=17, anchor="w", font=FONT_UI,
|
||
bg=BG, fg=FG2).pack(side="left")
|
||
if hint:
|
||
tk.Label(f, text=hint, font=FONT_S, bg=BG, fg=FG2).pack(side="right")
|
||
return f
|
||
|
||
# ── AUDIO ──
|
||
section("AUDIO")
|
||
devices = [(i, d["name"]) for i, d in enumerate(sd.query_devices())
|
||
if d["max_input_channels"] > 0]
|
||
dev_names = ["Standard"] + [f"{i}: {n}" for i, n in devices]
|
||
dev_var = tk.StringVar()
|
||
cur_dev = config.get("audio_device")
|
||
dev_var.set("Standard" if cur_dev is None else
|
||
next((f"{i}: {n}" for i, n in devices if i == cur_dev), "Standard"))
|
||
f = row("Mikrofon")
|
||
dd(f, dev_var, dev_names, width=44).pack(side="left")
|
||
|
||
# ── MODELL ──
|
||
section("MODELL")
|
||
model_hints = {
|
||
"tiny": "~1 GB VRAM · sehr schnell",
|
||
"base": "~1 GB VRAM",
|
||
"small": "~2 GB VRAM",
|
||
"medium": "~5 GB VRAM · empfohlen ✓",
|
||
"large-v2": "~10 GB VRAM",
|
||
"large-v3": "~10 GB VRAM · bestes Ergebnis",
|
||
}
|
||
model_var = tk.StringVar(value=config["model"])
|
||
f_model = row("Modell")
|
||
dd(f_model, model_var, MODELS, 14).pack(side="left")
|
||
hint_lbl = tk.Label(f_model, text=model_hints.get(config["model"], ""),
|
||
font=FONT_S, bg=BG, fg=FG2)
|
||
hint_lbl.pack(side="left", padx=(14, 0))
|
||
model_var.trace_add("write", lambda *_: hint_lbl.config(text=model_hints.get(model_var.get(), "")))
|
||
|
||
lang_display = {v: k for k, v in LANGUAGES.items()}
|
||
lang_var = tk.StringVar(value=lang_display.get(config["language"], "Deutsch"))
|
||
f = row("Sprache")
|
||
dd(f, lang_var, list(LANGUAGES.keys()), 14).pack(side="left")
|
||
|
||
# ── LEISTUNG ──
|
||
section("LEISTUNG")
|
||
device_var = tk.StringVar(value=config["device"])
|
||
f = row("Gerät (GPU/CPU)", hint="cuda = NVIDIA GPU empfohlen")
|
||
dd(f, device_var, DEVICES, 8).pack(side="left")
|
||
|
||
ct_display = {v: k for k, v in COMPUTE_TYPES.items()}
|
||
ct_var = tk.StringVar(value=ct_display.get(config["compute_type"], "float16 (GPU)"))
|
||
f = row("Compute Type")
|
||
dd(f, ct_var, list(COMPUTE_TYPES.keys()), 18).pack(side="left")
|
||
|
||
# ── STEUERUNG ──
|
||
section("STEUERUNG")
|
||
hotkey_var = tk.StringVar(value=config["hotkey"])
|
||
f_hk = row("Hotkey", hint="z.B. ctrl+shift+space")
|
||
tk.Entry(f_hk, textvariable=hotkey_var, font=FONT, width=24,
|
||
bg=BG3, fg=FG, insertbackground=AMBER,
|
||
relief="flat", bd=6,
|
||
highlightbackground=BORDER, highlightthickness=1).pack(side="left")
|
||
|
||
# ── Buttons ──
|
||
tk.Frame(win, bg=BORDER, height=1).pack(fill="x")
|
||
btn_bar = tk.Frame(win, bg=BG2, pady=16, padx=32)
|
||
btn_bar.pack(fill="x")
|
||
|
||
def save():
|
||
sel = dev_var.get()
|
||
config["audio_device"] = None if sel == "Standard" else int(sel.split(":")[0])
|
||
config["model"] = model_var.get()
|
||
config["language"] = LANGUAGES[lang_var.get()]
|
||
config["device"] = device_var.get()
|
||
config["compute_type"] = COMPUTE_TYPES[ct_var.get()]
|
||
config["hotkey"] = hotkey_var.get()
|
||
save_config()
|
||
win.destroy()
|
||
threading.Thread(target=reload_model_and_hotkey, daemon=True).start()
|
||
|
||
def btn_hover(b, c_in, c_out):
|
||
b.bind("<Enter>", lambda _: b.config(bg=c_in))
|
||
b.bind("<Leave>", lambda _: b.config(bg=c_out))
|
||
|
||
save_btn = tk.Button(btn_bar, text="Speichern & Neuladen", command=save,
|
||
bg=AMBER, fg=BG, font=FONT_B,
|
||
relief="flat", padx=20, pady=9, cursor="hand2", bd=0)
|
||
save_btn.pack(side="right")
|
||
btn_hover(save_btn, AMBER2, AMBER)
|
||
|
||
cancel_btn = tk.Button(btn_bar, text="Abbrechen", command=win.destroy,
|
||
bg=BG3, fg=FG2, font=FONT_UI,
|
||
relief="flat", padx=20, pady=9, cursor="hand2", bd=0)
|
||
cancel_btn.pack(side="right", padx=(0, 10))
|
||
btn_hover(cancel_btn, BORDER, BG3)
|
||
|
||
# Center on screen after layout
|
||
win.update_idletasks()
|
||
sw = win.winfo_screenwidth()
|
||
sh = win.winfo_screenheight()
|
||
w = win.winfo_reqwidth()
|
||
h = win.winfo_reqheight()
|
||
win.geometry(f"+{(sw-w)//2}+{(sh-h)//2}")
|
||
|
||
|
||
def open_vocab():
|
||
if overlay_tk is None:
|
||
return
|
||
overlay_tk.after(0, _open_vocab_main)
|
||
|
||
def _open_vocab_main():
|
||
BG = "#18181f"
|
||
BG2 = "#22222c"
|
||
BG3 = "#2c2c38"
|
||
BORDER = "#38384a"
|
||
FG = "#e8e8f0"
|
||
FG2 = "#7878a0"
|
||
AMBER = "#f5a623"
|
||
AMBER2 = "#c8831a"
|
||
RED = "#f87171"
|
||
_mono = "Consolas" if os.name == "nt" else "monospace"
|
||
_sans = "Segoe UI" if os.name == "nt" else "sans-serif"
|
||
FONT = (_sans, 11)
|
||
FONT_B = (_sans, 11, "bold")
|
||
FONT_S = (_sans, 9)
|
||
FONT_H = (_sans, 14, "bold")
|
||
FONT_M = (_mono, 10)
|
||
|
||
win = tk.Toplevel(overlay_tk)
|
||
win.title("Vokabular")
|
||
win.configure(bg=BG)
|
||
win.attributes("-topmost", True)
|
||
win.resizable(False, False)
|
||
win.minsize(600, 0)
|
||
win.option_add("*Menu.background", BG3)
|
||
win.option_add("*Menu.foreground", FG)
|
||
win.option_add("*Menu.activeBackground", AMBER)
|
||
win.option_add("*Menu.activeForeground", BG)
|
||
|
||
# ── Header ──
|
||
hdr = tk.Frame(win, bg=BG2)
|
||
hdr.pack(fill="x")
|
||
tk.Frame(hdr, bg=AMBER, height=3).pack(fill="x")
|
||
tk.Label(hdr, text="VOKABULAR & KORREKTUREN", font=FONT_H,
|
||
bg=BG2, fg=FG, pady=14).pack()
|
||
tk.Label(hdr, text="Wörter lernen · Ersetzungen definieren",
|
||
font=FONT_S, bg=BG2, fg=FG2).pack(pady=(0, 10))
|
||
|
||
content = tk.Frame(win, bg=BG, padx=28, pady=12)
|
||
content.pack(fill="both", expand=True)
|
||
|
||
# ── Add-word form ─────────────────────────────────────────────────────────
|
||
is_correction = tk.BooleanVar(value=False)
|
||
|
||
form = tk.Frame(content, bg=BG3, padx=16, pady=14)
|
||
form.pack(fill="x", pady=(0, 16))
|
||
|
||
# Toggle row
|
||
tog_row = tk.Frame(form, bg=BG3)
|
||
tog_row.pack(fill="x", pady=(0, 10))
|
||
tk.Label(tog_row, text="Korrektur (falsch → richtig)", font=FONT,
|
||
bg=BG3, fg=FG).pack(side="left")
|
||
|
||
def toggle_form(*_):
|
||
if is_correction.get():
|
||
entry_from.pack(side="left", padx=(0, 6))
|
||
arrow_lbl.pack(side="left", padx=4)
|
||
entry_to.pack(side="left")
|
||
entry_word.pack_forget()
|
||
else:
|
||
entry_word.pack(side="left", fill="x", expand=True)
|
||
entry_from.pack_forget()
|
||
arrow_lbl.pack_forget()
|
||
entry_to.pack_forget()
|
||
|
||
tog_btn = tk.Checkbutton(tog_row, variable=is_correction, command=toggle_form,
|
||
bg=BG3, fg=FG2, activebackground=BG3,
|
||
selectcolor=AMBER, relief="flat", bd=0,
|
||
indicatoron=True)
|
||
tog_btn.pack(side="right")
|
||
|
||
# Input row
|
||
inp_row = tk.Frame(form, bg=BG3)
|
||
inp_row.pack(fill="x")
|
||
|
||
entry_style = dict(font=FONT_M, bg=BG, fg=FG, insertbackground=AMBER,
|
||
relief="flat", bd=6, highlightbackground=BORDER, highlightthickness=1)
|
||
|
||
entry_word = tk.Entry(inp_row, width=32, **entry_style)
|
||
entry_word.insert(0, "")
|
||
entry_from = tk.Entry(inp_row, width=14, **entry_style)
|
||
arrow_lbl = tk.Label(inp_row, text="→", font=("Segoe UI", 14), bg=BG3, fg=AMBER)
|
||
entry_to = tk.Entry(inp_row, width=14, **entry_style)
|
||
entry_word.pack(side="left", fill="x", expand=True)
|
||
|
||
def add_entry():
|
||
if is_correction.get():
|
||
frm = entry_from.get().strip()
|
||
to = entry_to.get().strip()
|
||
if frm and to:
|
||
vocab["replacements"].append({"from": frm, "to": to})
|
||
entry_from.delete(0, tk.END)
|
||
entry_to.delete(0, tk.END)
|
||
else:
|
||
w = entry_word.get().strip()
|
||
if w and w not in vocab["words"]:
|
||
vocab["words"].append(w)
|
||
entry_word.delete(0, tk.END)
|
||
save_vocab()
|
||
refresh_lists()
|
||
|
||
win.bind("<Return>", lambda _: add_entry())
|
||
|
||
add_btn = tk.Button(inp_row, text="Hinzufügen", command=add_entry,
|
||
bg=AMBER, fg=BG, font=FONT_B,
|
||
relief="flat", padx=14, pady=5, cursor="hand2", bd=0)
|
||
add_btn.pack(side="right", padx=(10, 0))
|
||
add_btn.bind("<Enter>", lambda _: add_btn.config(bg=AMBER2))
|
||
add_btn.bind("<Leave>", lambda _: add_btn.config(bg=AMBER))
|
||
|
||
# ── Lists ─────────────────────────────────────────────────────────────────
|
||
lists_frame = tk.Frame(content, bg=BG)
|
||
lists_frame.pack(fill="both", expand=True)
|
||
lists_frame.columnconfigure(0, weight=1)
|
||
lists_frame.columnconfigure(1, weight=2)
|
||
|
||
def section_label(parent, text):
|
||
tk.Label(parent, text=text, font=("Consolas", 9, "bold"),
|
||
bg=BG, fg=AMBER).pack(anchor="w", pady=(0, 6))
|
||
|
||
# Words column
|
||
col_w = tk.Frame(lists_frame, bg=BG)
|
||
col_w.grid(row=0, column=0, sticky="nsew", padx=(0, 12))
|
||
section_label(col_w, "WÖRTER")
|
||
|
||
words_box = tk.Listbox(col_w, font=FONT_M, bg=BG3, fg=FG,
|
||
selectbackground=AMBER, selectforeground=BG,
|
||
relief="flat", bd=0, highlightthickness=0,
|
||
activestyle="none", height=10)
|
||
words_box.pack(fill="both", expand=True)
|
||
|
||
def del_word():
|
||
sel = words_box.curselection()
|
||
if sel:
|
||
vocab["words"].pop(sel[0])
|
||
save_vocab()
|
||
refresh_lists()
|
||
|
||
tk.Button(col_w, text="− Entfernen", command=del_word,
|
||
bg=BG3, fg=RED, font=FONT_S, relief="flat",
|
||
padx=8, pady=3, cursor="hand2", bd=0).pack(anchor="e", pady=(4, 0))
|
||
|
||
# Replacements column
|
||
col_r = tk.Frame(lists_frame, bg=BG)
|
||
col_r.grid(row=0, column=1, sticky="nsew")
|
||
section_label(col_r, "KORREKTUREN")
|
||
|
||
repl_box = tk.Listbox(col_r, font=FONT_M, bg=BG3, fg=FG,
|
||
selectbackground=AMBER, selectforeground=BG,
|
||
relief="flat", bd=0, highlightthickness=0,
|
||
activestyle="none", height=10)
|
||
repl_box.pack(fill="both", expand=True)
|
||
|
||
def del_repl():
|
||
sel = repl_box.curselection()
|
||
if sel:
|
||
vocab["replacements"].pop(sel[0])
|
||
save_vocab()
|
||
refresh_lists()
|
||
|
||
tk.Button(col_r, text="− Entfernen", command=del_repl,
|
||
bg=BG3, fg=RED, font=FONT_S, relief="flat",
|
||
padx=8, pady=3, cursor="hand2", bd=0).pack(anchor="e", pady=(4, 0))
|
||
|
||
def refresh_lists():
|
||
words_box.delete(0, tk.END)
|
||
for w in vocab.get("words", []):
|
||
words_box.insert(tk.END, f" {w}")
|
||
repl_box.delete(0, tk.END)
|
||
for r in vocab.get("replacements", []):
|
||
repl_box.insert(tk.END, f" {r['from']} → {r['to']}")
|
||
|
||
refresh_lists()
|
||
|
||
# ── Footer ──
|
||
tk.Frame(win, bg=BORDER, height=1).pack(fill="x")
|
||
tk.Label(win, text="Wörter fließen als Kontext in Whisper ein · Korrekturen werden nach der Transkription angewendet",
|
||
font=FONT_S, bg=BG2, fg=FG2, pady=8).pack()
|
||
|
||
# Center on screen after layout
|
||
win.update_idletasks()
|
||
sw = win.winfo_screenwidth()
|
||
sh = win.winfo_screenheight()
|
||
w = win.winfo_reqwidth()
|
||
h = win.winfo_reqheight()
|
||
win.geometry(f"+{(sw-w)//2}+{(sh-h)//2}")
|
||
|
||
|
||
def reload_model_and_hotkey():
|
||
global hotkey_listener
|
||
if hotkey_listener:
|
||
hotkey_listener.stop()
|
||
load_model()
|
||
hotkey_listener = HotkeyListener(
|
||
config["hotkey"],
|
||
on_press=start_recording,
|
||
on_release=lambda: threading.Thread(target=stop_and_transcribe, daemon=True).start(),
|
||
)
|
||
print(f"Hotkey updated: {config['hotkey']}", flush=True)
|
||
|
||
|
||
# ── Main ──────────────────────────────────────────────────────────────────────
|
||
|
||
def main():
|
||
global tray_icon, overlay_tk
|
||
|
||
load_config()
|
||
load_vocab()
|
||
load_model()
|
||
|
||
# Tkinter root (hidden) for overlay and settings
|
||
root = tk.Tk()
|
||
root.withdraw()
|
||
overlay_tk = root
|
||
create_overlay(root)
|
||
|
||
# Audio stream
|
||
stream = get_audio_stream()
|
||
stream.start()
|
||
|
||
# Hotkey
|
||
global hotkey_listener
|
||
hotkey_listener = HotkeyListener(
|
||
config["hotkey"],
|
||
on_press=start_recording,
|
||
on_release=lambda: threading.Thread(target=stop_and_transcribe, daemon=True).start(),
|
||
)
|
||
|
||
# Tray
|
||
menu = pystray.Menu(
|
||
pystray.MenuItem("Einstellungen", lambda: open_settings()),
|
||
pystray.MenuItem("Vokabular", lambda: open_vocab()),
|
||
pystray.Menu.SEPARATOR,
|
||
pystray.MenuItem("Beenden", lambda: quit_app(stream)),
|
||
)
|
||
tray_icon = pystray.Icon("whisper", ICONS[AppState.IDLE], "Whisper Dictation", menu)
|
||
|
||
threading.Thread(target=tray_icon.run, daemon=True).start()
|
||
|
||
print(f"Ready. Hotkey: {config['hotkey']}", flush=True)
|
||
root.mainloop()
|
||
|
||
stream.stop()
|
||
|
||
def quit_app(stream):
|
||
stream.stop()
|
||
tray_icon.stop()
|
||
overlay_tk.after(0, overlay_tk.quit)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|