feat: add Linux support, replace keyboard lib with pynput

- Replace `keyboard` package (requires root on Linux) with pynput Listener for hotkey handling — works without root on X11/Wayland - Enable system-site-packages in venv for PyGObject/AppIndicator so pystray uses StatusNotifierItem backend on KDE Wayland - Use platform-appropriate fonts (sans-serif/monospace on Linux) - Auto-size settings and vocabulary windows instead of hardcoded dims - Update install.sh with --system-site-packages flag Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-19 19:13:26 +01:00 · 2026-03-19 19:13:26 +01:00 · 4efa66fc79
parent e1a3eba05a
commit 4efa66fc79
9 changed files with 867 additions and 766 deletions
--- a/dictate.py
+++ b/dictate.py
@ -11,10 +11,9 @@ from tkinter import ttk
 import numpy as np
 import sounddevice as sd
 import keyboard
 import pystray
 from PIL import Image, ImageDraw
-from pynput.keyboard import Controller as KeyboardController
+from pynput.keyboard import Controller as KeyboardController, Listener as KeyboardListener, Key, KeyCode
 from faster_whisper import WhisperModel
 # Shared data dir: script directory (= git repo root, synced via git pull).
@ -69,6 +68,87 @@ config = {}
 tray_icon = None
 overlay_window = None
 overlay_tk = None
 hotkey_listener = None
 # ── Hotkey via pynput ────────────────────────────────────────────────────────
 _MODIFIER_MAP = {
    "ctrl": {Key.ctrl_l, Key.ctrl_r},
    "ctrl_l": {Key.ctrl_l}, "ctrl_r": {Key.ctrl_r},
    "shift": {Key.shift_l, Key.shift_r},
    "shift_l": {Key.shift_l}, "shift_r": {Key.shift_r},
    "alt": {Key.alt_l, Key.alt_r},
    "alt_l": {Key.alt_l}, "alt_r": {Key.alt_r},
 }
 _KEY_MAP = {
    "space": Key.space, "tab": Key.tab, "enter": Key.enter,
    "esc": Key.esc, "escape": Key.esc,
    "up": Key.up, "down": Key.down, "left": Key.left, "right": Key.right,
    "home": Key.home, "end": Key.end, "page_up": Key.page_up, "page_down": Key.page_down,
    "insert": Key.insert, "delete": Key.delete, "backspace": Key.backspace,
 }
 for i in range(1, 13):
    _KEY_MAP[f"f{i}"] = getattr(Key, f"f{i}")
 def _parse_hotkey(hotkey_str):
    """Parse hotkey string into (modifier_sets, trigger_key).
    Returns: (list of sets-of-pynput-keys for each modifier, pynput key for trigger)
    """
    parts = [p.strip().lower() for p in hotkey_str.split("+")]
    modifiers = []
    for p in parts[:-1]:
        if p in _MODIFIER_MAP:
            modifiers.append(_MODIFIER_MAP[p])
        elif p in _KEY_MAP:
            modifiers.append({_KEY_MAP[p]})
        else:
            modifiers.append({KeyCode.from_char(p)})
    trigger_part = parts[-1]
    if trigger_part in _KEY_MAP:
        trigger = _KEY_MAP[trigger_part]
    elif trigger_part in _MODIFIER_MAP:
        trigger = next(iter(_MODIFIER_MAP[trigger_part]))
    else:
        trigger = KeyCode.from_char(trigger_part)
    return modifiers, trigger
 class HotkeyListener:
    """Hold-to-record hotkey using pynput. No root required on X11."""
    def __init__(self, hotkey_str, on_press, on_release):
        self._modifiers, self._trigger = _parse_hotkey(hotkey_str)
        self._on_press = on_press
        self._on_release = on_release
        self._pressed = set()
        self._active = False
        self._listener = KeyboardListener(on_press=self._key_down, on_release=self._key_up)
        self._listener.daemon = True
        self._listener.start()
    def _matches_trigger(self, key):
        return key == self._trigger
    def _modifiers_held(self):
        return all(any(k in self._pressed for k in mod_set) for mod_set in self._modifiers)
    def _key_down(self, key):
        self._pressed.add(key)
        if not self._active and self._matches_trigger(key) and self._modifiers_held():
            self._active = True
            self._on_press()
    def _key_up(self, key):
        self._pressed.discard(key)
        if self._active and self._matches_trigger(key):
            self._active = False
            self._on_release()
    def stop(self):
        self._listener.stop()
 # ── Config ────────────────────────────────────────────────────────────────────
@ -184,8 +264,9 @@ def create_overlay(root):
    dot.create_oval(2, 2, 12, 12, fill="#e03030", outline="")
    dot.pack(side="left", padx=(0, 8))
    _sans = "Segoe UI" if os.name == "nt" else "sans-serif"
    tk.Label(frame, text="Aufnahme läuft …", fg="white", bg="#1a1a1a",
-             font=("Segoe UI", 11)).pack(side="left")
+             font=(_sans, 11)).pack(side="left")
    overlay_window = win
@ -231,11 +312,17 @@ def stop_and_transcribe():
    rms = float(np.sqrt(np.mean(audio ** 2)))
    print(f"Audio: {duration:.1f}s  RMS: {rms:.5f}", flush=True)
-    if duration < 0.3 or rms < 0.0005:
+    if duration < 0.3 or rms < 0.0001:
        print("Too short or silent — skipped.", flush=True)
        set_state(AppState.IDLE)
        return
    # Normalize to target RMS so Whisper gets consistent signal level
    target_rms = 0.05
    if rms > 0:
        audio = audio * (target_rms / rms)
    audio = np.clip(audio, -1.0, 1.0)
    lang = config["language"] if config["language"] else None
    prompt = get_initial_prompt()
    segments, _ = model.transcribe(
@ -251,9 +338,6 @@ def stop_and_transcribe():
        time.sleep(0.15)
        typer.type(text)
 def on_space_release(e):
    if state == AppState.RECORDING:
        threading.Thread(target=stop_and_transcribe, daemon=True).start()
 # ── Model loading ─────────────────────────────────────────────────────────────
@ -287,23 +371,20 @@ def _open_settings_main():
    AMBER   = "#f5a623"
    AMBER2  = "#c8831a"
    GREEN   = "#4ade80"
-    FONT    = ("Consolas", 11)
+    _mono   = "Consolas" if os.name == "nt" else "monospace"
-    FONT_UI = ("Segoe UI", 11)
+    _sans   = "Segoe UI" if os.name == "nt" else "sans-serif"
-    FONT_B  = ("Segoe UI", 11, "bold")
+    FONT    = (_mono, 11)
-    FONT_S  = ("Segoe UI", 9)
+    FONT_UI = (_sans, 11)
-    FONT_H  = ("Segoe UI Semibold", 16)
+    FONT_B  = (_sans, 11, "bold")
    FONT_S  = (_sans, 9)
    FONT_H  = (_sans, 16, "bold")
    win = tk.Toplevel(overlay_tk)
    win.title("Whisper Dictation")
    win.configure(bg=BG)
    win.attributes("-topmost", True)
    win.resizable(False, False)
-
+    win.minsize(700, 0)
    # Center
    W, H = 680, 660
    win.update_idletasks()
    sw, sh = win.winfo_screenwidth(), win.winfo_screenheight()
    win.geometry(f"{W}x{H}+{(sw-W)//2}+{(sh-H)//2}")
    # Global option for OptionMenu dropdowns (dark listbox)
    win.option_add("*Menu.background",       BG3)
@ -322,11 +403,9 @@ def _open_settings_main():
    tk.Label(hdr, text="Lokale GPU-Transkription  ·  offline  ·  privat",
             font=FONT_S, bg=BG2, fg=FG2).pack()
-    # ── Scrollable content ──
+    # ── Content ──
-    canvas = tk.Canvas(win, bg=BG, highlightthickness=0)
+    content = tk.Frame(win, bg=BG, padx=36, pady=16)
-    canvas.pack(fill="both", expand=True)
+    content.pack(fill="both", expand=True)
    content = tk.Frame(canvas, bg=BG, padx=36, pady=16)
    canvas.create_window((0, 0), window=content, anchor="nw")
    def section(label):
        f = tk.Frame(content, bg=BG)
@ -443,6 +522,14 @@ def _open_settings_main():
    cancel_btn.pack(side="right", padx=(0, 10))
    btn_hover(cancel_btn, BORDER, BG3)
    # Center on screen after layout
    win.update_idletasks()
    sw = win.winfo_screenwidth()
    sh = win.winfo_screenheight()
    w = win.winfo_reqwidth()
    h = win.winfo_reqheight()
    win.geometry(f"+{(sw-w)//2}+{(sh-h)//2}")
 def open_vocab():
    if overlay_tk is None:
@ -459,21 +546,20 @@ def _open_vocab_main():
    AMBER  = "#f5a623"
    AMBER2 = "#c8831a"
    RED    = "#f87171"
-    FONT   = ("Segoe UI", 11)
+    _mono   = "Consolas" if os.name == "nt" else "monospace"
-    FONT_B = ("Segoe UI", 11, "bold")
+    _sans   = "Segoe UI" if os.name == "nt" else "sans-serif"
-    FONT_S = ("Segoe UI", 9)
+    FONT   = (_sans, 11)
-    FONT_H = ("Segoe UI Semibold", 14)
+    FONT_B = (_sans, 11, "bold")
-    FONT_M = ("Consolas", 10)
+    FONT_S = (_sans, 9)
    FONT_H = (_sans, 14, "bold")
    FONT_M = (_mono, 10)
    win = tk.Toplevel(overlay_tk)
    win.title("Vokabular")
    win.configure(bg=BG)
    win.attributes("-topmost", True)
    win.resizable(False, False)
-    W, H = 600, 620
+    win.minsize(600, 0)
    win.update_idletasks()
    sw, sh = win.winfo_screenwidth(), win.winfo_screenheight()
    win.geometry(f"{W}x{H}+{(sw-W)//2}+{(sh-H)//2}")
    win.option_add("*Menu.background", BG3)
    win.option_add("*Menu.foreground", FG)
    win.option_add("*Menu.activeBackground", AMBER)
@ -629,12 +715,25 @@ def _open_vocab_main():
    tk.Label(win, text="Wörter fließen als Kontext in Whisper ein  ·  Korrekturen werden nach der Transkription angewendet",
             font=FONT_S, bg=BG2, fg=FG2, pady=8).pack()
    # Center on screen after layout
    win.update_idletasks()
    sw = win.winfo_screenwidth()
    sh = win.winfo_screenheight()
    w = win.winfo_reqwidth()
    h = win.winfo_reqheight()
    win.geometry(f"+{(sw-w)//2}+{(sh-h)//2}")
 def reload_model_and_hotkey():
-    keyboard.unhook_all()
+    global hotkey_listener
    if hotkey_listener:
        hotkey_listener.stop()
    load_model()
-    keyboard.add_hotkey(config["hotkey"], start_recording, suppress=True)
+    hotkey_listener = HotkeyListener(
-    keyboard.on_release_key(config["hotkey"].split("+")[-1], on_space_release)
+        config["hotkey"],
        on_press=start_recording,
        on_release=lambda: threading.Thread(target=stop_and_transcribe, daemon=True).start(),
    )
    print(f"Hotkey updated: {config['hotkey']}", flush=True)
@ -658,9 +757,12 @@ def main():
    stream.start()
    # Hotkey
-    last_key = config["hotkey"].split("+")[-1]
+    global hotkey_listener
-    keyboard.add_hotkey(config["hotkey"], start_recording, suppress=True)
+    hotkey_listener = HotkeyListener(
-    keyboard.on_release_key(last_key, on_space_release)
+        config["hotkey"],
        on_press=start_recording,
        on_release=lambda: threading.Thread(target=stop_and_transcribe, daemon=True).start(),
    )
    # Tray
    menu = pystray.Menu(
--- a/install.bat
+++ b/install.bat
@ -6,11 +6,11 @@ py -3.13 -m venv .venv-windows
 set "VENV=%~dp0.venv-windows"
 echo Installing dependencies...
-"%VENV%\Scripts\pip" install --upgrade pip
+"%VENV%\Scripts\python.exe" -m pip install --upgrade pip
-"%VENV%\Scripts\pip" install -r requirements.txt
+"%VENV%\Scripts\python.exe" -m pip install -r requirements.txt
 echo Installing CUDA 12 DLLs (required for GPU acceleration)...
-"%VENV%\Scripts\pip" install -r requirements-cuda.txt
+"%VENV%\Scripts\python.exe" -m pip install -r requirements-cuda.txt
 echo.
 echo Done. Run start.bat to launch.
--- a/install.sh
+++ b/install.sh
@ -3,7 +3,7 @@ set -e
 cd "$(dirname "$0")"
 echo "Creating Linux venv (.venv-linux)..."
-python3 -m venv .venv-linux
+python3 -m venv --system-site-packages .venv-linux
 echo "Installing dependencies..."
 .venv-linux/bin/pip install --upgrade pip
--- a/requirements.txt
+++ b/requirements.txt
@ -1,7 +1,6 @@
 faster-whisper>=1.0.2
 sounddevice>=0.4.6
 numpy>=1.24
 keyboard>=0.13
 pystray>=0.19
 Pillow>=9.5
 pynput>=1.7.6