feat: add Linux support, replace keyboard lib with pynput

- Replace `keyboard` package (requires root on Linux) with pynput Listener for hotkey handling — works without root on X11/Wayland - Enable system-site-packages in venv for PyGObject/AppIndicator so pystray uses StatusNotifierItem backend on KDE Wayland - Use platform-appropriate fonts (sans-serif/monospace on Linux) - Auto-size settings and vocabulary windows instead of hardcoded dims - Update install.sh with --system-site-packages flag Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-19 19:13:26 +01:00 · 2026-03-19 19:13:26 +01:00 · 4efa66fc79
parent e1a3eba05a
commit 4efa66fc79
9 changed files with 867 additions and 766 deletions
--- a/dictate.py
+++ b/dictate.py
@ -11,10 +11,9 @@ from tkinter import ttk

 import numpy as np
 import sounddevice as sd
-import keyboard
 import pystray
 from PIL import Image, ImageDraw
-from pynput.keyboard import Controller as KeyboardController
+from pynput.keyboard import Controller as KeyboardController, Listener as KeyboardListener, Key, KeyCode
 from faster_whisper import WhisperModel

 # Shared data dir: script directory (= git repo root, synced via git pull).
@ -69,6 +68,87 @@ config = {}
 tray_icon = None
 overlay_window = None
 overlay_tk = None
+hotkey_listener = None
+
+
+# ── Hotkey via pynput ────────────────────────────────────────────────────────
+
+_MODIFIER_MAP = {
+    "ctrl": {Key.ctrl_l, Key.ctrl_r},
+    "ctrl_l": {Key.ctrl_l}, "ctrl_r": {Key.ctrl_r},
+    "shift": {Key.shift_l, Key.shift_r},
+    "shift_l": {Key.shift_l}, "shift_r": {Key.shift_r},
+    "alt": {Key.alt_l, Key.alt_r},
+    "alt_l": {Key.alt_l}, "alt_r": {Key.alt_r},
+}
+
+_KEY_MAP = {
+    "space": Key.space, "tab": Key.tab, "enter": Key.enter,
+    "esc": Key.esc, "escape": Key.esc,
+    "up": Key.up, "down": Key.down, "left": Key.left, "right": Key.right,
+    "home": Key.home, "end": Key.end, "page_up": Key.page_up, "page_down": Key.page_down,
+    "insert": Key.insert, "delete": Key.delete, "backspace": Key.backspace,
+}
+for i in range(1, 13):
+    _KEY_MAP[f"f{i}"] = getattr(Key, f"f{i}")
+
+
+def _parse_hotkey(hotkey_str):
+    """Parse hotkey string into (modifier_sets, trigger_key).
+    Returns: (list of sets-of-pynput-keys for each modifier, pynput key for trigger)
+    """
+    parts = [p.strip().lower() for p in hotkey_str.split("+")]
+    modifiers = []
+    for p in parts[:-1]:
+        if p in _MODIFIER_MAP:
+            modifiers.append(_MODIFIER_MAP[p])
+        elif p in _KEY_MAP:
+            modifiers.append({_KEY_MAP[p]})
+        else:
+            modifiers.append({KeyCode.from_char(p)})
+    trigger_part = parts[-1]
+    if trigger_part in _KEY_MAP:
+        trigger = _KEY_MAP[trigger_part]
+    elif trigger_part in _MODIFIER_MAP:
+        trigger = next(iter(_MODIFIER_MAP[trigger_part]))
+    else:
+        trigger = KeyCode.from_char(trigger_part)
+    return modifiers, trigger
+
+
+class HotkeyListener:
+    """Hold-to-record hotkey using pynput. No root required on X11."""
+
+    def __init__(self, hotkey_str, on_press, on_release):
+        self._modifiers, self._trigger = _parse_hotkey(hotkey_str)
+        self._on_press = on_press
+        self._on_release = on_release
+        self._pressed = set()
+        self._active = False
+        self._listener = KeyboardListener(on_press=self._key_down, on_release=self._key_up)
+        self._listener.daemon = True
+        self._listener.start()
+
+    def _matches_trigger(self, key):
+        return key == self._trigger
+
+    def _modifiers_held(self):
+        return all(any(k in self._pressed for k in mod_set) for mod_set in self._modifiers)
+
+    def _key_down(self, key):
+        self._pressed.add(key)
+        if not self._active and self._matches_trigger(key) and self._modifiers_held():
+            self._active = True
+            self._on_press()
+
+    def _key_up(self, key):
+        self._pressed.discard(key)
+        if self._active and self._matches_trigger(key):
+            self._active = False
+            self._on_release()
+
+    def stop(self):
+        self._listener.stop()


 # ── Config ────────────────────────────────────────────────────────────────────
@ -184,8 +264,9 @@ def create_overlay(root):
    dot.create_oval(2, 2, 12, 12, fill="#e03030", outline="")
    dot.pack(side="left", padx=(0, 8))

+    _sans = "Segoe UI" if os.name == "nt" else "sans-serif"
    tk.Label(frame, text="Aufnahme läuft …", fg="white", bg="#1a1a1a",
-             font=("Segoe UI", 11)).pack(side="left")
+             font=(_sans, 11)).pack(side="left")

    overlay_window = win

@ -231,11 +312,17 @@ def stop_and_transcribe():
    rms = float(np.sqrt(np.mean(audio ** 2)))
    print(f"Audio: {duration:.1f}s  RMS: {rms:.5f}", flush=True)

-    if duration < 0.3 or rms < 0.0005:
+    if duration < 0.3 or rms < 0.0001:
        print("Too short or silent — skipped.", flush=True)
        set_state(AppState.IDLE)
        return

+    # Normalize to target RMS so Whisper gets consistent signal level
+    target_rms = 0.05
+    if rms > 0:
+        audio = audio * (target_rms / rms)
+    audio = np.clip(audio, -1.0, 1.0)
+
    lang = config["language"] if config["language"] else None
    prompt = get_initial_prompt()
    segments, _ = model.transcribe(
@ -251,9 +338,6 @@ def stop_and_transcribe():
        time.sleep(0.15)
        typer.type(text)

-def on_space_release(e):
-    if state == AppState.RECORDING:
-        threading.Thread(target=stop_and_transcribe, daemon=True).start()


 # ── Model loading ─────────────────────────────────────────────────────────────
@ -287,23 +371,20 @@ def _open_settings_main():
    AMBER   = "#f5a623"
    AMBER2  = "#c8831a"
    GREEN   = "#4ade80"
-    FONT    = ("Consolas", 11)
-    FONT_UI = ("Segoe UI", 11)
-    FONT_B  = ("Segoe UI", 11, "bold")
-    FONT_S  = ("Segoe UI", 9)
-    FONT_H  = ("Segoe UI Semibold", 16)
+    _mono   = "Consolas" if os.name == "nt" else "monospace"
+    _sans   = "Segoe UI" if os.name == "nt" else "sans-serif"
+    FONT    = (_mono, 11)
+    FONT_UI = (_sans, 11)
+    FONT_B  = (_sans, 11, "bold")
+    FONT_S  = (_sans, 9)
+    FONT_H  = (_sans, 16, "bold")

    win = tk.Toplevel(overlay_tk)
    win.title("Whisper Dictation")
    win.configure(bg=BG)
    win.attributes("-topmost", True)
    win.resizable(False, False)
-
-    # Center
-    W, H = 680, 660
-    win.update_idletasks()
-    sw, sh = win.winfo_screenwidth(), win.winfo_screenheight()
-    win.geometry(f"{W}x{H}+{(sw-W)//2}+{(sh-H)//2}")
+    win.minsize(700, 0)

    # Global option for OptionMenu dropdowns (dark listbox)
    win.option_add("*Menu.background",       BG3)
@ -322,11 +403,9 @@ def _open_settings_main():
    tk.Label(hdr, text="Lokale GPU-Transkription  ·  offline  ·  privat",
             font=FONT_S, bg=BG2, fg=FG2).pack()

-    # ── Scrollable content ──
-    canvas = tk.Canvas(win, bg=BG, highlightthickness=0)
-    canvas.pack(fill="both", expand=True)
-    content = tk.Frame(canvas, bg=BG, padx=36, pady=16)
-    canvas.create_window((0, 0), window=content, anchor="nw")
+    # ── Content ──
+    content = tk.Frame(win, bg=BG, padx=36, pady=16)
+    content.pack(fill="both", expand=True)

    def section(label):
        f = tk.Frame(content, bg=BG)
@ -443,6 +522,14 @@ def _open_settings_main():
    cancel_btn.pack(side="right", padx=(0, 10))
    btn_hover(cancel_btn, BORDER, BG3)

+    # Center on screen after layout
+    win.update_idletasks()
+    sw = win.winfo_screenwidth()
+    sh = win.winfo_screenheight()
+    w = win.winfo_reqwidth()
+    h = win.winfo_reqheight()
+    win.geometry(f"+{(sw-w)//2}+{(sh-h)//2}")
+

 def open_vocab():
    if overlay_tk is None:
@ -459,21 +546,20 @@ def _open_vocab_main():
    AMBER  = "#f5a623"
    AMBER2 = "#c8831a"
    RED    = "#f87171"
-    FONT   = ("Segoe UI", 11)
-    FONT_B = ("Segoe UI", 11, "bold")
-    FONT_S = ("Segoe UI", 9)
-    FONT_H = ("Segoe UI Semibold", 14)
-    FONT_M = ("Consolas", 10)
+    _mono   = "Consolas" if os.name == "nt" else "monospace"
+    _sans   = "Segoe UI" if os.name == "nt" else "sans-serif"
+    FONT   = (_sans, 11)
+    FONT_B = (_sans, 11, "bold")
+    FONT_S = (_sans, 9)
+    FONT_H = (_sans, 14, "bold")
+    FONT_M = (_mono, 10)

    win = tk.Toplevel(overlay_tk)
    win.title("Vokabular")
    win.configure(bg=BG)
    win.attributes("-topmost", True)
    win.resizable(False, False)
-    W, H = 600, 620
-    win.update_idletasks()
-    sw, sh = win.winfo_screenwidth(), win.winfo_screenheight()
-    win.geometry(f"{W}x{H}+{(sw-W)//2}+{(sh-H)//2}")
+    win.minsize(600, 0)
    win.option_add("*Menu.background", BG3)
    win.option_add("*Menu.foreground", FG)
    win.option_add("*Menu.activeBackground", AMBER)
@ -629,12 +715,25 @@ def _open_vocab_main():
    tk.Label(win, text="Wörter fließen als Kontext in Whisper ein  ·  Korrekturen werden nach der Transkription angewendet",
             font=FONT_S, bg=BG2, fg=FG2, pady=8).pack()

+    # Center on screen after layout
+    win.update_idletasks()
+    sw = win.winfo_screenwidth()
+    sh = win.winfo_screenheight()
+    w = win.winfo_reqwidth()
+    h = win.winfo_reqheight()
+    win.geometry(f"+{(sw-w)//2}+{(sh-h)//2}")
+

 def reload_model_and_hotkey():
-    keyboard.unhook_all()
+    global hotkey_listener
+    if hotkey_listener:
+        hotkey_listener.stop()
    load_model()
-    keyboard.add_hotkey(config["hotkey"], start_recording, suppress=True)
-    keyboard.on_release_key(config["hotkey"].split("+")[-1], on_space_release)
+    hotkey_listener = HotkeyListener(
+        config["hotkey"],
+        on_press=start_recording,
+        on_release=lambda: threading.Thread(target=stop_and_transcribe, daemon=True).start(),
+    )
    print(f"Hotkey updated: {config['hotkey']}", flush=True)


@ -658,9 +757,12 @@ def main():
    stream.start()

    # Hotkey
-    last_key = config["hotkey"].split("+")[-1]
-    keyboard.add_hotkey(config["hotkey"], start_recording, suppress=True)
-    keyboard.on_release_key(last_key, on_space_release)
+    global hotkey_listener
+    hotkey_listener = HotkeyListener(
+        config["hotkey"],
+        on_press=start_recording,
+        on_release=lambda: threading.Thread(target=stop_and_transcribe, daemon=True).start(),
+    )

    # Tray
    menu = pystray.Menu(
--- a/install.bat
+++ b/install.bat
@ -6,11 +6,11 @@ py -3.13 -m venv .venv-windows

 set "VENV=%~dp0.venv-windows"
 echo Installing dependencies...
-"%VENV%\Scripts\pip" install --upgrade pip
-"%VENV%\Scripts\pip" install -r requirements.txt
+"%VENV%\Scripts\python.exe" -m pip install --upgrade pip
+"%VENV%\Scripts\python.exe" -m pip install -r requirements.txt

 echo Installing CUDA 12 DLLs (required for GPU acceleration)...
-"%VENV%\Scripts\pip" install -r requirements-cuda.txt
+"%VENV%\Scripts\python.exe" -m pip install -r requirements-cuda.txt

 echo.
 echo Done. Run start.bat to launch.
--- a/install.sh
+++ b/install.sh
@ -3,7 +3,7 @@ set -e
 cd "$(dirname "$0")"

 echo "Creating Linux venv (.venv-linux)..."
-python3 -m venv .venv-linux
+python3 -m venv --system-site-packages .venv-linux

 echo "Installing dependencies..."
 .venv-linux/bin/pip install --upgrade pip
--- a/requirements.txt
+++ b/requirements.txt
@ -1,7 +1,6 @@
 faster-whisper>=1.0.2
 sounddevice>=0.4.6
 numpy>=1.24
-keyboard>=0.13
 pystray>=0.19
 Pillow>=9.5
 pynput>=1.7.6