feat: add Linux support, replace keyboard lib with pynput

- Replace `keyboard` package (requires root on Linux) with pynput
  Listener for hotkey handling — works without root on X11/Wayland
- Enable system-site-packages in venv for PyGObject/AppIndicator
  so pystray uses StatusNotifierItem backend on KDE Wayland
- Use platform-appropriate fonts (sans-serif/monospace on Linux)
- Auto-size settings and vocabulary windows instead of hardcoded dims
- Update install.sh with --system-site-packages flag

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
d-chrka 2026-03-19 19:13:26 +01:00
parent e1a3eba05a
commit 4efa66fc79
9 changed files with 867 additions and 766 deletions

View File

@ -11,10 +11,9 @@ from tkinter import ttk
import numpy as np
import sounddevice as sd
import keyboard
import pystray
from PIL import Image, ImageDraw
from pynput.keyboard import Controller as KeyboardController
from pynput.keyboard import Controller as KeyboardController, Listener as KeyboardListener, Key, KeyCode
from faster_whisper import WhisperModel
# Shared data dir: script directory (= git repo root, synced via git pull).
@ -69,6 +68,87 @@ config = {}
tray_icon = None
overlay_window = None
overlay_tk = None
hotkey_listener = None
# ── Hotkey via pynput ────────────────────────────────────────────────────────
_MODIFIER_MAP = {
"ctrl": {Key.ctrl_l, Key.ctrl_r},
"ctrl_l": {Key.ctrl_l}, "ctrl_r": {Key.ctrl_r},
"shift": {Key.shift_l, Key.shift_r},
"shift_l": {Key.shift_l}, "shift_r": {Key.shift_r},
"alt": {Key.alt_l, Key.alt_r},
"alt_l": {Key.alt_l}, "alt_r": {Key.alt_r},
}
_KEY_MAP = {
"space": Key.space, "tab": Key.tab, "enter": Key.enter,
"esc": Key.esc, "escape": Key.esc,
"up": Key.up, "down": Key.down, "left": Key.left, "right": Key.right,
"home": Key.home, "end": Key.end, "page_up": Key.page_up, "page_down": Key.page_down,
"insert": Key.insert, "delete": Key.delete, "backspace": Key.backspace,
}
for i in range(1, 13):
_KEY_MAP[f"f{i}"] = getattr(Key, f"f{i}")
def _parse_hotkey(hotkey_str):
"""Parse hotkey string into (modifier_sets, trigger_key).
Returns: (list of sets-of-pynput-keys for each modifier, pynput key for trigger)
"""
parts = [p.strip().lower() for p in hotkey_str.split("+")]
modifiers = []
for p in parts[:-1]:
if p in _MODIFIER_MAP:
modifiers.append(_MODIFIER_MAP[p])
elif p in _KEY_MAP:
modifiers.append({_KEY_MAP[p]})
else:
modifiers.append({KeyCode.from_char(p)})
trigger_part = parts[-1]
if trigger_part in _KEY_MAP:
trigger = _KEY_MAP[trigger_part]
elif trigger_part in _MODIFIER_MAP:
trigger = next(iter(_MODIFIER_MAP[trigger_part]))
else:
trigger = KeyCode.from_char(trigger_part)
return modifiers, trigger
class HotkeyListener:
"""Hold-to-record hotkey using pynput. No root required on X11."""
def __init__(self, hotkey_str, on_press, on_release):
self._modifiers, self._trigger = _parse_hotkey(hotkey_str)
self._on_press = on_press
self._on_release = on_release
self._pressed = set()
self._active = False
self._listener = KeyboardListener(on_press=self._key_down, on_release=self._key_up)
self._listener.daemon = True
self._listener.start()
def _matches_trigger(self, key):
return key == self._trigger
def _modifiers_held(self):
return all(any(k in self._pressed for k in mod_set) for mod_set in self._modifiers)
def _key_down(self, key):
self._pressed.add(key)
if not self._active and self._matches_trigger(key) and self._modifiers_held():
self._active = True
self._on_press()
def _key_up(self, key):
self._pressed.discard(key)
if self._active and self._matches_trigger(key):
self._active = False
self._on_release()
def stop(self):
self._listener.stop()
# ── Config ────────────────────────────────────────────────────────────────────
@ -184,8 +264,9 @@ def create_overlay(root):
dot.create_oval(2, 2, 12, 12, fill="#e03030", outline="")
dot.pack(side="left", padx=(0, 8))
_sans = "Segoe UI" if os.name == "nt" else "sans-serif"
tk.Label(frame, text="Aufnahme läuft …", fg="white", bg="#1a1a1a",
font=("Segoe UI", 11)).pack(side="left")
font=(_sans, 11)).pack(side="left")
overlay_window = win
@ -231,11 +312,17 @@ def stop_and_transcribe():
rms = float(np.sqrt(np.mean(audio ** 2)))
print(f"Audio: {duration:.1f}s RMS: {rms:.5f}", flush=True)
if duration < 0.3 or rms < 0.0005:
if duration < 0.3 or rms < 0.0001:
print("Too short or silent — skipped.", flush=True)
set_state(AppState.IDLE)
return
# Normalize to target RMS so Whisper gets consistent signal level
target_rms = 0.05
if rms > 0:
audio = audio * (target_rms / rms)
audio = np.clip(audio, -1.0, 1.0)
lang = config["language"] if config["language"] else None
prompt = get_initial_prompt()
segments, _ = model.transcribe(
@ -251,9 +338,6 @@ def stop_and_transcribe():
time.sleep(0.15)
typer.type(text)
def on_space_release(e):
if state == AppState.RECORDING:
threading.Thread(target=stop_and_transcribe, daemon=True).start()
# ── Model loading ─────────────────────────────────────────────────────────────
@ -287,23 +371,20 @@ def _open_settings_main():
AMBER = "#f5a623"
AMBER2 = "#c8831a"
GREEN = "#4ade80"
FONT = ("Consolas", 11)
FONT_UI = ("Segoe UI", 11)
FONT_B = ("Segoe UI", 11, "bold")
FONT_S = ("Segoe UI", 9)
FONT_H = ("Segoe UI Semibold", 16)
_mono = "Consolas" if os.name == "nt" else "monospace"
_sans = "Segoe UI" if os.name == "nt" else "sans-serif"
FONT = (_mono, 11)
FONT_UI = (_sans, 11)
FONT_B = (_sans, 11, "bold")
FONT_S = (_sans, 9)
FONT_H = (_sans, 16, "bold")
win = tk.Toplevel(overlay_tk)
win.title("Whisper Dictation")
win.configure(bg=BG)
win.attributes("-topmost", True)
win.resizable(False, False)
# Center
W, H = 680, 660
win.update_idletasks()
sw, sh = win.winfo_screenwidth(), win.winfo_screenheight()
win.geometry(f"{W}x{H}+{(sw-W)//2}+{(sh-H)//2}")
win.minsize(700, 0)
# Global option for OptionMenu dropdowns (dark listbox)
win.option_add("*Menu.background", BG3)
@ -322,11 +403,9 @@ def _open_settings_main():
tk.Label(hdr, text="Lokale GPU-Transkription · offline · privat",
font=FONT_S, bg=BG2, fg=FG2).pack()
# ── Scrollable content ──
canvas = tk.Canvas(win, bg=BG, highlightthickness=0)
canvas.pack(fill="both", expand=True)
content = tk.Frame(canvas, bg=BG, padx=36, pady=16)
canvas.create_window((0, 0), window=content, anchor="nw")
# ── Content ──
content = tk.Frame(win, bg=BG, padx=36, pady=16)
content.pack(fill="both", expand=True)
def section(label):
f = tk.Frame(content, bg=BG)
@ -443,6 +522,14 @@ def _open_settings_main():
cancel_btn.pack(side="right", padx=(0, 10))
btn_hover(cancel_btn, BORDER, BG3)
# Center on screen after layout
win.update_idletasks()
sw = win.winfo_screenwidth()
sh = win.winfo_screenheight()
w = win.winfo_reqwidth()
h = win.winfo_reqheight()
win.geometry(f"+{(sw-w)//2}+{(sh-h)//2}")
def open_vocab():
if overlay_tk is None:
@ -459,21 +546,20 @@ def _open_vocab_main():
AMBER = "#f5a623"
AMBER2 = "#c8831a"
RED = "#f87171"
FONT = ("Segoe UI", 11)
FONT_B = ("Segoe UI", 11, "bold")
FONT_S = ("Segoe UI", 9)
FONT_H = ("Segoe UI Semibold", 14)
FONT_M = ("Consolas", 10)
_mono = "Consolas" if os.name == "nt" else "monospace"
_sans = "Segoe UI" if os.name == "nt" else "sans-serif"
FONT = (_sans, 11)
FONT_B = (_sans, 11, "bold")
FONT_S = (_sans, 9)
FONT_H = (_sans, 14, "bold")
FONT_M = (_mono, 10)
win = tk.Toplevel(overlay_tk)
win.title("Vokabular")
win.configure(bg=BG)
win.attributes("-topmost", True)
win.resizable(False, False)
W, H = 600, 620
win.update_idletasks()
sw, sh = win.winfo_screenwidth(), win.winfo_screenheight()
win.geometry(f"{W}x{H}+{(sw-W)//2}+{(sh-H)//2}")
win.minsize(600, 0)
win.option_add("*Menu.background", BG3)
win.option_add("*Menu.foreground", FG)
win.option_add("*Menu.activeBackground", AMBER)
@ -629,12 +715,25 @@ def _open_vocab_main():
tk.Label(win, text="Wörter fließen als Kontext in Whisper ein · Korrekturen werden nach der Transkription angewendet",
font=FONT_S, bg=BG2, fg=FG2, pady=8).pack()
# Center on screen after layout
win.update_idletasks()
sw = win.winfo_screenwidth()
sh = win.winfo_screenheight()
w = win.winfo_reqwidth()
h = win.winfo_reqheight()
win.geometry(f"+{(sw-w)//2}+{(sh-h)//2}")
def reload_model_and_hotkey():
keyboard.unhook_all()
global hotkey_listener
if hotkey_listener:
hotkey_listener.stop()
load_model()
keyboard.add_hotkey(config["hotkey"], start_recording, suppress=True)
keyboard.on_release_key(config["hotkey"].split("+")[-1], on_space_release)
hotkey_listener = HotkeyListener(
config["hotkey"],
on_press=start_recording,
on_release=lambda: threading.Thread(target=stop_and_transcribe, daemon=True).start(),
)
print(f"Hotkey updated: {config['hotkey']}", flush=True)
@ -658,9 +757,12 @@ def main():
stream.start()
# Hotkey
last_key = config["hotkey"].split("+")[-1]
keyboard.add_hotkey(config["hotkey"], start_recording, suppress=True)
keyboard.on_release_key(last_key, on_space_release)
global hotkey_listener
hotkey_listener = HotkeyListener(
config["hotkey"],
on_press=start_recording,
on_release=lambda: threading.Thread(target=stop_and_transcribe, daemon=True).start(),
)
# Tray
menu = pystray.Menu(

View File

@ -6,11 +6,11 @@ py -3.13 -m venv .venv-windows
set "VENV=%~dp0.venv-windows"
echo Installing dependencies...
"%VENV%\Scripts\pip" install --upgrade pip
"%VENV%\Scripts\pip" install -r requirements.txt
"%VENV%\Scripts\python.exe" -m pip install --upgrade pip
"%VENV%\Scripts\python.exe" -m pip install -r requirements.txt
echo Installing CUDA 12 DLLs (required for GPU acceleration)...
"%VENV%\Scripts\pip" install -r requirements-cuda.txt
"%VENV%\Scripts\python.exe" -m pip install -r requirements-cuda.txt
echo.
echo Done. Run start.bat to launch.

View File

@ -3,7 +3,7 @@ set -e
cd "$(dirname "$0")"
echo "Creating Linux venv (.venv-linux)..."
python3 -m venv .venv-linux
python3 -m venv --system-site-packages .venv-linux
echo "Installing dependencies..."
.venv-linux/bin/pip install --upgrade pip

View File

@ -1,7 +1,6 @@
faster-whisper>=1.0.2
sounddevice>=0.4.6
numpy>=1.24
keyboard>=0.13
pystray>=0.19
Pillow>=9.5
pynput>=1.7.6