feat: add Linux support, replace keyboard lib with pynput
- Replace `keyboard` package (requires root on Linux) with pynput Listener for hotkey handling — works without root on X11/Wayland - Enable system-site-packages in venv for PyGObject/AppIndicator so pystray uses StatusNotifierItem backend on KDE Wayland - Use platform-appropriate fonts (sans-serif/monospace on Linux) - Auto-size settings and vocabulary windows instead of hardcoded dims - Update install.sh with --system-site-packages flag Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
e1a3eba05a
commit
4efa66fc79
178
dictate.py
178
dictate.py
|
|
@ -11,10 +11,9 @@ from tkinter import ttk
|
|||
|
||||
import numpy as np
|
||||
import sounddevice as sd
|
||||
import keyboard
|
||||
import pystray
|
||||
from PIL import Image, ImageDraw
|
||||
from pynput.keyboard import Controller as KeyboardController
|
||||
from pynput.keyboard import Controller as KeyboardController, Listener as KeyboardListener, Key, KeyCode
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
# Shared data dir: script directory (= git repo root, synced via git pull).
|
||||
|
|
@ -69,6 +68,87 @@ config = {}
|
|||
tray_icon = None
|
||||
overlay_window = None
|
||||
overlay_tk = None
|
||||
hotkey_listener = None
|
||||
|
||||
|
||||
# ── Hotkey via pynput ────────────────────────────────────────────────────────
|
||||
|
||||
_MODIFIER_MAP = {
|
||||
"ctrl": {Key.ctrl_l, Key.ctrl_r},
|
||||
"ctrl_l": {Key.ctrl_l}, "ctrl_r": {Key.ctrl_r},
|
||||
"shift": {Key.shift_l, Key.shift_r},
|
||||
"shift_l": {Key.shift_l}, "shift_r": {Key.shift_r},
|
||||
"alt": {Key.alt_l, Key.alt_r},
|
||||
"alt_l": {Key.alt_l}, "alt_r": {Key.alt_r},
|
||||
}
|
||||
|
||||
_KEY_MAP = {
|
||||
"space": Key.space, "tab": Key.tab, "enter": Key.enter,
|
||||
"esc": Key.esc, "escape": Key.esc,
|
||||
"up": Key.up, "down": Key.down, "left": Key.left, "right": Key.right,
|
||||
"home": Key.home, "end": Key.end, "page_up": Key.page_up, "page_down": Key.page_down,
|
||||
"insert": Key.insert, "delete": Key.delete, "backspace": Key.backspace,
|
||||
}
|
||||
for i in range(1, 13):
|
||||
_KEY_MAP[f"f{i}"] = getattr(Key, f"f{i}")
|
||||
|
||||
|
||||
def _parse_hotkey(hotkey_str):
|
||||
"""Parse hotkey string into (modifier_sets, trigger_key).
|
||||
Returns: (list of sets-of-pynput-keys for each modifier, pynput key for trigger)
|
||||
"""
|
||||
parts = [p.strip().lower() for p in hotkey_str.split("+")]
|
||||
modifiers = []
|
||||
for p in parts[:-1]:
|
||||
if p in _MODIFIER_MAP:
|
||||
modifiers.append(_MODIFIER_MAP[p])
|
||||
elif p in _KEY_MAP:
|
||||
modifiers.append({_KEY_MAP[p]})
|
||||
else:
|
||||
modifiers.append({KeyCode.from_char(p)})
|
||||
trigger_part = parts[-1]
|
||||
if trigger_part in _KEY_MAP:
|
||||
trigger = _KEY_MAP[trigger_part]
|
||||
elif trigger_part in _MODIFIER_MAP:
|
||||
trigger = next(iter(_MODIFIER_MAP[trigger_part]))
|
||||
else:
|
||||
trigger = KeyCode.from_char(trigger_part)
|
||||
return modifiers, trigger
|
||||
|
||||
|
||||
class HotkeyListener:
|
||||
"""Hold-to-record hotkey using pynput. No root required on X11."""
|
||||
|
||||
def __init__(self, hotkey_str, on_press, on_release):
|
||||
self._modifiers, self._trigger = _parse_hotkey(hotkey_str)
|
||||
self._on_press = on_press
|
||||
self._on_release = on_release
|
||||
self._pressed = set()
|
||||
self._active = False
|
||||
self._listener = KeyboardListener(on_press=self._key_down, on_release=self._key_up)
|
||||
self._listener.daemon = True
|
||||
self._listener.start()
|
||||
|
||||
def _matches_trigger(self, key):
|
||||
return key == self._trigger
|
||||
|
||||
def _modifiers_held(self):
|
||||
return all(any(k in self._pressed for k in mod_set) for mod_set in self._modifiers)
|
||||
|
||||
def _key_down(self, key):
|
||||
self._pressed.add(key)
|
||||
if not self._active and self._matches_trigger(key) and self._modifiers_held():
|
||||
self._active = True
|
||||
self._on_press()
|
||||
|
||||
def _key_up(self, key):
|
||||
self._pressed.discard(key)
|
||||
if self._active and self._matches_trigger(key):
|
||||
self._active = False
|
||||
self._on_release()
|
||||
|
||||
def stop(self):
|
||||
self._listener.stop()
|
||||
|
||||
|
||||
# ── Config ────────────────────────────────────────────────────────────────────
|
||||
|
|
@ -184,8 +264,9 @@ def create_overlay(root):
|
|||
dot.create_oval(2, 2, 12, 12, fill="#e03030", outline="")
|
||||
dot.pack(side="left", padx=(0, 8))
|
||||
|
||||
_sans = "Segoe UI" if os.name == "nt" else "sans-serif"
|
||||
tk.Label(frame, text="Aufnahme läuft …", fg="white", bg="#1a1a1a",
|
||||
font=("Segoe UI", 11)).pack(side="left")
|
||||
font=(_sans, 11)).pack(side="left")
|
||||
|
||||
overlay_window = win
|
||||
|
||||
|
|
@ -231,11 +312,17 @@ def stop_and_transcribe():
|
|||
rms = float(np.sqrt(np.mean(audio ** 2)))
|
||||
print(f"Audio: {duration:.1f}s RMS: {rms:.5f}", flush=True)
|
||||
|
||||
if duration < 0.3 or rms < 0.0005:
|
||||
if duration < 0.3 or rms < 0.0001:
|
||||
print("Too short or silent — skipped.", flush=True)
|
||||
set_state(AppState.IDLE)
|
||||
return
|
||||
|
||||
# Normalize to target RMS so Whisper gets consistent signal level
|
||||
target_rms = 0.05
|
||||
if rms > 0:
|
||||
audio = audio * (target_rms / rms)
|
||||
audio = np.clip(audio, -1.0, 1.0)
|
||||
|
||||
lang = config["language"] if config["language"] else None
|
||||
prompt = get_initial_prompt()
|
||||
segments, _ = model.transcribe(
|
||||
|
|
@ -251,9 +338,6 @@ def stop_and_transcribe():
|
|||
time.sleep(0.15)
|
||||
typer.type(text)
|
||||
|
||||
def on_space_release(e):
|
||||
if state == AppState.RECORDING:
|
||||
threading.Thread(target=stop_and_transcribe, daemon=True).start()
|
||||
|
||||
|
||||
# ── Model loading ─────────────────────────────────────────────────────────────
|
||||
|
|
@ -287,23 +371,20 @@ def _open_settings_main():
|
|||
AMBER = "#f5a623"
|
||||
AMBER2 = "#c8831a"
|
||||
GREEN = "#4ade80"
|
||||
FONT = ("Consolas", 11)
|
||||
FONT_UI = ("Segoe UI", 11)
|
||||
FONT_B = ("Segoe UI", 11, "bold")
|
||||
FONT_S = ("Segoe UI", 9)
|
||||
FONT_H = ("Segoe UI Semibold", 16)
|
||||
_mono = "Consolas" if os.name == "nt" else "monospace"
|
||||
_sans = "Segoe UI" if os.name == "nt" else "sans-serif"
|
||||
FONT = (_mono, 11)
|
||||
FONT_UI = (_sans, 11)
|
||||
FONT_B = (_sans, 11, "bold")
|
||||
FONT_S = (_sans, 9)
|
||||
FONT_H = (_sans, 16, "bold")
|
||||
|
||||
win = tk.Toplevel(overlay_tk)
|
||||
win.title("Whisper Dictation")
|
||||
win.configure(bg=BG)
|
||||
win.attributes("-topmost", True)
|
||||
win.resizable(False, False)
|
||||
|
||||
# Center
|
||||
W, H = 680, 660
|
||||
win.update_idletasks()
|
||||
sw, sh = win.winfo_screenwidth(), win.winfo_screenheight()
|
||||
win.geometry(f"{W}x{H}+{(sw-W)//2}+{(sh-H)//2}")
|
||||
win.minsize(700, 0)
|
||||
|
||||
# Global option for OptionMenu dropdowns (dark listbox)
|
||||
win.option_add("*Menu.background", BG3)
|
||||
|
|
@ -322,11 +403,9 @@ def _open_settings_main():
|
|||
tk.Label(hdr, text="Lokale GPU-Transkription · offline · privat",
|
||||
font=FONT_S, bg=BG2, fg=FG2).pack()
|
||||
|
||||
# ── Scrollable content ──
|
||||
canvas = tk.Canvas(win, bg=BG, highlightthickness=0)
|
||||
canvas.pack(fill="both", expand=True)
|
||||
content = tk.Frame(canvas, bg=BG, padx=36, pady=16)
|
||||
canvas.create_window((0, 0), window=content, anchor="nw")
|
||||
# ── Content ──
|
||||
content = tk.Frame(win, bg=BG, padx=36, pady=16)
|
||||
content.pack(fill="both", expand=True)
|
||||
|
||||
def section(label):
|
||||
f = tk.Frame(content, bg=BG)
|
||||
|
|
@ -443,6 +522,14 @@ def _open_settings_main():
|
|||
cancel_btn.pack(side="right", padx=(0, 10))
|
||||
btn_hover(cancel_btn, BORDER, BG3)
|
||||
|
||||
# Center on screen after layout
|
||||
win.update_idletasks()
|
||||
sw = win.winfo_screenwidth()
|
||||
sh = win.winfo_screenheight()
|
||||
w = win.winfo_reqwidth()
|
||||
h = win.winfo_reqheight()
|
||||
win.geometry(f"+{(sw-w)//2}+{(sh-h)//2}")
|
||||
|
||||
|
||||
def open_vocab():
|
||||
if overlay_tk is None:
|
||||
|
|
@ -459,21 +546,20 @@ def _open_vocab_main():
|
|||
AMBER = "#f5a623"
|
||||
AMBER2 = "#c8831a"
|
||||
RED = "#f87171"
|
||||
FONT = ("Segoe UI", 11)
|
||||
FONT_B = ("Segoe UI", 11, "bold")
|
||||
FONT_S = ("Segoe UI", 9)
|
||||
FONT_H = ("Segoe UI Semibold", 14)
|
||||
FONT_M = ("Consolas", 10)
|
||||
_mono = "Consolas" if os.name == "nt" else "monospace"
|
||||
_sans = "Segoe UI" if os.name == "nt" else "sans-serif"
|
||||
FONT = (_sans, 11)
|
||||
FONT_B = (_sans, 11, "bold")
|
||||
FONT_S = (_sans, 9)
|
||||
FONT_H = (_sans, 14, "bold")
|
||||
FONT_M = (_mono, 10)
|
||||
|
||||
win = tk.Toplevel(overlay_tk)
|
||||
win.title("Vokabular")
|
||||
win.configure(bg=BG)
|
||||
win.attributes("-topmost", True)
|
||||
win.resizable(False, False)
|
||||
W, H = 600, 620
|
||||
win.update_idletasks()
|
||||
sw, sh = win.winfo_screenwidth(), win.winfo_screenheight()
|
||||
win.geometry(f"{W}x{H}+{(sw-W)//2}+{(sh-H)//2}")
|
||||
win.minsize(600, 0)
|
||||
win.option_add("*Menu.background", BG3)
|
||||
win.option_add("*Menu.foreground", FG)
|
||||
win.option_add("*Menu.activeBackground", AMBER)
|
||||
|
|
@ -629,12 +715,25 @@ def _open_vocab_main():
|
|||
tk.Label(win, text="Wörter fließen als Kontext in Whisper ein · Korrekturen werden nach der Transkription angewendet",
|
||||
font=FONT_S, bg=BG2, fg=FG2, pady=8).pack()
|
||||
|
||||
# Center on screen after layout
|
||||
win.update_idletasks()
|
||||
sw = win.winfo_screenwidth()
|
||||
sh = win.winfo_screenheight()
|
||||
w = win.winfo_reqwidth()
|
||||
h = win.winfo_reqheight()
|
||||
win.geometry(f"+{(sw-w)//2}+{(sh-h)//2}")
|
||||
|
||||
|
||||
def reload_model_and_hotkey():
|
||||
keyboard.unhook_all()
|
||||
global hotkey_listener
|
||||
if hotkey_listener:
|
||||
hotkey_listener.stop()
|
||||
load_model()
|
||||
keyboard.add_hotkey(config["hotkey"], start_recording, suppress=True)
|
||||
keyboard.on_release_key(config["hotkey"].split("+")[-1], on_space_release)
|
||||
hotkey_listener = HotkeyListener(
|
||||
config["hotkey"],
|
||||
on_press=start_recording,
|
||||
on_release=lambda: threading.Thread(target=stop_and_transcribe, daemon=True).start(),
|
||||
)
|
||||
print(f"Hotkey updated: {config['hotkey']}", flush=True)
|
||||
|
||||
|
||||
|
|
@ -658,9 +757,12 @@ def main():
|
|||
stream.start()
|
||||
|
||||
# Hotkey
|
||||
last_key = config["hotkey"].split("+")[-1]
|
||||
keyboard.add_hotkey(config["hotkey"], start_recording, suppress=True)
|
||||
keyboard.on_release_key(last_key, on_space_release)
|
||||
global hotkey_listener
|
||||
hotkey_listener = HotkeyListener(
|
||||
config["hotkey"],
|
||||
on_press=start_recording,
|
||||
on_release=lambda: threading.Thread(target=stop_and_transcribe, daemon=True).start(),
|
||||
)
|
||||
|
||||
# Tray
|
||||
menu = pystray.Menu(
|
||||
|
|
|
|||
|
|
@ -6,11 +6,11 @@ py -3.13 -m venv .venv-windows
|
|||
|
||||
set "VENV=%~dp0.venv-windows"
|
||||
echo Installing dependencies...
|
||||
"%VENV%\Scripts\pip" install --upgrade pip
|
||||
"%VENV%\Scripts\pip" install -r requirements.txt
|
||||
"%VENV%\Scripts\python.exe" -m pip install --upgrade pip
|
||||
"%VENV%\Scripts\python.exe" -m pip install -r requirements.txt
|
||||
|
||||
echo Installing CUDA 12 DLLs (required for GPU acceleration)...
|
||||
"%VENV%\Scripts\pip" install -r requirements-cuda.txt
|
||||
"%VENV%\Scripts\python.exe" -m pip install -r requirements-cuda.txt
|
||||
|
||||
echo.
|
||||
echo Done. Run start.bat to launch.
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ set -e
|
|||
cd "$(dirname "$0")"
|
||||
|
||||
echo "Creating Linux venv (.venv-linux)..."
|
||||
python3 -m venv .venv-linux
|
||||
python3 -m venv --system-site-packages .venv-linux
|
||||
|
||||
echo "Installing dependencies..."
|
||||
.venv-linux/bin/pip install --upgrade pip
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
faster-whisper>=1.0.2
|
||||
sounddevice>=0.4.6
|
||||
numpy>=1.24
|
||||
keyboard>=0.13
|
||||
pystray>=0.19
|
||||
Pillow>=9.5
|
||||
pynput>=1.7.6
|
||||
|
|
|
|||
Loading…
Reference in New Issue