feat: add Linux support, replace keyboard lib with pynput
- Replace `keyboard` package (requires root on Linux) with pynput Listener for hotkey handling — works without root on X11/Wayland - Enable system-site-packages in venv for PyGObject/AppIndicator so pystray uses StatusNotifierItem backend on KDE Wayland - Use platform-appropriate fonts (sans-serif/monospace on Linux) - Auto-size settings and vocabulary windows instead of hardcoded dims - Update install.sh with --system-site-packages flag Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
e1a3eba05a
commit
4efa66fc79
178
dictate.py
178
dictate.py
|
|
@ -11,10 +11,9 @@ from tkinter import ttk
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import sounddevice as sd
|
import sounddevice as sd
|
||||||
import keyboard
|
|
||||||
import pystray
|
import pystray
|
||||||
from PIL import Image, ImageDraw
|
from PIL import Image, ImageDraw
|
||||||
from pynput.keyboard import Controller as KeyboardController
|
from pynput.keyboard import Controller as KeyboardController, Listener as KeyboardListener, Key, KeyCode
|
||||||
from faster_whisper import WhisperModel
|
from faster_whisper import WhisperModel
|
||||||
|
|
||||||
# Shared data dir: script directory (= git repo root, synced via git pull).
|
# Shared data dir: script directory (= git repo root, synced via git pull).
|
||||||
|
|
@ -69,6 +68,87 @@ config = {}
|
||||||
tray_icon = None
|
tray_icon = None
|
||||||
overlay_window = None
|
overlay_window = None
|
||||||
overlay_tk = None
|
overlay_tk = None
|
||||||
|
hotkey_listener = None
|
||||||
|
|
||||||
|
|
||||||
|
# ── Hotkey via pynput ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
_MODIFIER_MAP = {
|
||||||
|
"ctrl": {Key.ctrl_l, Key.ctrl_r},
|
||||||
|
"ctrl_l": {Key.ctrl_l}, "ctrl_r": {Key.ctrl_r},
|
||||||
|
"shift": {Key.shift_l, Key.shift_r},
|
||||||
|
"shift_l": {Key.shift_l}, "shift_r": {Key.shift_r},
|
||||||
|
"alt": {Key.alt_l, Key.alt_r},
|
||||||
|
"alt_l": {Key.alt_l}, "alt_r": {Key.alt_r},
|
||||||
|
}
|
||||||
|
|
||||||
|
_KEY_MAP = {
|
||||||
|
"space": Key.space, "tab": Key.tab, "enter": Key.enter,
|
||||||
|
"esc": Key.esc, "escape": Key.esc,
|
||||||
|
"up": Key.up, "down": Key.down, "left": Key.left, "right": Key.right,
|
||||||
|
"home": Key.home, "end": Key.end, "page_up": Key.page_up, "page_down": Key.page_down,
|
||||||
|
"insert": Key.insert, "delete": Key.delete, "backspace": Key.backspace,
|
||||||
|
}
|
||||||
|
for i in range(1, 13):
|
||||||
|
_KEY_MAP[f"f{i}"] = getattr(Key, f"f{i}")
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_hotkey(hotkey_str):
|
||||||
|
"""Parse hotkey string into (modifier_sets, trigger_key).
|
||||||
|
Returns: (list of sets-of-pynput-keys for each modifier, pynput key for trigger)
|
||||||
|
"""
|
||||||
|
parts = [p.strip().lower() for p in hotkey_str.split("+")]
|
||||||
|
modifiers = []
|
||||||
|
for p in parts[:-1]:
|
||||||
|
if p in _MODIFIER_MAP:
|
||||||
|
modifiers.append(_MODIFIER_MAP[p])
|
||||||
|
elif p in _KEY_MAP:
|
||||||
|
modifiers.append({_KEY_MAP[p]})
|
||||||
|
else:
|
||||||
|
modifiers.append({KeyCode.from_char(p)})
|
||||||
|
trigger_part = parts[-1]
|
||||||
|
if trigger_part in _KEY_MAP:
|
||||||
|
trigger = _KEY_MAP[trigger_part]
|
||||||
|
elif trigger_part in _MODIFIER_MAP:
|
||||||
|
trigger = next(iter(_MODIFIER_MAP[trigger_part]))
|
||||||
|
else:
|
||||||
|
trigger = KeyCode.from_char(trigger_part)
|
||||||
|
return modifiers, trigger
|
||||||
|
|
||||||
|
|
||||||
|
class HotkeyListener:
|
||||||
|
"""Hold-to-record hotkey using pynput. No root required on X11."""
|
||||||
|
|
||||||
|
def __init__(self, hotkey_str, on_press, on_release):
|
||||||
|
self._modifiers, self._trigger = _parse_hotkey(hotkey_str)
|
||||||
|
self._on_press = on_press
|
||||||
|
self._on_release = on_release
|
||||||
|
self._pressed = set()
|
||||||
|
self._active = False
|
||||||
|
self._listener = KeyboardListener(on_press=self._key_down, on_release=self._key_up)
|
||||||
|
self._listener.daemon = True
|
||||||
|
self._listener.start()
|
||||||
|
|
||||||
|
def _matches_trigger(self, key):
|
||||||
|
return key == self._trigger
|
||||||
|
|
||||||
|
def _modifiers_held(self):
|
||||||
|
return all(any(k in self._pressed for k in mod_set) for mod_set in self._modifiers)
|
||||||
|
|
||||||
|
def _key_down(self, key):
|
||||||
|
self._pressed.add(key)
|
||||||
|
if not self._active and self._matches_trigger(key) and self._modifiers_held():
|
||||||
|
self._active = True
|
||||||
|
self._on_press()
|
||||||
|
|
||||||
|
def _key_up(self, key):
|
||||||
|
self._pressed.discard(key)
|
||||||
|
if self._active and self._matches_trigger(key):
|
||||||
|
self._active = False
|
||||||
|
self._on_release()
|
||||||
|
|
||||||
|
def stop(self):
|
||||||
|
self._listener.stop()
|
||||||
|
|
||||||
|
|
||||||
# ── Config ────────────────────────────────────────────────────────────────────
|
# ── Config ────────────────────────────────────────────────────────────────────
|
||||||
|
|
@ -184,8 +264,9 @@ def create_overlay(root):
|
||||||
dot.create_oval(2, 2, 12, 12, fill="#e03030", outline="")
|
dot.create_oval(2, 2, 12, 12, fill="#e03030", outline="")
|
||||||
dot.pack(side="left", padx=(0, 8))
|
dot.pack(side="left", padx=(0, 8))
|
||||||
|
|
||||||
|
_sans = "Segoe UI" if os.name == "nt" else "sans-serif"
|
||||||
tk.Label(frame, text="Aufnahme läuft …", fg="white", bg="#1a1a1a",
|
tk.Label(frame, text="Aufnahme läuft …", fg="white", bg="#1a1a1a",
|
||||||
font=("Segoe UI", 11)).pack(side="left")
|
font=(_sans, 11)).pack(side="left")
|
||||||
|
|
||||||
overlay_window = win
|
overlay_window = win
|
||||||
|
|
||||||
|
|
@ -231,11 +312,17 @@ def stop_and_transcribe():
|
||||||
rms = float(np.sqrt(np.mean(audio ** 2)))
|
rms = float(np.sqrt(np.mean(audio ** 2)))
|
||||||
print(f"Audio: {duration:.1f}s RMS: {rms:.5f}", flush=True)
|
print(f"Audio: {duration:.1f}s RMS: {rms:.5f}", flush=True)
|
||||||
|
|
||||||
if duration < 0.3 or rms < 0.0005:
|
if duration < 0.3 or rms < 0.0001:
|
||||||
print("Too short or silent — skipped.", flush=True)
|
print("Too short or silent — skipped.", flush=True)
|
||||||
set_state(AppState.IDLE)
|
set_state(AppState.IDLE)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# Normalize to target RMS so Whisper gets consistent signal level
|
||||||
|
target_rms = 0.05
|
||||||
|
if rms > 0:
|
||||||
|
audio = audio * (target_rms / rms)
|
||||||
|
audio = np.clip(audio, -1.0, 1.0)
|
||||||
|
|
||||||
lang = config["language"] if config["language"] else None
|
lang = config["language"] if config["language"] else None
|
||||||
prompt = get_initial_prompt()
|
prompt = get_initial_prompt()
|
||||||
segments, _ = model.transcribe(
|
segments, _ = model.transcribe(
|
||||||
|
|
@ -251,9 +338,6 @@ def stop_and_transcribe():
|
||||||
time.sleep(0.15)
|
time.sleep(0.15)
|
||||||
typer.type(text)
|
typer.type(text)
|
||||||
|
|
||||||
def on_space_release(e):
|
|
||||||
if state == AppState.RECORDING:
|
|
||||||
threading.Thread(target=stop_and_transcribe, daemon=True).start()
|
|
||||||
|
|
||||||
|
|
||||||
# ── Model loading ─────────────────────────────────────────────────────────────
|
# ── Model loading ─────────────────────────────────────────────────────────────
|
||||||
|
|
@ -287,23 +371,20 @@ def _open_settings_main():
|
||||||
AMBER = "#f5a623"
|
AMBER = "#f5a623"
|
||||||
AMBER2 = "#c8831a"
|
AMBER2 = "#c8831a"
|
||||||
GREEN = "#4ade80"
|
GREEN = "#4ade80"
|
||||||
FONT = ("Consolas", 11)
|
_mono = "Consolas" if os.name == "nt" else "monospace"
|
||||||
FONT_UI = ("Segoe UI", 11)
|
_sans = "Segoe UI" if os.name == "nt" else "sans-serif"
|
||||||
FONT_B = ("Segoe UI", 11, "bold")
|
FONT = (_mono, 11)
|
||||||
FONT_S = ("Segoe UI", 9)
|
FONT_UI = (_sans, 11)
|
||||||
FONT_H = ("Segoe UI Semibold", 16)
|
FONT_B = (_sans, 11, "bold")
|
||||||
|
FONT_S = (_sans, 9)
|
||||||
|
FONT_H = (_sans, 16, "bold")
|
||||||
|
|
||||||
win = tk.Toplevel(overlay_tk)
|
win = tk.Toplevel(overlay_tk)
|
||||||
win.title("Whisper Dictation")
|
win.title("Whisper Dictation")
|
||||||
win.configure(bg=BG)
|
win.configure(bg=BG)
|
||||||
win.attributes("-topmost", True)
|
win.attributes("-topmost", True)
|
||||||
win.resizable(False, False)
|
win.resizable(False, False)
|
||||||
|
win.minsize(700, 0)
|
||||||
# Center
|
|
||||||
W, H = 680, 660
|
|
||||||
win.update_idletasks()
|
|
||||||
sw, sh = win.winfo_screenwidth(), win.winfo_screenheight()
|
|
||||||
win.geometry(f"{W}x{H}+{(sw-W)//2}+{(sh-H)//2}")
|
|
||||||
|
|
||||||
# Global option for OptionMenu dropdowns (dark listbox)
|
# Global option for OptionMenu dropdowns (dark listbox)
|
||||||
win.option_add("*Menu.background", BG3)
|
win.option_add("*Menu.background", BG3)
|
||||||
|
|
@ -322,11 +403,9 @@ def _open_settings_main():
|
||||||
tk.Label(hdr, text="Lokale GPU-Transkription · offline · privat",
|
tk.Label(hdr, text="Lokale GPU-Transkription · offline · privat",
|
||||||
font=FONT_S, bg=BG2, fg=FG2).pack()
|
font=FONT_S, bg=BG2, fg=FG2).pack()
|
||||||
|
|
||||||
# ── Scrollable content ──
|
# ── Content ──
|
||||||
canvas = tk.Canvas(win, bg=BG, highlightthickness=0)
|
content = tk.Frame(win, bg=BG, padx=36, pady=16)
|
||||||
canvas.pack(fill="both", expand=True)
|
content.pack(fill="both", expand=True)
|
||||||
content = tk.Frame(canvas, bg=BG, padx=36, pady=16)
|
|
||||||
canvas.create_window((0, 0), window=content, anchor="nw")
|
|
||||||
|
|
||||||
def section(label):
|
def section(label):
|
||||||
f = tk.Frame(content, bg=BG)
|
f = tk.Frame(content, bg=BG)
|
||||||
|
|
@ -443,6 +522,14 @@ def _open_settings_main():
|
||||||
cancel_btn.pack(side="right", padx=(0, 10))
|
cancel_btn.pack(side="right", padx=(0, 10))
|
||||||
btn_hover(cancel_btn, BORDER, BG3)
|
btn_hover(cancel_btn, BORDER, BG3)
|
||||||
|
|
||||||
|
# Center on screen after layout
|
||||||
|
win.update_idletasks()
|
||||||
|
sw = win.winfo_screenwidth()
|
||||||
|
sh = win.winfo_screenheight()
|
||||||
|
w = win.winfo_reqwidth()
|
||||||
|
h = win.winfo_reqheight()
|
||||||
|
win.geometry(f"+{(sw-w)//2}+{(sh-h)//2}")
|
||||||
|
|
||||||
|
|
||||||
def open_vocab():
|
def open_vocab():
|
||||||
if overlay_tk is None:
|
if overlay_tk is None:
|
||||||
|
|
@ -459,21 +546,20 @@ def _open_vocab_main():
|
||||||
AMBER = "#f5a623"
|
AMBER = "#f5a623"
|
||||||
AMBER2 = "#c8831a"
|
AMBER2 = "#c8831a"
|
||||||
RED = "#f87171"
|
RED = "#f87171"
|
||||||
FONT = ("Segoe UI", 11)
|
_mono = "Consolas" if os.name == "nt" else "monospace"
|
||||||
FONT_B = ("Segoe UI", 11, "bold")
|
_sans = "Segoe UI" if os.name == "nt" else "sans-serif"
|
||||||
FONT_S = ("Segoe UI", 9)
|
FONT = (_sans, 11)
|
||||||
FONT_H = ("Segoe UI Semibold", 14)
|
FONT_B = (_sans, 11, "bold")
|
||||||
FONT_M = ("Consolas", 10)
|
FONT_S = (_sans, 9)
|
||||||
|
FONT_H = (_sans, 14, "bold")
|
||||||
|
FONT_M = (_mono, 10)
|
||||||
|
|
||||||
win = tk.Toplevel(overlay_tk)
|
win = tk.Toplevel(overlay_tk)
|
||||||
win.title("Vokabular")
|
win.title("Vokabular")
|
||||||
win.configure(bg=BG)
|
win.configure(bg=BG)
|
||||||
win.attributes("-topmost", True)
|
win.attributes("-topmost", True)
|
||||||
win.resizable(False, False)
|
win.resizable(False, False)
|
||||||
W, H = 600, 620
|
win.minsize(600, 0)
|
||||||
win.update_idletasks()
|
|
||||||
sw, sh = win.winfo_screenwidth(), win.winfo_screenheight()
|
|
||||||
win.geometry(f"{W}x{H}+{(sw-W)//2}+{(sh-H)//2}")
|
|
||||||
win.option_add("*Menu.background", BG3)
|
win.option_add("*Menu.background", BG3)
|
||||||
win.option_add("*Menu.foreground", FG)
|
win.option_add("*Menu.foreground", FG)
|
||||||
win.option_add("*Menu.activeBackground", AMBER)
|
win.option_add("*Menu.activeBackground", AMBER)
|
||||||
|
|
@ -629,12 +715,25 @@ def _open_vocab_main():
|
||||||
tk.Label(win, text="Wörter fließen als Kontext in Whisper ein · Korrekturen werden nach der Transkription angewendet",
|
tk.Label(win, text="Wörter fließen als Kontext in Whisper ein · Korrekturen werden nach der Transkription angewendet",
|
||||||
font=FONT_S, bg=BG2, fg=FG2, pady=8).pack()
|
font=FONT_S, bg=BG2, fg=FG2, pady=8).pack()
|
||||||
|
|
||||||
|
# Center on screen after layout
|
||||||
|
win.update_idletasks()
|
||||||
|
sw = win.winfo_screenwidth()
|
||||||
|
sh = win.winfo_screenheight()
|
||||||
|
w = win.winfo_reqwidth()
|
||||||
|
h = win.winfo_reqheight()
|
||||||
|
win.geometry(f"+{(sw-w)//2}+{(sh-h)//2}")
|
||||||
|
|
||||||
|
|
||||||
def reload_model_and_hotkey():
|
def reload_model_and_hotkey():
|
||||||
keyboard.unhook_all()
|
global hotkey_listener
|
||||||
|
if hotkey_listener:
|
||||||
|
hotkey_listener.stop()
|
||||||
load_model()
|
load_model()
|
||||||
keyboard.add_hotkey(config["hotkey"], start_recording, suppress=True)
|
hotkey_listener = HotkeyListener(
|
||||||
keyboard.on_release_key(config["hotkey"].split("+")[-1], on_space_release)
|
config["hotkey"],
|
||||||
|
on_press=start_recording,
|
||||||
|
on_release=lambda: threading.Thread(target=stop_and_transcribe, daemon=True).start(),
|
||||||
|
)
|
||||||
print(f"Hotkey updated: {config['hotkey']}", flush=True)
|
print(f"Hotkey updated: {config['hotkey']}", flush=True)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -658,9 +757,12 @@ def main():
|
||||||
stream.start()
|
stream.start()
|
||||||
|
|
||||||
# Hotkey
|
# Hotkey
|
||||||
last_key = config["hotkey"].split("+")[-1]
|
global hotkey_listener
|
||||||
keyboard.add_hotkey(config["hotkey"], start_recording, suppress=True)
|
hotkey_listener = HotkeyListener(
|
||||||
keyboard.on_release_key(last_key, on_space_release)
|
config["hotkey"],
|
||||||
|
on_press=start_recording,
|
||||||
|
on_release=lambda: threading.Thread(target=stop_and_transcribe, daemon=True).start(),
|
||||||
|
)
|
||||||
|
|
||||||
# Tray
|
# Tray
|
||||||
menu = pystray.Menu(
|
menu = pystray.Menu(
|
||||||
|
|
|
||||||
|
|
@ -6,11 +6,11 @@ py -3.13 -m venv .venv-windows
|
||||||
|
|
||||||
set "VENV=%~dp0.venv-windows"
|
set "VENV=%~dp0.venv-windows"
|
||||||
echo Installing dependencies...
|
echo Installing dependencies...
|
||||||
"%VENV%\Scripts\pip" install --upgrade pip
|
"%VENV%\Scripts\python.exe" -m pip install --upgrade pip
|
||||||
"%VENV%\Scripts\pip" install -r requirements.txt
|
"%VENV%\Scripts\python.exe" -m pip install -r requirements.txt
|
||||||
|
|
||||||
echo Installing CUDA 12 DLLs (required for GPU acceleration)...
|
echo Installing CUDA 12 DLLs (required for GPU acceleration)...
|
||||||
"%VENV%\Scripts\pip" install -r requirements-cuda.txt
|
"%VENV%\Scripts\python.exe" -m pip install -r requirements-cuda.txt
|
||||||
|
|
||||||
echo.
|
echo.
|
||||||
echo Done. Run start.bat to launch.
|
echo Done. Run start.bat to launch.
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,7 @@ set -e
|
||||||
cd "$(dirname "$0")"
|
cd "$(dirname "$0")"
|
||||||
|
|
||||||
echo "Creating Linux venv (.venv-linux)..."
|
echo "Creating Linux venv (.venv-linux)..."
|
||||||
python3 -m venv .venv-linux
|
python3 -m venv --system-site-packages .venv-linux
|
||||||
|
|
||||||
echo "Installing dependencies..."
|
echo "Installing dependencies..."
|
||||||
.venv-linux/bin/pip install --upgrade pip
|
.venv-linux/bin/pip install --upgrade pip
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,6 @@
|
||||||
faster-whisper>=1.0.2
|
faster-whisper>=1.0.2
|
||||||
sounddevice>=0.4.6
|
sounddevice>=0.4.6
|
||||||
numpy>=1.24
|
numpy>=1.24
|
||||||
keyboard>=0.13
|
|
||||||
pystray>=0.19
|
pystray>=0.19
|
||||||
Pillow>=9.5
|
Pillow>=9.5
|
||||||
pynput>=1.7.6
|
pynput>=1.7.6
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue