From 5aaf8b59ceeaf1856ef91543abd9a4b54988e8f8 Mon Sep 17 00:00:00 2001 From: Christian Kauer Date: Sun, 22 Mar 2026 11:01:14 +0100 Subject: [PATCH] fix linux version --- .claude/settings.local.json | 29 +++++++++++- .gitignore | 1 + README.md | 85 +++++++++++++++++++++++++++++----- config.json | 5 +- main.py | 2 + shared_data/.directory | 2 + shared_data/vocabulary.json | 63 +++++++++++++++++++++++++ whisper_app/config.py | 16 ++++++- whisper_app/hotkey.py | 49 +++++++++++++------- whisper_app/settings_window.py | 42 +++++++++++++++++ whisper_app/transcriber.py | 2 + whisper_app/typer.py | 9 ++-- 12 files changed, 268 insertions(+), 37 deletions(-) create mode 100644 shared_data/.directory create mode 100644 shared_data/vocabulary.json diff --git a/.claude/settings.local.json b/.claude/settings.local.json index 21b9c57..8ab1f28 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -36,7 +36,34 @@ "Bash(bash build-linux.sh)", "Bash(.venv-linux/bin/python -c \"import tkinter; print\\(''tkinter OK''\\)\")", "Bash(pacman -Q tk)", - "Bash(sudo pacman:*)" + "Bash(sudo pacman:*)", + "Bash(grep -r \"WHISPER_DATA_DIR\\\\|WHISPER_LOCAL_DIR\" /run/media/chk/Ventoy/projects/chrka/whisper-dictation --include=*.py)", + "Bash(grep -l \"config.load_config\\\\|config.load_vocab\" /run/media/chk/Ventoy/projects/chrka/whisper-dictation/whisper_app/*.py)", + "Bash(.venv-linux/bin/python -m pytest tests/ -v)", + "Bash(.venv-linux/bin/python -m unittest discover -s tests -v)", + "Bash(head -5 tests/*.py)", + "Bash(.venv-linux/bin/pip install:*)", + "Bash(./whisper-dictation)", + "Bash(pacman -Ss appindicator)", + "Bash(pacman -Q libayatana-appindicator)", + "Bash(echo \"$XDG_SESSION_TYPE\")", + "Bash(echo \"Session: $XDG_SESSION_TYPE\")", + "Bash(mount)", + "Bash(desktop-file-validate ~/.local/share/applications/whisper-dictation.desktop)", + "Bash(update-desktop-database ~/.local/share/applications/)", + "Bash(echo \"DISPLAY=$DISPLAY\")", + "Bash(xlsclients)", + "Bash(DISPLAY=:0 xdpyinfo)", + "Bash(pkill -f \"whisper-dictation.*resource_tracker\")", + "Bash(pkill -f \"dist/whisper-dictation-linux/whisper-dictation\")", + "Bash(pkill -9 -f whisper-dictation)", + "Bash(pkill -f whisper-dictation)", + "Bash(gtk-launch whisper-dictation:*)", + "Bash(pkill -9 -f resource_tracker)", + "Bash(echo \"Desktop: $XDG_CURRENT_DESKTOP\")", + "Bash(nvidia-smi)", + "Bash(lspci)", + "Bash(pacman -Q)" ] } } diff --git a/.gitignore b/.gitignore index ada5b23..611a755 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ models/ *.log build/ dist/ +shared_data/models--Systran--faster-whisper-medium/ icon.ico .claude/settings.local.json .superpowers/ diff --git a/README.md b/README.md index d77775b..7f7aed3 100644 --- a/README.md +++ b/README.md @@ -6,9 +6,10 @@ Local GPU speech-to-text dictation tool. Hold a hotkey to record, release to tra - System tray icon with settings GUI (tkinter) - Configurable hotkey, model, language, audio device +- Cross-platform: Windows and Linux builds from a single codebase - Shared config via git (`config.json`, `vocabulary.json`) -- Machine-specific settings stored locally (audio device, GPU settings) -- Windows: GPU acceleration via CUDA; Linux: CPU +- Machine-specific settings stored locally (audio device, GPU settings, model) +- Configurable shared paths for vocabulary and model cache (useful for dual-boot setups) ## Requirements @@ -20,8 +21,38 @@ Local GPU speech-to-text dictation tool. Hold a hotkey to record, release to tra - `pyinstaller` (for building a standalone executable) ### Linux + +**System packages (install via package manager):** + +Arch/CachyOS: +```bash +sudo pacman -S tk libayatana-appindicator wl-clipboard xdotool +``` + +Debian/Ubuntu: +```bash +sudo apt install python3-tk libayatana-appindicator3-1 wl-clipboard xdotool +``` + +| Package | Purpose | +|---------|---------| +| `tk` | tkinter GUI (settings, log, vocabulary windows) | +| `libayatana-appindicator` | System tray icon (required for KDE/GNOME on Wayland) | +| `wl-clipboard` | Text injection on Wayland (`wl-copy`) | +| `xdotool` | Simulates Ctrl+V paste on Wayland, text typing on X11 | + +**Optional (for GPU acceleration):** + +Arch/CachyOS: +```bash +sudo pacman -S nvidia cuda +``` + +Without CUDA, the app runs on CPU. Use `int8` compute type and a smaller model (`small` or `base`) for acceptable speed on CPU. + +**Python:** - Python 3.10+ -- PortAudio: `sudo apt install portaudio19-dev` +- PortAudio (bundled with `sounddevice` wheels) ## Installation @@ -36,11 +67,11 @@ This creates a `.venv-windows` virtual environment, installs all dependencies an ### Linux ```bash -chmod +x install.sh start.sh +chmod +x install.sh start.sh build-linux.sh ./install.sh ``` -Creates a `.venv-linux` virtual environment. GPU support on Linux requires a manually installed CUDA environment; by default runs on CPU. +Creates a `.venv-linux` virtual environment with all dependencies and PyInstaller. ## Usage @@ -58,33 +89,63 @@ The app starts in the system tray. Hold the hotkey (default: `Ctrl+Shift+Space`) ## Build -To produce a standalone Windows executable: +Builds are platform-specific and output to separate directories: +- Windows: `dist/whisper-dictation-windows/` +- Linux: `dist/whisper-dictation-linux/` +### Windows ```bat .venv-windows\Scripts\python.exe build.py ``` -This uses PyInstaller to bundle the app and all dependencies into a single folder under `dist/`. The resulting executable can be run without a Python installation. +### Linux +```bash +./build-linux.sh +``` + +Both use PyInstaller to bundle the app into a standalone folder. The resulting executable can be run without a Python installation. ## Configuration -`config.json` (shared, stored in the repo): +### Shared config (`config.json`, in app directory) | Key | Default | Description | |-----|---------|-------------| | `hotkey` | `ctrl+shift+space` | Recording trigger | -| `model` | `medium` | Whisper model size (`tiny`, `base`, `small`, `medium`, `large-v2`, `large-v3`) | | `language` | `de` | Transcription language (`de`, `en`, `fr`, `es`, `it`, `null` = auto) | | `sample_rate` | `16000` | Audio sample rate in Hz | +| `vocab_path` | `""` | Path to vocabulary file (empty = local `vocabulary.json`) | +| `model_dir` | `""` | Path to shared model cache directory (empty = default HuggingFace cache) | -Machine-specific settings (GPU device, compute type, audio device) are stored separately and not tracked by git: +### Local config (`config_local.json`, per machine) + +Stored outside the app directory to keep machine-specific settings separate: - **Windows:** `%LOCALAPPDATA%\WhisperDictation\config_local.json` - **Linux:** `~/.local/share/WhisperDictation/config_local.json` +| Key | Default | Description | +|-----|---------|-------------| +| `model` | `medium` | Whisper model size (`tiny`, `base`, `small`, `medium`, `large-v2`, `large-v3`) | +| `device` | `cuda` | Inference device (`cuda` or `cpu`) | +| `compute_type` | `float16` | Precision (`float16` for GPU, `int8` for CPU, `float32`) | +| `audio_device` | `null` | Microphone (null = system default) | + +### Sharing data between Windows and Linux + +On a shared drive (e.g. Ventoy USB), both builds can use the same vocabulary and model files. Set `vocab_path` and `model_dir` in the Settings UI to point to a common directory: + +``` +shared_data/ + vocabulary.json <- shared vocabulary + models/ <- shared Whisper model cache +``` + +Audio settings, model selection, and compute type remain per-platform in `config_local.json`. + ## Vocabulary -Custom vocabulary/replacements can be added to `vocabulary.json`. These are passed as initial prompts to improve recognition of domain-specific terms. +Custom vocabulary/replacements can be edited via the Settings UI or directly in `vocabulary.json`. Words are passed as initial prompts to improve recognition of domain-specific terms. Replacements are applied as find/replace after transcription. ## Model Download -On first start the selected Whisper model is downloaded automatically from HuggingFace (~500 MB for `medium`). Subsequent starts use the cached model. +On first start the selected Whisper model is downloaded automatically from HuggingFace (~500 MB for `medium`). Subsequent starts use the cached model. Set `model_dir` to share the cache between builds. diff --git a/config.json b/config.json index 3168cd1..571aec1 100644 --- a/config.json +++ b/config.json @@ -1,6 +1,7 @@ { "hotkey": "ctrl+shift+space", - "model": "medium", "language": "de", - "sample_rate": 16000 + "sample_rate": 16000, + "vocab_path": "/run/media/chk/Ventoy/projects/chrka/whisper-dictation/shared_data/vocabulary.json", + "model_dir": "/run/media/chk/Ventoy/projects/chrka/whisper-dictation/shared_data/" } \ No newline at end of file diff --git a/main.py b/main.py index 435fe24..1d991e9 100644 --- a/main.py +++ b/main.py @@ -103,4 +103,6 @@ def _quit(stream, icon): app.overlay_tk.after(0, app.overlay_tk.quit) if __name__ == "__main__": + import multiprocessing + multiprocessing.freeze_support() main() diff --git a/shared_data/.directory b/shared_data/.directory new file mode 100644 index 0000000..e4c3043 --- /dev/null +++ b/shared_data/.directory @@ -0,0 +1,2 @@ +[Desktop Entry] +Icon=folder-yellow diff --git a/shared_data/vocabulary.json b/shared_data/vocabulary.json new file mode 100644 index 0000000..c23e13d --- /dev/null +++ b/shared_data/vocabulary.json @@ -0,0 +1,63 @@ +{ + "words": [ + "test" + ], + "replacements": [ + { + "from": "KRA", + "to": "KRAH" + }, + { + "from": "Atos", + "to": "ATHOS" + }, + { + "from": "Resistec", + "to": "RESISTEC" + }, + { + "from": "Resistek", + "to": "RESISTEC" + }, + { + "from": "HES", + "to": "HEES" + }, + { + "from": "Ackerschot", + "to": "Ackerschott" + }, + { + "from": "Carrois", + "to": "Kauer" + }, + { + "from": "Jouer fixe", + "to": "Jour-Fixe" + }, + { + "from": "Docuware", + "to": "DocuWare" + }, + { + "from": "Nates", + "to": "Nejc" + }, + { + "from": "Bittzeit", + "to": "BitSight" + }, + { + "from": "Kalmikow", + "to": "Kalmykov" + }, + { + "from": "Leifert", + "to": "Leifer" + }, + { + "from": "Kiyosa", + "to": "Key-User" + } + ] +} \ No newline at end of file diff --git a/whisper_app/config.py b/whisper_app/config.py index 1f1ee54..4173da9 100644 --- a/whisper_app/config.py +++ b/whisper_app/config.py @@ -33,6 +33,8 @@ DEFAULT_CONFIG = { "language": "de", "audio_device": None, "sample_rate": 16000, + "vocab_path": "", + "model_dir": "", } MODELS = ["tiny", "base", "small", "medium", "large-v2", "large-v3"] @@ -40,12 +42,22 @@ LANGUAGES = {"Deutsch": "de", "English": "en", "Français": "fr", "Español": "e "Italiano": "it", "Auto": None} DEVICES = ["cuda", "cpu"] COMPUTE_TYPES = {"float16 (GPU)": "float16", "int8 (CPU/GPU)": "int8", "float32": "float32"} -LOCAL_KEYS = {"audio_device", "device", "compute_type"} +LOCAL_KEYS = {"audio_device", "device", "compute_type", "model"} config: dict = {} vocab: dict = {"words": [], "replacements": []} +def _resolve_vocab_file() -> None: + """Set VOCAB_FILE from config['vocab_path'], falling back to DATA_DIR.""" + global VOCAB_FILE + vp = config.get("vocab_path", "") + if vp: + VOCAB_FILE = vp if os.path.isabs(vp) else os.path.join(DATA_DIR, vp) + else: + VOCAB_FILE = os.path.join(DATA_DIR, "vocabulary.json") + + def load_config() -> None: global config os.makedirs(_local_dir, exist_ok=True) @@ -63,6 +75,7 @@ def load_config() -> None: config.update(json.load(f)) except json.JSONDecodeError: print(f"Warning: could not parse {CONFIG_LOCAL_FILE}; ignoring") + _resolve_vocab_file() def save_config() -> None: @@ -74,6 +87,7 @@ def save_config() -> None: json.dump(shared, f, indent=2) with open(CONFIG_LOCAL_FILE, "w", encoding="utf-8") as f: json.dump(local, f, indent=2) + _resolve_vocab_file() def load_vocab() -> None: diff --git a/whisper_app/hotkey.py b/whisper_app/hotkey.py index 452d50c..2ed9ef5 100644 --- a/whisper_app/hotkey.py +++ b/whisper_app/hotkey.py @@ -1,29 +1,42 @@ -from pynput.keyboard import Controller as KeyboardController, Listener as KeyboardListener, Key, KeyCode +_pynput_loaded = False +Key = KeyCode = KeyboardListener = None -_MODIFIER_MAP = { - "ctrl": {Key.ctrl_l, Key.ctrl_r}, - "ctrl_l": {Key.ctrl_l}, "ctrl_r": {Key.ctrl_r}, - "shift": {Key.shift_l, Key.shift_r}, - "shift_l": {Key.shift_l}, "shift_r": {Key.shift_r}, - "alt": {Key.alt_l, Key.alt_r}, - "alt_l": {Key.alt_l}, "alt_r": {Key.alt_r}, -} +def _ensure_pynput(): + global _pynput_loaded, Key, KeyCode, KeyboardListener, _MODIFIER_MAP, _KEY_MAP + if _pynput_loaded: + return + from pynput.keyboard import Listener as _Listener, Key as _Key, KeyCode as _KeyCode + Key = _Key + KeyCode = _KeyCode + KeyboardListener = _Listener + _MODIFIER_MAP.update({ + "ctrl": {Key.ctrl_l, Key.ctrl_r}, + "ctrl_l": {Key.ctrl_l}, "ctrl_r": {Key.ctrl_r}, + "shift": {Key.shift_l, Key.shift_r}, + "shift_l": {Key.shift_l}, "shift_r": {Key.shift_r}, + "alt": {Key.alt_l, Key.alt_r}, + "alt_l": {Key.alt_l}, "alt_r": {Key.alt_r}, + }) + _KEY_MAP.update({ + "space": Key.space, "tab": Key.tab, "enter": Key.enter, + "esc": Key.esc, "escape": Key.esc, + "up": Key.up, "down": Key.down, "left": Key.left, "right": Key.right, + "home": Key.home, "end": Key.end, "page_up": Key.page_up, "page_down": Key.page_down, + "insert": Key.insert, "delete": Key.delete, "backspace": Key.backspace, + }) + for i in range(1, 13): + _KEY_MAP[f"f{i}"] = getattr(Key, f"f{i}") + _pynput_loaded = True -_KEY_MAP = { - "space": Key.space, "tab": Key.tab, "enter": Key.enter, - "esc": Key.esc, "escape": Key.esc, - "up": Key.up, "down": Key.down, "left": Key.left, "right": Key.right, - "home": Key.home, "end": Key.end, "page_up": Key.page_up, "page_down": Key.page_down, - "insert": Key.insert, "delete": Key.delete, "backspace": Key.backspace, -} -for i in range(1, 13): - _KEY_MAP[f"f{i}"] = getattr(Key, f"f{i}") +_MODIFIER_MAP = {} +_KEY_MAP = {} def _parse_hotkey(hotkey_str): """Parse hotkey string into (modifier_sets, trigger_key). Returns: (list of sets-of-pynput-keys for each modifier, pynput key for trigger) """ + _ensure_pynput() parts = [p.strip().lower() for p in hotkey_str.split("+")] modifiers = [] for p in parts[:-1]: diff --git a/whisper_app/settings_window.py b/whisper_app/settings_window.py index bf9e5f3..c58db58 100644 --- a/whisper_app/settings_window.py +++ b/whisper_app/settings_window.py @@ -1,6 +1,7 @@ import os import threading import tkinter as tk +from tkinter import filedialog from whisper_app import config as cfg @@ -177,6 +178,45 @@ def _open_main(root: tk.Tk, on_reload) -> None: relief="flat", bd=6, highlightbackground=BORDER, highlightthickness=1).pack(side="left") + # ── PFADE ── + section("PFADE") + + vocab_path_var = tk.StringVar(value=cfg.config.get("vocab_path", "")) + f_vp = row("Vocabulary-Datei", hint="leer = lokal im App-Ordner") + vp_entry = tk.Entry(f_vp, textvariable=vocab_path_var, font=FONT, width=30, + bg=BG3, fg=FG, insertbackground=AMBER, + relief="flat", bd=6, + highlightbackground=BORDER, highlightthickness=1) + vp_entry.pack(side="left") + + def browse_vocab(): + path = filedialog.askopenfilename( + parent=win, title="Vocabulary-Datei wählen", + filetypes=[("JSON", "*.json"), ("Alle", "*.*")]) + if path: + vocab_path_var.set(path) + + tk.Button(f_vp, text="...", command=browse_vocab, + bg=BG3, fg=FG, font=FONT_S, relief="flat", + padx=8, pady=3, cursor="hand2", bd=0).pack(side="left", padx=(6, 0)) + + model_dir_var = tk.StringVar(value=cfg.config.get("model_dir", "")) + f_md = row("Modell-Verzeichnis", hint="leer = Standard-Cache") + md_entry = tk.Entry(f_md, textvariable=model_dir_var, font=FONT, width=30, + bg=BG3, fg=FG, insertbackground=AMBER, + relief="flat", bd=6, + highlightbackground=BORDER, highlightthickness=1) + md_entry.pack(side="left") + + def browse_model_dir(): + path = filedialog.askdirectory(parent=win, title="Modell-Verzeichnis wählen") + if path: + model_dir_var.set(path) + + tk.Button(f_md, text="...", command=browse_model_dir, + bg=BG3, fg=FG, font=FONT_S, relief="flat", + padx=8, pady=3, cursor="hand2", bd=0).pack(side="left", padx=(6, 0)) + # ── Buttons ── tk.Frame(win, bg=BORDER, height=1).pack(fill="x") btn_bar = tk.Frame(win, bg=BG2, pady=16, padx=32) @@ -190,6 +230,8 @@ def _open_main(root: tk.Tk, on_reload) -> None: cfg.config["device"] = device_var.get() cfg.config["compute_type"] = cfg.COMPUTE_TYPES[ct_var.get()] cfg.config["hotkey"] = hotkey_var.get() + cfg.config["vocab_path"] = vocab_path_var.get() + cfg.config["model_dir"] = model_dir_var.get() cfg.save_config() win.destroy() threading.Thread(target=on_reload, daemon=True).start() diff --git a/whisper_app/transcriber.py b/whisper_app/transcriber.py index 16ef2b9..a2dda20 100644 --- a/whisper_app/transcriber.py +++ b/whisper_app/transcriber.py @@ -8,10 +8,12 @@ from whisper_app import app, config, typer def load_model() -> None: app.log(f"Loading {config.config['model']} on {config.config['device']}...") + model_dir = config.config.get("model_dir") or None app.model = WhisperModel( config.config["model"], device=config.config["device"], compute_type=config.config["compute_type"], + download_root=model_dir, ) app.log("Model ready.") diff --git a/whisper_app/typer.py b/whisper_app/typer.py index 3318565..ad1ff58 100644 --- a/whisper_app/typer.py +++ b/whisper_app/typer.py @@ -3,13 +3,16 @@ import shutil import subprocess import time -from pynput.keyboard import Controller as KeyboardController + +def _pynput_type(text): + from pynput.keyboard import Controller as KeyboardController + KeyboardController().type(text) def type_text(text): """Type text into the active window, cross-platform.""" if os.name == "nt": - KeyboardController().type(text) + _pynput_type(text) return session = os.environ.get("XDG_SESSION_TYPE", "") if session == "wayland" and shutil.which("wl-copy"): @@ -19,4 +22,4 @@ def type_text(text): elif shutil.which("xdotool"): subprocess.run(["xdotool", "type", "--clearmodifiers", "--", text], check=False) else: - KeyboardController().type(text) + _pynput_type(text)