From 6d20f210789ea2d64b440d389ba91676d19334f9 Mon Sep 17 00:00:00 2001 From: beo3000 Date: Fri, 20 Mar 2026 15:28:50 +0100 Subject: [PATCH] fix: audio device handling, CUDA/VAD bundling, and transcription errors - Resolve audio devices by name instead of unstable PortAudio index - Filter device list to default host API (hides loopback/output devices) - Add mic test button with live level meter in settings - Bundle silero_vad.onnx and CUDA DLLs in PyInstaller spec - Wrap transcription in try/finally so tray icon always resets to idle - Add build artifacts to .gitignore Co-Authored-By: Claude Opus 4.6 (1M context) --- .gitignore | 5 +++ build.py | 2 +- whisper-dictation.spec | 23 +++++++++-- whisper_app/audio.py | 71 ++++++++++++++++++++++++++++++---- whisper_app/settings_window.py | 53 +++++++++++++++++++++---- whisper_app/transcriber.py | 16 ++++++-- 6 files changed, 148 insertions(+), 22 deletions(-) diff --git a/.gitignore b/.gitignore index f78af01..ada5b23 100644 --- a/.gitignore +++ b/.gitignore @@ -7,5 +7,10 @@ __pycache__/ config_local.json models/ *.log +build/ +dist/ +icon.ico +.claude/settings.local.json +.superpowers/ .DS_Store Thumbs.db diff --git a/build.py b/build.py index 761ceb2..70b7477 100644 --- a/build.py +++ b/build.py @@ -30,7 +30,7 @@ def build(): dest = os.path.join(dist_dir, fname) if not os.path.exists(dest): shutil.copy(fname, dest) - print(f"Copied {fname} → {dist_dir}/") + print(f"Copied {fname} -> {dist_dir}/") else: print(f"Skipped {fname} (already exists in dist — preserving user edits)") diff --git a/whisper-dictation.spec b/whisper-dictation.spec index e0ee2d4..8ec3692 100644 --- a/whisper-dictation.spec +++ b/whisper-dictation.spec @@ -1,12 +1,29 @@ # whisper-dictation.spec # -*- mode: python ; coding: utf-8 -*- -import sys +import os, sys + +import importlib, site +def _pkg_path(pkg): + mod = importlib.import_module(pkg) + if mod.__file__: + return os.path.dirname(mod.__file__) + # namespace package — resolve via site-packages + sp = site.getsitepackages()[0] + return os.path.join(sp, *pkg.split('.')) + +_sp = next(p for p in site.getsitepackages() if p.endswith('site-packages')) +_nvidia = os.path.join(_sp, 'nvidia') a = Analysis( ['main.py'], pathex=[], - binaries=[], - datas=[], + binaries=[ + (os.path.join(_nvidia, 'cublas', 'bin', '*.dll'), '.'), + (os.path.join(_nvidia, 'cudnn', 'bin', '*.dll'), '.'), + ], + datas=[ + (os.path.join(_pkg_path('faster_whisper'), 'assets', '*.onnx'), 'faster_whisper/assets'), + ], hiddenimports=[ 'ctranslate2', 'faster_whisper', diff --git a/whisper_app/audio.py b/whisper_app/audio.py index 940ba60..9d002df 100644 --- a/whisper_app/audio.py +++ b/whisper_app/audio.py @@ -1,18 +1,75 @@ +import logging +import threading + +import numpy as np import sounddevice as sd from whisper_app import app, config +log = logging.getLogger(__name__) + def audio_callback(indata, frames, time_info, status): if app.state == app.AppState.RECORDING: app.audio_chunks.append(indata.copy()) +def resolve_device(name: str | None) -> int | None: + """Resolve a device name to its current PortAudio index, or None for default.""" + if not name: + return None + for i, d in enumerate(sd.query_devices()): + if d["max_input_channels"] > 0 and d["name"] == name: + return i + log.warning("Audio device '%s' not found, using default", name) + return None + + +def get_input_devices() -> list[tuple[int, str]]: + """Return list of (index, name) for input devices on the default host API.""" + default_api = sd.query_hostapis(sd.default.hostapi)["name"] + return [(i, d["name"]) for i, d in enumerate(sd.query_devices()) + if d["max_input_channels"] > 0 + and sd.query_hostapis(d["hostapi"])["name"] == default_api] + + +def test_device(device_name: str | None, duration: float, + on_level: callable, on_done: callable) -> None: + """Record from device for *duration* seconds, calling on_level(float 0..1) periodically.""" + device = resolve_device(device_name) + + def _run(): + try: + sr = config.config["sample_rate"] + block = int(sr * 0.05) # 50 ms blocks + peak = 0.0 + + def _cb(indata, frames, time_info, status): + nonlocal peak + level = float(np.abs(indata).max()) + peak = max(peak, level) + on_level(min(level / 0.1, 1.0)) # normalize: 0.1 amplitude = 100% + + with sd.InputStream(samplerate=sr, channels=1, device=device, + callback=_cb, blocksize=block): + sd.sleep(int(duration * 1000)) + on_done(peak > 0.001) + except Exception as e: + log.error("Mic test failed: %s", e) + on_done(False) + + threading.Thread(target=_run, daemon=True).start() + + def get_audio_stream(): - device = config.config.get("audio_device") - return sd.InputStream( - samplerate=config.config["sample_rate"], - channels=1, - device=device, - callback=audio_callback, - ) + device = resolve_device(config.config.get("audio_device")) + sr = config.config["sample_rate"] + try: + return sd.InputStream( + samplerate=sr, channels=1, device=device, callback=audio_callback, + ) + except sd.PortAudioError: + log.warning("Audio device %s failed, falling back to default", device) + return sd.InputStream( + samplerate=sr, channels=1, device=None, callback=audio_callback, + ) diff --git a/whisper_app/settings_window.py b/whisper_app/settings_window.py index d6db5ae..f12e9ef 100644 --- a/whisper_app/settings_window.py +++ b/whisper_app/settings_window.py @@ -2,8 +2,6 @@ import os import threading import tkinter as tk -import sounddevice as sd - from whisper_app import config as cfg @@ -87,16 +85,55 @@ def _open_main(root: tk.Tk, on_reload) -> None: # ── AUDIO ── section("AUDIO") - devices = [(i, d["name"]) for i, d in enumerate(sd.query_devices()) - if d["max_input_channels"] > 0] - dev_names = ["Standard"] + [f"{i}: {n}" for i, n in devices] + from whisper_app.audio import get_input_devices, test_device + devices = get_input_devices() + dev_names = ["Standard"] + [name for _, name in devices] dev_var = tk.StringVar() cur_dev = cfg.config.get("audio_device") - dev_var.set("Standard" if cur_dev is None else - next((f"{i}: {n}" for i, n in devices if i == cur_dev), "Standard")) + dev_var.set(cur_dev if cur_dev and cur_dev in dev_names else "Standard") f = row("Mikrofon") dd(f, dev_var, dev_names, width=44).pack(side="left") + # ── Mic test ── + f_test = tk.Frame(content, bg=BG) + f_test.pack(fill="x", pady=(2, 8)) + tk.Label(f_test, text="", width=17, bg=BG).pack(side="left") # spacer + + level_canvas = tk.Canvas(f_test, width=200, height=14, bg=BG3, + highlightbackground=BORDER, highlightthickness=1, bd=0) + level_canvas.pack(side="left") + level_bar = level_canvas.create_rectangle(0, 0, 0, 14, fill=GREEN, width=0) + + test_label = tk.Label(f_test, text="", font=FONT_S, bg=BG, fg=FG2) + test_label.pack(side="left", padx=(8, 0)) + + def run_mic_test(): + test_btn.config(state="disabled", text="Aufnahme...") + test_label.config(text="Sprich jetzt...", fg=FG2) + level_canvas.coords(level_bar, 0, 0, 0, 14) + + dev_name = dev_var.get() + device = None if dev_name == "Standard" else dev_name + + def on_level(lvl): + win.after(0, lambda: level_canvas.coords(level_bar, 0, 0, int(lvl * 200), 14)) + + def on_done(ok): + def _update(): + test_btn.config(state="normal", text="Test") + if ok: + test_label.config(text="Signal erkannt", fg=GREEN) + else: + test_label.config(text="Kein Signal!", fg="#f05050") + win.after(0, _update) + + test_device(device, 2.0, on_level, on_done) + + test_btn = tk.Button(f_test, text="Test", command=run_mic_test, + bg=BG3, fg=FG, font=FONT_S, relief="flat", + padx=10, pady=3, cursor="hand2", bd=0) + test_btn.pack(side="left", padx=(8, 0)) + # ── MODELL ── section("MODELL") model_hints = { @@ -147,7 +184,7 @@ def _open_main(root: tk.Tk, on_reload) -> None: def save(): sel = dev_var.get() - cfg.config["audio_device"] = None if sel == "Standard" else int(sel.split(":")[0]) + cfg.config["audio_device"] = None if sel == "Standard" else sel cfg.config["model"] = model_var.get() cfg.config["language"] = cfg.LANGUAGES[lang_var.get()] cfg.config["device"] = device_var.get() diff --git a/whisper_app/transcriber.py b/whisper_app/transcriber.py index d5cb987..88fc831 100644 --- a/whisper_app/transcriber.py +++ b/whisper_app/transcriber.py @@ -20,10 +20,22 @@ def stop_and_transcribe() -> None: if app.state != app.AppState.RECORDING: return set_state(app.AppState.TRANSCRIBING) + try: + _do_transcribe() + except Exception as e: + app.log(f"Transcription error: {e}") + finally: + set_state(app.AppState.IDLE) + + +def _do_transcribe() -> None: chunks = list(app.audio_chunks) if not chunks: - set_state(app.AppState.IDLE) + return + + if app.model is None: + app.log("Model not loaded yet — skipped.") return audio = np.concatenate(chunks, axis=0).flatten().astype(np.float32) @@ -33,7 +45,6 @@ def stop_and_transcribe() -> None: if duration < 0.3 or rms < 0.0001: app.log("Too short or silent — skipped.") - set_state(app.AppState.IDLE) return target_rms = 0.05 @@ -51,7 +62,6 @@ def stop_and_transcribe() -> None: text = config.apply_vocab(text) app.log(f"Result: {repr(text)}") - set_state(app.AppState.IDLE) if text: time.sleep(0.15) typer.type_text(text)