fix: audio device handling, CUDA/VAD bundling, and transcription errors

- Resolve audio devices by name instead of unstable PortAudio index - Filter device list to default host API (hides loopback/output devices) - Add mic test button with live level meter in settings - Bundle silero_vad.onnx and CUDA DLLs in PyInstaller spec - Wrap transcription in try/finally so tray icon always resets to idle - Add build artifacts to .gitignore Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-20 15:28:50 +01:00 · 2026-03-20 15:28:50 +01:00 · 6d20f21078
parent db88df6368
commit 6d20f21078
6 changed files with 148 additions and 22 deletions
--- a/.gitignore
+++ b/.gitignore
@ -7,5 +7,10 @@ __pycache__/
 config_local.json
 models/
 *.log
+build/
+dist/
+icon.ico
+.claude/settings.local.json
+.superpowers/
 .DS_Store
 Thumbs.db
--- a/build.py
+++ b/build.py
@ -30,7 +30,7 @@ def build():
        dest = os.path.join(dist_dir, fname)
        if not os.path.exists(dest):
            shutil.copy(fname, dest)
-            print(f"Copied {fname} → {dist_dir}/")
+            print(f"Copied {fname} -> {dist_dir}/")
        else:
            print(f"Skipped {fname} (already exists in dist — preserving user edits)")

--- a/whisper-dictation.spec
+++ b/whisper-dictation.spec
@ -1,12 +1,29 @@
 # whisper-dictation.spec
 # -*- mode: python ; coding: utf-8 -*-
-import sys
+import os, sys
+
+import importlib, site
+def _pkg_path(pkg):
+    mod = importlib.import_module(pkg)
+    if mod.__file__:
+        return os.path.dirname(mod.__file__)
+    # namespace package — resolve via site-packages
+    sp = site.getsitepackages()[0]
+    return os.path.join(sp, *pkg.split('.'))
+
+_sp = next(p for p in site.getsitepackages() if p.endswith('site-packages'))
+_nvidia = os.path.join(_sp, 'nvidia')

 a = Analysis(
    ['main.py'],
    pathex=[],
-    binaries=[],
-    datas=[],
+    binaries=[
+        (os.path.join(_nvidia, 'cublas', 'bin', '*.dll'), '.'),
+        (os.path.join(_nvidia, 'cudnn', 'bin', '*.dll'), '.'),
+    ],
+    datas=[
+        (os.path.join(_pkg_path('faster_whisper'), 'assets', '*.onnx'), 'faster_whisper/assets'),
+    ],
    hiddenimports=[
        'ctranslate2',
        'faster_whisper',
--- a/whisper_app/audio.py
+++ b/whisper_app/audio.py
@ -1,18 +1,75 @@
+import logging
+import threading
+
+import numpy as np
 import sounddevice as sd

 from whisper_app import app, config

+log = logging.getLogger(__name__)
+

 def audio_callback(indata, frames, time_info, status):
    if app.state == app.AppState.RECORDING:
        app.audio_chunks.append(indata.copy())


+def resolve_device(name: str | None) -> int | None:
+    """Resolve a device name to its current PortAudio index, or None for default."""
+    if not name:
+        return None
+    for i, d in enumerate(sd.query_devices()):
+        if d["max_input_channels"] > 0 and d["name"] == name:
+            return i
+    log.warning("Audio device '%s' not found, using default", name)
+    return None
+
+
+def get_input_devices() -> list[tuple[int, str]]:
+    """Return list of (index, name) for input devices on the default host API."""
+    default_api = sd.query_hostapis(sd.default.hostapi)["name"]
+    return [(i, d["name"]) for i, d in enumerate(sd.query_devices())
+            if d["max_input_channels"] > 0
+            and sd.query_hostapis(d["hostapi"])["name"] == default_api]
+
+
+def test_device(device_name: str | None, duration: float,
+                on_level: callable, on_done: callable) -> None:
+    """Record from device for *duration* seconds, calling on_level(float 0..1) periodically."""
+    device = resolve_device(device_name)
+
+    def _run():
+        try:
+            sr = config.config["sample_rate"]
+            block = int(sr * 0.05)  # 50 ms blocks
+            peak = 0.0
+
+            def _cb(indata, frames, time_info, status):
+                nonlocal peak
+                level = float(np.abs(indata).max())
+                peak = max(peak, level)
+                on_level(min(level / 0.1, 1.0))  # normalize: 0.1 amplitude = 100%
+
+            with sd.InputStream(samplerate=sr, channels=1, device=device,
+                                callback=_cb, blocksize=block):
+                sd.sleep(int(duration * 1000))
+            on_done(peak > 0.001)
+        except Exception as e:
+            log.error("Mic test failed: %s", e)
+            on_done(False)
+
+    threading.Thread(target=_run, daemon=True).start()
+
+
 def get_audio_stream():
-    device = config.config.get("audio_device")
-    return sd.InputStream(
-        samplerate=config.config["sample_rate"],
-        channels=1,
-        device=device,
-        callback=audio_callback,
-    )
+    device = resolve_device(config.config.get("audio_device"))
+    sr = config.config["sample_rate"]
+    try:
+        return sd.InputStream(
+            samplerate=sr, channels=1, device=device, callback=audio_callback,
+        )
+    except sd.PortAudioError:
+        log.warning("Audio device %s failed, falling back to default", device)
+        return sd.InputStream(
+            samplerate=sr, channels=1, device=None, callback=audio_callback,
+        )
--- a/whisper_app/settings_window.py
+++ b/whisper_app/settings_window.py
@ -2,8 +2,6 @@ import os
 import threading
 import tkinter as tk

-import sounddevice as sd
-
 from whisper_app import config as cfg


@ -87,16 +85,55 @@ def _open_main(root: tk.Tk, on_reload) -> None:

    # ── AUDIO ──
    section("AUDIO")
-    devices = [(i, d["name"]) for i, d in enumerate(sd.query_devices())
-               if d["max_input_channels"] > 0]
-    dev_names = ["Standard"] + [f"{i}: {n}" for i, n in devices]
+    from whisper_app.audio import get_input_devices, test_device
+    devices = get_input_devices()
+    dev_names = ["Standard"] + [name for _, name in devices]
    dev_var = tk.StringVar()
    cur_dev = cfg.config.get("audio_device")
-    dev_var.set("Standard" if cur_dev is None else
-                next((f"{i}: {n}" for i, n in devices if i == cur_dev), "Standard"))
+    dev_var.set(cur_dev if cur_dev and cur_dev in dev_names else "Standard")
    f = row("Mikrofon")
    dd(f, dev_var, dev_names, width=44).pack(side="left")

+    # ── Mic test ──
+    f_test = tk.Frame(content, bg=BG)
+    f_test.pack(fill="x", pady=(2, 8))
+    tk.Label(f_test, text="", width=17, bg=BG).pack(side="left")  # spacer
+
+    level_canvas = tk.Canvas(f_test, width=200, height=14, bg=BG3,
+                             highlightbackground=BORDER, highlightthickness=1, bd=0)
+    level_canvas.pack(side="left")
+    level_bar = level_canvas.create_rectangle(0, 0, 0, 14, fill=GREEN, width=0)
+
+    test_label = tk.Label(f_test, text="", font=FONT_S, bg=BG, fg=FG2)
+    test_label.pack(side="left", padx=(8, 0))
+
+    def run_mic_test():
+        test_btn.config(state="disabled", text="Aufnahme...")
+        test_label.config(text="Sprich jetzt...", fg=FG2)
+        level_canvas.coords(level_bar, 0, 0, 0, 14)
+
+        dev_name = dev_var.get()
+        device = None if dev_name == "Standard" else dev_name
+
+        def on_level(lvl):
+            win.after(0, lambda: level_canvas.coords(level_bar, 0, 0, int(lvl * 200), 14))
+
+        def on_done(ok):
+            def _update():
+                test_btn.config(state="normal", text="Test")
+                if ok:
+                    test_label.config(text="Signal erkannt", fg=GREEN)
+                else:
+                    test_label.config(text="Kein Signal!", fg="#f05050")
+            win.after(0, _update)
+
+        test_device(device, 2.0, on_level, on_done)
+
+    test_btn = tk.Button(f_test, text="Test", command=run_mic_test,
+                         bg=BG3, fg=FG, font=FONT_S, relief="flat",
+                         padx=10, pady=3, cursor="hand2", bd=0)
+    test_btn.pack(side="left", padx=(8, 0))
+
    # ── MODELL ──
    section("MODELL")
    model_hints = {
@ -147,7 +184,7 @@ def _open_main(root: tk.Tk, on_reload) -> None:

    def save():
        sel = dev_var.get()
-        cfg.config["audio_device"] = None if sel == "Standard" else int(sel.split(":")[0])
+        cfg.config["audio_device"] = None if sel == "Standard" else sel
        cfg.config["model"] = model_var.get()
        cfg.config["language"] = cfg.LANGUAGES[lang_var.get()]
        cfg.config["device"] = device_var.get()
--- a/whisper_app/transcriber.py
+++ b/whisper_app/transcriber.py
@ -20,10 +20,22 @@ def stop_and_transcribe() -> None:
    if app.state != app.AppState.RECORDING:
        return
    set_state(app.AppState.TRANSCRIBING)
+    try:
+        _do_transcribe()
+    except Exception as e:
+        app.log(f"Transcription error: {e}")
+    finally:
+        set_state(app.AppState.IDLE)
+
+
+def _do_transcribe() -> None:
    chunks = list(app.audio_chunks)

    if not chunks:
-        set_state(app.AppState.IDLE)
+        return
+
+    if app.model is None:
+        app.log("Model not loaded yet — skipped.")
        return

    audio = np.concatenate(chunks, axis=0).flatten().astype(np.float32)
@ -33,7 +45,6 @@ def stop_and_transcribe() -> None:

    if duration < 0.3 or rms < 0.0001:
        app.log("Too short or silent — skipped.")
-        set_state(app.AppState.IDLE)
        return

    target_rms = 0.05
@ -51,7 +62,6 @@ def stop_and_transcribe() -> None:
    text = config.apply_vocab(text)
    app.log(f"Result: {repr(text)}")

-    set_state(app.AppState.IDLE)
    if text:
        time.sleep(0.15)
        typer.type_text(text)