From 6d20f210789ea2d64b440d389ba91676d19334f9 Mon Sep 17 00:00:00 2001
From: beo3000 <pakabu@web.de>
Date: Fri, 20 Mar 2026 15:28:50 +0100
Subject: [PATCH] fix: audio device handling, CUDA/VAD bundling, and
 transcription errors

- Resolve audio devices by name instead of unstable PortAudio index
- Filter device list to default host API (hides loopback/output devices)
- Add mic test button with live level meter in settings
- Bundle silero_vad.onnx and CUDA DLLs in PyInstaller spec
- Wrap transcription in try/finally so tray icon always resets to idle
- Add build artifacts to .gitignore

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .gitignore                     |  5 +++
 build.py                       |  2 +-
 whisper-dictation.spec         | 23 +++++++++--
 whisper_app/audio.py           | 71 ++++++++++++++++++++++++++++++----
 whisper_app/settings_window.py | 53 +++++++++++++++++++++----
 whisper_app/transcriber.py     | 16 ++++++--
 6 files changed, 148 insertions(+), 22 deletions(-)

diff --git a/.gitignore b/.gitignore
index f78af01..ada5b23 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,5 +7,10 @@ __pycache__/
 config_local.json
 models/
 *.log
+build/
+dist/
+icon.ico
+.claude/settings.local.json
+.superpowers/
 .DS_Store
 Thumbs.db
diff --git a/build.py b/build.py
index 761ceb2..70b7477 100644
--- a/build.py
+++ b/build.py
@@ -30,7 +30,7 @@ def build():
         dest = os.path.join(dist_dir, fname)
         if not os.path.exists(dest):
             shutil.copy(fname, dest)
-            print(f"Copied {fname} → {dist_dir}/")
+            print(f"Copied {fname} -> {dist_dir}/")
         else:
             print(f"Skipped {fname} (already exists in dist — preserving user edits)")
 
diff --git a/whisper-dictation.spec b/whisper-dictation.spec
index e0ee2d4..8ec3692 100644
--- a/whisper-dictation.spec
+++ b/whisper-dictation.spec
@@ -1,12 +1,29 @@
 # whisper-dictation.spec
 # -*- mode: python ; coding: utf-8 -*-
-import sys
+import os, sys
+
+import importlib, site
+def _pkg_path(pkg):
+    mod = importlib.import_module(pkg)
+    if mod.__file__:
+        return os.path.dirname(mod.__file__)
+    # namespace package — resolve via site-packages
+    sp = site.getsitepackages()[0]
+    return os.path.join(sp, *pkg.split('.'))
+
+_sp = next(p for p in site.getsitepackages() if p.endswith('site-packages'))
+_nvidia = os.path.join(_sp, 'nvidia')
 
 a = Analysis(
     ['main.py'],
     pathex=[],
-    binaries=[],
-    datas=[],
+    binaries=[
+        (os.path.join(_nvidia, 'cublas', 'bin', '*.dll'), '.'),
+        (os.path.join(_nvidia, 'cudnn', 'bin', '*.dll'), '.'),
+    ],
+    datas=[
+        (os.path.join(_pkg_path('faster_whisper'), 'assets', '*.onnx'), 'faster_whisper/assets'),
+    ],
     hiddenimports=[
         'ctranslate2',
         'faster_whisper',
diff --git a/whisper_app/audio.py b/whisper_app/audio.py
index 940ba60..9d002df 100644
--- a/whisper_app/audio.py
+++ b/whisper_app/audio.py
@@ -1,18 +1,75 @@
+import logging
+import threading
+
+import numpy as np
 import sounddevice as sd
 
 from whisper_app import app, config
 
+log = logging.getLogger(__name__)
+
 
 def audio_callback(indata, frames, time_info, status):
     if app.state == app.AppState.RECORDING:
         app.audio_chunks.append(indata.copy())
 
 
+def resolve_device(name: str | None) -> int | None:
+    """Resolve a device name to its current PortAudio index, or None for default."""
+    if not name:
+        return None
+    for i, d in enumerate(sd.query_devices()):
+        if d["max_input_channels"] > 0 and d["name"] == name:
+            return i
+    log.warning("Audio device '%s' not found, using default", name)
+    return None
+
+
+def get_input_devices() -> list[tuple[int, str]]:
+    """Return list of (index, name) for input devices on the default host API."""
+    default_api = sd.query_hostapis(sd.default.hostapi)["name"]
+    return [(i, d["name"]) for i, d in enumerate(sd.query_devices())
+            if d["max_input_channels"] > 0
+            and sd.query_hostapis(d["hostapi"])["name"] == default_api]
+
+
+def test_device(device_name: str | None, duration: float,
+                on_level: callable, on_done: callable) -> None:
+    """Record from device for *duration* seconds, calling on_level(float 0..1) periodically."""
+    device = resolve_device(device_name)
+
+    def _run():
+        try:
+            sr = config.config["sample_rate"]
+            block = int(sr * 0.05)  # 50 ms blocks
+            peak = 0.0
+
+            def _cb(indata, frames, time_info, status):
+                nonlocal peak
+                level = float(np.abs(indata).max())
+                peak = max(peak, level)
+                on_level(min(level / 0.1, 1.0))  # normalize: 0.1 amplitude = 100%
+
+            with sd.InputStream(samplerate=sr, channels=1, device=device,
+                                callback=_cb, blocksize=block):
+                sd.sleep(int(duration * 1000))
+            on_done(peak > 0.001)
+        except Exception as e:
+            log.error("Mic test failed: %s", e)
+            on_done(False)
+
+    threading.Thread(target=_run, daemon=True).start()
+
+
 def get_audio_stream():
-    device = config.config.get("audio_device")
-    return sd.InputStream(
-        samplerate=config.config["sample_rate"],
-        channels=1,
-        device=device,
-        callback=audio_callback,
-    )
+    device = resolve_device(config.config.get("audio_device"))
+    sr = config.config["sample_rate"]
+    try:
+        return sd.InputStream(
+            samplerate=sr, channels=1, device=device, callback=audio_callback,
+        )
+    except sd.PortAudioError:
+        log.warning("Audio device %s failed, falling back to default", device)
+        return sd.InputStream(
+            samplerate=sr, channels=1, device=None, callback=audio_callback,
+        )
diff --git a/whisper_app/settings_window.py b/whisper_app/settings_window.py
index d6db5ae..f12e9ef 100644
--- a/whisper_app/settings_window.py
+++ b/whisper_app/settings_window.py
@@ -2,8 +2,6 @@ import os
 import threading
 import tkinter as tk
 
-import sounddevice as sd
-
 from whisper_app import config as cfg
 
 
@@ -87,16 +85,55 @@ def _open_main(root: tk.Tk, on_reload) -> None:
 
     # ── AUDIO ──
     section("AUDIO")
-    devices = [(i, d["name"]) for i, d in enumerate(sd.query_devices())
-               if d["max_input_channels"] > 0]
-    dev_names = ["Standard"] + [f"{i}: {n}" for i, n in devices]
+    from whisper_app.audio import get_input_devices, test_device
+    devices = get_input_devices()
+    dev_names = ["Standard"] + [name for _, name in devices]
     dev_var = tk.StringVar()
     cur_dev = cfg.config.get("audio_device")
-    dev_var.set("Standard" if cur_dev is None else
-                next((f"{i}: {n}" for i, n in devices if i == cur_dev), "Standard"))
+    dev_var.set(cur_dev if cur_dev and cur_dev in dev_names else "Standard")
     f = row("Mikrofon")
     dd(f, dev_var, dev_names, width=44).pack(side="left")
 
+    # ── Mic test ──
+    f_test = tk.Frame(content, bg=BG)
+    f_test.pack(fill="x", pady=(2, 8))
+    tk.Label(f_test, text="", width=17, bg=BG).pack(side="left")  # spacer
+
+    level_canvas = tk.Canvas(f_test, width=200, height=14, bg=BG3,
+                             highlightbackground=BORDER, highlightthickness=1, bd=0)
+    level_canvas.pack(side="left")
+    level_bar = level_canvas.create_rectangle(0, 0, 0, 14, fill=GREEN, width=0)
+
+    test_label = tk.Label(f_test, text="", font=FONT_S, bg=BG, fg=FG2)
+    test_label.pack(side="left", padx=(8, 0))
+
+    def run_mic_test():
+        test_btn.config(state="disabled", text="Aufnahme...")
+        test_label.config(text="Sprich jetzt...", fg=FG2)
+        level_canvas.coords(level_bar, 0, 0, 0, 14)
+
+        dev_name = dev_var.get()
+        device = None if dev_name == "Standard" else dev_name
+
+        def on_level(lvl):
+            win.after(0, lambda: level_canvas.coords(level_bar, 0, 0, int(lvl * 200), 14))
+
+        def on_done(ok):
+            def _update():
+                test_btn.config(state="normal", text="Test")
+                if ok:
+                    test_label.config(text="Signal erkannt", fg=GREEN)
+                else:
+                    test_label.config(text="Kein Signal!", fg="#f05050")
+            win.after(0, _update)
+
+        test_device(device, 2.0, on_level, on_done)
+
+    test_btn = tk.Button(f_test, text="Test", command=run_mic_test,
+                         bg=BG3, fg=FG, font=FONT_S, relief="flat",
+                         padx=10, pady=3, cursor="hand2", bd=0)
+    test_btn.pack(side="left", padx=(8, 0))
+
     # ── MODELL ──
     section("MODELL")
     model_hints = {
@@ -147,7 +184,7 @@ def _open_main(root: tk.Tk, on_reload) -> None:
 
     def save():
         sel = dev_var.get()
-        cfg.config["audio_device"] = None if sel == "Standard" else int(sel.split(":")[0])
+        cfg.config["audio_device"] = None if sel == "Standard" else sel
         cfg.config["model"] = model_var.get()
         cfg.config["language"] = cfg.LANGUAGES[lang_var.get()]
         cfg.config["device"] = device_var.get()
diff --git a/whisper_app/transcriber.py b/whisper_app/transcriber.py
index d5cb987..88fc831 100644
--- a/whisper_app/transcriber.py
+++ b/whisper_app/transcriber.py
@@ -20,10 +20,22 @@ def stop_and_transcribe() -> None:
     if app.state != app.AppState.RECORDING:
         return
     set_state(app.AppState.TRANSCRIBING)
+    try:
+        _do_transcribe()
+    except Exception as e:
+        app.log(f"Transcription error: {e}")
+    finally:
+        set_state(app.AppState.IDLE)
+
+
+def _do_transcribe() -> None:
     chunks = list(app.audio_chunks)
 
     if not chunks:
-        set_state(app.AppState.IDLE)
+        return
+
+    if app.model is None:
+        app.log("Model not loaded yet — skipped.")
         return
 
     audio = np.concatenate(chunks, axis=0).flatten().astype(np.float32)
@@ -33,7 +45,6 @@ def stop_and_transcribe() -> None:
 
     if duration < 0.3 or rms < 0.0001:
         app.log("Too short or silent — skipped.")
-        set_state(app.AppState.IDLE)
         return
 
     target_rms = 0.05
@@ -51,7 +62,6 @@ def stop_and_transcribe() -> None:
     text = config.apply_vocab(text)
     app.log(f"Result: {repr(text)}")
 
-    set_state(app.AppState.IDLE)
     if text:
         time.sleep(0.15)
         typer.type_text(text)