fix: audio device handling, CUDA/VAD bundling, and transcription errors

- Resolve audio devices by name instead of unstable PortAudio index
- Filter device list to default host API (hides loopback/output devices)
- Add mic test button with live level meter in settings
- Bundle silero_vad.onnx and CUDA DLLs in PyInstaller spec
- Wrap transcription in try/finally so tray icon always resets to idle
- Add build artifacts to .gitignore

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
beo3000 2026-03-20 15:28:50 +01:00
parent db88df6368
commit 6d20f21078
6 changed files with 148 additions and 22 deletions

5
.gitignore vendored
View File

@ -7,5 +7,10 @@ __pycache__/
config_local.json
models/
*.log
build/
dist/
icon.ico
.claude/settings.local.json
.superpowers/
.DS_Store
Thumbs.db

View File

@ -30,7 +30,7 @@ def build():
dest = os.path.join(dist_dir, fname)
if not os.path.exists(dest):
shutil.copy(fname, dest)
print(f"Copied {fname} {dist_dir}/")
print(f"Copied {fname} -> {dist_dir}/")
else:
print(f"Skipped {fname} (already exists in dist — preserving user edits)")

View File

@ -1,12 +1,29 @@
# whisper-dictation.spec
# -*- mode: python ; coding: utf-8 -*-
import sys
import os, sys
import importlib, site
def _pkg_path(pkg):
mod = importlib.import_module(pkg)
if mod.__file__:
return os.path.dirname(mod.__file__)
# namespace package — resolve via site-packages
sp = site.getsitepackages()[0]
return os.path.join(sp, *pkg.split('.'))
_sp = next(p for p in site.getsitepackages() if p.endswith('site-packages'))
_nvidia = os.path.join(_sp, 'nvidia')
a = Analysis(
['main.py'],
pathex=[],
binaries=[],
datas=[],
binaries=[
(os.path.join(_nvidia, 'cublas', 'bin', '*.dll'), '.'),
(os.path.join(_nvidia, 'cudnn', 'bin', '*.dll'), '.'),
],
datas=[
(os.path.join(_pkg_path('faster_whisper'), 'assets', '*.onnx'), 'faster_whisper/assets'),
],
hiddenimports=[
'ctranslate2',
'faster_whisper',

View File

@ -1,18 +1,75 @@
import logging
import threading
import numpy as np
import sounddevice as sd
from whisper_app import app, config
log = logging.getLogger(__name__)
def audio_callback(indata, frames, time_info, status):
if app.state == app.AppState.RECORDING:
app.audio_chunks.append(indata.copy())
def resolve_device(name: str | None) -> int | None:
"""Resolve a device name to its current PortAudio index, or None for default."""
if not name:
return None
for i, d in enumerate(sd.query_devices()):
if d["max_input_channels"] > 0 and d["name"] == name:
return i
log.warning("Audio device '%s' not found, using default", name)
return None
def get_input_devices() -> list[tuple[int, str]]:
"""Return list of (index, name) for input devices on the default host API."""
default_api = sd.query_hostapis(sd.default.hostapi)["name"]
return [(i, d["name"]) for i, d in enumerate(sd.query_devices())
if d["max_input_channels"] > 0
and sd.query_hostapis(d["hostapi"])["name"] == default_api]
def test_device(device_name: str | None, duration: float,
on_level: callable, on_done: callable) -> None:
"""Record from device for *duration* seconds, calling on_level(float 0..1) periodically."""
device = resolve_device(device_name)
def _run():
try:
sr = config.config["sample_rate"]
block = int(sr * 0.05) # 50 ms blocks
peak = 0.0
def _cb(indata, frames, time_info, status):
nonlocal peak
level = float(np.abs(indata).max())
peak = max(peak, level)
on_level(min(level / 0.1, 1.0)) # normalize: 0.1 amplitude = 100%
with sd.InputStream(samplerate=sr, channels=1, device=device,
callback=_cb, blocksize=block):
sd.sleep(int(duration * 1000))
on_done(peak > 0.001)
except Exception as e:
log.error("Mic test failed: %s", e)
on_done(False)
threading.Thread(target=_run, daemon=True).start()
def get_audio_stream():
device = config.config.get("audio_device")
return sd.InputStream(
samplerate=config.config["sample_rate"],
channels=1,
device=device,
callback=audio_callback,
)
device = resolve_device(config.config.get("audio_device"))
sr = config.config["sample_rate"]
try:
return sd.InputStream(
samplerate=sr, channels=1, device=device, callback=audio_callback,
)
except sd.PortAudioError:
log.warning("Audio device %s failed, falling back to default", device)
return sd.InputStream(
samplerate=sr, channels=1, device=None, callback=audio_callback,
)

View File

@ -2,8 +2,6 @@ import os
import threading
import tkinter as tk
import sounddevice as sd
from whisper_app import config as cfg
@ -87,16 +85,55 @@ def _open_main(root: tk.Tk, on_reload) -> None:
# ── AUDIO ──
section("AUDIO")
devices = [(i, d["name"]) for i, d in enumerate(sd.query_devices())
if d["max_input_channels"] > 0]
dev_names = ["Standard"] + [f"{i}: {n}" for i, n in devices]
from whisper_app.audio import get_input_devices, test_device
devices = get_input_devices()
dev_names = ["Standard"] + [name for _, name in devices]
dev_var = tk.StringVar()
cur_dev = cfg.config.get("audio_device")
dev_var.set("Standard" if cur_dev is None else
next((f"{i}: {n}" for i, n in devices if i == cur_dev), "Standard"))
dev_var.set(cur_dev if cur_dev and cur_dev in dev_names else "Standard")
f = row("Mikrofon")
dd(f, dev_var, dev_names, width=44).pack(side="left")
# ── Mic test ──
f_test = tk.Frame(content, bg=BG)
f_test.pack(fill="x", pady=(2, 8))
tk.Label(f_test, text="", width=17, bg=BG).pack(side="left") # spacer
level_canvas = tk.Canvas(f_test, width=200, height=14, bg=BG3,
highlightbackground=BORDER, highlightthickness=1, bd=0)
level_canvas.pack(side="left")
level_bar = level_canvas.create_rectangle(0, 0, 0, 14, fill=GREEN, width=0)
test_label = tk.Label(f_test, text="", font=FONT_S, bg=BG, fg=FG2)
test_label.pack(side="left", padx=(8, 0))
def run_mic_test():
test_btn.config(state="disabled", text="Aufnahme...")
test_label.config(text="Sprich jetzt...", fg=FG2)
level_canvas.coords(level_bar, 0, 0, 0, 14)
dev_name = dev_var.get()
device = None if dev_name == "Standard" else dev_name
def on_level(lvl):
win.after(0, lambda: level_canvas.coords(level_bar, 0, 0, int(lvl * 200), 14))
def on_done(ok):
def _update():
test_btn.config(state="normal", text="Test")
if ok:
test_label.config(text="Signal erkannt", fg=GREEN)
else:
test_label.config(text="Kein Signal!", fg="#f05050")
win.after(0, _update)
test_device(device, 2.0, on_level, on_done)
test_btn = tk.Button(f_test, text="Test", command=run_mic_test,
bg=BG3, fg=FG, font=FONT_S, relief="flat",
padx=10, pady=3, cursor="hand2", bd=0)
test_btn.pack(side="left", padx=(8, 0))
# ── MODELL ──
section("MODELL")
model_hints = {
@ -147,7 +184,7 @@ def _open_main(root: tk.Tk, on_reload) -> None:
def save():
sel = dev_var.get()
cfg.config["audio_device"] = None if sel == "Standard" else int(sel.split(":")[0])
cfg.config["audio_device"] = None if sel == "Standard" else sel
cfg.config["model"] = model_var.get()
cfg.config["language"] = cfg.LANGUAGES[lang_var.get()]
cfg.config["device"] = device_var.get()

View File

@ -20,10 +20,22 @@ def stop_and_transcribe() -> None:
if app.state != app.AppState.RECORDING:
return
set_state(app.AppState.TRANSCRIBING)
try:
_do_transcribe()
except Exception as e:
app.log(f"Transcription error: {e}")
finally:
set_state(app.AppState.IDLE)
def _do_transcribe() -> None:
chunks = list(app.audio_chunks)
if not chunks:
set_state(app.AppState.IDLE)
return
if app.model is None:
app.log("Model not loaded yet — skipped.")
return
audio = np.concatenate(chunks, axis=0).flatten().astype(np.float32)
@ -33,7 +45,6 @@ def stop_and_transcribe() -> None:
if duration < 0.3 or rms < 0.0001:
app.log("Too short or silent — skipped.")
set_state(app.AppState.IDLE)
return
target_rms = 0.05
@ -51,7 +62,6 @@ def stop_and_transcribe() -> None:
text = config.apply_vocab(text)
app.log(f"Result: {repr(text)}")
set_state(app.AppState.IDLE)
if text:
time.sleep(0.15)
typer.type_text(text)