fix: audio device handling, CUDA/VAD bundling, and transcription errors
- Resolve audio devices by name instead of unstable PortAudio index - Filter device list to default host API (hides loopback/output devices) - Add mic test button with live level meter in settings - Bundle silero_vad.onnx and CUDA DLLs in PyInstaller spec - Wrap transcription in try/finally so tray icon always resets to idle - Add build artifacts to .gitignore Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
db88df6368
commit
6d20f21078
|
|
@ -7,5 +7,10 @@ __pycache__/
|
|||
config_local.json
|
||||
models/
|
||||
*.log
|
||||
build/
|
||||
dist/
|
||||
icon.ico
|
||||
.claude/settings.local.json
|
||||
.superpowers/
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
|
|
|
|||
2
build.py
2
build.py
|
|
@ -30,7 +30,7 @@ def build():
|
|||
dest = os.path.join(dist_dir, fname)
|
||||
if not os.path.exists(dest):
|
||||
shutil.copy(fname, dest)
|
||||
print(f"Copied {fname} → {dist_dir}/")
|
||||
print(f"Copied {fname} -> {dist_dir}/")
|
||||
else:
|
||||
print(f"Skipped {fname} (already exists in dist — preserving user edits)")
|
||||
|
||||
|
|
|
|||
|
|
@ -1,12 +1,29 @@
|
|||
# whisper-dictation.spec
|
||||
# -*- mode: python ; coding: utf-8 -*-
|
||||
import sys
|
||||
import os, sys
|
||||
|
||||
import importlib, site
|
||||
def _pkg_path(pkg):
|
||||
mod = importlib.import_module(pkg)
|
||||
if mod.__file__:
|
||||
return os.path.dirname(mod.__file__)
|
||||
# namespace package — resolve via site-packages
|
||||
sp = site.getsitepackages()[0]
|
||||
return os.path.join(sp, *pkg.split('.'))
|
||||
|
||||
_sp = next(p for p in site.getsitepackages() if p.endswith('site-packages'))
|
||||
_nvidia = os.path.join(_sp, 'nvidia')
|
||||
|
||||
a = Analysis(
|
||||
['main.py'],
|
||||
pathex=[],
|
||||
binaries=[],
|
||||
datas=[],
|
||||
binaries=[
|
||||
(os.path.join(_nvidia, 'cublas', 'bin', '*.dll'), '.'),
|
||||
(os.path.join(_nvidia, 'cudnn', 'bin', '*.dll'), '.'),
|
||||
],
|
||||
datas=[
|
||||
(os.path.join(_pkg_path('faster_whisper'), 'assets', '*.onnx'), 'faster_whisper/assets'),
|
||||
],
|
||||
hiddenimports=[
|
||||
'ctranslate2',
|
||||
'faster_whisper',
|
||||
|
|
|
|||
|
|
@ -1,18 +1,75 @@
|
|||
import logging
|
||||
import threading
|
||||
|
||||
import numpy as np
|
||||
import sounddevice as sd
|
||||
|
||||
from whisper_app import app, config
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def audio_callback(indata, frames, time_info, status):
|
||||
if app.state == app.AppState.RECORDING:
|
||||
app.audio_chunks.append(indata.copy())
|
||||
|
||||
|
||||
def resolve_device(name: str | None) -> int | None:
|
||||
"""Resolve a device name to its current PortAudio index, or None for default."""
|
||||
if not name:
|
||||
return None
|
||||
for i, d in enumerate(sd.query_devices()):
|
||||
if d["max_input_channels"] > 0 and d["name"] == name:
|
||||
return i
|
||||
log.warning("Audio device '%s' not found, using default", name)
|
||||
return None
|
||||
|
||||
|
||||
def get_input_devices() -> list[tuple[int, str]]:
|
||||
"""Return list of (index, name) for input devices on the default host API."""
|
||||
default_api = sd.query_hostapis(sd.default.hostapi)["name"]
|
||||
return [(i, d["name"]) for i, d in enumerate(sd.query_devices())
|
||||
if d["max_input_channels"] > 0
|
||||
and sd.query_hostapis(d["hostapi"])["name"] == default_api]
|
||||
|
||||
|
||||
def test_device(device_name: str | None, duration: float,
|
||||
on_level: callable, on_done: callable) -> None:
|
||||
"""Record from device for *duration* seconds, calling on_level(float 0..1) periodically."""
|
||||
device = resolve_device(device_name)
|
||||
|
||||
def _run():
|
||||
try:
|
||||
sr = config.config["sample_rate"]
|
||||
block = int(sr * 0.05) # 50 ms blocks
|
||||
peak = 0.0
|
||||
|
||||
def _cb(indata, frames, time_info, status):
|
||||
nonlocal peak
|
||||
level = float(np.abs(indata).max())
|
||||
peak = max(peak, level)
|
||||
on_level(min(level / 0.1, 1.0)) # normalize: 0.1 amplitude = 100%
|
||||
|
||||
with sd.InputStream(samplerate=sr, channels=1, device=device,
|
||||
callback=_cb, blocksize=block):
|
||||
sd.sleep(int(duration * 1000))
|
||||
on_done(peak > 0.001)
|
||||
except Exception as e:
|
||||
log.error("Mic test failed: %s", e)
|
||||
on_done(False)
|
||||
|
||||
threading.Thread(target=_run, daemon=True).start()
|
||||
|
||||
|
||||
def get_audio_stream():
|
||||
device = config.config.get("audio_device")
|
||||
return sd.InputStream(
|
||||
samplerate=config.config["sample_rate"],
|
||||
channels=1,
|
||||
device=device,
|
||||
callback=audio_callback,
|
||||
)
|
||||
device = resolve_device(config.config.get("audio_device"))
|
||||
sr = config.config["sample_rate"]
|
||||
try:
|
||||
return sd.InputStream(
|
||||
samplerate=sr, channels=1, device=device, callback=audio_callback,
|
||||
)
|
||||
except sd.PortAudioError:
|
||||
log.warning("Audio device %s failed, falling back to default", device)
|
||||
return sd.InputStream(
|
||||
samplerate=sr, channels=1, device=None, callback=audio_callback,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -2,8 +2,6 @@ import os
|
|||
import threading
|
||||
import tkinter as tk
|
||||
|
||||
import sounddevice as sd
|
||||
|
||||
from whisper_app import config as cfg
|
||||
|
||||
|
||||
|
|
@ -87,16 +85,55 @@ def _open_main(root: tk.Tk, on_reload) -> None:
|
|||
|
||||
# ── AUDIO ──
|
||||
section("AUDIO")
|
||||
devices = [(i, d["name"]) for i, d in enumerate(sd.query_devices())
|
||||
if d["max_input_channels"] > 0]
|
||||
dev_names = ["Standard"] + [f"{i}: {n}" for i, n in devices]
|
||||
from whisper_app.audio import get_input_devices, test_device
|
||||
devices = get_input_devices()
|
||||
dev_names = ["Standard"] + [name for _, name in devices]
|
||||
dev_var = tk.StringVar()
|
||||
cur_dev = cfg.config.get("audio_device")
|
||||
dev_var.set("Standard" if cur_dev is None else
|
||||
next((f"{i}: {n}" for i, n in devices if i == cur_dev), "Standard"))
|
||||
dev_var.set(cur_dev if cur_dev and cur_dev in dev_names else "Standard")
|
||||
f = row("Mikrofon")
|
||||
dd(f, dev_var, dev_names, width=44).pack(side="left")
|
||||
|
||||
# ── Mic test ──
|
||||
f_test = tk.Frame(content, bg=BG)
|
||||
f_test.pack(fill="x", pady=(2, 8))
|
||||
tk.Label(f_test, text="", width=17, bg=BG).pack(side="left") # spacer
|
||||
|
||||
level_canvas = tk.Canvas(f_test, width=200, height=14, bg=BG3,
|
||||
highlightbackground=BORDER, highlightthickness=1, bd=0)
|
||||
level_canvas.pack(side="left")
|
||||
level_bar = level_canvas.create_rectangle(0, 0, 0, 14, fill=GREEN, width=0)
|
||||
|
||||
test_label = tk.Label(f_test, text="", font=FONT_S, bg=BG, fg=FG2)
|
||||
test_label.pack(side="left", padx=(8, 0))
|
||||
|
||||
def run_mic_test():
|
||||
test_btn.config(state="disabled", text="Aufnahme...")
|
||||
test_label.config(text="Sprich jetzt...", fg=FG2)
|
||||
level_canvas.coords(level_bar, 0, 0, 0, 14)
|
||||
|
||||
dev_name = dev_var.get()
|
||||
device = None if dev_name == "Standard" else dev_name
|
||||
|
||||
def on_level(lvl):
|
||||
win.after(0, lambda: level_canvas.coords(level_bar, 0, 0, int(lvl * 200), 14))
|
||||
|
||||
def on_done(ok):
|
||||
def _update():
|
||||
test_btn.config(state="normal", text="Test")
|
||||
if ok:
|
||||
test_label.config(text="Signal erkannt", fg=GREEN)
|
||||
else:
|
||||
test_label.config(text="Kein Signal!", fg="#f05050")
|
||||
win.after(0, _update)
|
||||
|
||||
test_device(device, 2.0, on_level, on_done)
|
||||
|
||||
test_btn = tk.Button(f_test, text="Test", command=run_mic_test,
|
||||
bg=BG3, fg=FG, font=FONT_S, relief="flat",
|
||||
padx=10, pady=3, cursor="hand2", bd=0)
|
||||
test_btn.pack(side="left", padx=(8, 0))
|
||||
|
||||
# ── MODELL ──
|
||||
section("MODELL")
|
||||
model_hints = {
|
||||
|
|
@ -147,7 +184,7 @@ def _open_main(root: tk.Tk, on_reload) -> None:
|
|||
|
||||
def save():
|
||||
sel = dev_var.get()
|
||||
cfg.config["audio_device"] = None if sel == "Standard" else int(sel.split(":")[0])
|
||||
cfg.config["audio_device"] = None if sel == "Standard" else sel
|
||||
cfg.config["model"] = model_var.get()
|
||||
cfg.config["language"] = cfg.LANGUAGES[lang_var.get()]
|
||||
cfg.config["device"] = device_var.get()
|
||||
|
|
|
|||
|
|
@ -20,10 +20,22 @@ def stop_and_transcribe() -> None:
|
|||
if app.state != app.AppState.RECORDING:
|
||||
return
|
||||
set_state(app.AppState.TRANSCRIBING)
|
||||
try:
|
||||
_do_transcribe()
|
||||
except Exception as e:
|
||||
app.log(f"Transcription error: {e}")
|
||||
finally:
|
||||
set_state(app.AppState.IDLE)
|
||||
|
||||
|
||||
def _do_transcribe() -> None:
|
||||
chunks = list(app.audio_chunks)
|
||||
|
||||
if not chunks:
|
||||
set_state(app.AppState.IDLE)
|
||||
return
|
||||
|
||||
if app.model is None:
|
||||
app.log("Model not loaded yet — skipped.")
|
||||
return
|
||||
|
||||
audio = np.concatenate(chunks, axis=0).flatten().astype(np.float32)
|
||||
|
|
@ -33,7 +45,6 @@ def stop_and_transcribe() -> None:
|
|||
|
||||
if duration < 0.3 or rms < 0.0001:
|
||||
app.log("Too short or silent — skipped.")
|
||||
set_state(app.AppState.IDLE)
|
||||
return
|
||||
|
||||
target_rms = 0.05
|
||||
|
|
@ -51,7 +62,6 @@ def stop_and_transcribe() -> None:
|
|||
text = config.apply_vocab(text)
|
||||
app.log(f"Result: {repr(text)}")
|
||||
|
||||
set_state(app.AppState.IDLE)
|
||||
if text:
|
||||
time.sleep(0.15)
|
||||
typer.type_text(text)
|
||||
|
|
|
|||
Loading…
Reference in New Issue