90 lines
2.7 KiB
Python
90 lines
2.7 KiB
Python
import time
|
|
|
|
import numpy as np
|
|
from faster_whisper import WhisperModel
|
|
|
|
from whisper_app import app, config, grammar, media_duck, typer
|
|
|
|
|
|
def load_model() -> None:
|
|
app.log(f"Loading {config.config['model']} on {config.config['device']}...")
|
|
model_dir = config.config.get("model_dir") or None
|
|
app.model = WhisperModel(
|
|
config.config["model"],
|
|
device=config.config["device"],
|
|
compute_type=config.config["compute_type"],
|
|
download_root=model_dir,
|
|
)
|
|
app.log("Model ready.")
|
|
if config.config.get("grammar_check"):
|
|
grammar.init(config.config.get("language") or "de", log=app.log)
|
|
|
|
|
|
def stop_and_transcribe() -> None:
|
|
if app.state != app.AppState.RECORDING:
|
|
return
|
|
set_state(app.AppState.TRANSCRIBING)
|
|
try:
|
|
_do_transcribe()
|
|
except Exception as e:
|
|
app.log(f"Transcription error: {e}")
|
|
finally:
|
|
set_state(app.AppState.IDLE)
|
|
|
|
|
|
def _do_transcribe() -> None:
|
|
chunks = list(app.audio_chunks)
|
|
|
|
if not chunks:
|
|
return
|
|
|
|
if app.model is None:
|
|
app.log("Model not loaded yet — skipped.")
|
|
return
|
|
|
|
audio = np.concatenate(chunks, axis=0).flatten().astype(np.float32)
|
|
duration = len(audio) / config.config["sample_rate"]
|
|
rms = float(np.sqrt(np.mean(audio ** 2)))
|
|
app.log(f"Audio: {duration:.1f}s RMS: {rms:.5f}")
|
|
|
|
if duration < 0.3 or rms < 0.0001:
|
|
app.log("Too short or silent — skipped.")
|
|
return
|
|
|
|
target_rms = 0.05
|
|
if rms > 0:
|
|
audio = audio * (target_rms / rms)
|
|
audio = np.clip(audio, -1.0, 1.0)
|
|
|
|
lang = config.config["language"] if config.config["language"] else None
|
|
prompt = config.get_initial_prompt()
|
|
segments, _ = app.model.transcribe(
|
|
audio, language=lang, beam_size=5, vad_filter=True,
|
|
initial_prompt=prompt if prompt else None,
|
|
)
|
|
text = " ".join(s.text for s in segments).strip()
|
|
text = config.apply_vocab(text)
|
|
if config.config.get("grammar_check"):
|
|
text = grammar.correct(text)
|
|
app.log(f"Result: {repr(text)}")
|
|
|
|
if text:
|
|
time.sleep(0.15)
|
|
typer.type_text(text)
|
|
|
|
|
|
def set_state(new_state: app.AppState) -> None: # semi-public, used by main.py
|
|
app.state = new_state
|
|
if app.tray_icon:
|
|
from whisper_app import tray
|
|
tray.update_icon(new_state)
|
|
if new_state == app.AppState.RECORDING:
|
|
from whisper_app import overlay
|
|
overlay.show()
|
|
if config.config.get("media_duck"):
|
|
media_duck.duck(config.config.get("duck_percent", 20))
|
|
else:
|
|
from whisper_app import overlay
|
|
overlay.hide()
|
|
media_duck.unduck()
|