import json import gradio as gr from huggingface_hub import snapshot_download from omegaconf import OmegaConf from vosk import KaldiRecognizer, Model def load_vosk(model_id: str): model_dir = snapshot_download(model_id) return Model(model_path=model_dir) OmegaConf.register_new_resolver("load_vosk", load_vosk) models_config = OmegaConf.load("configs/models.yaml") DEFAULT_MODEL = OmegaConf.to_object(models_config[list(models_config.keys())[0]]) def automatic_speech_recognition(dialect_id: str, audio_data: str): if isinstance(DEFAULT_MODEL["model"], dict): model = DEFAULT_MODEL["model"][dialect_id] else: model = DEFAULT_MODEL["model"] sample_rate, audio_array = audio_data if audio_array.ndim == 2: audio_array = audio_array[:, 0] audio_bytes = audio_array.tobytes() rec = KaldiRecognizer(model, sample_rate) rec.SetWords(True) results = [] for start in range(0, len(audio_bytes), 4000): end = min(start + 4000, len(audio_bytes)) data = audio_bytes[start:end] if rec.AcceptWaveform(data): raw_result = json.loads(rec.Result()) results.append(raw_result) final_result = json.loads(rec.FinalResult()) results.append(final_result) filtered_lines = [] for result in results: if len(result["text"]) > 0: if dialect_id == "formosan_ami": result["text"] = result["text"].replace("u", "o") filtered_lines.append(result["text"]) return (", ".join(filtered_lines) + ".").capitalize() def get_title(): with open("DEMO.md") as tong: return tong.readline().strip("# ") demo = gr.Blocks( title=get_title(), css="@import url(https://tauhu.tw/tauhu-oo.css);", theme=gr.themes.Default( font=( "tauhu-oo", gr.themes.GoogleFont("Source Sans Pro"), "ui-sans-serif", "system-ui", "sans-serif", ) ), ) with demo: with open("DEMO.md") as tong: gr.Markdown(tong.read()) with gr.Row(): with gr.Column(): dialect_drop_down = gr.Radio( choices=[(k, v) for k, v in DEFAULT_MODEL["dialect_mapping"].items()], value=list(DEFAULT_MODEL["dialect_mapping"].values())[0], label="步驟一:選擇族別", ) audio_source = gr.Audio( label="步驟二:上傳待辨識音檔或點擊🎙️自行錄音", type="numpy", format="wav", waveform_options=gr.WaveformOptions( sample_rate=16000, ), sources=["microphone", "upload"], ) submit_button = gr.Button("步驟三:開始辨識", variant="primary") with gr.Column(): output_textbox = gr.TextArea(interactive=True, label="辨識結果") submit_button.click( automatic_speech_recognition, inputs=[dialect_drop_down, audio_source], outputs=[output_textbox], ) demo.launch()