Skip to content

Commit bb994c5

Browse files
authored
support test long audio with streaming-model & vad (#2405)
1 parent 91707ec commit bb994c5

File tree

3 files changed

+209
-0
lines changed

3 files changed

+209
-0
lines changed

cmake/cmake_extension.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ def get_binaries():
6868
"sherpa-onnx-vad-microphone",
6969
"sherpa-onnx-vad-microphone-offline-asr",
7070
"sherpa-onnx-vad-with-offline-asr",
71+
"sherpa-onnx-vad-with-online-asr",
7172
"sherpa-onnx-version",
7273
]
7374

sherpa-onnx/csrc/CMakeLists.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -505,6 +505,10 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO AND SHERPA_ONNX_ENABLE_BINARY)
505505
sherpa-onnx-vad-with-offline-asr.cc
506506
)
507507

508+
add_executable(sherpa-onnx-vad-with-online-asr
509+
sherpa-onnx-vad-with-online-asr.cc
510+
)
511+
508512
add_executable(sherpa-onnx-vad-microphone-offline-asr
509513
sherpa-onnx-vad-microphone-offline-asr.cc
510514
microphone.cc
@@ -529,6 +533,7 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO AND SHERPA_ONNX_ENABLE_BINARY)
529533
sherpa-onnx-vad-microphone
530534
sherpa-onnx-vad-microphone-offline-asr
531535
sherpa-onnx-vad-with-offline-asr
536+
sherpa-onnx-vad-with-online-asr
532537
)
533538
if(SHERPA_ONNX_ENABLE_TTS)
534539
list(APPEND exes
Lines changed: 203 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,203 @@
1+
// sherpa-onnx/csrc/sherpa-onnx-vad-with-online-asr.cc
2+
//
3+
// Copyright (c) 2025 Xiaomi Corporation
4+
// Copyright (c) 2025 Pingfeng Luo
5+
//
6+
// This file demonstrates how to use vad in streaming speech recognition
7+
//
8+
9+
#include <stdio.h>
10+
11+
#include <chrono> // NOLINT
12+
#include <string>
13+
#include <vector>
14+
15+
#include "sherpa-onnx/csrc/online-recognizer.h"
16+
#include "sherpa-onnx/csrc/online-stream.h"
17+
#include "sherpa-onnx/csrc/parse-options.h"
18+
#include "sherpa-onnx/csrc/resample.h"
19+
#include "sherpa-onnx/csrc/symbol-table.h"
20+
#include "sherpa-onnx/csrc/voice-activity-detector.h"
21+
#include "sherpa-onnx/csrc/wave-reader.h"
22+
23+
int32_t main(int32_t argc, char *argv[]) {
24+
const char *kUsageMessage = R"usage(
25+
Speech recognition using VAD + streaming models with sherpa-onnx-vad-with-online-asr.
26+
This is useful when testing long audio.
27+
28+
Usage:
29+
30+
Note you can download silero_vad.onnx using
31+
32+
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
33+
34+
(1) Streaming transducer
35+
36+
./bin/sherpa-onnx-vad-with-online-asr \
37+
--silero-vad-model=/path/to/silero_vad.onnx \
38+
--tokens=/path/to/tokens.txt \
39+
--encoder=/path/to/encoder.onnx \
40+
--decoder=/path/to/decoder.onnx \
41+
--joiner=/path/to/joiner.onnx \
42+
--provider=cpu \
43+
--num-threads=2 \
44+
--decoding-method=greedy_search \
45+
/path/to/long_duration.wav
46+
47+
(2) Streaming zipformer2 CTC
48+
49+
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.tar.bz2
50+
tar xvf sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.tar.bz2
51+
52+
./bin/sherpa-onnx-vad-with-online-asr \
53+
--debug=1 \
54+
--silero-vad-model=/path/to/silero_vad.onnx \
55+
--zipformer2-ctc-model=./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/ctc-epoch-20-avg-1-chunk-16-left-128.onnx \
56+
--tokens=./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/tokens.txt \
57+
./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/test_wavs/DEV_T0000000000.wav
58+
59+
(3) Streaming paraformer
60+
61+
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2
62+
tar xvf sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2
63+
64+
./bin/sherpa-onnx-vad-with-online-asr \
65+
--silero-vad-model=/path/to/silero_vad.onnx \
66+
--tokens=./sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt \
67+
--paraformer-encoder=./sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.onnx \
68+
--paraformer-decoder=./sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.onnx \
69+
/path/to/long_duration.wav
70+
71+
72+
The input wav should be of single channel, 16-bit PCM encoded wave file; its
73+
sampling rate can be arbitrary and does not need to be 16kHz.
74+
75+
Please refer to
76+
https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html
77+
for a list of pre-trained models to download.
78+
)usage";
79+
80+
sherpa_onnx::ParseOptions po(kUsageMessage);
81+
sherpa_onnx::OnlineRecognizerConfig asr_config;
82+
asr_config.Register(&po);
83+
84+
sherpa_onnx::VadModelConfig vad_config;
85+
vad_config.Register(&po);
86+
87+
po.Read(argc, argv);
88+
if (po.NumArgs() != 1) {
89+
fprintf(stderr, "Error: Please provide exactly 1 wave file. Given: %d\n\n",
90+
po.NumArgs());
91+
po.PrintUsage();
92+
exit(EXIT_FAILURE);
93+
}
94+
95+
fprintf(stderr, "%s\n", vad_config.ToString().c_str());
96+
fprintf(stderr, "%s\n", asr_config.ToString().c_str());
97+
98+
if (!vad_config.Validate()) {
99+
fprintf(stderr, "Errors in vad_config!\n");
100+
return -1;
101+
}
102+
103+
if (!asr_config.Validate()) {
104+
fprintf(stderr, "Errors in ASR config!\n");
105+
return -1;
106+
}
107+
108+
fprintf(stderr, "Creating recognizer ...\n");
109+
sherpa_onnx::OnlineRecognizer recognizer(asr_config);
110+
fprintf(stderr, "Recognizer created!\n");
111+
112+
auto vad = std::make_unique<sherpa_onnx::VoiceActivityDetector>(vad_config);
113+
114+
fprintf(stderr, "Started\n");
115+
const auto begin = std::chrono::steady_clock::now();
116+
117+
std::string wave_filename = po.GetArg(1);
118+
fprintf(stderr, "Reading: %s\n", wave_filename.c_str());
119+
int32_t sampling_rate = -1;
120+
bool is_ok = false;
121+
auto samples = sherpa_onnx::ReadWave(wave_filename, &sampling_rate, &is_ok);
122+
if (!is_ok) {
123+
fprintf(stderr, "Failed to read '%s'\n", wave_filename.c_str());
124+
return -1;
125+
}
126+
127+
if (sampling_rate != 16000) {
128+
fprintf(stderr, "Resampling from %d Hz to 16000 Hz\n", sampling_rate);
129+
float min_freq = std::min(sampling_rate, 16000)
130+
float lowpass_cutoff = 0.99 * 0.5 * min_freq;
131+
132+
int32_t lowpass_filter_width = 6;
133+
auto resampler = std::make_unique<sherpa_onnx::LinearResample>(
134+
sampling_rate, 16000, lowpass_cutoff, lowpass_filter_width);
135+
std::vector<float> out_samples;
136+
resampler->Resample(samples.data(), samples.size(), true, &out_samples);
137+
samples = std::move(out_samples);
138+
fprintf(stderr, "Resampling done\n");
139+
}
140+
141+
fprintf(stderr, "Started!\n");
142+
int32_t window_size = vad_config.ten_vad.model.empty()
143+
? vad_config.silero_vad.window_size : vad_config.ten_vad.window_size;
144+
int32_t offset = 0;
145+
int32_t segment_id = 0;
146+
bool speech_started = false;
147+
while (offset < samples.size()) {
148+
if (offset + window_size <= samples.size()) {
149+
vad->AcceptWaveform(samples.data() + offset, window_size);
150+
} else {
151+
vad->Flush();
152+
}
153+
offset += window_size;
154+
if (vad->IsSpeechDetected() && !speech_started) {
155+
// new voice activity
156+
speech_started = true;
157+
segment_id++;
158+
} else if (!vad->IsSpeechDetected() && speech_started) {
159+
// end voice activity
160+
speech_started = false;
161+
}
162+
163+
while (!vad->Empty()) {
164+
const auto &segment = vad->Front();
165+
float duration = segment.samples.size() / 16000.;
166+
float start_time = segment.start / 16000.;
167+
float end_time = start_time + duration;
168+
auto s = recognizer.CreateStream();
169+
s->AcceptWaveform(16000, segment.samples.data(), segment.samples.size());
170+
s->InputFinished();
171+
while (recognizer.IsReady(s.get())) {
172+
recognizer.DecodeStream(s.get());
173+
}
174+
auto text = recognizer.GetResult(s.get()).text;
175+
if (!text.empty()) {
176+
fprintf(stderr, "vad segment(%d:%.3f-%.3f) results: %s\n",
177+
segment_id, start_time, end_time, text.c_str());
178+
}
179+
vad->Pop();
180+
}
181+
}
182+
183+
const auto end = std::chrono::steady_clock::now();
184+
185+
float elapsed_seconds =
186+
std::chrono::duration_cast<std::chrono::milliseconds>(end - begin)
187+
.count() /
188+
1000.;
189+
190+
fprintf(stderr, "num threads: %d\n", asr_config.model_config.num_threads);
191+
fprintf(stderr, "decoding method: %s\n", asr_config.decoding_method.c_str());
192+
if (asr_config.decoding_method == "modified_beam_search") {
193+
fprintf(stderr, "max active paths: %d\n", asr_config.max_active_paths);
194+
}
195+
196+
float duration = samples.size() / 16000.;
197+
fprintf(stderr, "Elapsed seconds: %.3f s\n", elapsed_seconds);
198+
float rtf = elapsed_seconds / duration;
199+
fprintf(stderr, "Real time factor (RTF): %.3f / %.3f = %.3f\n",
200+
elapsed_seconds, duration, rtf);
201+
202+
return 0;
203+
}

0 commit comments

Comments
 (0)