Skip to content

Commit e2b2d5e

Browse files
authored
Add CXX examples for NeMo TDT ASR. (#2363)
# New Features - Added new example programs demonstrating streaming speech recognition from a microphone using Parakeet-TDT CTC and Zipformer Transducer models with voice activity detection. - These examples support microphone input via PortAudio and display recognized text incrementally. # Bug Fixes - Improved error handling and logic when opening microphone devices in several example programs for more reliable device initialization. # Chores - Updated build configuration to include new executable examples when PortAudio support is enabled.
1 parent f096034 commit e2b2d5e

6 files changed

+512
-9
lines changed

cxx-api-examples/CMakeLists.txt

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,15 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO)
4949
portaudio_static
5050
)
5151

52+
add_executable(parakeet-tdt-ctc-simulate-streaming-microphone-cxx-api
53+
./parakeet-tdt-ctc-simulate-streaming-microphone-cxx-api.cc
54+
${CMAKE_CURRENT_LIST_DIR}/../sherpa-onnx/csrc/microphone.cc
55+
)
56+
target_link_libraries(parakeet-tdt-ctc-simulate-streaming-microphone-cxx-api
57+
sherpa-onnx-cxx-api
58+
portaudio_static
59+
)
60+
5261
add_executable(zipformer-ctc-simulate-streaming-microphone-cxx-api
5362
./zipformer-ctc-simulate-streaming-microphone-cxx-api.cc
5463
${CMAKE_CURRENT_LIST_DIR}/../sherpa-onnx/csrc/microphone.cc
@@ -57,6 +66,15 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO)
5766
sherpa-onnx-cxx-api
5867
portaudio_static
5968
)
69+
70+
add_executable(zipformer-transducer-simulate-streaming-microphone-cxx-api
71+
./zipformer-transducer-simulate-streaming-microphone-cxx-api.cc
72+
${CMAKE_CURRENT_LIST_DIR}/../sherpa-onnx/csrc/microphone.cc
73+
)
74+
target_link_libraries(zipformer-transducer-simulate-streaming-microphone-cxx-api
75+
sherpa-onnx-cxx-api
76+
portaudio_static
77+
)
6078
endif()
6179

6280
if(SHERPA_ONNX_HAS_ALSA)
Lines changed: 238 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,238 @@
1+
// cxx-api-examples/parakeet-tdt-simulate-streaming-microphone-cxx-api.cc
2+
// Copyright (c) 2025 Xiaomi Corporation
3+
4+
//
5+
// This file demonstrates how to use parakeet-tdt with sherpa-onnx's C++ API
6+
// for streaming speech recognition from a microphone.
7+
//
8+
// clang-format off
9+
//
10+
// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
11+
//
12+
// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-parakeet-tdt_ctc-0.6b-ja-35000-int8.tar.bz2
13+
// tar xvf sherpa-onnx-nemo-parakeet-tdt_ctc-0.6b-ja-35000-int8.tar.bz2
14+
// rm sherpa-onnx-nemo-parakeet-tdt_ctc-0.6b-ja-35000-int8.tar.bz2
15+
//
16+
// clang-format on
17+
18+
#include <signal.h>
19+
#include <stdio.h>
20+
#include <stdlib.h>
21+
22+
#include <chrono> // NOLINT
23+
#include <condition_variable> // NOLINT
24+
#include <iostream>
25+
#include <mutex> // NOLINT
26+
#include <queue>
27+
#include <vector>
28+
29+
#include "portaudio.h" // NOLINT
30+
#include "sherpa-display.h" // NOLINT
31+
#include "sherpa-onnx/c-api/cxx-api.h"
32+
#include "sherpa-onnx/csrc/microphone.h"
33+
34+
std::queue<std::vector<float>> samples_queue;
35+
std::condition_variable condition_variable;
36+
std::mutex mutex;
37+
bool stop = false;
38+
39+
static void Handler(int32_t /*sig*/) {
40+
stop = true;
41+
condition_variable.notify_one();
42+
fprintf(stderr, "\nCaught Ctrl + C. Exiting...\n");
43+
}
44+
45+
static int32_t RecordCallback(const void *input_buffer,
46+
void * /*output_buffer*/,
47+
unsigned long frames_per_buffer, // NOLINT
48+
const PaStreamCallbackTimeInfo * /*time_info*/,
49+
PaStreamCallbackFlags /*status_flags*/,
50+
void * /*user_data*/) {
51+
std::lock_guard<std::mutex> lock(mutex);
52+
samples_queue.emplace(
53+
reinterpret_cast<const float *>(input_buffer),
54+
reinterpret_cast<const float *>(input_buffer) + frames_per_buffer);
55+
condition_variable.notify_one();
56+
57+
return stop ? paComplete : paContinue;
58+
}
59+
60+
static sherpa_onnx::cxx::VoiceActivityDetector CreateVad() {
61+
using namespace sherpa_onnx::cxx; // NOLINT
62+
VadModelConfig config;
63+
config.silero_vad.model = "./silero_vad.onnx";
64+
config.silero_vad.threshold = 0.25;
65+
config.silero_vad.min_silence_duration = 0.25;
66+
config.silero_vad.min_speech_duration = 0.25;
67+
config.silero_vad.max_speech_duration = 5;
68+
config.sample_rate = 16000;
69+
config.debug = false;
70+
71+
VoiceActivityDetector vad = VoiceActivityDetector::Create(config, 60);
72+
if (!vad.Get()) {
73+
std::cerr << "Failed to create VAD. Please check your config\n";
74+
exit(-1);
75+
}
76+
77+
return vad;
78+
}
79+
80+
static sherpa_onnx::cxx::OfflineRecognizer CreateOfflineRecognizer() {
81+
using namespace sherpa_onnx::cxx; // NOLINT
82+
OfflineRecognizerConfig config;
83+
84+
config.model_config.nemo_ctc.model =
85+
"./sherpa-onnx-nemo-parakeet-tdt_ctc-0.6b-ja-35000-int8/model.int8.onnx";
86+
config.model_config.tokens =
87+
"./sherpa-onnx-nemo-parakeet-tdt_ctc-0.6b-ja-35000-int8/tokens.txt";
88+
89+
config.model_config.num_threads = 2;
90+
config.model_config.debug = false;
91+
92+
std::cout << "Loading model\n";
93+
OfflineRecognizer recognizer = OfflineRecognizer::Create(config);
94+
if (!recognizer.Get()) {
95+
std::cerr << "Please check your config\n";
96+
exit(-1);
97+
}
98+
std::cout << "Loading model done\n";
99+
return recognizer;
100+
}
101+
102+
int32_t main() {
103+
signal(SIGINT, Handler);
104+
105+
using namespace sherpa_onnx::cxx; // NOLINT
106+
107+
auto vad = CreateVad();
108+
auto recognizer = CreateOfflineRecognizer();
109+
110+
sherpa_onnx::Microphone mic;
111+
112+
PaDeviceIndex num_devices = Pa_GetDeviceCount();
113+
if (num_devices == 0) {
114+
std::cerr << " If you are using Linux, please try to modify "
115+
"./build/bin/sense-voice-simulate-streaming-alsa-cxx-api\n";
116+
return -1;
117+
}
118+
119+
int32_t device_index = Pa_GetDefaultInputDevice();
120+
const char *pDeviceIndex = std::getenv("SHERPA_ONNX_MIC_DEVICE");
121+
if (pDeviceIndex) {
122+
fprintf(stderr, "Use specified device: %s\n", pDeviceIndex);
123+
device_index = atoi(pDeviceIndex);
124+
}
125+
mic.PrintDevices(device_index);
126+
127+
float mic_sample_rate = 16000;
128+
const char *sample_rate_str = std::getenv("SHERPA_ONNX_MIC_SAMPLE_RATE");
129+
if (sample_rate_str) {
130+
mic_sample_rate = atof(sample_rate_str);
131+
fprintf(stderr, "Use sample rate %f for mic\n", mic_sample_rate);
132+
}
133+
134+
float sample_rate = 16000;
135+
LinearResampler resampler;
136+
if (mic_sample_rate != sample_rate) {
137+
float min_freq = std::min(mic_sample_rate, sample_rate);
138+
float lowpass_cutoff = 0.99 * 0.5 * min_freq;
139+
140+
int32_t lowpass_filter_width = 6;
141+
resampler = LinearResampler::Create(mic_sample_rate, sample_rate,
142+
lowpass_cutoff, lowpass_filter_width);
143+
}
144+
if (!mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback,
145+
nullptr)) {
146+
std::cerr << "Failed to open microphone device\n";
147+
return -1;
148+
}
149+
150+
int32_t window_size = 512; // samples, please don't change
151+
152+
int32_t offset = 0;
153+
std::vector<float> buffer;
154+
bool speech_started = false;
155+
156+
auto started_time = std::chrono::steady_clock::now();
157+
158+
SherpaDisplay display;
159+
160+
std::cout << "Started! Please speak\n";
161+
162+
while (!stop) {
163+
{
164+
std::unique_lock<std::mutex> lock(mutex);
165+
while (samples_queue.empty() && !stop) {
166+
condition_variable.wait(lock);
167+
}
168+
169+
const auto &s = samples_queue.front();
170+
if (!resampler.Get()) {
171+
buffer.insert(buffer.end(), s.begin(), s.end());
172+
} else {
173+
auto resampled = resampler.Resample(s.data(), s.size(), false);
174+
buffer.insert(buffer.end(), resampled.begin(), resampled.end());
175+
}
176+
177+
samples_queue.pop();
178+
}
179+
180+
for (; offset + window_size < buffer.size(); offset += window_size) {
181+
vad.AcceptWaveform(buffer.data() + offset, window_size);
182+
if (!speech_started && vad.IsDetected()) {
183+
speech_started = true;
184+
started_time = std::chrono::steady_clock::now();
185+
}
186+
}
187+
if (!speech_started) {
188+
if (buffer.size() > 10 * window_size) {
189+
offset -= buffer.size() - 10 * window_size;
190+
buffer = {buffer.end() - 10 * window_size, buffer.end()};
191+
}
192+
}
193+
194+
auto current_time = std::chrono::steady_clock::now();
195+
const float elapsed_seconds =
196+
std::chrono::duration_cast<std::chrono::milliseconds>(current_time -
197+
started_time)
198+
.count() /
199+
1000.;
200+
201+
if (speech_started && elapsed_seconds > 0.2) {
202+
OfflineStream stream = recognizer.CreateStream();
203+
stream.AcceptWaveform(sample_rate, buffer.data(), buffer.size());
204+
205+
recognizer.Decode(&stream);
206+
207+
OfflineRecognizerResult result = recognizer.GetResult(&stream);
208+
display.UpdateText(result.text);
209+
display.Display();
210+
211+
started_time = std::chrono::steady_clock::now();
212+
}
213+
214+
while (!vad.IsEmpty()) {
215+
auto segment = vad.Front();
216+
217+
vad.Pop();
218+
219+
OfflineStream stream = recognizer.CreateStream();
220+
stream.AcceptWaveform(sample_rate, segment.samples.data(),
221+
segment.samples.size());
222+
223+
recognizer.Decode(&stream);
224+
225+
OfflineRecognizerResult result = recognizer.GetResult(&stream);
226+
227+
display.UpdateText(result.text);
228+
display.FinalizeCurrentSentence();
229+
display.Display();
230+
231+
buffer.clear();
232+
offset = 0;
233+
speech_started = false;
234+
}
235+
}
236+
237+
return 0;
238+
}

cxx-api-examples/parakeet-tdt-simulate-streaming-microphone-cxx-api.cc

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -136,11 +136,7 @@ int32_t main() {
136136
fprintf(stderr, "Use sample rate %f for mic\n", mic_sample_rate);
137137
mic_sample_rate = atof(sample_rate_str);
138138
}
139-
if (!mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback,
140-
nullptr) == false) {
141-
std::cerr << "Failed to open microphone device\n";
142-
return -1;
143-
}
139+
144140
float sample_rate = 16000;
145141
LinearResampler resampler;
146142
if (mic_sample_rate != sample_rate) {
@@ -152,6 +148,12 @@ int32_t main() {
152148
lowpass_cutoff, lowpass_filter_width);
153149
}
154150

151+
if (!mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback,
152+
nullptr)) {
153+
std::cerr << "Failed to open microphone device\n";
154+
return -1;
155+
}
156+
155157
int32_t window_size = 512; // samples, please don't change
156158

157159
int32_t offset = 0;

cxx-api-examples/sense-voice-simulate-streaming-microphone-cxx-api.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -142,8 +142,8 @@ int32_t main() {
142142
resampler = LinearResampler::Create(mic_sample_rate, sample_rate,
143143
lowpass_cutoff, lowpass_filter_width);
144144
}
145-
if (mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback,
146-
nullptr) == false) {
145+
if (!mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback,
146+
nullptr)) {
147147
std::cerr << "Failed to open microphone device\n";
148148
return -1;
149149
}

cxx-api-examples/zipformer-ctc-simulate-streaming-microphone-cxx-api.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -140,8 +140,8 @@ int32_t main() {
140140
resampler = LinearResampler::Create(mic_sample_rate, sample_rate,
141141
lowpass_cutoff, lowpass_filter_width);
142142
}
143-
if (mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback,
144-
nullptr) == false) {
143+
if (!mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback,
144+
nullptr)) {
145145
std::cerr << "Failed to open microphone device\n";
146146
return -1;
147147
}

0 commit comments

Comments
 (0)