Skip to content

Commit 19ac7d5

Browse files
authored
Add C++ and Python support for ten-vad (#2377)
This PR adds support for the TEN VAD model alongside the existing Silero VAD in both C++ and Python interfaces. - Introduces TenVadModelConfig with Python bindings and integrates it into VadModelConfig. - Implements TenVadModel in C++ and extends the factory (VadModel::Create) and detector logic to choose between Silero and TEN VAD. - Updates build files (CMake), fixes a spelling typo, and extends the Python example script to demonstrate --ten-vad-model.
1 parent bebd996 commit 19ac7d5

20 files changed

+902
-49
lines changed

cmake/kaldi-native-fbank.cmake

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
function(download_kaldi_native_fbank)
22
include(FetchContent)
33

4-
set(kaldi_native_fbank_URL "https://github.com/csukuangfj/kaldi-native-fbank/archive/refs/tags/v1.21.2.tar.gz")
5-
set(kaldi_native_fbank_URL2 "https://hf-mirror.com/csukuangfj/sherpa-onnx-cmake-deps/resolve/main/kaldi-native-fbank-1.21.2.tar.gz")
6-
set(kaldi_native_fbank_HASH "SHA256=f4bd7d53fe8aeaecc4eda9680c72696bb86bf74e86371d81aacacd6f4ca3914d")
4+
set(kaldi_native_fbank_URL "https://github.com/csukuangfj/kaldi-native-fbank/archive/refs/tags/v1.21.3.tar.gz")
5+
set(kaldi_native_fbank_URL2 "https://hf-mirror.com/csukuangfj/sherpa-onnx-cmake-deps/resolve/main/kaldi-native-fbank-1.21.3.tar.gz")
6+
set(kaldi_native_fbank_HASH "SHA256=d409eddae5a46dc796f0841880f489ff0728b96ae26218702cd438c28667c70e")
77

88
set(KALDI_NATIVE_FBANK_BUILD_TESTS OFF CACHE BOOL "" FORCE)
99
set(KALDI_NATIVE_FBANK_BUILD_PYTHON OFF CACHE BOOL "" FORCE)
@@ -12,11 +12,11 @@ function(download_kaldi_native_fbank)
1212
# If you don't have access to the Internet,
1313
# please pre-download kaldi-native-fbank
1414
set(possible_file_locations
15-
$ENV{HOME}/Downloads/kaldi-native-fbank-1.21.2.tar.gz
16-
${CMAKE_SOURCE_DIR}/kaldi-native-fbank-1.21.2.tar.gz
17-
${CMAKE_BINARY_DIR}/kaldi-native-fbank-1.21.2.tar.gz
18-
/tmp/kaldi-native-fbank-1.21.2.tar.gz
19-
/star-fj/fangjun/download/github/kaldi-native-fbank-1.21.2.tar.gz
15+
$ENV{HOME}/Downloads/kaldi-native-fbank-1.21.3.tar.gz
16+
${CMAKE_SOURCE_DIR}/kaldi-native-fbank-1.21.3.tar.gz
17+
${CMAKE_BINARY_DIR}/kaldi-native-fbank-1.21.3.tar.gz
18+
/tmp/kaldi-native-fbank-1.21.3.tar.gz
19+
/star-fj/fangjun/download/github/kaldi-native-fbank-1.21.3.tar.gz
2020
)
2121

2222
foreach(f IN LISTS possible_file_locations)

cxx-api-examples/zipformer-transducer-simulate-streaming-microphone-cxx-api.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
// cxx-api-examples/zipformer-transducer-simulate-streaming-microphone-cxx-api.cc
22
// Copyright (c) 2025 Xiaomi Corporation
33
//
4-
// This file demonstrates how to use Zipformer transducer with sherpa-onnx's C++ API
5-
// for streaming speech recognition from a microphone.
4+
// This file demonstrates how to use Zipformer transducer with sherpa-onnx's C++
5+
// API for streaming speech recognition from a microphone.
66
//
77
// clang-format off
88
//

python-api-examples/generate-subtitles.py

Lines changed: 47 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,12 @@
1919
2020
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
2121
22+
or download ten-vad.onnx, for instance
23+
24+
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/ten-vad.onnx
25+
26+
Please replace --silero-vad-model with --ten-vad-model below to use ten-vad.
27+
2228
(1) For paraformer
2329
2430
./python-api-examples/generate-subtitles.py \
@@ -124,8 +130,13 @@ def get_args():
124130
parser.add_argument(
125131
"--silero-vad-model",
126132
type=str,
127-
required=True,
128-
help="Path to silero_vad.onnx",
133+
help="Path to silero_vad.onnx.",
134+
)
135+
136+
parser.add_argument(
137+
"--ten-vad-model",
138+
type=str,
139+
help="Path to ten-vad.onnx",
129140
)
130141

131142
parser.add_argument(
@@ -499,7 +510,12 @@ def __str__(self):
499510
def main():
500511
args = get_args()
501512
assert_file_exists(args.tokens)
502-
assert_file_exists(args.silero_vad_model)
513+
if args.silero_vad_model:
514+
assert_file_exists(args.silero_vad_model)
515+
elif args.ten_vad_model:
516+
assert_file_exists(args.ten_vad_model)
517+
else:
518+
raise ValueError("You need to supply one vad model")
503519

504520
assert args.num_threads > 0, args.num_threads
505521

@@ -536,18 +552,34 @@ def main():
536552
stream = recognizer.create_stream()
537553

538554
config = sherpa_onnx.VadModelConfig()
539-
config.silero_vad.model = args.silero_vad_model
540-
config.silero_vad.threshold = 0.5
541-
config.silero_vad.min_silence_duration = 0.25 # seconds
542-
config.silero_vad.min_speech_duration = 0.25 # seconds
543-
544-
# If the current segment is larger than this value, then it increases
545-
# the threshold to 0.9 internally. After detecting this segment,
546-
# it resets the threshold to its original value.
547-
config.silero_vad.max_speech_duration = 5 # seconds
548-
config.sample_rate = args.sample_rate
549-
550-
window_size = config.silero_vad.window_size
555+
if args.silero_vad_model:
556+
config.silero_vad.model = args.silero_vad_model
557+
config.silero_vad.threshold = 0.2
558+
config.silero_vad.min_silence_duration = 0.25 # seconds
559+
config.silero_vad.min_speech_duration = 0.25 # seconds
560+
561+
# If the current segment is larger than this value, then it increases
562+
# the threshold to 0.9 internally. After detecting this segment,
563+
# it resets the threshold to its original value.
564+
config.silero_vad.max_speech_duration = 5 # seconds
565+
config.sample_rate = args.sample_rate
566+
567+
window_size = config.silero_vad.window_size
568+
print("use silero-vad")
569+
else:
570+
config.ten_vad.model = args.ten_vad_model
571+
config.ten_vad.threshold = 0.2
572+
config.ten_vad.min_silence_duration = 0.25 # seconds
573+
config.ten_vad.min_speech_duration = 0.25 # seconds
574+
575+
# If the current segment is larger than this value, then it increases
576+
# the threshold to 0.9 internally. After detecting this segment,
577+
# it resets the threshold to its original value.
578+
config.ten_vad.max_speech_duration = 5 # seconds
579+
config.sample_rate = args.sample_rate
580+
581+
window_size = config.ten_vad.window_size
582+
print("use ten-vad")
551583

552584
buffer = []
553585
vad = sherpa_onnx.VoiceActivityDetector(config, buffer_size_in_seconds=100)

sherpa-onnx/csrc/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,8 @@ set(sources
123123
spoken-language-identification.cc
124124
stack.cc
125125
symbol-table.cc
126+
ten-vad-model-config.cc
127+
ten-vad-model.cc
126128
text-utils.cc
127129
transducer-keyword-decoder.cc
128130
transpose.cc

sherpa-onnx/csrc/silero-vad-model-config.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ void SileroVadModelConfig::Register(ParseOptions *po) {
4040
"to the silero VAD model. WARNING! Silero VAD models were trained using "
4141
"512, 1024, 1536 samples for 16000 sample rate and 256, 512, 768 samples "
4242
"for 8000 sample rate. Values other than these may affect model "
43-
"perfomance!");
43+
"performance!");
4444
}
4545

4646
bool SileroVadModelConfig::Validate() const {

sherpa-onnx/csrc/silero-vad-model-config.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ struct SileroVadModelConfig {
2424
float min_speech_duration = 0.25; // in seconds
2525

2626
// 512, 1024, 1536 samples for 16000 Hz
27-
// 256, 512, 768 samples for 800 Hz
2827
int32_t window_size = 512; // in samples
2928

3029
// If a speech segment is longer than this value, then we increase
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
// sherpa-onnx/csrc/ten-vad-model-config.cc
2+
//
3+
// Copyright (c) 2025 Xiaomi Corporation
4+
5+
#include "sherpa-onnx/csrc/ten-vad-model-config.h"
6+
7+
#include "sherpa-onnx/csrc/file-utils.h"
8+
#include "sherpa-onnx/csrc/macros.h"
9+
10+
namespace sherpa_onnx {
11+
12+
void TenVadModelConfig::Register(ParseOptions *po) {
13+
po->Register("ten-vad-model", &model, "Path to TEN VAD ONNX model.");
14+
15+
po->Register("ten-vad-threshold", &threshold,
16+
"Speech threshold. TEN VAD outputs speech probabilities for "
17+
"each audio chunk, probabilities ABOVE this value are "
18+
"considered as SPEECH. It is better to tune this parameter for "
19+
"each dataset separately, but lazy "
20+
"0.5 is pretty good for most datasets.");
21+
22+
po->Register("ten-vad-min-silence-duration", &min_silence_duration,
23+
"In seconds. In the end of each speech chunk wait for "
24+
"--ten-vad-min-silence-duration seconds before separating it");
25+
26+
po->Register("ten-vad-min-speech-duration", &min_speech_duration,
27+
"In seconds. In the end of each silence chunk wait for "
28+
"--ten-vad-min-speech-duration seconds before separating it");
29+
30+
po->Register(
31+
"ten-vad-max-speech-duration", &max_speech_duration,
32+
"In seconds. If a speech segment is longer than this value, then we "
33+
"increase the threshold to 0.9. After finishing detecting the segment, "
34+
"the threshold value is reset to its original value.");
35+
36+
po->Register(
37+
"ten-vad-window-size", &window_size,
38+
"In samples. Audio chunks of --ten-vad-window-size samples are fed "
39+
"to the ten VAD model. WARNING! Please use 160 or 256 ");
40+
}
41+
42+
bool TenVadModelConfig::Validate() const {
43+
if (model.empty()) {
44+
SHERPA_ONNX_LOGE("Please provide --ten-vad-model");
45+
return false;
46+
}
47+
48+
if (!FileExists(model)) {
49+
SHERPA_ONNX_LOGE("TEN vad model file '%s' does not exist", model.c_str());
50+
return false;
51+
}
52+
53+
if (threshold < 0.01) {
54+
SHERPA_ONNX_LOGE(
55+
"Please use a larger value for --ten-vad-threshold. Given: %f",
56+
threshold);
57+
return false;
58+
}
59+
60+
if (threshold >= 1) {
61+
SHERPA_ONNX_LOGE(
62+
"Please use a smaller value for --ten-vad-threshold. Given: %f",
63+
threshold);
64+
return false;
65+
}
66+
67+
if (min_silence_duration <= 0) {
68+
SHERPA_ONNX_LOGE(
69+
"Please use a larger value for --ten-vad-min-silence-duration. "
70+
"Given: "
71+
"%f",
72+
min_silence_duration);
73+
return false;
74+
}
75+
76+
if (min_speech_duration <= 0) {
77+
SHERPA_ONNX_LOGE(
78+
"Please use a larger value for --ten-vad-min-speech-duration. "
79+
"Given: "
80+
"%f",
81+
min_speech_duration);
82+
return false;
83+
}
84+
85+
if (max_speech_duration <= 0) {
86+
SHERPA_ONNX_LOGE(
87+
"Please use a larger value for --ten-vad-max-speech-duration. "
88+
"Given: "
89+
"%f",
90+
max_speech_duration);
91+
return false;
92+
}
93+
94+
return true;
95+
}
96+
97+
std::string TenVadModelConfig::ToString() const {
98+
std::ostringstream os;
99+
100+
os << "TenVadModelConfig(";
101+
os << "model=\"" << model << "\", ";
102+
os << "threshold=" << threshold << ", ";
103+
os << "min_silence_duration=" << min_silence_duration << ", ";
104+
os << "min_speech_duration=" << min_speech_duration << ", ";
105+
os << "max_speech_duration=" << max_speech_duration << ", ";
106+
os << "window_size=" << window_size << ")";
107+
108+
return os.str();
109+
}
110+
111+
} // namespace sherpa_onnx
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
// sherpa-onnx/csrc/ten-vad-model-config.h
2+
//
3+
// Copyright (c) 2025 Xiaomi Corporation
4+
#ifndef SHERPA_ONNX_CSRC_TEN_VAD_MODEL_CONFIG_H_
5+
#define SHERPA_ONNX_CSRC_TEN_VAD_MODEL_CONFIG_H_
6+
7+
#include <string>
8+
9+
#include "sherpa-onnx/csrc/parse-options.h"
10+
11+
namespace sherpa_onnx {
12+
13+
struct TenVadModelConfig {
14+
std::string model;
15+
16+
// threshold to classify a segment as speech
17+
//
18+
// If the predicted probability of a segment is larger than this
19+
// value, then it is classified as speech.
20+
float threshold = 0.5;
21+
22+
float min_silence_duration = 0.5; // in seconds
23+
24+
float min_speech_duration = 0.25; // in seconds
25+
26+
// 160 or 256
27+
int32_t window_size = 256; // in samples
28+
29+
// If a speech segment is longer than this value, then we increase
30+
// the threshold to 0.9. After finishing detecting the segment,
31+
// the threshold value is reset to its original value.
32+
float max_speech_duration = 20; // in seconds
33+
34+
TenVadModelConfig() = default;
35+
36+
void Register(ParseOptions *po);
37+
38+
bool Validate() const;
39+
40+
std::string ToString() const;
41+
};
42+
43+
} // namespace sherpa_onnx
44+
45+
#endif // SHERPA_ONNX_CSRC_TEN_VAD_MODEL_CONFIG_H_

0 commit comments

Comments
 (0)