Skip to content

Commit a77ba48

Browse files
authored
Support returning the current speech segment for VAD. (#2397)
A new method and property were introduced in the VoiceActivityDetector C++ and Python APIs to provide access to the current speech segment as soon as speech is detected, rather than only after the segment completes.
1 parent 8693b1e commit a77ba48

File tree

4 files changed

+53
-2
lines changed

4 files changed

+53
-2
lines changed

python-api-examples/generate-subtitles.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -609,6 +609,16 @@ def main():
609609
vad.accept_waveform(buffer[:window_size])
610610
buffer = buffer[window_size:]
611611

612+
if False:
613+
# If you want to process the speech segment as soon as
614+
# speech is detected, you can use
615+
current_segment = vad.current_segment
616+
if len(current_segment.samples) > 0:
617+
print(
618+
f"speech starts at {current_segment.start/16000} seconds: ",
619+
f"duration {len(current_segment.samples)/16000} seconds",
620+
)
621+
612622
streams = []
613623
segments = []
614624
while not vad.empty():

sherpa-onnx/csrc/voice-activity-detector.cc

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,9 +91,16 @@ class VoiceActivityDetector::Impl {
9191
start_ = std::max(buffer_.Tail() - 2 * model_->WindowSize() -
9292
model_->MinSpeechDurationSamples(),
9393
buffer_.Head());
94+
cur_segment_.start = start_;
9495
}
96+
int32_t num_samples = buffer_.Tail() - start_ - 1;
97+
cur_segment_.samples = buffer_.Get(start_, num_samples);
9598
} else {
9699
// non-speech
100+
101+
cur_segment_.start = -1;
102+
cur_segment_.samples.clear();
103+
97104
if (start_ != -1 && buffer_.Size()) {
98105
// end of speech, save the speech segment
99106
int32_t end = buffer_.Tail() - model_->MinSilenceDurationSamples();
@@ -138,6 +145,9 @@ class VoiceActivityDetector::Impl {
138145
last_.clear();
139146

140147
start_ = -1;
148+
149+
cur_segment_.start = -1;
150+
cur_segment_.samples.clear();
141151
}
142152

143153
void Flush() {
@@ -161,10 +171,15 @@ class VoiceActivityDetector::Impl {
161171

162172
buffer_.Pop(end - buffer_.Head());
163173
start_ = -1;
174+
175+
cur_segment_.start = -1;
176+
cur_segment_.samples.clear();
164177
}
165178

166179
bool IsSpeechDetected() const { return start_ != -1; }
167180

181+
SpeechSegment CurrentSpeechSegment() const { return cur_segment_; }
182+
168183
const VadModelConfig &GetConfig() const { return config_; }
169184

170185
private:
@@ -184,6 +199,9 @@ class VoiceActivityDetector::Impl {
184199
private:
185200
std::queue<SpeechSegment> segments_;
186201

202+
// it is empty if no speech is detected
203+
SpeechSegment cur_segment_;
204+
187205
std::unique_ptr<VadModel> model_;
188206
VadModelConfig config_;
189207
CircularBuffer buffer_;
@@ -230,6 +248,10 @@ bool VoiceActivityDetector::IsSpeechDetected() const {
230248
return impl_->IsSpeechDetected();
231249
}
232250

251+
SpeechSegment VoiceActivityDetector::CurrentSpeechSegment() const {
252+
return impl_->CurrentSpeechSegment();
253+
}
254+
233255
const VadModelConfig &VoiceActivityDetector::GetConfig() const {
234256
return impl_->GetConfig();
235257
}

sherpa-onnx/csrc/voice-activity-detector.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,18 @@ class VoiceActivityDetector {
3131
bool Empty() const;
3232
void Pop();
3333
void Clear();
34+
35+
// It is an error to call Front() if Empty() returns true.
36+
//
37+
// The returned reference is valid until the next call to any
38+
// methods of VoiceActivityDetector.
3439
const SpeechSegment &Front() const;
3540

3641
bool IsSpeechDetected() const;
3742

43+
// It is empty if IsSpeechDetected() returns false
44+
SpeechSegment CurrentSpeechSegment() const;
45+
3846
void Reset() const;
3947

4048
// At the end of the utterance, you can invoke this method so that

sherpa-onnx/python/csrc/voice-activity-detector.cc

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,17 @@ void PybindSpeechSegment(py::module *m) {
2222
void PybindVoiceActivityDetector(py::module *m) {
2323
PybindSpeechSegment(m);
2424
using PyClass = VoiceActivityDetector;
25-
py::class_<PyClass>(*m, "VoiceActivityDetector")
25+
py::class_<PyClass>(*m, "VoiceActivityDetector",
26+
R"(
27+
1. It is an error to call the front property when the method empty() returns True
28+
2. The property front returns a reference, which is valid until the next call of any
29+
methods of this class
30+
3. When speech is detected, the method is_speech_detected() return True, you can
31+
use the property current_segment to get the speech samples since
32+
is_speech_detected() returns true
33+
4. When is_speech_detected() is changed from True to False, the method
34+
empty() returns False.
35+
)")
2636
.def(py::init<const VadModelConfig &, float>(), py::arg("config"),
2737
py::arg("buffer_size_in_seconds") = 60,
2838
py::call_guard<py::gil_scoped_release>())
@@ -39,7 +49,8 @@ void PybindVoiceActivityDetector(py::module *m) {
3949
py::call_guard<py::gil_scoped_release>())
4050
.def("reset", &PyClass::Reset, py::call_guard<py::gil_scoped_release>())
4151
.def("flush", &PyClass::Flush, py::call_guard<py::gil_scoped_release>())
42-
.def_property_readonly("front", &PyClass::Front);
52+
.def_property_readonly("front", &PyClass::Front)
53+
.def_property_readonly("current_segment", &PyClass::CurrentSpeechSegment);
4354
}
4455

4556
} // namespace sherpa_onnx

0 commit comments

Comments
 (0)