Support returning the current speech segment for VAD. (#2397)

csukuangfj · web-flow · commit a77ba48d6cd1 · 2025-07-15T12:05:33.000+08:00
A new method and property were introduced in the VoiceActivityDetector C++ and Python APIs to 
provide access to the current speech segment as soon as speech is detected, rather than only 
after the segment completes.
diff --git a/python-api-examples/generate-subtitles.py b/python-api-examples/generate-subtitles.py
@@ -609,6 +609,16 @@ def main():
                 vad.accept_waveform(buffer[:window_size])
                 buffer = buffer[window_size:]
 
+                if False:
+                    # If you want to process the speech segment as soon as
+                    # speech is detected, you can use
+                    current_segment = vad.current_segment
+                    if len(current_segment.samples) > 0:
+                        print(
+                            f"speech starts at {current_segment.start/16000} seconds: ",
+                            f"duration {len(current_segment.samples)/16000} seconds",
+                        )
+
         streams = []
         segments = []
         while not vad.empty():
diff --git a/sherpa-onnx/csrc/voice-activity-detector.cc b/sherpa-onnx/csrc/voice-activity-detector.cc
@@ -91,9 +91,16 @@ class VoiceActivityDetector::Impl {
         start_ = std::max(buffer_.Tail() - 2 * model_->WindowSize() -
                               model_->MinSpeechDurationSamples(),
                           buffer_.Head());
+        cur_segment_.start = start_;
       }
+      int32_t num_samples = buffer_.Tail() - start_ - 1;
+      cur_segment_.samples = buffer_.Get(start_, num_samples);
     } else {
       // non-speech
+
+      cur_segment_.start = -1;
+      cur_segment_.samples.clear();
+
       if (start_ != -1 && buffer_.Size()) {
         // end of speech, save the speech segment
         int32_t end = buffer_.Tail() - model_->MinSilenceDurationSamples();
@@ -138,6 +145,9 @@ class VoiceActivityDetector::Impl {
     last_.clear();
 
     start_ = -1;
+
+    cur_segment_.start = -1;
+    cur_segment_.samples.clear();
   }
 
   void Flush() {
@@ -161,10 +171,15 @@ class VoiceActivityDetector::Impl {
 
     buffer_.Pop(end - buffer_.Head());
     start_ = -1;
+
+    cur_segment_.start = -1;
+    cur_segment_.samples.clear();
   }
 
   bool IsSpeechDetected() const { return start_ != -1; }
 
+  SpeechSegment CurrentSpeechSegment() const { return cur_segment_; }
+
   const VadModelConfig &GetConfig() const { return config_; }
 
  private:
@@ -184,6 +199,9 @@ class VoiceActivityDetector::Impl {
  private:
   std::queue<SpeechSegment> segments_;
 
+  // it is empty if no speech is detected
+  SpeechSegment cur_segment_;
+
   std::unique_ptr<VadModel> model_;
   VadModelConfig config_;
   CircularBuffer buffer_;
@@ -230,6 +248,10 @@ bool VoiceActivityDetector::IsSpeechDetected() const {
   return impl_->IsSpeechDetected();
 }
 
+SpeechSegment VoiceActivityDetector::CurrentSpeechSegment() const {
+  return impl_->CurrentSpeechSegment();
+}
+
 const VadModelConfig &VoiceActivityDetector::GetConfig() const {
   return impl_->GetConfig();
 }
diff --git a/sherpa-onnx/csrc/voice-activity-detector.h b/sherpa-onnx/csrc/voice-activity-detector.h
@@ -31,10 +31,18 @@ class VoiceActivityDetector {
   bool Empty() const;
   void Pop();
   void Clear();
+
+  // It is an error to call Front() if Empty() returns true.
+  //
+  // The returned reference is valid until the next call to any
+  // methods of VoiceActivityDetector.
   const SpeechSegment &Front() const;
 
   bool IsSpeechDetected() const;
 
+  // It is empty if IsSpeechDetected() returns false
+  SpeechSegment CurrentSpeechSegment() const;
+
   void Reset() const;
 
   // At the end of the utterance, you can invoke this method so that
diff --git a/sherpa-onnx/python/csrc/voice-activity-detector.cc b/sherpa-onnx/python/csrc/voice-activity-detector.cc
@@ -22,7 +22,17 @@ void PybindSpeechSegment(py::module *m) {
 void PybindVoiceActivityDetector(py::module *m) {
   PybindSpeechSegment(m);
   using PyClass = VoiceActivityDetector;
-  py::class_<PyClass>(*m, "VoiceActivityDetector")
+  py::class_<PyClass>(*m, "VoiceActivityDetector",
+                      R"(
+1. It is an error to call the front property when the method empty() returns True
+2. The property front returns a reference, which is valid until the next call of any
+   methods of this class
+3. When speech is detected, the method is_speech_detected() return True, you can
+   use the property current_segment to get the speech samples since
+   is_speech_detected() returns true
+4. When is_speech_detected() is changed from True to False, the method
+   empty() returns False.
+      )")
       .def(py::init<const VadModelConfig &, float>(), py::arg("config"),
            py::arg("buffer_size_in_seconds") = 60,
            py::call_guard<py::gil_scoped_release>())
@@ -39,7 +49,8 @@ void PybindVoiceActivityDetector(py::module *m) {
            py::call_guard<py::gil_scoped_release>())
       .def("reset", &PyClass::Reset, py::call_guard<py::gil_scoped_release>())
       .def("flush", &PyClass::Flush, py::call_guard<py::gil_scoped_release>())
-      .def_property_readonly("front", &PyClass::Front);
+      .def_property_readonly("front", &PyClass::Front)
+      .def_property_readonly("current_segment", &PyClass::CurrentSpeechSegment);
 }
 
 }  // namespace sherpa_onnx