Skip to content

Commit fc2fc3d

Browse files
authored
Add Pascal API for ten-vad (#2388)
1 parent 0514aee commit fc2fc3d

File tree

6 files changed

+238
-21
lines changed

6 files changed

+238
-21
lines changed

.github/workflows/pascal.yaml

Lines changed: 21 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,27 @@ jobs:
136136
cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/vad-with-non-streaming-asr
137137
fi
138138
139+
- name: Run Pascal test (VAD test)
140+
shell: bash
141+
run: |
142+
export PATH=/c/lazarus/fpc/3.2.2/bin/x86_64-win64:$PATH
143+
144+
cd ./pascal-api-examples
145+
146+
pushd vad
147+
./run-circular-buffer.sh
148+
echo "---"
149+
150+
time ./run-remove-silence-ten-vad.sh
151+
echo "---"
152+
153+
time ./run-remove-silence.sh
154+
echo "---"
155+
156+
ls -lh
157+
158+
popd
159+
139160
- name: Run Speech Enhancement test (GTCRN)
140161
shell: bash
141162
run: |
@@ -298,24 +319,6 @@ jobs:
298319
299320
popd
300321
301-
- name: Run Pascal test (VAD test)
302-
shell: bash
303-
run: |
304-
export PATH=/c/lazarus/fpc/3.2.2/bin/x86_64-win64:$PATH
305-
306-
cd ./pascal-api-examples
307-
308-
pushd vad
309-
./run-circular-buffer.sh
310-
echo "---"
311-
312-
time ./run-remove-silence.sh
313-
echo "---"
314-
315-
ls -lh
316-
317-
popd
318-
319322
- name: Run Pascal test (Read wav test)
320323
shell: bash
321324
run: |

pascal-api-examples/vad/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
!run*.sh
22
circular_buffer
33
remove_silence
4+
remove_silence_ten_vad

pascal-api-examples/vad/remove_silence.pas

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{ Copyright (c) 2024 Xiaomi Corporation }
22
{
33
This file shows how to use the VAD API from sherpa-onnx
4-
to remove silences from a wave file.
4+
to remove silences from a wave file with silero-vad.
55
}
66
program main;
77

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
{ Copyright (c) 2025 Xiaomi Corporation }
2+
{
3+
This file shows how to use the VAD API from sherpa-onnx
4+
to remove silences from a wave file with ten-vad.
5+
}
6+
program main;
7+
8+
{$mode delphi}
9+
10+
uses
11+
sherpa_onnx,
12+
SysUtils;
13+
14+
var
15+
Wave: TSherpaOnnxWave;
16+
17+
Config: TSherpaOnnxVadModelConfig;
18+
Vad: TSherpaOnnxVoiceActivityDetector;
19+
Offset: Integer;
20+
WindowSize: Integer;
21+
SpeechSegment: TSherpaOnnxSpeechSegment;
22+
23+
Start: Single;
24+
Duration: Single;
25+
SampleRate: Integer;
26+
27+
AllSpeechSegment: array of TSherpaOnnxSpeechSegment;
28+
AllSamples: array of Single;
29+
N: Integer;
30+
I: Integer;
31+
begin
32+
SampleRate := 16000; {Please don't change it unless you know the details}
33+
34+
Wave := SherpaOnnxReadWave('./lei-jun-test.wav');
35+
if Wave.SampleRate <> SampleRate then
36+
begin
37+
WriteLn(Format('Expected sample rate: %d. Given: %d',
38+
[SampleRate, Wave.SampleRate]));
39+
40+
Exit;
41+
end;
42+
43+
WindowSize := 256; {Please don't change it unless you know the details}
44+
Initialize(Config);
45+
46+
Config.TenVad.Model := './ten-vad.onnx';
47+
Config.TenVad.MinSpeechDuration := 0.25;
48+
Config.TenVad.MinSilenceDuration := 0.5;
49+
Config.TenVad.Threshold := 0.25;
50+
Config.TenVad.WindowSize := WindowSize;
51+
Config.NumThreads:= 1;
52+
Config.Debug:= True;
53+
Config.Provider:= 'cpu';
54+
Config.SampleRate := SampleRate;
55+
56+
Vad := TSherpaOnnxVoiceActivityDetector.Create(Config, 20);
57+
58+
AllSpeechSegment := nil;
59+
AllSamples := nil;
60+
Offset := 0;
61+
while Offset + WindowSize <= Length(Wave.Samples) do
62+
begin
63+
Vad.AcceptWaveform(Wave.Samples, Offset, WindowSize);
64+
Inc(Offset, WindowSize);
65+
66+
while not Vad.IsEmpty do
67+
begin
68+
SetLength(AllSpeechSegment, Length(AllSpeechSegment) + 1);
69+
70+
SpeechSegment := Vad.Front();
71+
Vad.Pop();
72+
AllSpeechSegment[Length(AllSpeechSegment)-1] := SpeechSegment;
73+
74+
Start := SpeechSegment.Start / SampleRate;
75+
Duration := Length(SpeechSegment.Samples) / SampleRate;
76+
WriteLn(Format('%.3f -- %.3f', [Start, Start + Duration]));
77+
end;
78+
end;
79+
80+
Vad.Flush;
81+
82+
while not Vad.IsEmpty do
83+
begin
84+
SetLength(AllSpeechSegment, Length(AllSpeechSegment) + 1);
85+
86+
SpeechSegment := Vad.Front();
87+
Vad.Pop();
88+
AllSpeechSegment[Length(AllSpeechSegment)-1] := SpeechSegment;
89+
90+
Start := SpeechSegment.Start / SampleRate;
91+
Duration := Length(SpeechSegment.Samples) / SampleRate;
92+
WriteLn(Format('%.3f -- %.3f', [Start, Start + Duration]));
93+
end;
94+
95+
N := 0;
96+
for SpeechSegment in AllSpeechSegment do
97+
Inc(N, Length(SpeechSegment.Samples));
98+
99+
SetLength(AllSamples, N);
100+
101+
N := 0;
102+
for SpeechSegment in AllSpeechSegment do
103+
begin
104+
for I := Low(SpeechSegment.Samples) to High(SpeechSegment.Samples) do
105+
begin
106+
AllSamples[N] := SpeechSegment.Samples[I];
107+
Inc(N);
108+
end;
109+
end;
110+
111+
SherpaOnnxWriteWave('./lei-jun-test-no-silence-ten-vad.wav', AllSamples, SampleRate);
112+
WriteLn('Saved to ./lei-jun-test-no-silence-ten-vad.wav');
113+
114+
FreeAndNil(Vad);
115+
end.
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
#!/usr/bin/env bash
2+
3+
set -ex
4+
5+
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
6+
SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
7+
8+
echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
9+
10+
if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
11+
mkdir -p ../../build
12+
pushd ../../build
13+
cmake \
14+
-DCMAKE_INSTALL_PREFIX=./install \
15+
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
16+
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
17+
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
18+
-DBUILD_SHARED_LIBS=ON \
19+
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
20+
..
21+
22+
cmake --build . --target install --config Release
23+
popd
24+
fi
25+
26+
if [[ ! -f ./ten-vad.onnx ]]; then
27+
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/ten-vad.onnx
28+
fi
29+
30+
if [ ! -f ./lei-jun-test.wav ]; then
31+
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
32+
fi
33+
34+
fpc \
35+
-dSHERPA_ONNX_USE_SHARED_LIBS \
36+
-Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
37+
-Fl$SHERPA_ONNX_DIR/build/install/lib \
38+
./remove_silence_ten_vad.pas
39+
40+
export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
41+
export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
42+
43+
./remove_silence_ten_vad

sherpa-onnx/pascal-api/sherpa_onnx.pas

Lines changed: 57 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -426,12 +426,24 @@ TSherpaOnnxSileroVadModelConfig = record
426426
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSileroVadModelConfig);
427427
end;
428428

429+
TSherpaOnnxTenVadModelConfig = record
430+
Model: AnsiString;
431+
Threshold: Single;
432+
MinSilenceDuration: Single;
433+
MinSpeechDuration: Single;
434+
WindowSize: Integer;
435+
MaxSpeechDuration: Single;
436+
function ToString: AnsiString;
437+
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxTenVadModelConfig);
438+
end;
439+
429440
TSherpaOnnxVadModelConfig = record
430441
SileroVad: TSherpaOnnxSileroVadModelConfig;
431442
SampleRate: Integer;
432443
NumThreads: Integer;
433444
Provider: AnsiString;
434445
Debug: Boolean;
446+
TenVad: TSherpaOnnxTenVadModelConfig;
435447
function ToString: AnsiString;
436448
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxVadModelConfig);
437449
end;
@@ -829,12 +841,23 @@ SherpaOnnxSileroVadModelConfig = record
829841
WindowSize: cint32;
830842
MaxSpeechDuration: cfloat;
831843
end;
844+
845+
SherpaOnnxTenVadModelConfig = record
846+
Model: PAnsiChar;
847+
Threshold: cfloat;
848+
MinSilenceDuration: cfloat;
849+
MinSpeechDuration: cfloat;
850+
WindowSize: cint32;
851+
MaxSpeechDuration: cfloat;
852+
end;
853+
832854
SherpaOnnxVadModelConfig = record
833855
SileroVad: SherpaOnnxSileroVadModelConfig;
834856
SampleRate: cint32;
835857
NumThreads: cint32;
836858
Provider: PAnsiChar;
837859
Debug: cint32;
860+
TenVad: SherpaOnnxTenVadModelConfig;
838861
end;
839862
PSherpaOnnxVadModelConfig = ^SherpaOnnxVadModelConfig;
840863

@@ -1907,6 +1930,21 @@ function TSherpaOnnxSileroVadModelConfig.ToString: AnsiString;
19071930
]);
19081931
end;
19091932

1933+
function TSherpaOnnxTenVadModelConfig.ToString: AnsiString;
1934+
begin
1935+
Result := Format('TSherpaOnnxTenVadModelConfig(' +
1936+
'Model := %s, ' +
1937+
'Threshold := %.2f, ' +
1938+
'MinSilenceDuration := %.2f, ' +
1939+
'MinSpeechDuration := %.2f, ' +
1940+
'WindowSize := %d, ' +
1941+
'MaxSpeechDuration := %.2f' +
1942+
')',
1943+
[Self.Model, Self.Threshold, Self.MinSilenceDuration,
1944+
Self.MinSpeechDuration, Self.WindowSize, Self.MaxSpeechDuration
1945+
]);
1946+
end;
1947+
19101948
class operator TSherpaOnnxSileroVadModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSileroVadModelConfig);
19111949
begin
19121950
Dest.Threshold := 0.5;
@@ -1916,17 +1954,27 @@ function TSherpaOnnxSileroVadModelConfig.ToString: AnsiString;
19161954
Dest.MaxSpeechDuration := 5.0;
19171955
end;
19181956

1957+
class operator TSherpaOnnxTenVadModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxTenVadModelConfig);
1958+
begin
1959+
Dest.Threshold := 0.5;
1960+
Dest.MinSilenceDuration := 0.5;
1961+
Dest.MinSpeechDuration := 0.25;
1962+
Dest.WindowSize := 256;
1963+
Dest.MaxSpeechDuration := 5.0;
1964+
end;
1965+
19191966
function TSherpaOnnxVadModelConfig.ToString: AnsiString;
19201967
begin
19211968
Result := Format('TSherpaOnnxVadModelConfig(' +
19221969
'SileroVad := %s, ' +
19231970
'SampleRate := %d, ' +
19241971
'NumThreads := %d, ' +
19251972
'Provider := %s, ' +
1926-
'Debug := %s' +
1973+
'Debug := %s, ' +
1974+
'TenVad := %s' +
19271975
')',
19281976
[Self.SileroVad.ToString, Self.SampleRate, Self.NumThreads, Self.Provider,
1929-
Self.Debug.ToString
1977+
Self.Debug.ToString, Self.TenVad.ToString
19301978
]);
19311979
end;
19321980

@@ -2077,6 +2125,13 @@ constructor TSherpaOnnxVoiceActivityDetector.Create(Config: TSherpaOnnxVadModelC
20772125
C.SileroVad.WindowSize := Config.SileroVad.WindowSize;
20782126
C.SileroVad.MaxSpeechDuration := Config.SileroVad.MaxSpeechDuration;
20792127

2128+
C.TenVad.Model := PAnsiChar(Config.TenVad.Model);
2129+
C.TenVad.Threshold := Config.TenVad.Threshold;
2130+
C.TenVad.MinSilenceDuration := Config.TenVad.MinSilenceDuration;
2131+
C.TenVad.MinSpeechDuration := Config.TenVad.MinSpeechDuration;
2132+
C.TenVad.WindowSize := Config.TenVad.WindowSize;
2133+
C.TenVad.MaxSpeechDuration := Config.TenVad.MaxSpeechDuration;
2134+
20802135
C.SampleRate := Config.SampleRate;
20812136
C.NumThreads := Config.NumThreads;
20822137
C.Provider := PAnsiChar(Config.Provider);

0 commit comments

Comments
 (0)