Skip to content

Commit 0514aee

Browse files
authored
Add Swift API for ten-vad (#2387)
1 parent 7f1d71f commit 0514aee

File tree

5 files changed

+127
-13
lines changed

5 files changed

+127
-13
lines changed

.github/scripts/test-swift.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,11 @@ curl -SL -O https://huggingface.co/csukuangfj/test-data/resolve/main/Obama.wav
7171
ls -lh
7272
popd
7373

74+
./run-generate-subtitles-ten-vad.sh
75+
rm -rf *.onnx
76+
7477
./run-generate-subtitles.sh
78+
rm -rf *.onnx
7579

7680
ls -lh /Users/fangjun/Desktop
7781
cat /Users/fangjun/Desktop/Obama.srt

swift-api-examples/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
decode-file
22
decode-file-non-streaming
33
generate-subtitles
4+
generate-subtitles-ten-vad
45
spoken-language-identification
56
tts-vits
67
vits-vctk

swift-api-examples/SherpaOnnx.swift

Lines changed: 48 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -386,6 +386,22 @@ func sherpaOnnxOfflineWhisperModelConfig(
386386
)
387387
}
388388

389+
func sherpaOnnxOfflineCanaryModelConfig(
390+
encoder: String = "",
391+
decoder: String = "",
392+
srcLang: String = "en",
393+
tgtLang: String = "en",
394+
usePnc: Bool = true
395+
) -> SherpaOnnxOfflineCanaryModelConfig {
396+
return SherpaOnnxOfflineCanaryModelConfig(
397+
encoder: toCPointer(encoder),
398+
decoder: toCPointer(decoder),
399+
src_lang: toCPointer(srcLang),
400+
tgt_lang: toCPointer(tgtLang),
401+
use_pnc: usePnc ? 1 : 0
402+
)
403+
}
404+
389405
func sherpaOnnxOfflineFireRedAsrModelConfig(
390406
encoder: String = "",
391407
decoder: String = ""
@@ -459,7 +475,8 @@ func sherpaOnnxOfflineModelConfig(
459475
fireRedAsr: SherpaOnnxOfflineFireRedAsrModelConfig = sherpaOnnxOfflineFireRedAsrModelConfig(),
460476
dolphin: SherpaOnnxOfflineDolphinModelConfig = sherpaOnnxOfflineDolphinModelConfig(),
461477
zipformerCtc: SherpaOnnxOfflineZipformerCtcModelConfig =
462-
sherpaOnnxOfflineZipformerCtcModelConfig()
478+
sherpaOnnxOfflineZipformerCtcModelConfig(),
479+
canary: SherpaOnnxOfflineCanaryModelConfig = sherpaOnnxOfflineCanaryModelConfig()
463480
) -> SherpaOnnxOfflineModelConfig {
464481
return SherpaOnnxOfflineModelConfig(
465482
transducer: transducer,
@@ -479,7 +496,8 @@ func sherpaOnnxOfflineModelConfig(
479496
moonshine: moonshine,
480497
fire_red_asr: fireRedAsr,
481498
dolphin: dolphin,
482-
zipformer_ctc: zipformerCtc
499+
zipformer_ctc: zipformerCtc,
500+
canary: canary
483501
)
484502
}
485503

@@ -607,10 +625,14 @@ class SherpaOnnxOfflineRecognizer {
607625

608626
return SherpaOnnxOfflineRecongitionResult(result: result)
609627
}
628+
629+
func setConfig(config: UnsafePointer<SherpaOnnxOfflineRecognizerConfig>!) {
630+
SherpaOnnxOfflineRecognizerSetConfig(recognizer, config)
631+
}
610632
}
611633

612634
func sherpaOnnxSileroVadModelConfig(
613-
model: String,
635+
model: String = "",
614636
threshold: Float = 0.5,
615637
minSilenceDuration: Float = 0.25,
616638
minSpeechDuration: Float = 0.5,
@@ -627,19 +649,39 @@ func sherpaOnnxSileroVadModelConfig(
627649
)
628650
}
629651

652+
func sherpaOnnxTenVadModelConfig(
653+
model: String = "",
654+
threshold: Float = 0.5,
655+
minSilenceDuration: Float = 0.25,
656+
minSpeechDuration: Float = 0.5,
657+
windowSize: Int = 256,
658+
maxSpeechDuration: Float = 5.0
659+
) -> SherpaOnnxTenVadModelConfig {
660+
return SherpaOnnxTenVadModelConfig(
661+
model: toCPointer(model),
662+
threshold: threshold,
663+
min_silence_duration: minSilenceDuration,
664+
min_speech_duration: minSpeechDuration,
665+
window_size: Int32(windowSize),
666+
max_speech_duration: maxSpeechDuration
667+
)
668+
}
669+
630670
func sherpaOnnxVadModelConfig(
631-
sileroVad: SherpaOnnxSileroVadModelConfig,
671+
sileroVad: SherpaOnnxSileroVadModelConfig = sherpaOnnxSileroVadModelConfig(),
632672
sampleRate: Int32 = 16000,
633673
numThreads: Int = 1,
634674
provider: String = "cpu",
635-
debug: Int = 0
675+
debug: Int = 0,
676+
tenVad: SherpaOnnxTenVadModelConfig = sherpaOnnxTenVadModelConfig()
636677
) -> SherpaOnnxVadModelConfig {
637678
return SherpaOnnxVadModelConfig(
638679
silero_vad: sileroVad,
639680
sample_rate: sampleRate,
640681
num_threads: Int32(numThreads),
641682
provider: toCPointer(provider),
642-
debug: Int32(debug)
683+
debug: Int32(debug),
684+
ten_vad: tenVad
643685
)
644686
}
645687

swift-api-examples/generate-subtitles.swift

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -156,11 +156,35 @@ func run() {
156156
assert(audioFormat.channelCount == 1)
157157
assert(audioFormat.commonFormat == AVAudioCommonFormat.pcmFormatFloat32)
158158

159-
let sileroVadConfig = sherpaOnnxSileroVadModelConfig(
160-
model: "./silero_vad.onnx"
161-
)
159+
var sileroVadConfig = sherpaOnnxSileroVadModelConfig()
160+
var tenVadConfig = sherpaOnnxTenVadModelConfig()
161+
162+
var windowSize = 0
163+
164+
if FileManager.default.fileExists(atPath: "./silero_vad.onnx") {
165+
sileroVadConfig = sherpaOnnxSileroVadModelConfig(
166+
model: "./silero_vad.onnx",
167+
threshold: 0.25,
168+
windowSize: 512
169+
)
170+
windowSize = 512
171+
print("Use silero-vad")
172+
} else if FileManager.default.fileExists(atPath: "./ten-vad.onnx") {
173+
tenVadConfig = sherpaOnnxTenVadModelConfig(
174+
model: "./ten-vad.onnx",
175+
threshold: 0.25,
176+
windowSize: 256
177+
)
178+
windowSize = 256
179+
print("Use ten-vad")
180+
} else {
181+
print("Please provide ./silero_vad.onnx or ./ten-vad.onnx")
182+
return
183+
}
184+
185+
var vadModelConfig = sherpaOnnxVadModelConfig(
186+
sileroVad: sileroVadConfig, tenVad: tenVadConfig)
162187

163-
var vadModelConfig = sherpaOnnxVadModelConfig(sileroVad: sileroVadConfig)
164188
let vad = SherpaOnnxVoiceActivityDetectorWrapper(
165189
config: &vadModelConfig, buffer_size_in_seconds: 120)
166190

@@ -170,8 +194,6 @@ func run() {
170194
try! audioFile.read(into: audioFileBuffer!)
171195
var array: [Float]! = audioFileBuffer?.array()
172196

173-
let windowSize = Int(vadModelConfig.silero_vad.window_size)
174-
175197
var segments: [SpeechSegment] = []
176198

177199
for offset in stride(from: 0, to: array.count, by: windowSize) {
@@ -180,7 +202,6 @@ func run() {
180202
}
181203

182204
vad.flush()
183-
var index: Int = 0
184205
while !vad.isEmpty() {
185206
let s = vad.front()
186207
vad.pop()
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
#!/usr/bin/env bash
2+
3+
set -ex
4+
5+
if [ ! -d ../build-swift-macos ]; then
6+
echo "Please run ../build-swift-macos.sh first!"
7+
exit 1
8+
fi
9+
10+
if [ ! -d ./sherpa-onnx-whisper-tiny.en ]; then
11+
echo "Please download the pre-trained model for testing."
12+
echo "You can refer to"
13+
echo ""
14+
echo "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/tiny.en.html"
15+
echo ""
16+
echo "for help"
17+
18+
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
19+
tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2
20+
rm sherpa-onnx-whisper-tiny.en.tar.bz2
21+
ls -lh sherpa-onnx-whisper-tiny.en
22+
fi
23+
if [ ! -f ./ten-vad.onnx ]; then
24+
echo "downloading ten-vad"
25+
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/ten-vad.onnx
26+
fi
27+
28+
if [ ! -e ./generate-subtitles-ten-vad ]; then
29+
# Note: We use -lc++ to link against libc++ instead of libstdc++
30+
swiftc \
31+
-lc++ \
32+
-I ../build-swift-macos/install/include \
33+
-import-objc-header ./SherpaOnnx-Bridging-Header.h \
34+
./generate-subtitles.swift ./SherpaOnnx.swift \
35+
-L ../build-swift-macos/install/lib/ \
36+
-l sherpa-onnx \
37+
-l onnxruntime \
38+
-o generate-subtitles-ten-vad
39+
40+
strip generate-subtitles-ten-vad
41+
else
42+
echo "./generate-subtitles-ten-vad exists - skip building"
43+
fi
44+
45+
export DYLD_LIBRARY_PATH=$PWD/../build-swift-macos/install/lib:$DYLD_LIBRARY_PATH
46+
./generate-subtitles-ten-vad

0 commit comments

Comments
 (0)