Skip to content

Commit 8693b1e

Browse files
authored
Support Portuguese and German ASR models from NeMo (#2394)
1 parent 27098a0 commit 8693b1e

9 files changed

+299
-4
lines changed

.github/workflows/export-nemo-fast-conformer-hybrid-transducer-ctc-non-streaming.yaml

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ jobs:
3939
shell: bash
4040
run: |
4141
cd scripts/nemo/fast-conformer-hybrid-transducer-ctc
42+
./run-ctc-non-streaming-2.sh
4243
./run-ctc-non-streaming.sh
4344
4445
mv -v sherpa-onnx-nemo* ../../..
@@ -66,6 +67,10 @@ jobs:
6667
sherpa-onnx-nemo-fast-conformer-ctc-en-de-es-fr-14288-int8
6768
sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k-int8
6869
sherpa-onnx-nemo-parakeet_tdt_ctc_110m-en-36000-int8
70+
sherpa-onnx-nemo-stt_pt_fastconformer_hybrid_large_pc
71+
sherpa-onnx-nemo-stt_pt_fastconformer_hybrid_large_pc-int8
72+
sherpa-onnx-nemo-stt_de_fastconformer_hybrid_large_pc
73+
sherpa-onnx-nemo-stt_de_fastconformer_hybrid_large_pc-int8
6974
)
7075
7176
for m in ${models[@]}; do
@@ -75,7 +80,7 @@ jobs:
7580
git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$m huggingface
7681
cp -av $m/* huggingface
7782
cd huggingface
78-
git lfs track "*.onnx"
83+
git lfs track "*.onnx" "*.wav"
7984
git status
8085
git add .
8186
git status
@@ -99,6 +104,10 @@ jobs:
99104
sherpa-onnx-nemo-fast-conformer-ctc-en-de-es-fr-14288-int8
100105
sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k-int8
101106
sherpa-onnx-nemo-parakeet_tdt_ctc_110m-en-36000-int8
107+
sherpa-onnx-nemo-stt_pt_fastconformer_hybrid_large_pc
108+
sherpa-onnx-nemo-stt_pt_fastconformer_hybrid_large_pc-int8
109+
sherpa-onnx-nemo-stt_de_fastconformer_hybrid_large_pc
110+
sherpa-onnx-nemo-stt_de_fastconformer_hybrid_large_pc-int8
102111
)
103112
for d in ${dirs[@]}; do
104113
tar cjvf ${d}.tar.bz2 ./$d

.github/workflows/export-nemo-fast-conformer-hybrid-transducer-transducer-non-streaming.yaml

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ jobs:
3939
shell: bash
4040
run: |
4141
cd scripts/nemo/fast-conformer-hybrid-transducer-ctc
42+
./run-transducer-non-streaming-2.sh
4243
./run-transducer-non-streaming.sh
4344
4445
mv -v sherpa-onnx-nemo* ../../..
@@ -66,6 +67,10 @@ jobs:
6667
sherpa-onnx-nemo-fast-conformer-transducer-en-de-es-fr-14288-int8
6768
sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k-int8
6869
sherpa-onnx-nemo-parakeet_tdt_transducer_110m-en-36000-int8
70+
sherpa-onnx-nemo-transducer-stt_pt_fastconformer_hybrid_large_pc
71+
sherpa-onnx-nemo-transducer-stt_pt_fastconformer_hybrid_large_pc-int8
72+
sherpa-onnx-nemo-transducer-stt_de_fastconformer_hybrid_large_pc
73+
sherpa-onnx-nemo-transducer-stt_de_fastconformer_hybrid_large_pc-int8
6974
)
7075
7176
for m in ${models[@]}; do
@@ -75,7 +80,7 @@ jobs:
7580
git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$m huggingface
7681
cp -av $m/* huggingface
7782
cd huggingface
78-
git lfs track "*.onnx"
83+
git lfs track "*.onnx" "*.wav"
7984
git status
8085
git add .
8186
git status
@@ -98,6 +103,10 @@ jobs:
98103
sherpa-onnx-nemo-fast-conformer-transducer-en-de-es-fr-14288-int8
99104
sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k-int8
100105
sherpa-onnx-nemo-parakeet_tdt_transducer_110m-en-36000-int8
106+
sherpa-onnx-nemo-transducer-stt_pt_fastconformer_hybrid_large_pc
107+
sherpa-onnx-nemo-transducer-stt_pt_fastconformer_hybrid_large_pc-int8
108+
sherpa-onnx-nemo-transducer-stt_de_fastconformer_hybrid_large_pc
109+
sherpa-onnx-nemo-transducer-stt_de_fastconformer_hybrid_large_pc-int8
101110
)
102111
for d in ${dirs[@]}; do
103112
tar cjvf ${d}.tar.bz2 ./$d

scripts/apk/generate-vad-asr-apk-script.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -597,6 +597,70 @@ def get_models():
597597
598598
ls -lh
599599
600+
popd
601+
""",
602+
),
603+
Model(
604+
model_name="sherpa-onnx-nemo-transducer-stt_pt_fastconformer_hybrid_large_pc-int8",
605+
idx=35,
606+
lang="pt",
607+
lang2="Portuguese",
608+
short_name="stt_pt_fastconformer_hybrid_large_pc_transducer_int8",
609+
cmd="""
610+
pushd $model_name
611+
612+
rm -rfv test_wavs
613+
614+
ls -lh
615+
616+
popd
617+
""",
618+
),
619+
Model(
620+
model_name="sherpa-onnx-nemo-stt_pt_fastconformer_hybrid_large_pc-int8",
621+
idx=36,
622+
lang="pt",
623+
lang2="Portuguese",
624+
short_name="stt_pt_fastconformer_hybrid_large_pc_ctc-int8",
625+
cmd="""
626+
pushd $model_name
627+
628+
rm -rfv test_wavs
629+
630+
ls -lh
631+
632+
popd
633+
""",
634+
),
635+
Model(
636+
model_name="sherpa-onnx-nemo-transducer-stt_de_fastconformer_hybrid_large_pc-int8",
637+
idx=37,
638+
lang="de",
639+
lang2="German",
640+
short_name="stt_de_fastconformer_hybrid_large_pc_transducer_int8",
641+
cmd="""
642+
pushd $model_name
643+
644+
rm -rfv test_wavs
645+
646+
ls -lh
647+
648+
popd
649+
""",
650+
),
651+
Model(
652+
model_name="sherpa-onnx-nemo-stt_de_fastconformer_hybrid_large_pc-int8",
653+
idx=38,
654+
lang="de",
655+
lang2="German",
656+
short_name="stt_de_fastconformer_hybrid_large_pc_ctc-int8",
657+
cmd="""
658+
pushd $model_name
659+
660+
rm -rfv test_wavs
661+
662+
ls -lh
663+
600664
popd
601665
""",
602666
),

scripts/nemo/fast-conformer-hybrid-transducer-ctc/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,5 +24,7 @@ This folder contains scripts for exporting models from
2424

2525
- https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/parakeet-tdt_ctc-110m
2626
- https://huggingface.co/nvidia/parakeet-tdt_ctc-0.6b-ja
27+
- https://huggingface.co/nvidia/stt_pt_fastconformer_hybrid_large_pc
28+
- https://huggingface.co/nvidia/stt_de_fastconformer_hybrid_large_pc
2729

2830
to `sherpa-onnx`.

scripts/nemo/fast-conformer-hybrid-transducer-ctc/export-onnx-ctc-non-streaming.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,9 @@ def main():
8181
"model_type": "EncDecHybridRNNTCTCBPEModel",
8282
"version": "1",
8383
"model_author": "NeMo",
84-
"url": f"https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/{model_name}",
84+
"url": f"https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/{model_name}"
85+
if "/" in model_name
86+
else f"https://huggingface.co/{model_name}",
8587
"comment": "Only the CTC branch is exported",
8688
"doc": args.doc,
8789
}

scripts/nemo/fast-conformer-hybrid-transducer-ctc/export-onnx-transducer-non-streaming.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,9 @@ def main():
8585
"model_type": "EncDecHybridRNNTCTCBPEModel",
8686
"version": "1",
8787
"model_author": "NeMo",
88-
"url": f"https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/{model_name}",
88+
"url": f"https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/{model_name}"
89+
if "/" in model_name
90+
else f"https://huggingface.co/{model_name}",
8991
"comment": "Only the transducer branch is exported",
9092
"doc": args.doc,
9193
}
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
#!/usr/bin/env bash
2+
# Copyright 2024 Xiaomi Corp. (authors: Fangjun Kuang)
3+
4+
set -ex
5+
6+
log() {
7+
# This function is from espnet
8+
local fname=${BASH_SOURCE[1]##*/}
9+
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
10+
}
11+
12+
# 2200 hours of Portuguese speech
13+
url=https://huggingface.co/nvidia/stt_pt_fastconformer_hybrid_large_pc
14+
name=$(basename $url)
15+
name="nvidia/$name"
16+
doc="STT PT FastConformer Hybrid Transducer-CTC Large transcribes text in upper and lower case Portuguese alphabet along with spaces, period, comma, question mark. This collection contains the Brazilian Portuguese FastConformer Hybrid (Transducer and CTC) Large model (around 115M parameters) with punctuation and capitalization trained on around 2200h hours of Portuguese speech. "
17+
18+
log "Process $name at $url"
19+
./export-onnx-ctc-non-streaming.py --model $name --doc "$doc"
20+
d=sherpa-onnx-nemo-stt_pt_fastconformer_hybrid_large_pc
21+
mkdir -p $d
22+
mv -v model.onnx $d/
23+
cp -v tokens.txt $d/
24+
ls -lh $d
25+
26+
mkdir test_wavs
27+
pushd test_wavs
28+
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/pt_br.wav
29+
popd
30+
cp -a test_wavs $d
31+
32+
d=sherpa-onnx-nemo-stt_pt_fastconformer_hybrid_large_pc-int8
33+
mkdir -p $d
34+
mv -v model.int8.onnx $d/
35+
mv -v tokens.txt $d/
36+
ls -lh $d
37+
mv test_wavs $d
38+
39+
python3 ./test-onnx-ctc-non-streaming.py \
40+
--model $d/model.int8.onnx \
41+
--tokens $d/tokens.txt \
42+
--wav $d/test_wavs/pt_br.wav
43+
44+
45+
# 2500 hours of German speech
46+
url=https://huggingface.co/nvidia/stt_de_fastconformer_hybrid_large_pc
47+
name=$(basename $url)
48+
name="nvidia/$name"
49+
doc="This model transcribes speech in upper and lower case German alphabet along with spaces, periods, commas, and question marks. It is a 'large' version of FastConformer Transducer-CTC (around 115M parameters) model. This is a hybrid model trained on two losses: Transducer (default) and CTC."
50+
51+
log "Process $name at $url"
52+
./export-onnx-ctc-non-streaming.py --model $name --doc "$doc"
53+
d=sherpa-onnx-nemo-stt_de_fastconformer_hybrid_large_pc
54+
mkdir -p $d
55+
mv -v model.onnx $d/
56+
cp -v tokens.txt $d/
57+
ls -lh $d
58+
59+
mkdir test_wavs
60+
pushd test_wavs
61+
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/de.wav
62+
popd
63+
cp -a test_wavs $d
64+
65+
d=sherpa-onnx-nemo-stt_de_fastconformer_hybrid_large_pc-int8
66+
mkdir -p $d
67+
mv -v model.int8.onnx $d/
68+
mv -v tokens.txt $d/
69+
ls -lh $d
70+
mv test_wavs $d
71+
72+
python3 ./test-onnx-ctc-non-streaming.py \
73+
--model $d/model.int8.onnx \
74+
--tokens $d/tokens.txt \
75+
--wav $d/test_wavs/de.wav
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
#!/usr/bin/env bash
2+
# Copyright 2024 Xiaomi Corp. (authors: Fangjun Kuang)
3+
4+
set -ex
5+
6+
log() {
7+
# This function is from espnet
8+
local fname=${BASH_SOURCE[1]##*/}
9+
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
10+
}
11+
12+
# 2200 hours of Portuguese speech
13+
url=https://huggingface.co/nvidia/stt_pt_fastconformer_hybrid_large_pc
14+
name=$(basename $url)
15+
name="nvidia/$name"
16+
doc="STT PT FastConformer Hybrid Transducer-CTC Large transcribes text in upper and lower case Portuguese alphabet along with spaces, period, comma, question mark. This collection contains the Brazilian Portuguese FastConformer Hybrid (Transducer and CTC) Large model (around 115M parameters) with punctuation and capitalization trained on around 2200h hours of Portuguese speech. "
17+
18+
log "Process $name at $url"
19+
./export-onnx-transducer-non-streaming.py --model $name --doc "$doc"
20+
d=sherpa-onnx-nemo-transducer-stt_pt_fastconformer_hybrid_large_pc
21+
mkdir -p $d
22+
mv -v encoder.onnx $d/
23+
mv -v decoder.onnx $d/
24+
mv -v joiner.onnx $d/
25+
cp -v tokens.txt $d/
26+
ls -lh $d
27+
28+
mkdir test_wavs
29+
pushd test_wavs
30+
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/pt_br.wav
31+
popd
32+
cp -a test_wavs $d
33+
34+
d=sherpa-onnx-nemo-transducer-stt_pt_fastconformer_hybrid_large_pc-int8
35+
mkdir -p $d
36+
mv -v encoder.int8.onnx $d/
37+
mv -v decoder.int8.onnx $d/
38+
mv -v joiner.int8.onnx $d/
39+
mv -v tokens.txt $d/
40+
ls -lh $d
41+
mv test_wavs $d
42+
43+
python3 ./test-onnx-transducer-non-streaming.py \
44+
--encoder $d/encoder.int8.onnx \
45+
--decoder $d/decoder.int8.onnx \
46+
--joiner $d/joiner.int8.onnx \
47+
--tokens $d/tokens.txt \
48+
--wav $d/test_wavs/pt_br.wav
49+
50+
# 2500 hours of German speech
51+
url=https://huggingface.co/nvidia/stt_de_fastconformer_hybrid_large_pc
52+
name=$(basename $url)
53+
name="nvidia/$name"
54+
doc="This model transcribes speech in upper and lower case German alphabet along with spaces, periods, commas, and question marks. It is a 'large' version of FastConformer Transducer-CTC (around 115M parameters) model. This is a hybrid model trained on two losses: Transducer (default) and CTC."
55+
56+
log "Process $name at $url"
57+
./export-onnx-transducer-non-streaming.py --model $name --doc "$doc"
58+
d=sherpa-onnx-nemo-transducer-stt_de_fastconformer_hybrid_large_pc
59+
mkdir -p $d
60+
mv -v encoder.onnx $d/
61+
mv -v decoder.onnx $d/
62+
mv -v joiner.onnx $d/
63+
cp -v tokens.txt $d/
64+
ls -lh $d
65+
66+
mkdir test_wavs
67+
pushd test_wavs
68+
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/de.wav
69+
popd
70+
cp -a test_wavs $d
71+
72+
d=sherpa-onnx-nemo-transducer-stt_de_fastconformer_hybrid_large_pc-int8
73+
mkdir -p $d
74+
mv -v encoder.int8.onnx $d/
75+
mv -v decoder.int8.onnx $d/
76+
mv -v joiner.int8.onnx $d/
77+
mv -v tokens.txt $d/
78+
ls -lh $d
79+
mv test_wavs $d
80+
81+
python3 ./test-onnx-transducer-non-streaming.py \
82+
--encoder $d/encoder.int8.onnx \
83+
--decoder $d/decoder.int8.onnx \
84+
--joiner $d/joiner.int8.onnx \
85+
--tokens $d/tokens.txt \
86+
--wav $d/test_wavs/de.wav

sherpa-onnx/kotlin-api/OfflineRecognizer.kt

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -621,6 +621,52 @@ fun getOfflineModelConfig(type: Int): OfflineModelConfig? {
621621
tokens = "$modelDir/tokens.txt",
622622
)
623623
}
624+
625+
35 -> {
626+
val modelDir = "sherpa-onnx-nemo-transducer-stt_pt_fastconformer_hybrid_large_pc-int8"
627+
return OfflineModelConfig(
628+
transducer = OfflineTransducerModelConfig(
629+
encoder = "$modelDir/encoder.int8.onnx",
630+
decoder = "$modelDir/decoder.int8.onnx",
631+
joiner = "$modelDir/joiner.int8.onnx",
632+
),
633+
tokens = "$modelDir/tokens.txt",
634+
modelType = "nemo_transducer",
635+
)
636+
}
637+
638+
36 -> {
639+
val modelDir = "sherpa-onnx-nemo-stt_pt_fastconformer_hybrid_large_pc-int8"
640+
return OfflineModelConfig(
641+
nemo = OfflineNemoEncDecCtcModelConfig(
642+
model = "$modelDir/model.int8.onnx",
643+
),
644+
tokens = "$modelDir/tokens.txt",
645+
)
646+
}
647+
648+
37 -> {
649+
val modelDir = "sherpa-onnx-nemo-transducer-stt_de_fastconformer_hybrid_large_pc-int8"
650+
return OfflineModelConfig(
651+
transducer = OfflineTransducerModelConfig(
652+
encoder = "$modelDir/encoder.int8.onnx",
653+
decoder = "$modelDir/decoder.int8.onnx",
654+
joiner = "$modelDir/joiner.int8.onnx",
655+
),
656+
tokens = "$modelDir/tokens.txt",
657+
modelType = "nemo_transducer",
658+
)
659+
}
660+
661+
38 -> {
662+
val modelDir = "sherpa-onnx-nemo-stt_de_fastconformer_hybrid_large_pc-int8"
663+
return OfflineModelConfig(
664+
nemo = OfflineNemoEncDecCtcModelConfig(
665+
model = "$modelDir/model.int8.onnx",
666+
),
667+
tokens = "$modelDir/tokens.txt",
668+
)
669+
}
624670
}
625671
return null
626672
}

0 commit comments

Comments
 (0)