1
- import os
2
1
import hashlib
3
- import numpy as np
4
- import regex as re
2
+ import math
3
+ import os
5
4
import shutil
6
- import soundfile as sf
7
- import stanza
8
5
import subprocess
9
6
import tempfile
10
- import torch
11
- import torchaudio
12
7
import threading
13
8
import uuid
14
9
10
+ import numpy as np
11
+ import regex as re
12
+ import soundfile as sf
13
+ import torch
14
+ import torchaudio
15
+
15
16
from huggingface_hub import hf_hub_download
16
17
from pathlib import Path
17
18
from pprint import pprint
18
19
19
20
from lib import *
20
- from lib .classes .tts_engines .common .utils import detect_date_entities , year_to_words , unload_tts , append_sentence2vtt
21
+ from lib .classes .tts_engines .common .utils import unload_tts , append_sentence2vtt
21
22
from lib .classes .tts_engines .common .audio_filters import detect_gender , trim_audio , normalize_audio , is_audio_data_valid
22
23
23
24
#import logging
27
28
xtts_builtin_speakers_list = None
28
29
29
30
class Coqui :
31
+
30
32
def __init__ (self , session ):
31
33
try :
32
- if session ['language' ] in year_to_decades_languages :
33
- stanza .download (session ['language_iso1' ])
34
- self .stanza_nlp = stanza .Pipeline (session ['language_iso1' ], processors = 'tokenize,ner' )
35
34
self .session = session
36
35
self .cache_dir = tts_dir
37
36
self .speakers_path = None
@@ -45,11 +44,38 @@ def __init__(self, session):
45
44
self .params = {TTS_ENGINES ['XTTSv2' ]: {"latent_embedding" :{}}, TTS_ENGINES ['BARK' ]: {},TTS_ENGINES ['VITS' ]: {"semitones" : {}}, TTS_ENGINES ['FAIRSEQ' ]: {"semitones" : {}}, TTS_ENGINES ['TACOTRON2' ]: {"semitones" : {}}, TTS_ENGINES ['YOURTTS' ]: {}}
46
45
self .params [self .session ['tts_engine' ]]['samplerate' ] = models [self .session ['tts_engine' ]][self .session ['fine_tuned' ]]['samplerate' ]
47
46
self .vtt_path = os .path .join (self .session ['process_dir' ], os .path .splitext (self .session ['final_name' ])[0 ] + '.vtt' )
47
+ self .max_chars = language_mapping .get (self .session ['language' ], {}).get ("max_chars" ) + 2
48
+ list_split = [
49
+ # Western
50
+ '.' , ',' ,
51
+ # Arabic-Persian
52
+ '،' ,
53
+ # CJK
54
+ '。' , ',' , '、' , '·' , '…' ,
55
+ # Indic
56
+ '।' , '॥' ,
57
+ # Thai
58
+ 'ฯ' ,
59
+ # Ethiopic
60
+ '፡' , '።' , '፣' , '፤' , '፥' , '፦' , '፧' ,
61
+ # Hebrew
62
+ '״' ,
63
+ # Tibetan
64
+ '།' , '༎' ,
65
+ # Khmer
66
+ '។' , '៕' ,
67
+ # Lao
68
+ '໌' , 'ໍ' ,
69
+ # Misc (global)
70
+ '—' , '!' , '?' , ':' , ';'
71
+ ]
72
+ punctuation_class = "[" + "" .join (re .escape (ch ) for ch in list_split ) + "]"
73
+ self .punc_re = re .compile (punctuation_class )
48
74
self ._build ()
49
75
except Exception as e :
50
76
error = f'__init__() error: { e } '
51
77
print (error )
52
- return False
78
+ return None
53
79
54
80
def _build (self ):
55
81
try :
@@ -410,13 +436,10 @@ def convert(self, sentence_number, sentence):
410
436
try :
411
437
speaker = None
412
438
audio_data = False
413
- audio2trim = False
414
439
trim_audio_buffer = 0.004
415
440
settings = self .params [self .session ['tts_engine' ]]
416
441
final_sentence_file = os .path .join (self .session ['chapters_dir_sentences' ], f'{ sentence_number } .{ default_audio_proc_format } ' )
417
442
sentence = sentence .rstrip ()
418
- if sentence .endswith ('-' ) or sentence [- 1 ].isalnum ():
419
- audio2trim = True
420
443
settings ['voice_path' ] = (
421
444
self .session ['voice' ] if self .session ['voice' ] is not None
422
445
else os .path .join (self .session ['custom_model_dir' ], self .session ['tts_engine' ], self .session ['custom_model' ], 'ref.wav' ) if self .session ['custom_model' ] is not None
@@ -432,38 +455,19 @@ def convert(self, sentence_number, sentence):
432
455
return False
433
456
tts = (loaded_tts .get (self .tts_key ) or {}).get ('engine' , False )
434
457
if tts :
435
- # Check if the language requires to split the year in decades
436
- if self .session ['language' ] in year_to_decades_languages :
437
- # Check if numbers exists in the sentence
438
- if bool (re .search (r'[-+]?\b\d+(\.\d+)?\b' , sentence )):
439
- # Check if there are positive integers so possible date to convert
440
- if bool (re .search (r'\b\d+\b' , sentence )):
441
- date_spans = detect_date_entities (sentence , self .stanza_nlp )
442
- if date_spans :
443
- result = []
444
- last_pos = 0
445
- for start , end , date_text in date_spans :
446
- # Append sentence before this date
447
- result .append (sentence [last_pos :start ])
448
- processed = re .sub (r"\b\d{4}\b" , lambda m : year_to_words (m .group (), self .session ['language_iso1' ]), date_text )
449
- if not processed :
450
- break
451
- result .append (processed )
452
- last_pos = end
453
- # Append remaining sentence
454
- result .append (sentence [last_pos :])
455
- sentence = '' .join (result )
456
458
sentence_parts = sentence .split ('‡pause‡' )
457
459
if self .session ['tts_engine' ] == TTS_ENGINES ['XTTSv2' ] or self .session ['tts_engine' ] == TTS_ENGINES ['FAIRSEQ' ]:
458
- sentence_parts = [p .replace ('. ' , '— ' ) for p in sentence_parts ]
460
+ sentence_parts = [p .replace ('.' , '— ' ) for p in sentence_parts ]
459
461
silence_tensor = torch .zeros (1 , int (settings ['samplerate' ] * 1.4 )) # 1.4 seconds
460
462
audio_segments = []
461
463
for text_part in sentence_parts :
462
464
text_part = text_part .strip ()
463
465
if not text_part :
464
466
audio_segments .append (silence_tensor .clone ())
465
467
continue
466
- audio_part = None
468
+ audio2trim = False
469
+ if text_part .endswith ('-' ) or text_part [- 1 ].isalnum ():
470
+ audio2trim = True
467
471
if self .session ['tts_engine' ] == TTS_ENGINES ['XTTSv2' ]:
468
472
trim_audio_buffer = 0.06
469
473
if settings ['voice_path' ] is not None and settings ['voice_path' ] in settings ['latent_embedding' ].keys ():
0 commit comments