Skip to content

Commit 1ada727

Browse files
Merge pull request #916 from DrewThomasson/v25
V25.7.12
2 parents d7842f8 + 84026be commit 1ada727

File tree

17 files changed

+613
-475
lines changed

17 files changed

+613
-475
lines changed

VERSION.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
25.7.7
1+
25.7.12

ebook2audiobook.egg-info/requires.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ pythainlp
1616
mutagen
1717
nvidia-ml-py
1818
pydub
19+
pyannote-audio
1920
PyOpenGL
2021
pypinyin
2122
ray

lib/.pbtk.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
pbtk = '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'

lib/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from .models import (
22
TTS_ENGINES, default_fine_tuned, default_tts_engine,
3-
default_engine_settings, default_vc_model,
3+
default_engine_settings, default_vc_model, default_voice_detection_model,
44
loaded_tts, max_custom_model, max_custom_voices,
55
max_tts_in_memory, max_upload_size, models, os, voices_dir
66
)
@@ -28,7 +28,7 @@
2828
__all__ = [
2929
# from models
3030
"TTS_ENGINES", "default_fine_tuned", "default_tts_engine",
31-
"default_engine_settings", "default_vc_model",
31+
"default_engine_settings", "default_vc_model", "default_voice_detection_model",
3232
"loaded_tts", "max_custom_model",
3333
"max_custom_voices", "max_tts_in_memory", "max_upload_size",
3434
"models", "os", "voices_dir",

lib/classes/background_detector.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
import os
2+
import numpy as np
3+
import librosa
4+
5+
from pyannote.audio import Model
6+
from pyannote.audio.pipelines import VoiceActivityDetection
7+
from lib.conf import tts_dir
8+
from lib.models import default_voice_detection_model
9+
10+
class BackgroundDetector:
11+
12+
def __init__(self, wav_file: str):
13+
self.wav_file = wav_file
14+
model = Model.from_pretrained(default_voice_detection_model, cache_dir=tts_dir)
15+
self.pipeline = VoiceActivityDetection(segmentation=model)
16+
hyper_params = {
17+
# onset/offset activation thresholds
18+
"onset": 0.5, "offset": 0.5,
19+
# remove speech regions shorter than that many seconds.
20+
"min_duration_on": 0.0,
21+
# fill non-speech regions shorter than that many seconds.
22+
"min_duration_off": 0.0
23+
}
24+
self.pipeline.instantiate(hyper_params)
25+
26+
def detect(self, vad_ratio_thresh: float=0.05):
27+
diarization = self.pipeline(self.wav_file)
28+
speech_segments = [(s.start, s.end) for s in diarization.get_timeline()]
29+
total_duration = librosa.get_duration(path=self.wav_file)
30+
speech_time = sum(end - start for start, end in speech_segments)
31+
non_speech_ratio = 1 - (speech_time / total_duration)
32+
status = non_speech_ratio > vad_ratio_thresh
33+
report = {
34+
'non_speech_ratio': non_speech_ratio,
35+
'background_detected': status
36+
}
37+
return status, report

lib/classes/tts_engines/common/utils.py

Lines changed: 1 addition & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -3,35 +3,7 @@
33
import regex as re
44
import stanza
55

6-
from num2words import num2words
7-
from lib.models import loaded_tts, max_tts_in_memory
8-
9-
def detect_date_entities(text, stanza_nlp):
10-
try:
11-
doc = stanza_nlp(text)
12-
date_spans = []
13-
for ent in doc.ents:
14-
if ent.type == 'DATE':
15-
date_spans.append((ent.start_char, ent.end_char, ent.text))
16-
return date_spans
17-
except Exception as e:
18-
error = f'detect_date_entities() error: {e}'
19-
print(error)
20-
return False
21-
22-
def year_to_words(year_str, lang_iso1):
23-
try:
24-
year = int(year_str)
25-
if len(year_str) != 4 or not year_str.isdigit():
26-
return num2words(year, lang=lang_iso1)
27-
first_two = int(year_str[:2])
28-
last_two = int(year_str[2:])
29-
return f"{num2words(first_two, lang=lang_iso1)} {num2words(last_two, lang=lang_iso1)}"
30-
except Exception as e:
31-
error = f'year_to_words() error: {e}'
32-
print(error)
33-
raise
34-
return False
6+
from lib.models import loaded_tts, max_tts_in_memory, TTS_ENGINES
357

368
def unload_tts(device, reserved_keys=None, tts_key=None):
379
try:

lib/classes/tts_engines/coqui.py

Lines changed: 42 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,24 @@
1-
import os
21
import hashlib
3-
import numpy as np
4-
import regex as re
2+
import math
3+
import os
54
import shutil
6-
import soundfile as sf
7-
import stanza
85
import subprocess
96
import tempfile
10-
import torch
11-
import torchaudio
127
import threading
138
import uuid
149

10+
import numpy as np
11+
import regex as re
12+
import soundfile as sf
13+
import torch
14+
import torchaudio
15+
1516
from huggingface_hub import hf_hub_download
1617
from pathlib import Path
1718
from pprint import pprint
1819

1920
from lib import *
20-
from lib.classes.tts_engines.common.utils import detect_date_entities, year_to_words, unload_tts, append_sentence2vtt
21+
from lib.classes.tts_engines.common.utils import unload_tts, append_sentence2vtt
2122
from lib.classes.tts_engines.common.audio_filters import detect_gender, trim_audio, normalize_audio, is_audio_data_valid
2223

2324
#import logging
@@ -27,11 +28,9 @@
2728
xtts_builtin_speakers_list = None
2829

2930
class Coqui:
31+
3032
def __init__(self, session):
3133
try:
32-
if session['language'] in year_to_decades_languages:
33-
stanza.download(session['language_iso1'])
34-
self.stanza_nlp = stanza.Pipeline(session['language_iso1'], processors='tokenize,ner')
3534
self.session = session
3635
self.cache_dir = tts_dir
3736
self.speakers_path = None
@@ -45,11 +44,38 @@ def __init__(self, session):
4544
self.params = {TTS_ENGINES['XTTSv2']: {"latent_embedding":{}}, TTS_ENGINES['BARK']: {},TTS_ENGINES['VITS']: {"semitones": {}}, TTS_ENGINES['FAIRSEQ']: {"semitones": {}}, TTS_ENGINES['TACOTRON2']: {"semitones": {}}, TTS_ENGINES['YOURTTS']: {}}
4645
self.params[self.session['tts_engine']]['samplerate'] = models[self.session['tts_engine']][self.session['fine_tuned']]['samplerate']
4746
self.vtt_path = os.path.join(self.session['process_dir'], os.path.splitext(self.session['final_name'])[0] + '.vtt')
47+
self.max_chars = language_mapping.get(self.session['language'], {}).get("max_chars") + 2
48+
list_split = [
49+
# Western
50+
'.', ',',
51+
# Arabic-Persian
52+
'،',
53+
# CJK
54+
'。', ',', '、', '·', '…',
55+
# Indic
56+
'।', '॥',
57+
# Thai
58+
'ฯ',
59+
# Ethiopic
60+
'፡', '።', '፣', '፤', '፥', '፦', '፧',
61+
# Hebrew
62+
'״',
63+
# Tibetan
64+
'།', '༎',
65+
# Khmer
66+
'។', '៕',
67+
# Lao
68+
'໌', 'ໍ',
69+
# Misc (global)
70+
'—', '!', '?', ':', ';'
71+
]
72+
punctuation_class = "[" + "".join(re.escape(ch) for ch in list_split) + "]"
73+
self.punc_re = re.compile(punctuation_class)
4874
self._build()
4975
except Exception as e:
5076
error = f'__init__() error: {e}'
5177
print(error)
52-
return False
78+
return None
5379

5480
def _build(self):
5581
try:
@@ -410,13 +436,10 @@ def convert(self, sentence_number, sentence):
410436
try:
411437
speaker = None
412438
audio_data = False
413-
audio2trim = False
414439
trim_audio_buffer = 0.004
415440
settings = self.params[self.session['tts_engine']]
416441
final_sentence_file = os.path.join(self.session['chapters_dir_sentences'], f'{sentence_number}.{default_audio_proc_format}')
417442
sentence = sentence.rstrip()
418-
if sentence.endswith('-') or sentence[-1].isalnum():
419-
audio2trim = True
420443
settings['voice_path'] = (
421444
self.session['voice'] if self.session['voice'] is not None
422445
else os.path.join(self.session['custom_model_dir'], self.session['tts_engine'], self.session['custom_model'], 'ref.wav') if self.session['custom_model'] is not None
@@ -432,38 +455,19 @@ def convert(self, sentence_number, sentence):
432455
return False
433456
tts = (loaded_tts.get(self.tts_key) or {}).get('engine', False)
434457
if tts:
435-
# Check if the language requires to split the year in decades
436-
if self.session['language'] in year_to_decades_languages:
437-
# Check if numbers exists in the sentence
438-
if bool(re.search(r'[-+]?\b\d+(\.\d+)?\b', sentence)):
439-
# Check if there are positive integers so possible date to convert
440-
if bool(re.search(r'\b\d+\b', sentence)):
441-
date_spans = detect_date_entities(sentence, self.stanza_nlp)
442-
if date_spans:
443-
result = []
444-
last_pos = 0
445-
for start, end, date_text in date_spans:
446-
# Append sentence before this date
447-
result.append(sentence[last_pos:start])
448-
processed = re.sub(r"\b\d{4}\b", lambda m: year_to_words(m.group(), self.session['language_iso1']), date_text)
449-
if not processed:
450-
break
451-
result.append(processed)
452-
last_pos = end
453-
# Append remaining sentence
454-
result.append(sentence[last_pos:])
455-
sentence = ''.join(result)
456458
sentence_parts = sentence.split('‡pause‡')
457459
if self.session['tts_engine'] == TTS_ENGINES['XTTSv2'] or self.session['tts_engine'] == TTS_ENGINES['FAIRSEQ']:
458-
sentence_parts = [p.replace('. ', '— ') for p in sentence_parts]
460+
sentence_parts = [p.replace('.', '— ') for p in sentence_parts]
459461
silence_tensor = torch.zeros(1, int(settings['samplerate'] * 1.4)) # 1.4 seconds
460462
audio_segments = []
461463
for text_part in sentence_parts:
462464
text_part = text_part.strip()
463465
if not text_part:
464466
audio_segments.append(silence_tensor.clone())
465467
continue
466-
audio_part = None
468+
audio2trim = False
469+
if text_part.endswith('-') or text_part[-1].isalnum():
470+
audio2trim = True
467471
if self.session['tts_engine'] == TTS_ENGINES['XTTSv2']:
468472
trim_audio_buffer = 0.06
469473
if settings['voice_path'] is not None and settings['voice_path'] in settings['latent_embedding'].keys():

0 commit comments

Comments
 (0)