Description
Describe the bug
Emilia write_mp3
save in lower bitrate, @khursani8 found the audio frequency been truncated at 8k so we dig,
How To Reproduce
-
You can take any 24k audio sample rate, for example https://github.com/mesolitica/malaya-speech/blob/master/speech/podcast/sg-chunk.mp3
-
Plot the graph,
import librosa
import matplotlib.pyplot as plt
import numpy as np
def plot_waveform_and_mel_spectrogram(waveform, sample_rate):
mel_spec = librosa.feature.melspectrogram(
y=waveform,
sr=sample_rate,
n_mels=80,
n_fft=1024,
hop_length=512
)
mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
plt.figure(figsize=(12, 6))
plt.subplot(2, 1, 1)
librosa.display.waveshow(waveform, sr=sample_rate)
plt.title(f"Waveform")
plt.xlabel("Time (s)")
plt.ylabel("Amplitude")
plt.subplot(2, 1, 2)
librosa.display.specshow(
mel_spec_db,
sr=sample_rate,
hop_length=512,
x_axis="time",
y_axis="mel"
)
plt.colorbar(format="%+2.0f dB")
plt.title("Mel Spectrogram")
plt.xlabel("Time (s)")
plt.ylabel("Mel Frequency")
plt.tight_layout()
plt.show()
You will get,
- But if we use
write_mp3
from https://github.com/open-mmlab/Amphion/blob/main/preprocessors/Emilia/utils/tool.py#L52,
from pydub import AudioSegment
def write_mp3(path, sr, x):
"""Convert numpy array to MP3."""
try:
# Ensure x is in the correct format and normalize if necessary
if x.dtype != np.int16:
# Normalize the array to fit in int16 range if it's not already int16
x = np.int16(x / np.max(np.abs(x)) * 32767)
# Create audio segment from numpy array
audio = AudioSegment(
x.tobytes(), frame_rate=sr, sample_width=x.dtype.itemsize, channels=1
)
# Export as MP3 file
audio.export(path, format="mp3")
except Exception as e:
print(e)
print("Error: Failed to write MP3 file.")
write_mp3('out.mp3', 24000, y)
y, sr = librosa.load('out.mp3', sr = 24000)
plot_waveform_and_mel_spectrogram(y, sr)
You will get,
If you enable debug log level,
DEBUG:pydub.converter:subprocess.call(['ffmpeg', '-y', '-f', 'wav', '-i', '/tmp/tmplhbftkl6', '-f', 'mp3', '/tmp/tmpm3ttndt6'])
DEBUG:pydub.converter:subprocess output: b'ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers'
DEBUG:pydub.converter:subprocess output: b' built with gcc 9 (Ubuntu 9.4.0-1ubuntu1~20.04.1)'
DEBUG:pydub.converter:subprocess output: b' configuration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzmq --enable-libzvbi --enable-lv2 --enable-omx --enable-openal --enable-opencl --enable-opengl --enable-sdl2 --enable-libdc1394 --enable-libdrm --enable-libiec61883 --enable-nvenc --enable-chromaprint --enable-frei0r --enable-libx264 --enable-shared'
DEBUG:pydub.converter:subprocess output: b' libavutil 56. 31.100 / 56. 31.100'
DEBUG:pydub.converter:subprocess output: b' libavcodec 58. 54.100 / 58. 54.100'
DEBUG:pydub.converter:subprocess output: b' libavformat 58. 29.100 / 58. 29.100'
DEBUG:pydub.converter:subprocess output: b' libavdevice 58. 8.100 / 58. 8.100'
DEBUG:pydub.converter:subprocess output: b' libavfilter 7. 57.100 / 7. 57.100'
DEBUG:pydub.converter:subprocess output: b' libavresample 4. 0. 0 / 4. 0. 0'
DEBUG:pydub.converter:subprocess output: b' libswscale 5. 5.100 / 5. 5.100'
DEBUG:pydub.converter:subprocess output: b' libswresample 3. 5.100 / 3. 5.100'
DEBUG:pydub.converter:subprocess output: b' libpostproc 55. 5.100 / 55. 5.100'
DEBUG:pydub.converter:subprocess output: b'Guessed Channel Layout for Input Stream #0.0 : mono'
DEBUG:pydub.converter:subprocess output: b"Input #0, wav, from '/tmp/tmplhbftkl6':"
DEBUG:pydub.converter:subprocess output: b' Duration: 00:00:12.05, bitrate: 384 kb/s'
DEBUG:pydub.converter:subprocess output: b' Stream #0:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 24000 Hz, mono, s16, 384 kb/s'
DEBUG:pydub.converter:subprocess output: b'Stream mapping:'
DEBUG:pydub.converter:subprocess output: b' Stream #0:0 -> #0:0 (pcm_s16le (native) -> mp3 (libmp3lame))'
DEBUG:pydub.converter:subprocess output: b'Press [q] to stop, [?] for help'
DEBUG:pydub.converter:subprocess output: b"Output #0, mp3, to '/tmp/tmpm3ttndt6':"
DEBUG:pydub.converter:subprocess output: b' Metadata:'
DEBUG:pydub.converter:subprocess output: b' TSSE : Lavf58.29.100'
DEBUG:pydub.converter:subprocess output: b' Stream #0:0: Audio: mp3 (libmp3lame), 24000 Hz, mono, s16p'
DEBUG:pydub.converter:subprocess output: b' Metadata:'
DEBUG:pydub.converter:subprocess output: b' encoder : Lavc58.54.100 libmp3lame'
DEBUG:pydub.converter:subprocess output: b'size= 48kB time=00:00:12.07 bitrate= 32.3kbits/s speed= 371x'
DEBUG:pydub.converter:subprocess output: b'video:0kB audio:47kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.488861%'
DEBUG:matplotlib.colorbar:locator: <matplotlib.ticker.AutoLocator object at 0x7fed11242bc0>
You can see the bitrate for libmp3lame use 32kbit,
DEBUG:pydub.converter:subprocess output: b' encoder : Lavc58.54.100 libmp3lame'
DEBUG:pydub.converter:subprocess output: b'size= 48kB time=00:00:12.07 bitrate= 32.3kbits/s speed= 371x'
When truncated at 8k frequency, you loss the intended quality of 24k sample rate, supposely at least 12k frequency.
If we set the bitrate to 55k or higher,
from pydub import AudioSegment
def write_mp3(path, sr, x):
"""Convert numpy array to MP3."""
try:
# Ensure x is in the correct format and normalize if necessary
if x.dtype != np.int16:
# Normalize the array to fit in int16 range if it's not already int16
x = np.int16(x / np.max(np.abs(x)) * 32767)
# Create audio segment from numpy array
audio = AudioSegment(
x.tobytes(), frame_rate=sr, sample_width=x.dtype.itemsize, channels=1
)
# Export as MP3 file
audio.export(path, format="mp3", bitrate='55k')
except Exception as e:
print(e)
print("Error: Failed to write MP3 file.")
We can see now the intended frequency reached 12k,
To be save, we should use soundfile.write
.
Environment Information
I tested using 2 different ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers
and ffmpeg version 6.1.1-3ubuntu5 Copyright (c) 2000-2023 the FFmpeg developers
, both use the same libmp3lame 32kbits.