magenta
diff --git a/‎ddsp/training/data.py
Lines changed: 32 additions & 25 deletions b/‎ddsp/training/data.py
Lines changed: 32 additions & 25 deletions
diff --git a/‎ddsp/training/data_preparation/prepare_tfrecord_lib.py
Lines changed: 21 additions & 11 deletions b/‎ddsp/training/data_preparation/prepare_tfrecord_lib.py
Lines changed: 21 additions & 11 deletions
diff --git a/‎ddsp/training/data_preparation/prepare_tfrecord_lib_test.py
Lines changed: 43 additions & 3 deletions b/‎ddsp/training/data_preparation/prepare_tfrecord_lib_test.py
Lines changed: 43 additions & 3 deletions
@@ -16,6 +16,8 @@
 import os
 
 from absl import logging
+from ddsp.spectral_ops import CREPE_FRAME_SIZE
+from ddsp.spectral_ops import CREPE_SAMPLE_RATE
 from ddsp.spectral_ops import get_framed_lengths
 import gin
 import tensorflow.compat.v2 as tf
@@ -199,31 +201,30 @@ def preprocess_ex(ex):
     return dataset
 
 
-class RecordProvider(DataProvider):
-  """Class for reading records and returning a dataset."""
+@gin.register
+class TFRecordProvider(DataProvider):
+  """Class for reading TFRecords and returning a dataset."""
 
   def __init__(self,
-               file_pattern,
-               example_secs,
-               sample_rate,
-               frame_rate,
-               data_format_map_fn,
+               file_pattern=None,
+               example_secs=4,
+               sample_rate=16000,
+               frame_rate=250,
                centered=False):
     """RecordProvider constructor."""
+    super().__init__(sample_rate, frame_rate)
     self._file_pattern = file_pattern or self.default_file_pattern
     self._audio_length = example_secs * sample_rate
-    super().__init__(sample_rate, frame_rate)
+    self._audio_16k_length = example_secs * CREPE_SAMPLE_RATE
     self._feature_length = self.get_feature_length(centered)
-    self._data_format_map_fn = data_format_map_fn
 
   def get_feature_length(self, centered):
     """Take into account center padding to get number of frames."""
     # Number of frames is independent of frame size for "center/same" padding.
-    frame_size = 1024
-    hop_size = self.sample_rate / self.frame_rate
+    hop_size = CREPE_SAMPLE_RATE / self.frame_rate
     padding = 'center' if centered else 'same'
     return get_framed_lengths(
-        self._audio_length, frame_size, hop_size, padding)[0]
+        self._audio_16k_length, CREPE_FRAME_SIZE, hop_size, padding)[0]
 
   @property
   def default_file_pattern(self):
@@ -246,7 +247,7 @@ def parse_tfexample(record):
 
     filenames = tf.data.Dataset.list_files(self._file_pattern, shuffle=shuffle)
     dataset = filenames.interleave(
-        map_func=self._data_format_map_fn,
+        map_func=tf.data.TFRecordDataset,
         cycle_length=40,
         num_parallel_calls=_AUTOTUNE)
     dataset = dataset.map(parse_tfexample, num_parallel_calls=_AUTOTUNE)
@@ -258,6 +259,8 @@ def features_dict(self):
     return {
         'audio':
             tf.io.FixedLenFeature([self._audio_length], dtype=tf.float32),
+        'audio_16k':
+            tf.io.FixedLenFeature([self._audio_16k_length], dtype=tf.float32),
         'f0_hz':
             tf.io.FixedLenFeature([self._feature_length], dtype=tf.float32),
         'f0_confidence':
@@ -268,18 +271,22 @@ def features_dict(self):
 
 
 @gin.register
-class TFRecordProvider(RecordProvider):
+class LegacyTFRecordProvider(TFRecordProvider):
   """Class for reading TFRecords and returning a dataset."""
 
-  def __init__(self,
-               file_pattern=None,
-               example_secs=4,
-               sample_rate=16000,
-               frame_rate=250,
-               centered=False):
-    """TFRecordProvider constructor."""
-    super().__init__(file_pattern, example_secs, sample_rate,
-                     frame_rate, tf.data.TFRecordDataset, centered=centered)
+  @property
+  def features_dict(self):
+    """Dictionary of features to read from dataset."""
+    return {
+        'audio':
+            tf.io.FixedLenFeature([self._audio_length], dtype=tf.float32),
+        'f0_hz':
+            tf.io.FixedLenFeature([self._feature_length], dtype=tf.float32),
+        'f0_confidence':
+            tf.io.FixedLenFeature([self._feature_length], dtype=tf.float32),
+        'loudness_db':
+            tf.io.FixedLenFeature([self._feature_length], dtype=tf.float32),
+    }
 
 
 # ------------------------------------------------------------------------------
@@ -397,7 +404,7 @@ def get_dataset(self, shuffle=True):
 # Synthetic Data for InverseSynthesis
 # ------------------------------------------------------------------------------
 @gin.register
-class SyntheticNotes(TFRecordProvider):
+class SyntheticNotes(LegacyTFRecordProvider):
   """Create self-supervised control signal.
 
   EXPERIMENTAL
@@ -440,7 +447,7 @@ def features_dict(self):
 
 
 @gin.register
-class Urmp(TFRecordProvider):
+class Urmp(LegacyTFRecordProvider):
   """Urmp training set."""
 
   def __init__(self,
 
@@ -21,6 +21,7 @@
 import pydub
 import tensorflow.compat.v2 as tf
 
+CREPE_SAMPLE_RATE = spectral_ops.CREPE_SAMPLE_RATE  # 16kHz.
 
 
 def _load_audio_as_array(audio_path, sample_rate):
@@ -57,24 +58,32 @@ def _load_audio(audio_path, sample_rate):
   logging.info("Loading '%s'.", audio_path)
   beam.metrics.Metrics.counter('prepare-tfrecord', 'load-audio').inc()
   audio = _load_audio_as_array(audio_path, sample_rate)
-  return {'audio': audio}
+  if sample_rate != CREPE_SAMPLE_RATE:
+    audio_16k = _load_audio_as_array(audio_path, CREPE_SAMPLE_RATE)
+  else:
+    audio_16k = audio
+  return {'audio': audio, 'audio_16k': audio_16k}
 
 
 def _chunk_audio(ex, sample_rate, chunk_secs):
   """Pad audio and split into chunks."""
   beam.metrics.Metrics.counter('prepare-tfrecord', 'load-audio').inc()
-  audio = ex['audio']
-  chunk_size = int(chunk_secs * sample_rate)
-  chunks = tf.signal.frame(audio, chunk_size, chunk_size, pad_end=True)
+  def get_chunks(audio, sample_rate):
+    chunk_size = int(chunk_secs * sample_rate)
+    return tf.signal.frame(audio, chunk_size, chunk_size, pad_end=True).numpy()
+
+  chunks = get_chunks(ex['audio'], sample_rate)
+  chunks_16k = get_chunks(ex['audio_16k'], CREPE_SAMPLE_RATE)
+  assert chunks.shape[0] == chunks_16k.shape[0]
   n_chunks = chunks.shape[0]
   for i in range(n_chunks):
-    yield {'audio': chunks[i].numpy()}
+    yield {'audio': chunks[i], 'audio_16k': chunks_16k[i]}
 
 
 def _add_f0_estimate(ex, frame_rate, center, viterbi):
   """Add fundamental frequency (f0) estimate using CREPE."""
   beam.metrics.Metrics.counter('prepare-tfrecord', 'estimate-f0').inc()
-  audio = ex['audio']
+  audio = ex['audio_16k']
   padding = 'center' if center else 'same'
   f0_hz, f0_confidence = spectral_ops.compute_f0(
       audio, frame_rate, viterbi=viterbi, padding=padding)
@@ -86,13 +95,13 @@ def _add_f0_estimate(ex, frame_rate, center, viterbi):
   return ex
 
 
-def _add_loudness(ex, sample_rate, frame_rate, n_fft, center):
+def _add_loudness(ex, frame_rate, n_fft, center):
   """Add loudness in dB."""
   beam.metrics.Metrics.counter('prepare-tfrecord', 'compute-loudness').inc()
-  audio = ex['audio']
+  audio = ex['audio_16k']
   padding = 'center' if center else 'same'
   loudness_db = spectral_ops.compute_loudness(
-      audio, sample_rate, frame_rate, n_fft, padding=padding)
+      audio, CREPE_SAMPLE_RATE, frame_rate, n_fft, padding=padding)
   ex = dict(ex)
   ex['loudness_db'] = loudness_db.numpy().astype(np.float32)
   return ex
@@ -113,14 +122,16 @@ def get_windows(sequence, rate, center):
       end = start + window_size
       yield sequence[start:end]
 
-  for audio, loudness_db, f0_hz, f0_confidence in zip(
+  for audio, audio_16k, loudness_db, f0_hz, f0_confidence in zip(
       get_windows(ex['audio'], sample_rate, center=False),
+      get_windows(ex['audio_16k'], CREPE_SAMPLE_RATE, center=False),
       get_windows(ex['loudness_db'], frame_rate, center),
       get_windows(ex['f0_hz'], frame_rate, center),
       get_windows(ex['f0_confidence'], frame_rate, center)):
     beam.metrics.Metrics.counter('prepare-tfrecord', 'split-example').inc()
     yield {
         'audio': audio,
+        'audio_16k': audio_16k,
         'loudness_db': loudness_db,
         'f0_hz': f0_hz,
         'f0_confidence': f0_confidence
@@ -238,7 +249,6 @@ def postprocess_pipeline(examples, output_path, stage_name=''):
                      center=center,
                      viterbi=viterbi)
           | beam.Map(_add_loudness,
-                     sample_rate=sample_rate,
                      frame_rate=frame_rate,
                      n_fft=512,
                      center=center))
 
@@ -26,6 +26,8 @@
 import scipy.io.wavfile
 import tensorflow.compat.v2 as tf
 
+CREPE_SAMPLE_RATE = spectral_ops.CREPE_SAMPLE_RATE
+
 
 class PrepareTFRecordBeamTest(parameterized.TestCase):
 
@@ -70,7 +72,7 @@ def validate_outputs(self, expected_num_examples, expected_feature_lengths):
         try:
           self.assertLen(arr, expected_len)
         except AssertionError as e:
-          raise AssertionError('%s feature: %s' % (e, feat))
+          raise AssertionError('feature: %s' % feat) from e
         self.assertFalse(any(np.isinf(arr)))
 
   def get_expected_length(self, input_length, frame_rate, center=False):
@@ -139,6 +141,7 @@ def test_prepare_tfrecord(self, chunk_secs, example_secs):
         expected_n_batch,
         {
             'audio': expected_n_t,
+            'audio_16k': expected_n_t,
             'f0_hz': expected_n_frames,
             'f0_confidence': expected_n_frames,
             'loudness_db': expected_n_frames,
@@ -169,13 +172,49 @@ def test_centering(self, center):
     self.validate_outputs(
         n_batch, {
             'audio': n_t,
+            'audio_16k': n_t,
+            'f0_hz': n_frames,
+            'f0_confidence': n_frames,
+            'loudness_db': n_frames,
+        })
+
+  @parameterized.named_parameters(
+      ('16kHz', 16000),
+      ('32kHz', 32000),
+      ('48kHz', 48000))
+  def test_sample_rate(self, sample_rate):
+    frame_rate = 250
+    example_secs = 0.3
+    hop_secs = 0.1
+    center = True
+    n_batch = self.get_n_per_chunk(self.wav_secs, example_secs, hop_secs)
+    prepare_tfrecord_lib.prepare_tfrecord(
+        [self.wav_path],
+        os.path.join(self.test_dir, 'output.tfrecord'),
+        num_shards=2,
+        sample_rate=sample_rate,
+        frame_rate=frame_rate,
+        example_secs=example_secs,
+        hop_secs=hop_secs,
+        center=center,
+        chunk_secs=None)
+
+    n_t = int(example_secs * sample_rate)
+    n_t_16k = int(example_secs * CREPE_SAMPLE_RATE)
+    n_frames = self.get_expected_length(n_t_16k, frame_rate, center)
+    n_expected_frames = 76  # (250 * 0.3) + 1.
+    self.assertEqual(n_frames, n_expected_frames)
+    self.validate_outputs(
+        n_batch, {
+            'audio': n_t,
+            'audio_16k': n_t_16k,
             'f0_hz': n_frames,
             'f0_confidence': n_frames,
             'loudness_db': n_frames,
         })
 
-  @parameterized.named_parameters(('16k', 16000), ('24k', 24000),
-                                  ('48k', 48000))
+  @parameterized.named_parameters(('16kHz', 16000), ('44.1kHz', 44100),
+                                  ('48kHz', 48000))
   def test_audio_only(self, sample_rate):
     prepare_tfrecord_lib.prepare_tfrecord(
         [self.wav_path],
@@ -189,6 +228,7 @@ def test_audio_only(self, sample_rate):
     self.validate_outputs(
         1, {
             'audio': int(self.wav_secs * sample_rate),
+            'audio_16k': int(self.wav_secs * CREPE_SAMPLE_RATE),
         })