Fix model pickling (sdv-dev#271)

pvk-developer · web-flow · commit 350b771c18b5 · 2023-02-23T18:01:50.000+01:00
* Fix model pickling
diff --git a/ctgan/synthesizers/base.py b/ctgan/synthesizers/base.py
@@ -57,13 +57,60 @@ def wrapper(self, *args, **kwargs):
 
 
 class BaseSynthesizer:
-    """Base class for all default synthesizers of ``CTGAN``.
-
-    This should contain the save/load methods.
-    """
+    """Base class for all default synthesizers of ``CTGAN``."""
 
     random_states = None
 
+    def __getstate__(self):
+        """Improve pickling state for ``BaseSynthesizer``.
+
+        Convert to ``cpu`` device before starting the pickling process in order to be able to
+        load the model even when used from an external tool such as ``SDV``. Also, if
+        ``random_states`` are set, store their states as dictionaries rather than generators.
+
+        Returns:
+            dict:
+                Python dict representing the object.
+        """
+        device_backup = self._device
+        self.set_device(torch.device('cpu'))
+        state = self.__dict__.copy()
+        self.set_device(device_backup)
+        if (
+            isinstance(self.random_states, tuple) and
+            isinstance(self.random_states[0], np.random.RandomState) and
+            isinstance(self.random_states[1], torch.Generator)
+        ):
+            state['_numpy_random_state'] = self.random_states[0].get_state()
+            state['_torch_random_state'] = self.random_states[1].get_state()
+            state.pop('random_states')
+
+        return state
+
+    def __setstate__(self, state):
+        """Restore the state of a ``BaseSynthesizer``.
+
+        Restore the ``random_states`` from the state dict if those are present and then
+        set the device according to the current hardware.
+        """
+        if '_numpy_random_state' in state and '_torch_random_state' in state:
+            np_state = state.pop('_numpy_random_state')
+            torch_state = state.pop('_torch_random_state')
+
+            current_torch_state = torch.Generator()
+            current_torch_state.set_state(torch_state)
+
+            current_numpy_state = np.random.RandomState()
+            current_numpy_state.set_state(np_state)
+            state['random_states'] = (
+                current_numpy_state,
+                current_torch_state
+            )
+
+        self.__dict__ = state
+        device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+        self.set_device(device)
+
     def save(self, path):
         """Save the model in the passed `path`."""
         device_backup = self._device
diff --git a/tests/integration/synthesizer/test_ctgan.py b/tests/integration/synthesizer/test_ctgan.py
@@ -210,7 +210,7 @@ def test_fixed_random_seed():
     })
     discrete_columns = ['discrete']
 
-    ctgan = CTGAN(epochs=1)
+    ctgan = CTGAN(epochs=1, cuda=False)
 
     # Run
     ctgan.fit(data, discrete_columns)
@@ -273,3 +273,27 @@ def test_conditional():
 
 def test_batch_size_pack_size():
     """Test that if batch size is not a multiple of pack size, it raises a sane error."""
+
+
+def test_ctgan_save_and_load(tmpdir):
+    """Test that the ``CTGAN`` model can be saved and loaded."""
+    # Setup
+    data = pd.DataFrame({
+        'continuous': np.random.random(100),
+        'discrete': np.random.choice(['a', 'b', 'c'], 100)
+    })
+    discrete_columns = [1]
+
+    ctgan = CTGAN(epochs=1)
+    ctgan.fit(data.to_numpy(), discrete_columns)
+    ctgan.set_random_state(0)
+
+    ctgan.sample(100)
+    model_path = tmpdir / 'model.pkl'
+
+    # Save
+    ctgan.save(str(model_path))
+
+    # Load
+    loaded_instance = CTGAN.load(str(model_path))
+    loaded_instance.sample(100)
diff --git a/tests/integration/synthesizer/test_tvae.py b/tests/integration/synthesizer/test_tvae.py
@@ -129,3 +129,27 @@ def test_fixed_random_seed():
     assert not np.array_equal(sampled_random, sampled_0_1)
     np.testing.assert_array_equal(sampled_0_0, sampled_1_0)
     np.testing.assert_array_equal(sampled_0_1, sampled_1_1)
+
+
+def test_tvae_save(tmpdir):
+    """Test that the ``TVAE`` model can be saved and loaded."""
+    # Setup
+    data = pd.DataFrame({
+        'continuous': np.random.random(100),
+        'discrete': np.random.choice(['a', 'b', 'c'], 100)
+    })
+    discrete_columns = [1]
+
+    tvae = TVAE(epochs=1)
+    tvae.fit(data.to_numpy(), discrete_columns)
+    tvae.set_random_state(0)
+
+    tvae.sample(100)
+    model_path = tmpdir / 'model.pkl'
+
+    # Save
+    tvae.save(str(model_path))
+
+    # Load
+    loaded_instance = TVAE.load(str(model_path))
+    loaded_instance.sample(100)