Merge pull request #16 from alessiamarcolini/iterable-dataset

lantiga · web-flow · commit 3d590e8f2518 · 2022-08-26T10:15:45.000+02:00
Allow iterable dataset for detector fit
diff --git a/test/test_utils.py b/test/test_utils.py
@@ -4,34 +4,48 @@
 import torch.utils.data
 
 
+class DummyIterableDataset(torch.utils.data.IterableDataset):
+    def __init__(self, *args) -> None:
+        super().__init__()
+        self.args = args
+
+    def __iter__(self):
+        return iter(*self.args)
+
+
 class TensorDataModule:
-    def __init__(self, *args):
-        self.ds = torch.utils.data.TensorDataset(*args)
+    def __init__(self, *args, ds_type="map"):
+        self.ds_type = ds_type
+
+        if ds_type == "map":
+            self.ds = torch.utils.data.TensorDataset(*args)
+        else:
+            self.ds = DummyIterableDataset(*args)
 
     def default_dataloader(self, batch_size=None, num_samples=None, shuffle=True):
-        dataset = self.ds
         if batch_size is None:
             batch_size = self.val_batch_size
         replacement = num_samples is not None
-        if shuffle:
+        if shuffle and self.ds_type == "map":
             sampler = torch.utils.data.RandomSampler(
-                dataset, replacement=replacement, num_samples=num_samples
+                self.ds, replacement=replacement, num_samples=num_samples
             )
         else:
             sampler = None
         return torch.utils.data.DataLoader(
-            dataset, batch_size=batch_size, sampler=sampler
+            self.ds, batch_size=batch_size, sampler=sampler
         )
 
-
-def test_fit():
-    dm_ref = TensorDataModule(torch.randn(500, 5))
+@pytest.mark.parametrize("num_batches", (3, None))
+@pytest.mark.parametrize("ds_type", ("map", "iterable"))
+def test_fit(ds_type, num_batches):
+    dm_ref = TensorDataModule(torch.randn(500, 5), ds_type=ds_type)
     d = torchdrift.detectors.KernelMMDDriftDetector()
     torchdrift.utils.fit(
         dm_ref.default_dataloader(batch_size=10),
         torch.nn.Identity(),
         [torch.nn.Identity(), d],
-        num_batches=3,
+        num_batches=num_batches,
         device="cpu",
     )
 
diff --git a/torchdrift/utils/fit.py b/torchdrift/utils/fit.py
@@ -33,14 +33,24 @@ def fit(
 
     all_outputs = []
     # dl = torch.utils.data.DataLoader(ref_ds, batch_size=batch_size, shuffle=True)
-    nb = len(dl)
-    if num_batches is not None:
-        nb = min(nb, num_batches)
-    for i, b in tqdm.tqdm(zip(range(nb), dl), total=nb):
+
+    if hasattr(dl.dataset, "__len__"):
+        nb = len(dl)
+        if num_batches is not None:
+            nb = min(nb, num_batches)
+        total = nb
+    else:
+        total = None
+
+    for i, b in enumerate(tqdm.tqdm(dl, total=total)):
+        if num_batches is not None and i >= num_batches:
+            break
+
         if not isinstance(b, torch.Tensor):
             b = b[0]
         with torch.no_grad():
             all_outputs.append(feature_extractor(b.to(device)))
+
     all_outputs = torch.cat(all_outputs, dim=0)
 
     for m in reducers_detectors: