formatting, true divide, and kernels

t-vi · t-vi · commit b8ff1cd0f135 · 2021-02-22T15:40:46.000+01:00
diff --git a/test/test_detectors.py b/test/test_detectors.py
@@ -26,8 +26,24 @@ def _test_detector_class(cls):
 def test_ksdetector():
     _test_detector_class(torchdrift.detectors.KSDriftDetector)
 
+
+def _test_mmd_kernel(kernel):
+    torch.manual_seed(1234)
+    d = torchdrift.detectors.KernelMMDDriftDetector(kernel=kernel)
+    x = torch.randn(5, 5)
+    y = torch.randn(5, 5) + 1.0
+    d.fit(x)
+    assert (d(x).item() < d(y).item())
+    assert d.compute_p_value(x) > 0.80
+    assert d.compute_p_value(y) < 0.05
+
 def test_mmddetector():
     _test_detector_class(torchdrift.detectors.KernelMMDDriftDetector)
+    _test_mmd_kernel(torchdrift.detectors.mmd.GaussianKernel(lengthscale=1.0))
+    _test_mmd_kernel(torchdrift.detectors.mmd.ExpKernel())
+    _test_mmd_kernel(torchdrift.detectors.mmd.ExpKernel(lengthscale=1.0))
+    _test_mmd_kernel(torchdrift.detectors.mmd.RationalQuadraticKernel())
+    _test_mmd_kernel(torchdrift.detectors.mmd.RationalQuadraticKernel(lengthscale=1.0, alpha=2.0))
 
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/torchdrift/detectors/__init__.py b/torchdrift/detectors/__init__.py
@@ -1,3 +1,4 @@
 from .detector import Detector
 from .mmd import kernel_mmd, KernelMMDDriftDetector
 from .ks import ks_two_sample_multi_dim, KSDriftDetector, ks_p_value
+from . import mmd
diff --git a/torchdrift/detectors/detector.py b/torchdrift/detectors/detector.py
@@ -1,47 +1,60 @@
 import torch
 
+
 class Detector(torch.nn.Module):
     """Detector class.
 
-The detector is is a `nn.Module` subclass that, after fitting, performs a drift test when called and returns a score or p-value.
+    The detector is is a `nn.Module` subclass that, after fitting, performs a drift test when called and returns a score or p-value.
+
+        Constructor Args:
+                return_p_value (bool): If set, forward returns a p-value (estimate) instead of the raw test score.
+    """
 
-    Constructor Args:
-            return_p_value (bool): If set, forward returns a p-value (estimate) instead of the raw test score.
-        """
-    def __init__(self, *, return_p_value: bool=False):
+    def __init__(self, *, return_p_value: bool = False):
         super().__init__()
-        self.register_buffer('base_outputs', None)
+        self.register_buffer("base_outputs", None)
         self.return_p_value = return_p_value
 
     def fit(self, x: torch.Tensor):
         """Record a sample as the reference distribution"""
         self.base_outputs = x.detach()
         return x
 
-    def predict_shift_from_features(self, base_outputs: torch.Tensor, outputs: torch.Tensor, compute_score: bool, compute_p_value: bool, individual_samples: bool = False) -> torch.Tensor:
+    def predict_shift_from_features(
+        self,
+        base_outputs: torch.Tensor,
+        outputs: torch.Tensor,
+        compute_score: bool,
+        compute_p_value: bool,
+        individual_samples: bool = False,
+    ) -> torch.Tensor:
         """stub to be overridden by subclasses"""
         raise NotImplementedError("Override predict_shift_from_features in detectors")
 
     def compute_p_value(self, inputs: torch.Tensor) -> torch.Tensor:
         """Performs a statistical test for drift and returns the p-value.
 
-This method calls `predict_shift_from_features` under the hood, so you only need to override that when subclassing.
-"""
+        This method calls `predict_shift_from_features` under the hood, so you only need to override that when subclassing."""
         assert self.base_outputs is not None, "Please call fit before compute_p_value"
-        _, p_value = self.predict_shift_from_features(self.base_outputs, inputs, compute_score=False, compute_p_value=True)
+        _, p_value = self.predict_shift_from_features(
+            self.base_outputs, inputs, compute_score=False, compute_p_value=True
+        )
         return p_value
 
     def forward(
-            self, inputs: torch.Tensor,
-            individual_samples: bool = False
+        self, inputs: torch.Tensor, individual_samples: bool = False
     ) -> torch.Tensor:
         """Performs a statistical test for drift and returns the score or, if `return_p_value` has been set in the constructor, the p-value.
 
-This method calls `predict_shift_from_features` under the hood, so you only need to override that when subclassing.
-"""
+        This method calls `predict_shift_from_features` under the hood, so you only need to override that when subclassing."""
         assert self.base_outputs is not None, "Please call fit before predict_shift"
-        ood_score, p_value = self.predict_shift_from_features(self.base_outputs, inputs, compute_score=not self.return_p_value, compute_p_value=self.return_p_value, individual_samples=individual_samples)
+        ood_score, p_value = self.predict_shift_from_features(
+            self.base_outputs,
+            inputs,
+            compute_score=not self.return_p_value,
+            compute_p_value=self.return_p_value,
+            individual_samples=individual_samples,
+        )
         if self.return_p_value:
             return p_value
         return ood_score
-
diff --git a/torchdrift/detectors/ks.py b/torchdrift/detectors/ks.py
@@ -7,8 +7,9 @@
 
 try:
     import numba
+
     njit = numba.jit(nopython=True, fastmath=True)
-except ImportError: # pragma: no cover
+except ImportError:  # pragma: no cover
     njit = lambda x: x
 
 
@@ -17,22 +18,23 @@
 #              two-sample Kolmogorov-Smirnov test
 # https://arxiv.org/abs/2102.08037
 
+
 @njit
-def ks_p_value(n : int, m : int, d : float) -> float:
+def ks_p_value(n: int, m: int, d: float) -> float:
     """Computes the p-value for the two-sided two-sample KS test from the D-statistic.
 
-This uses the stable recursion from T. Viehmann: Numerically more stable computation of the p-values for the two-sample Kolmogorov-Smirnov test.
+    This uses the stable recursion from T. Viehmann: Numerically more stable computation of the p-values for the two-sample Kolmogorov-Smirnov test.
     """
-    size = int(2*m*d+2)
+    size = int(2 * m * d + 2)
     lastrow, row = numpy.zeros((2, size), dtype=numpy.float64)
     last_start_j = 0
     for i in range(n + 1):
-        start_j = max(int(m * (i/n + d)) + 1-size, 0)
+        start_j = max(int(m * (i / n + d)) + 1 - size, 0)
         lastrow, row = row, lastrow
         val = 0.0
         for jj in range(size):
             j = jj + start_j
-            dist = i/n - j/m
+            dist = i / n - j / m
             if dist > d or dist < -d:
                 val = 1.0
             elif i == 0 or j == 0:
@@ -46,6 +48,7 @@ def ks_p_value(n : int, m : int, d : float) -> float:
         last_start_j = start_j
     return row[m - start_j]
 
+
 def ks_two_sample_multi_dim(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
     """Computes the two-sample two-sided Kolmorogov-Smirnov statistic.
 
@@ -58,26 +61,37 @@ def ks_two_sample_multi_dim(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
     n_x, n_features = x.shape
     n_y, n_features_y = y.shape
     assert n_features == n_features_y
-    
+
     joint_sorted = torch.argsort(torch.cat([x, y], dim=0), dim=0)
-    sign = (joint_sorted < n_x).to(dtype=torch.float) * (1 /(n_x) + 1/(n_y)) - (1/(n_y))
+    sign = (joint_sorted < n_x).to(dtype=torch.float) * (1 / (n_x) + 1 / (n_y)) - (
+        1 / (n_y)
+    )
     ks_scores = sign.cumsum(0).abs().max(0).values
     return ks_scores
 
+
 class KSDriftDetector(Detector):
     """Drift detector based on (multiple) Kolmogorov-Smirnov tests.
 
-This detector uses the Kolmogorov-Smirnov test on the marginals of the features
-for each feature.
+    This detector uses the Kolmogorov-Smirnov test on the marginals of the features
+    for each feature.
 
-For scores, it returns the maximum score. p-values are computed with the
-Bonferroni correction of multiplying the p-value of the maximum score by
-the number of features/tests.
+    For scores, it returns the maximum score. p-values are computed with the
+    Bonferroni correction of multiplying the p-value of the maximum score by
+    the number of features/tests.
 
-This is modelled after the KS drift detection in
-S. Rabanser et al: *Failing Loudly: An Empirical Study of Methods for Detecting Dataset Shift* (NeurIPS), 2019.
+    This is modelled after the KS drift detection in
+    S. Rabanser et al: *Failing Loudly: An Empirical Study of Methods for Detecting Dataset Shift* (NeurIPS), 2019.
     """
-    def predict_shift_from_features(self, base_outputs: torch.Tensor, outputs: torch.Tensor, compute_score: bool, compute_p_value: bool, individual_samples: bool = False):
+
+    def predict_shift_from_features(
+        self,
+        base_outputs: torch.Tensor,
+        outputs: torch.Tensor,
+        compute_score: bool,
+        compute_p_value: bool,
+        individual_samples: bool = False,
+    ):
         assert (
             not individual_samples
         ), "Individual samples not supported by MMD detector"
diff --git a/torchdrift/detectors/mmd.py b/torchdrift/detectors/mmd.py
@@ -4,13 +4,69 @@
 
 from . import Detector
 
-def kernel_mmd(x, y, n_perm=1000):
+
+class Kernel:
+    pass
+
+
+class GaussianKernel(Kernel):
+    """Unnormalized gaussian kernel"""
+
+    def __init__(self, lengthscale=None):
+        super().__init__()
+        self.lengthscale = lengthscale
+
+    def __call__(self, dists):
+        # note that lengthscale should be squared in the RBF to match the Gretton et al heuristic
+        if self.lengthscale is not None:
+            lengthscale = self.lengthscale
+        else:
+            lengthscale = dists[:100, :100].median()
+        return torch.exp((-1 / lengthscale ** 2) * dists ** 2)
+
+
+class ExpKernel(Kernel):
+    """Unnormalized exponential kernel"""
+
+    def __init__(self, lengthscale=None):
+        super().__init__()
+        self.lengthscale = lengthscale
+
+    def __call__(self, dists):
+        if self.lengthscale is not None:
+            lengthscale = self.lengthscale
+        else:
+            lengthscale = dists[:100, :100].median()
+        return torch.exp((-1 / lengthscale) * dists)
+
+
+class RationalQuadraticKernel(Kernel):
+    """Unnormalized rational quadratic kernel
+
+    k(|x-y|) = (1+|x-y|^2/(2 alpha lengthscale**2))^(-alpha)"""
+
+    def __init__(self, lengthscale=None, alpha=1.0):
+        super().__init__()
+        self.alpha = alpha
+        self.lengthscale = lengthscale
+
+    def __call__(self, dists):
+        if self.lengthscale is not None:
+            lengthscale = self.lengthscale
+        else:
+            lengthscale = dists[:100, :100].median()
+        return torch.pow(
+            1 + (1 / (2 * self.alpha * lengthscale ** 2)) * dists ** 2, -self.alpha
+        )
+
+
+def kernel_mmd(x, y, n_perm=1000, kernel=GaussianKernel()):
     """Implements the kernel MMD two-sample test.
 
     It is modelled after the kernel MMD paper and code:
     A. Gretton et al.: A kernel two-sample test, JMLR 13 (2012)
     http://www.gatsby.ucl.ac.uk/~gretton/mmd/mmd.htm
-    
+
     The arguments `x` and `y` should be two-dimensional tensors.
     The first is the batch dimension (which may differ), the second
     the features (which must be the same on both `x` and `y`).
@@ -24,8 +80,7 @@ def kernel_mmd(x, y, n_perm=1000):
     xy = torch.cat([x.detach(), y.detach()], dim=0)
     dists = torch.cdist(xy, xy, p=2.0)
     # we are a bit sloppy here as we just keep the diagonal and everything twice
-    # note that sigma should be squared in the RBF to match the Gretton et al heuristic
-    k = torch.exp((-1 / dists[:100, :100].median() ** 2) * dists ** 2)
+    k = kernel(dists)
     k_x = k[:n, :n]
     k_y = k[n:, n:]
     k_xy = k[:n, n:]
@@ -55,34 +110,46 @@ def kernel_mmd(x, y, n_perm=1000):
         mmd_0s.append(mmd_0)
         count = count + (mmd_0 > mmd)
     # pyplot.hist(torch.stack(mmd_0s, dim=0).tolist(), bins=50)
-    p_val = count / n_perm
+    # true_divide: torch 1.6 compat replace with "/" after October 2021
+    p_val = torch.true_divide(count, n_perm)
+
     return mmd, p_val
 
 
 class KernelMMDDriftDetector(Detector):
     """Drift detector based on the kernel Maximum Mean Discrepancy (MMD) test.
 
-This is modelled after the MMD drift detection in
-S. Rabanser et al: *Failing Loudly: An Empirical Study of Methods for Detecting Dataset Shift* (NeurIPS), 2019.
+    This is modelled after the MMD drift detection in
+    S. Rabanser et al: *Failing Loudly: An Empirical Study of Methods for Detecting Dataset Shift* (NeurIPS), 2019.
 
-Note that our heuristic choice of the kernel bandwith is more closely aligned with that of the original MMD paper and code than S. Rabanser's.
+    Note that our heuristic choice of the kernel bandwith is more closely aligned with that of the original MMD paper and code than S. Rabanser's.
     """
-    
-    def __init__(self, *, return_p_value=False, n_perm: int = 1000):
+
+    def __init__(
+        self, *, return_p_value=False, n_perm: int = 1000, kernel=GaussianKernel()
+    ):
         super().__init__(return_p_value=return_p_value)
         self.n_perm = n_perm
+        self.kernel = kernel
 
-    def predict_shift_from_features(self, base_outputs: torch.Tensor, outputs: torch.Tensor, compute_score: bool, compute_p_value: bool, individual_samples: bool = False):
+    def predict_shift_from_features(
+        self,
+        base_outputs: torch.Tensor,
+        outputs: torch.Tensor,
+        compute_score: bool,
+        compute_p_value: bool,
+        individual_samples: bool = False,
+    ):
         assert (
             not individual_samples
         ), "Individual samples not supported by MMD detector"
         if not compute_p_value:
             ood_score = kernel_mmd(
-                outputs, base_outputs, n_perm=None
+                outputs, base_outputs, n_perm=None, kernel=self.kernel
             )
             p_value = None
         else:
             ood_score, p_value = kernel_mmd(
-                outputs, base_outputs, n_perm=self.n_perm
+                outputs, base_outputs, n_perm=self.n_perm, kernel=self.kernel
             )
         return ood_score, p_value
diff --git a/torchdrift/reducers/pca.py b/torchdrift/reducers/pca.py
@@ -1,29 +1,31 @@
 import torch
 from . import Reducer
 
+
 class PCAReducer(Reducer):
     """Reduce dimensions using PCA.
 
-This nn.Modue subclass reduces the dimensions of the inputs
-specified by `n_components`.
+    This nn.Modue subclass reduces the dimensions of the inputs
+    specified by `n_components`.
 
-The input is a 2-dimensional `Tensor` of size `batch` x `features`,
-the output is a `Tensor` of size `batch` x `n_components`.
+    The input is a 2-dimensional `Tensor` of size `batch` x `features`,
+    the output is a `Tensor` of size `batch` x `n_components`.
     """
-    def __init__(self, n_components:int = 2):
+
+    def __init__(self, n_components: int = 2):
         super().__init__()
         self.n_components = n_components
 
     def extra_repr(self) -> str:
-        return f'n_components={self.n_components}'
+        return f"n_components={self.n_components}"
 
     def fit(self, x: torch.Tensor) -> torch.Tensor:
         batch, feat = x.shape
         assert min(batch, feat) >= self.n_components
         self.mean = x.mean(0, keepdim=True)
         x = x - self.mean
         u, s, v = x.svd()
-        self.comp = v[:, :self.n_components]
+        self.comp = v[:, : self.n_components]
         reduced = x @ self.comp
         return reduced
 
diff --git a/torchdrift/reducers/reducer.py b/torchdrift/reducers/reducer.py
diff --git a/torchdrift/utils/experiments.py b/torchdrift/utils/experiments.py
diff --git a/torchdrift/utils/fit.py b/torchdrift/utils/fit.py