Spaces:

MilesCranmer
/

PySR

Sleeping

App Files Files Community

MilesCranmer commited on Jun 8, 2021

Commit

5750d1a

1 Parent(s): fdb138f

Add denoising operation with test

Browse files

Files changed (2) hide show

pysr/sr.py +37 -0
test/test.py +17 -0

pysr/sr.py CHANGED Viewed

@@ -130,6 +130,8 @@ def pysr(
     optimizer_iterations=10,
     tournament_selection_n=10,
     tournament_selection_p=1.0,
 ):
     """Run symbolic regression to fit f(X[i, :]) ~ y[i] for all i.
     Note: most default parameters have been tuned over several example
@@ -244,6 +246,8 @@ def pysr(
     :type tournament_selection_n: int
     :param tournament_selection_p: Probability of selecting the best expression in each tournament. The probability will decay as p*(1-p)^n for other expressions, sorted by loss.
     :type tournament_selection_p: float
     :returns: Results dataframe, giving complexity, MSE, and equations (as strings), as well as functional forms. If list, each element corresponds to a dataframe of equations for each output.
     :type: pd.DataFrame/list
     """
@@ -327,6 +331,24 @@ def pysr(
     else:
         raise NotImplementedError("y shape not supported!")
     kwargs = dict(
         X=X,
         y=y,
@@ -387,6 +409,7 @@ def pysr(
         nout=nout,
         tournament_selection_n=tournament_selection_n,
         tournament_selection_p=tournament_selection_p,
     )
     kwargs = {**_set_paths(tempdir), **kwargs}
@@ -1082,6 +1105,20 @@ def _yesno(question):
     return False
 class CallableEquation(object):
     """Simple wrapper for numpy lambda functions built with sympy"""

     optimizer_iterations=10,
     tournament_selection_n=10,
     tournament_selection_p=1.0,
+    denoise=False,
+    Xresampled=None,
 ):
     """Run symbolic regression to fit f(X[i, :]) ~ y[i] for all i.
     Note: most default parameters have been tuned over several example
     :type tournament_selection_n: int
     :param tournament_selection_p: Probability of selecting the best expression in each tournament. The probability will decay as p*(1-p)^n for other expressions, sorted by loss.
     :type tournament_selection_p: float
+    :param denoise: Whether to use a Gaussian Process to denoise the data before inputting to PySR. Can help PySR fit noisy data.
+    :type denoise: bool
     :returns: Results dataframe, giving complexity, MSE, and equations (as strings), as well as functional forms. If list, each element corresponds to a dataframe of equations for each output.
     :type: pd.DataFrame/list
     """
     else:
         raise NotImplementedError("y shape not supported!")
+    if denoise:
+        if weights is not None:
+            raise NotImplementedError(
+                "No weights for denoising - the weights are learned."
+            )
+        if Xresampled is not None and selection is not None:
+            # Select among only the selected features:
+            Xresampled = Xresampled[:, selection]
+        if multioutput:
+            y = np.stack(
+                [_denoise(X, y[:, i], Xresampled=Xresampled)[1] for i in range(nout)],
+                axis=1,
+            )
+            if Xresampled is not None:
+                X = Xresampled
+        else:
+            X, y = _denoise(X, y, Xresampled=Xresampled)
     kwargs = dict(
         X=X,
         y=y,
         nout=nout,
         tournament_selection_n=tournament_selection_n,
         tournament_selection_p=tournament_selection_p,
+        denoise=denoise,
     )
     kwargs = {**_set_paths(tempdir), **kwargs}
     return False
+def _denoise(X, y, Xresampled=None):
+    """Denoise the dataset using a Gaussian process"""
+    from sklearn.gaussian_process import GaussianProcessRegressor
+    from sklearn.gaussian_process.kernels import RBF, WhiteKernel, ConstantKernel
+    gp_kernel = RBF(np.ones(X.shape[1])) + WhiteKernel(1e-1) + ConstantKernel()
+    gpr = GaussianProcessRegressor(kernel=gp_kernel, n_restarts_optimizer=50)
+    gpr.fit(X, y)
+    if Xresampled is not None:
+        return Xresampled, gpr.predict(Xresampled)
+    return X, gpr.predict(X)
 class CallableEquation(object):
     """Simple wrapper for numpy lambda functions built with sympy"""

test/test.py CHANGED Viewed

@@ -82,6 +82,23 @@ class TestPipeline(unittest.TestCase):
         self.assertLessEqual(equations.iloc[-1]["MSE"], 1e-4)
 class TestBest(unittest.TestCase):
     def setUp(self):

         self.assertLessEqual(equations.iloc[-1]["MSE"], 1e-4)
+    def test_noisy(self):
+        np.random.seed(1)
+        y = self.X[:, [0, 1]] ** 2 + np.random.randn(self.X.shape[0]) * 0.05
+        equations = pysr(
+            self.X,
+            y,
+            unary_operators=["sq(x) = x^2"],
+            binary_operators=["plus"],
+            extra_sympy_mappings={"sq": lambda x: x ** 2},
+            **self.default_test_kwargs,
+            procs=0,
+            denoise=True,
+        )
+        self.assertLessEqual(best_row(equations=equations)[0]["MSE"], 1e-4)
+        self.assertLessEqual(best_row(equations=equations)[1]["MSE"], 1e-4)
 class TestBest(unittest.TestCase):
     def setUp(self):