Spaces:
Sleeping
Sleeping
Commit
·
5750d1a
1
Parent(s):
fdb138f
Add denoising operation with test
Browse files- pysr/sr.py +37 -0
- test/test.py +17 -0
pysr/sr.py
CHANGED
|
@@ -130,6 +130,8 @@ def pysr(
|
|
| 130 |
optimizer_iterations=10,
|
| 131 |
tournament_selection_n=10,
|
| 132 |
tournament_selection_p=1.0,
|
|
|
|
|
|
|
| 133 |
):
|
| 134 |
"""Run symbolic regression to fit f(X[i, :]) ~ y[i] for all i.
|
| 135 |
Note: most default parameters have been tuned over several example
|
|
@@ -244,6 +246,8 @@ def pysr(
|
|
| 244 |
:type tournament_selection_n: int
|
| 245 |
:param tournament_selection_p: Probability of selecting the best expression in each tournament. The probability will decay as p*(1-p)^n for other expressions, sorted by loss.
|
| 246 |
:type tournament_selection_p: float
|
|
|
|
|
|
|
| 247 |
:returns: Results dataframe, giving complexity, MSE, and equations (as strings), as well as functional forms. If list, each element corresponds to a dataframe of equations for each output.
|
| 248 |
:type: pd.DataFrame/list
|
| 249 |
"""
|
|
@@ -327,6 +331,24 @@ def pysr(
|
|
| 327 |
else:
|
| 328 |
raise NotImplementedError("y shape not supported!")
|
| 329 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 330 |
kwargs = dict(
|
| 331 |
X=X,
|
| 332 |
y=y,
|
|
@@ -387,6 +409,7 @@ def pysr(
|
|
| 387 |
nout=nout,
|
| 388 |
tournament_selection_n=tournament_selection_n,
|
| 389 |
tournament_selection_p=tournament_selection_p,
|
|
|
|
| 390 |
)
|
| 391 |
|
| 392 |
kwargs = {**_set_paths(tempdir), **kwargs}
|
|
@@ -1082,6 +1105,20 @@ def _yesno(question):
|
|
| 1082 |
return False
|
| 1083 |
|
| 1084 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1085 |
class CallableEquation(object):
|
| 1086 |
"""Simple wrapper for numpy lambda functions built with sympy"""
|
| 1087 |
|
|
|
|
| 130 |
optimizer_iterations=10,
|
| 131 |
tournament_selection_n=10,
|
| 132 |
tournament_selection_p=1.0,
|
| 133 |
+
denoise=False,
|
| 134 |
+
Xresampled=None,
|
| 135 |
):
|
| 136 |
"""Run symbolic regression to fit f(X[i, :]) ~ y[i] for all i.
|
| 137 |
Note: most default parameters have been tuned over several example
|
|
|
|
| 246 |
:type tournament_selection_n: int
|
| 247 |
:param tournament_selection_p: Probability of selecting the best expression in each tournament. The probability will decay as p*(1-p)^n for other expressions, sorted by loss.
|
| 248 |
:type tournament_selection_p: float
|
| 249 |
+
:param denoise: Whether to use a Gaussian Process to denoise the data before inputting to PySR. Can help PySR fit noisy data.
|
| 250 |
+
:type denoise: bool
|
| 251 |
:returns: Results dataframe, giving complexity, MSE, and equations (as strings), as well as functional forms. If list, each element corresponds to a dataframe of equations for each output.
|
| 252 |
:type: pd.DataFrame/list
|
| 253 |
"""
|
|
|
|
| 331 |
else:
|
| 332 |
raise NotImplementedError("y shape not supported!")
|
| 333 |
|
| 334 |
+
if denoise:
|
| 335 |
+
if weights is not None:
|
| 336 |
+
raise NotImplementedError(
|
| 337 |
+
"No weights for denoising - the weights are learned."
|
| 338 |
+
)
|
| 339 |
+
if Xresampled is not None and selection is not None:
|
| 340 |
+
# Select among only the selected features:
|
| 341 |
+
Xresampled = Xresampled[:, selection]
|
| 342 |
+
if multioutput:
|
| 343 |
+
y = np.stack(
|
| 344 |
+
[_denoise(X, y[:, i], Xresampled=Xresampled)[1] for i in range(nout)],
|
| 345 |
+
axis=1,
|
| 346 |
+
)
|
| 347 |
+
if Xresampled is not None:
|
| 348 |
+
X = Xresampled
|
| 349 |
+
else:
|
| 350 |
+
X, y = _denoise(X, y, Xresampled=Xresampled)
|
| 351 |
+
|
| 352 |
kwargs = dict(
|
| 353 |
X=X,
|
| 354 |
y=y,
|
|
|
|
| 409 |
nout=nout,
|
| 410 |
tournament_selection_n=tournament_selection_n,
|
| 411 |
tournament_selection_p=tournament_selection_p,
|
| 412 |
+
denoise=denoise,
|
| 413 |
)
|
| 414 |
|
| 415 |
kwargs = {**_set_paths(tempdir), **kwargs}
|
|
|
|
| 1105 |
return False
|
| 1106 |
|
| 1107 |
|
| 1108 |
+
def _denoise(X, y, Xresampled=None):
|
| 1109 |
+
"""Denoise the dataset using a Gaussian process"""
|
| 1110 |
+
from sklearn.gaussian_process import GaussianProcessRegressor
|
| 1111 |
+
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, ConstantKernel
|
| 1112 |
+
|
| 1113 |
+
gp_kernel = RBF(np.ones(X.shape[1])) + WhiteKernel(1e-1) + ConstantKernel()
|
| 1114 |
+
gpr = GaussianProcessRegressor(kernel=gp_kernel, n_restarts_optimizer=50)
|
| 1115 |
+
gpr.fit(X, y)
|
| 1116 |
+
if Xresampled is not None:
|
| 1117 |
+
return Xresampled, gpr.predict(Xresampled)
|
| 1118 |
+
|
| 1119 |
+
return X, gpr.predict(X)
|
| 1120 |
+
|
| 1121 |
+
|
| 1122 |
class CallableEquation(object):
|
| 1123 |
"""Simple wrapper for numpy lambda functions built with sympy"""
|
| 1124 |
|
test/test.py
CHANGED
|
@@ -82,6 +82,23 @@ class TestPipeline(unittest.TestCase):
|
|
| 82 |
|
| 83 |
self.assertLessEqual(equations.iloc[-1]["MSE"], 1e-4)
|
| 84 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
|
| 86 |
class TestBest(unittest.TestCase):
|
| 87 |
def setUp(self):
|
|
|
|
| 82 |
|
| 83 |
self.assertLessEqual(equations.iloc[-1]["MSE"], 1e-4)
|
| 84 |
|
| 85 |
+
def test_noisy(self):
|
| 86 |
+
|
| 87 |
+
np.random.seed(1)
|
| 88 |
+
y = self.X[:, [0, 1]] ** 2 + np.random.randn(self.X.shape[0]) * 0.05
|
| 89 |
+
equations = pysr(
|
| 90 |
+
self.X,
|
| 91 |
+
y,
|
| 92 |
+
unary_operators=["sq(x) = x^2"],
|
| 93 |
+
binary_operators=["plus"],
|
| 94 |
+
extra_sympy_mappings={"sq": lambda x: x ** 2},
|
| 95 |
+
**self.default_test_kwargs,
|
| 96 |
+
procs=0,
|
| 97 |
+
denoise=True,
|
| 98 |
+
)
|
| 99 |
+
self.assertLessEqual(best_row(equations=equations)[0]["MSE"], 1e-4)
|
| 100 |
+
self.assertLessEqual(best_row(equations=equations)[1]["MSE"], 1e-4)
|
| 101 |
+
|
| 102 |
|
| 103 |
class TestBest(unittest.TestCase):
|
| 104 |
def setUp(self):
|