Spaces:
Sleeping
Sleeping
Commit
·
ffd9cd1
1
Parent(s):
acce2c2
Add test for Xresampled as pd.DataFrame
Browse files- pysr/sr.py +26 -15
- test/test.py +49 -2
pysr/sr.py
CHANGED
|
@@ -323,6 +323,8 @@ def pysr(
|
|
| 323 |
if len(X.shape) == 1:
|
| 324 |
X = X[:, None]
|
| 325 |
|
|
|
|
|
|
|
| 326 |
if len(variable_names) == 0:
|
| 327 |
variable_names = [f"x{i}" for i in range(X.shape[1])]
|
| 328 |
|
|
@@ -364,9 +366,7 @@ def pysr(
|
|
| 364 |
if maxsize < 7:
|
| 365 |
raise NotImplementedError("PySR requires a maxsize of at least 7")
|
| 366 |
|
| 367 |
-
X,
|
| 368 |
-
X, select_k_features, use_custom_variable_names, variable_names, y
|
| 369 |
-
)
|
| 370 |
|
| 371 |
if maxdepth is None:
|
| 372 |
maxdepth = maxsize
|
|
@@ -390,9 +390,18 @@ def pysr(
|
|
| 390 |
raise NotImplementedError(
|
| 391 |
"No weights for denoising - the weights are learned."
|
| 392 |
)
|
| 393 |
-
if Xresampled is not None
|
| 394 |
# Select among only the selected features:
|
| 395 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 396 |
if multioutput:
|
| 397 |
y = np.stack(
|
| 398 |
[_denoise(X, y[:, i], Xresampled=Xresampled)[1] for i in range(nout)],
|
|
@@ -531,7 +540,7 @@ Tried to activate project {julia_project} but failed."""
|
|
| 531 |
Main.y,
|
| 532 |
weights=Main.weights,
|
| 533 |
niterations=int(niterations),
|
| 534 |
-
varMap=variable_names,
|
| 535 |
options=options,
|
| 536 |
numprocs=int(cprocs),
|
| 537 |
multithreading=bool(multithreading),
|
|
@@ -645,19 +654,15 @@ def _create_inline_operators(binary_operators, unary_operators):
|
|
| 645 |
op_list[i] = function_name
|
| 646 |
|
| 647 |
|
| 648 |
-
def _handle_feature_selection(
|
| 649 |
-
X, select_k_features, use_custom_variable_names, variable_names, y
|
| 650 |
-
):
|
| 651 |
if select_k_features is not None:
|
| 652 |
selection = run_feature_selection(X, y, select_k_features)
|
| 653 |
-
print(f"Using features {selection}")
|
| 654 |
X = X[:, selection]
|
| 655 |
|
| 656 |
-
if use_custom_variable_names:
|
| 657 |
-
variable_names = [variable_names[i] for i in selection]
|
| 658 |
else:
|
| 659 |
selection = None
|
| 660 |
-
return X,
|
| 661 |
|
| 662 |
|
| 663 |
def _check_assertions(
|
|
@@ -791,7 +796,9 @@ def get_hof(
|
|
| 791 |
sympy_format.append(eqn)
|
| 792 |
|
| 793 |
# Numpy:
|
| 794 |
-
lambda_format.append(
|
|
|
|
|
|
|
| 795 |
|
| 796 |
# JAX:
|
| 797 |
if output_jax_format:
|
|
@@ -942,16 +949,20 @@ def _denoise(X, y, Xresampled=None):
|
|
| 942 |
class CallableEquation:
|
| 943 |
"""Simple wrapper for numpy lambda functions built with sympy"""
|
| 944 |
|
| 945 |
-
def __init__(self, sympy_symbols, eqn, selection=None):
|
| 946 |
self._sympy = eqn
|
| 947 |
self._sympy_symbols = sympy_symbols
|
| 948 |
self._selection = selection
|
|
|
|
| 949 |
self._lambda = lambdify(sympy_symbols, eqn)
|
| 950 |
|
| 951 |
def __repr__(self):
|
| 952 |
return f"PySRFunction(X=>{self._sympy})"
|
| 953 |
|
| 954 |
def __call__(self, X):
|
|
|
|
|
|
|
|
|
|
| 955 |
if self._selection is not None:
|
| 956 |
return self._lambda(*X[:, self._selection].T)
|
| 957 |
return self._lambda(*X.T)
|
|
|
|
| 323 |
if len(X.shape) == 1:
|
| 324 |
X = X[:, None]
|
| 325 |
|
| 326 |
+
assert not isinstance(y, pd.DataFrame)
|
| 327 |
+
|
| 328 |
if len(variable_names) == 0:
|
| 329 |
variable_names = [f"x{i}" for i in range(X.shape[1])]
|
| 330 |
|
|
|
|
| 366 |
if maxsize < 7:
|
| 367 |
raise NotImplementedError("PySR requires a maxsize of at least 7")
|
| 368 |
|
| 369 |
+
X, selection = _handle_feature_selection(X, select_k_features, y, variable_names)
|
|
|
|
|
|
|
| 370 |
|
| 371 |
if maxdepth is None:
|
| 372 |
maxdepth = maxsize
|
|
|
|
| 390 |
raise NotImplementedError(
|
| 391 |
"No weights for denoising - the weights are learned."
|
| 392 |
)
|
| 393 |
+
if Xresampled is not None:
|
| 394 |
# Select among only the selected features:
|
| 395 |
+
if isinstance(Xresampled, pd.DataFrame):
|
| 396 |
+
# Handle Xresampled is pandas dataframe
|
| 397 |
+
if selection is not None:
|
| 398 |
+
Xresampled = Xresampled[[variable_names[i] for i in selection]]
|
| 399 |
+
else:
|
| 400 |
+
Xresampled = Xresampled[variable_names]
|
| 401 |
+
Xresampled = np.array(Xresampled)
|
| 402 |
+
else:
|
| 403 |
+
if selection is not None:
|
| 404 |
+
Xresampled = Xresampled[:, selection]
|
| 405 |
if multioutput:
|
| 406 |
y = np.stack(
|
| 407 |
[_denoise(X, y[:, i], Xresampled=Xresampled)[1] for i in range(nout)],
|
|
|
|
| 540 |
Main.y,
|
| 541 |
weights=Main.weights,
|
| 542 |
niterations=int(niterations),
|
| 543 |
+
varMap=[variable_names[i] for i in selection],
|
| 544 |
options=options,
|
| 545 |
numprocs=int(cprocs),
|
| 546 |
multithreading=bool(multithreading),
|
|
|
|
| 654 |
op_list[i] = function_name
|
| 655 |
|
| 656 |
|
| 657 |
+
def _handle_feature_selection(X, select_k_features, y, variable_names):
|
|
|
|
|
|
|
| 658 |
if select_k_features is not None:
|
| 659 |
selection = run_feature_selection(X, y, select_k_features)
|
| 660 |
+
print(f"Using features {[variable_names[i] for i in selection]}")
|
| 661 |
X = X[:, selection]
|
| 662 |
|
|
|
|
|
|
|
| 663 |
else:
|
| 664 |
selection = None
|
| 665 |
+
return X, selection
|
| 666 |
|
| 667 |
|
| 668 |
def _check_assertions(
|
|
|
|
| 796 |
sympy_format.append(eqn)
|
| 797 |
|
| 798 |
# Numpy:
|
| 799 |
+
lambda_format.append(
|
| 800 |
+
CallableEquation(sympy_symbols, eqn, selection, variable_names)
|
| 801 |
+
)
|
| 802 |
|
| 803 |
# JAX:
|
| 804 |
if output_jax_format:
|
|
|
|
| 949 |
class CallableEquation:
|
| 950 |
"""Simple wrapper for numpy lambda functions built with sympy"""
|
| 951 |
|
| 952 |
+
def __init__(self, sympy_symbols, eqn, selection=None, variable_names=None):
|
| 953 |
self._sympy = eqn
|
| 954 |
self._sympy_symbols = sympy_symbols
|
| 955 |
self._selection = selection
|
| 956 |
+
self._variable_names = variable_names
|
| 957 |
self._lambda = lambdify(sympy_symbols, eqn)
|
| 958 |
|
| 959 |
def __repr__(self):
|
| 960 |
return f"PySRFunction(X=>{self._sympy})"
|
| 961 |
|
| 962 |
def __call__(self, X):
|
| 963 |
+
if isinstance(X, pd.DataFrame):
|
| 964 |
+
X = np.array(X[self._variable_names])
|
| 965 |
+
|
| 966 |
if self._selection is not None:
|
| 967 |
return self._lambda(*X[:, self._selection].T)
|
| 968 |
return self._lambda(*X.T)
|
test/test.py
CHANGED
|
@@ -98,8 +98,9 @@ class TestPipeline(unittest.TestCase):
|
|
| 98 |
equations = pysr(
|
| 99 |
self.X,
|
| 100 |
y,
|
| 101 |
-
|
| 102 |
-
|
|
|
|
| 103 |
extra_sympy_mappings={"sq": lambda x: x ** 2},
|
| 104 |
**self.default_test_kwargs,
|
| 105 |
procs=0,
|
|
@@ -108,6 +109,52 @@ class TestPipeline(unittest.TestCase):
|
|
| 108 |
self.assertLessEqual(best_row(equations=equations)[0]["MSE"], 1e-2)
|
| 109 |
self.assertLessEqual(best_row(equations=equations)[1]["MSE"], 1e-2)
|
| 110 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
|
| 112 |
class TestBest(unittest.TestCase):
|
| 113 |
def setUp(self):
|
|
|
|
| 98 |
equations = pysr(
|
| 99 |
self.X,
|
| 100 |
y,
|
| 101 |
+
# Test that passing a single operator works:
|
| 102 |
+
unary_operators="sq(x) = x^2",
|
| 103 |
+
binary_operators="plus",
|
| 104 |
extra_sympy_mappings={"sq": lambda x: x ** 2},
|
| 105 |
**self.default_test_kwargs,
|
| 106 |
procs=0,
|
|
|
|
| 109 |
self.assertLessEqual(best_row(equations=equations)[0]["MSE"], 1e-2)
|
| 110 |
self.assertLessEqual(best_row(equations=equations)[1]["MSE"], 1e-2)
|
| 111 |
|
| 112 |
+
def test_pandas_resample(self):
|
| 113 |
+
np.random.seed(1)
|
| 114 |
+
X = pd.DataFrame(
|
| 115 |
+
{
|
| 116 |
+
"T": np.random.randn(500),
|
| 117 |
+
"x": np.random.randn(500),
|
| 118 |
+
"unused_feature": np.random.randn(500),
|
| 119 |
+
}
|
| 120 |
+
)
|
| 121 |
+
true_fn = lambda x: np.array(x["T"] + x["x"] ** 2 + 1.323837)
|
| 122 |
+
y = true_fn(X)
|
| 123 |
+
noise = np.random.randn(500) * 0.01
|
| 124 |
+
y = y + noise
|
| 125 |
+
# Resampled array is a different order of features:
|
| 126 |
+
Xresampled = pd.DataFrame(
|
| 127 |
+
{
|
| 128 |
+
"unused_feature": np.random.randn(100),
|
| 129 |
+
"x": np.random.randn(100),
|
| 130 |
+
"T": np.random.randn(100),
|
| 131 |
+
}
|
| 132 |
+
)
|
| 133 |
+
equations = pysr(
|
| 134 |
+
X,
|
| 135 |
+
y,
|
| 136 |
+
unary_operators=[],
|
| 137 |
+
binary_operators=["+", "*", "/", "-"],
|
| 138 |
+
**self.default_test_kwargs,
|
| 139 |
+
Xresampled=Xresampled,
|
| 140 |
+
denoise=True,
|
| 141 |
+
select_k_features=2,
|
| 142 |
+
)
|
| 143 |
+
self.assertNotIn("unused_feature", best_tex())
|
| 144 |
+
self.assertIn("T", best_tex())
|
| 145 |
+
self.assertIn("x", best_tex())
|
| 146 |
+
self.assertLessEqual(equations.iloc[-1]["MSE"], 1e-2)
|
| 147 |
+
fn = best_callable()
|
| 148 |
+
self.assertListEqual(list(sorted(fn._selection)), [0, 1])
|
| 149 |
+
X2 = pd.DataFrame(
|
| 150 |
+
{
|
| 151 |
+
"T": np.random.randn(100),
|
| 152 |
+
"unused_feature": np.random.randn(100),
|
| 153 |
+
"x": np.random.randn(100),
|
| 154 |
+
}
|
| 155 |
+
)
|
| 156 |
+
self.assertLess(np.average((fn(X2) - true_fn(X2)) ** 2), 1e-2)
|
| 157 |
+
|
| 158 |
|
| 159 |
class TestBest(unittest.TestCase):
|
| 160 |
def setUp(self):
|