Spaces:
Running
Running
Merge pull request #177 from MilesCranmer/improved-model-selection
Browse filesChange "best" model_selection to include a loss threshold
- pysr/sr.py +42 -23
- test/test.py +15 -0
pysr/sr.py
CHANGED
|
@@ -205,10 +205,18 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 205 |
Parameters
|
| 206 |
----------
|
| 207 |
model_selection : str, default="best"
|
| 208 |
-
Model selection criterion
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
the
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
|
| 213 |
binary_operators : list[str], default=["+", "-", "*", "/"]
|
| 214 |
List of strings giving the binary operators in Julia's Base.
|
|
@@ -469,7 +477,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 469 |
Whether to use a progress bar instead of printing to stdout.
|
| 470 |
|
| 471 |
equation_file : str, default=None
|
| 472 |
-
Where to save the files (
|
| 473 |
|
| 474 |
temp_equation_file : bool, default=False
|
| 475 |
Whether to put the hall of fame file in the temp directory.
|
|
@@ -943,12 +951,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 943 |
|
| 944 |
for i, equations in enumerate(all_equations):
|
| 945 |
selected = ["" for _ in range(len(equations))]
|
| 946 |
-
|
| 947 |
-
chosen_row = -1
|
| 948 |
-
elif self.model_selection == "best":
|
| 949 |
-
chosen_row = equations["score"].idxmax()
|
| 950 |
-
else:
|
| 951 |
-
raise NotImplementedError
|
| 952 |
selected[chosen_row] = ">>>>"
|
| 953 |
repr_equations = pd.DataFrame(
|
| 954 |
dict(
|
|
@@ -1091,18 +1094,14 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1091 |
return [eq.iloc[i] for eq, i in zip(self.equations_, index)]
|
| 1092 |
return self.equations_.iloc[index]
|
| 1093 |
|
| 1094 |
-
if self.
|
| 1095 |
-
|
| 1096 |
-
|
| 1097 |
-
|
| 1098 |
-
|
| 1099 |
-
|
| 1100 |
-
|
| 1101 |
-
|
| 1102 |
-
else:
|
| 1103 |
-
raise NotImplementedError(
|
| 1104 |
-
f"{self.model_selection} is not a valid model selection strategy."
|
| 1105 |
-
)
|
| 1106 |
|
| 1107 |
def _setup_equation_file(self):
|
| 1108 |
"""
|
|
@@ -2149,6 +2148,26 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 2149 |
return ret_outputs[0]
|
| 2150 |
|
| 2151 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2152 |
def _denoise(X, y, Xresampled=None, random_state=None):
|
| 2153 |
"""Denoise the dataset using a Gaussian process"""
|
| 2154 |
from sklearn.gaussian_process import GaussianProcessRegressor
|
|
|
|
| 205 |
Parameters
|
| 206 |
----------
|
| 207 |
model_selection : str, default="best"
|
| 208 |
+
Model selection criterion when selecting a final expression from
|
| 209 |
+
the list of best expression at each complexity.
|
| 210 |
+
Can be 'accuracy', 'best', or 'score'.
|
| 211 |
+
- `"accuracy"` selects the candidate model with the lowest loss
|
| 212 |
+
(highest accuracy).
|
| 213 |
+
- `"score"` selects the candidate model with the highest score.
|
| 214 |
+
Score is defined as the negated derivative of the log-loss with
|
| 215 |
+
respect to complexity - if an expression has a much better
|
| 216 |
+
loss at a slightly higher complexity, it is preferred.
|
| 217 |
+
- `"best"` selects the candidate model with the highest score
|
| 218 |
+
among expressions with a loss better than at least 1.5x the
|
| 219 |
+
most accurate model.
|
| 220 |
|
| 221 |
binary_operators : list[str], default=["+", "-", "*", "/"]
|
| 222 |
List of strings giving the binary operators in Julia's Base.
|
|
|
|
| 477 |
Whether to use a progress bar instead of printing to stdout.
|
| 478 |
|
| 479 |
equation_file : str, default=None
|
| 480 |
+
Where to save the files (.csv extension).
|
| 481 |
|
| 482 |
temp_equation_file : bool, default=False
|
| 483 |
Whether to put the hall of fame file in the temp directory.
|
|
|
|
| 951 |
|
| 952 |
for i, equations in enumerate(all_equations):
|
| 953 |
selected = ["" for _ in range(len(equations))]
|
| 954 |
+
chosen_row = idx_model_selection(equations, self.model_selection)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 955 |
selected[chosen_row] = ">>>>"
|
| 956 |
repr_equations = pd.DataFrame(
|
| 957 |
dict(
|
|
|
|
| 1094 |
return [eq.iloc[i] for eq, i in zip(self.equations_, index)]
|
| 1095 |
return self.equations_.iloc[index]
|
| 1096 |
|
| 1097 |
+
if isinstance(self.equations_, list):
|
| 1098 |
+
return [
|
| 1099 |
+
eq.iloc[idx_model_selection(eq, self.model_selection)]
|
| 1100 |
+
for eq in self.equations_
|
| 1101 |
+
]
|
| 1102 |
+
return self.equations_.iloc[
|
| 1103 |
+
idx_model_selection(self.equations_, self.model_selection)
|
| 1104 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1105 |
|
| 1106 |
def _setup_equation_file(self):
|
| 1107 |
"""
|
|
|
|
| 2148 |
return ret_outputs[0]
|
| 2149 |
|
| 2150 |
|
| 2151 |
+
def idx_model_selection(equations: pd.DataFrame, model_selection: str) -> int:
|
| 2152 |
+
"""
|
| 2153 |
+
Return the index of the selected expression, given a dataframe of
|
| 2154 |
+
equations and a model selection.
|
| 2155 |
+
"""
|
| 2156 |
+
if model_selection == "accuracy":
|
| 2157 |
+
chosen_idx = equations["loss"].idxmin()
|
| 2158 |
+
elif model_selection == "best":
|
| 2159 |
+
threshold = 1.5 * equations["loss"].min()
|
| 2160 |
+
filtered_equations = equations.query(f"loss <= {threshold}")
|
| 2161 |
+
chosen_idx = filtered_equations["score"].idxmax()
|
| 2162 |
+
elif model_selection == "score":
|
| 2163 |
+
chosen_idx = equations["score"].idxmax()
|
| 2164 |
+
else:
|
| 2165 |
+
raise NotImplementedError(
|
| 2166 |
+
f"{model_selection} is not a valid model selection strategy."
|
| 2167 |
+
)
|
| 2168 |
+
return chosen_idx
|
| 2169 |
+
|
| 2170 |
+
|
| 2171 |
def _denoise(X, y, Xresampled=None, random_state=None):
|
| 2172 |
"""Denoise the dataset using a Gaussian process"""
|
| 2173 |
from sklearn.gaussian_process import GaussianProcessRegressor
|
test/test.py
CHANGED
|
@@ -9,6 +9,7 @@ from pysr.sr import (
|
|
| 9 |
run_feature_selection,
|
| 10 |
_handle_feature_selection,
|
| 11 |
_csv_filename_to_pkl_filename,
|
|
|
|
| 12 |
)
|
| 13 |
from sklearn.utils.estimator_checks import check_estimator
|
| 14 |
import sympy
|
|
@@ -403,6 +404,20 @@ class TestBest(unittest.TestCase):
|
|
| 403 |
for f in [self.model.predict, self.equations_.iloc[-1]["lambda_format"]]:
|
| 404 |
np.testing.assert_almost_equal(f(X), y, decimal=3)
|
| 405 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 406 |
|
| 407 |
class TestFeatureSelection(unittest.TestCase):
|
| 408 |
def setUp(self):
|
|
|
|
| 9 |
run_feature_selection,
|
| 10 |
_handle_feature_selection,
|
| 11 |
_csv_filename_to_pkl_filename,
|
| 12 |
+
idx_model_selection,
|
| 13 |
)
|
| 14 |
from sklearn.utils.estimator_checks import check_estimator
|
| 15 |
import sympy
|
|
|
|
| 404 |
for f in [self.model.predict, self.equations_.iloc[-1]["lambda_format"]]:
|
| 405 |
np.testing.assert_almost_equal(f(X), y, decimal=3)
|
| 406 |
|
| 407 |
+
def test_all_selection_strategies(self):
|
| 408 |
+
equations = pd.DataFrame(
|
| 409 |
+
dict(
|
| 410 |
+
loss=[1.0, 0.1, 0.01, 0.001 * 1.4, 0.001],
|
| 411 |
+
score=[0.5, 1.0, 0.5, 0.5, 0.3],
|
| 412 |
+
)
|
| 413 |
+
)
|
| 414 |
+
idx_accuracy = idx_model_selection(equations, "accuracy")
|
| 415 |
+
self.assertEqual(idx_accuracy, 4)
|
| 416 |
+
idx_best = idx_model_selection(equations, "best")
|
| 417 |
+
self.assertEqual(idx_best, 3)
|
| 418 |
+
idx_score = idx_model_selection(equations, "score")
|
| 419 |
+
self.assertEqual(idx_score, 1)
|
| 420 |
+
|
| 421 |
|
| 422 |
class TestFeatureSelection(unittest.TestCase):
|
| 423 |
def setUp(self):
|