Spaces:
Running
Running
Commit
·
9351408
1
Parent(s):
c41cf33
Change "best" model_selection to apply loss threshold
Browse files- pysr/sr.py +40 -23
pysr/sr.py
CHANGED
|
@@ -205,10 +205,16 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 205 |
Parameters
|
| 206 |
----------
|
| 207 |
model_selection : str, default="best"
|
| 208 |
-
Model selection criterion. Can be 'accuracy' or '
|
| 209 |
-
`"accuracy"` selects the candidate model with the lowest loss
|
| 210 |
-
|
| 211 |
-
the
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
|
| 213 |
binary_operators : list[str], default=["+", "-", "*", "/"]
|
| 214 |
List of strings giving the binary operators in Julia's Base.
|
|
@@ -469,7 +475,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 469 |
Whether to use a progress bar instead of printing to stdout.
|
| 470 |
|
| 471 |
equation_file : str, default=None
|
| 472 |
-
Where to save the files (
|
| 473 |
|
| 474 |
temp_equation_file : bool, default=False
|
| 475 |
Whether to put the hall of fame file in the temp directory.
|
|
@@ -943,12 +949,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 943 |
|
| 944 |
for i, equations in enumerate(all_equations):
|
| 945 |
selected = ["" for _ in range(len(equations))]
|
| 946 |
-
|
| 947 |
-
chosen_row = -1
|
| 948 |
-
elif self.model_selection == "best":
|
| 949 |
-
chosen_row = equations["score"].idxmax()
|
| 950 |
-
else:
|
| 951 |
-
raise NotImplementedError
|
| 952 |
selected[chosen_row] = ">>>>"
|
| 953 |
repr_equations = pd.DataFrame(
|
| 954 |
dict(
|
|
@@ -1091,18 +1092,14 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1091 |
return [eq.iloc[i] for eq, i in zip(self.equations_, index)]
|
| 1092 |
return self.equations_.iloc[index]
|
| 1093 |
|
| 1094 |
-
if self.
|
| 1095 |
-
|
| 1096 |
-
|
| 1097 |
-
|
| 1098 |
-
|
| 1099 |
-
|
| 1100 |
-
|
| 1101 |
-
|
| 1102 |
-
else:
|
| 1103 |
-
raise NotImplementedError(
|
| 1104 |
-
f"{self.model_selection} is not a valid model selection strategy."
|
| 1105 |
-
)
|
| 1106 |
|
| 1107 |
def _setup_equation_file(self):
|
| 1108 |
"""
|
|
@@ -2149,6 +2146,26 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 2149 |
return ret_outputs[0]
|
| 2150 |
|
| 2151 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2152 |
def _denoise(X, y, Xresampled=None, random_state=None):
|
| 2153 |
"""Denoise the dataset using a Gaussian process"""
|
| 2154 |
from sklearn.gaussian_process import GaussianProcessRegressor
|
|
|
|
| 205 |
Parameters
|
| 206 |
----------
|
| 207 |
model_selection : str, default="best"
|
| 208 |
+
Model selection criterion. Can be 'accuracy', 'best', or 'score'.
|
| 209 |
+
- `"accuracy"` selects the candidate model with the lowest loss
|
| 210 |
+
(highest accuracy).
|
| 211 |
+
- `"score"` selects the candidate model with the highest score.
|
| 212 |
+
Score is defined as the derivative of the log-loss with
|
| 213 |
+
respect to complexity - if an expression has a much better
|
| 214 |
+
oss at a slightly higher complexity, it is preferred.
|
| 215 |
+
- `"best"` selects the candidate model with the highest score
|
| 216 |
+
among expressions with a loss better than at least 1.5x the
|
| 217 |
+
most accurate model.
|
| 218 |
|
| 219 |
binary_operators : list[str], default=["+", "-", "*", "/"]
|
| 220 |
List of strings giving the binary operators in Julia's Base.
|
|
|
|
| 475 |
Whether to use a progress bar instead of printing to stdout.
|
| 476 |
|
| 477 |
equation_file : str, default=None
|
| 478 |
+
Where to save the files (.csv extension).
|
| 479 |
|
| 480 |
temp_equation_file : bool, default=False
|
| 481 |
Whether to put the hall of fame file in the temp directory.
|
|
|
|
| 949 |
|
| 950 |
for i, equations in enumerate(all_equations):
|
| 951 |
selected = ["" for _ in range(len(equations))]
|
| 952 |
+
chosen_row = idx_model_selection(equations, self.model_selection)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 953 |
selected[chosen_row] = ">>>>"
|
| 954 |
repr_equations = pd.DataFrame(
|
| 955 |
dict(
|
|
|
|
| 1092 |
return [eq.iloc[i] for eq, i in zip(self.equations_, index)]
|
| 1093 |
return self.equations_.iloc[index]
|
| 1094 |
|
| 1095 |
+
if isinstance(self.equations_, list):
|
| 1096 |
+
return [
|
| 1097 |
+
eq.iloc[idx_model_selection(eq, self.model_selection)]
|
| 1098 |
+
for eq in self.equations_
|
| 1099 |
+
]
|
| 1100 |
+
return self.equations_.iloc[
|
| 1101 |
+
idx_model_selection(self.equations_, self.model_selection)
|
| 1102 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1103 |
|
| 1104 |
def _setup_equation_file(self):
|
| 1105 |
"""
|
|
|
|
| 2146 |
return ret_outputs[0]
|
| 2147 |
|
| 2148 |
|
| 2149 |
+
def idx_model_selection(equations: pd.DataFrame, model_selection: str) -> int:
|
| 2150 |
+
"""
|
| 2151 |
+
Return the index of the selected expression, given a dataframe of
|
| 2152 |
+
equations and a model selection.
|
| 2153 |
+
"""
|
| 2154 |
+
if model_selection == "accuracy":
|
| 2155 |
+
chosen_idx = equations["loss"].idxmin()
|
| 2156 |
+
elif model_selection == "best":
|
| 2157 |
+
threshold = 1.5 * equations["loss"].min()
|
| 2158 |
+
filtered_equations = equations.query(f"loss < {threshold}")
|
| 2159 |
+
chosen_idx = filtered_equations["score"].idxmax()
|
| 2160 |
+
elif model_selection == "score":
|
| 2161 |
+
chosen_idx = equations["score"].idxmax()
|
| 2162 |
+
else:
|
| 2163 |
+
raise NotImplementedError(
|
| 2164 |
+
f"{model_selection} is not a valid model selection strategy."
|
| 2165 |
+
)
|
| 2166 |
+
return chosen_idx
|
| 2167 |
+
|
| 2168 |
+
|
| 2169 |
def _denoise(X, y, Xresampled=None, random_state=None):
|
| 2170 |
"""Denoise the dataset using a Gaussian process"""
|
| 2171 |
from sklearn.gaussian_process import GaussianProcessRegressor
|