Spaces:
Running
Running
| from pysr import pysr, best_row | |
| from sklearn.base import BaseEstimator, RegressorMixin | |
| import inspect | |
| import pandas as pd | |
| class PySRRegressor(BaseEstimator, RegressorMixin): | |
| def __init__(self, model_selection="accuracy", **params): | |
| """Initialize settings for pysr.pysr call. | |
| :param model_selection: How to select a model. Can be 'accuracy' or 'best'. 'best' will optimize a combination of complexity and accuracy. | |
| :type model_selection: str | |
| """ | |
| super().__init__() | |
| self.model_selection = model_selection | |
| self.params = params | |
| # Stored equations: | |
| self.equations = None | |
| def __repr__(self): | |
| if self.equations is None: | |
| return "PySRRegressor.equations = None" | |
| equations = self.equations | |
| selected = ["" for _ in range(len(equations))] | |
| if self.model_selection == "accuracy": | |
| chosen_row = -1 | |
| elif self.model_selection == "best": | |
| chosen_row = equations["score"].idxmax() | |
| else: | |
| raise NotImplementedError | |
| selected[chosen_row] = ">>>>" | |
| output = "PySRRegressor.equations = [\n" | |
| repr_equations = pd.DataFrame( | |
| dict( | |
| pick=selected, | |
| score=equations["score"], | |
| Equation=equations["Equation"], | |
| MSE=equations["MSE"], | |
| Complexity=equations["Complexity"], | |
| ) | |
| ) | |
| output += repr_equations.__repr__() | |
| output += "\n]" | |
| return output | |
| def set_params(self, **params): | |
| """Set parameters for pysr.pysr call or model_selection strategy.""" | |
| for key, value in params.items(): | |
| if key == "model_selection": | |
| self.model_selection = value | |
| self.params[key] = value | |
| return self | |
| def get_params(self, deep=True): | |
| del deep | |
| return {**self.params, "model_selection": self.model_selection} | |
| def get_best(self): | |
| if self.equations is None: | |
| return 0.0 | |
| if self.model_selection == "accuracy": | |
| return self.equations.iloc[-1] | |
| elif self.model_selection == "best": | |
| return best_row(self.equations) | |
| else: | |
| raise NotImplementedError | |
| def fit(self, X, y, weights=None, variable_names=None): | |
| """Search for equations to fit the dataset. | |
| :param X: 2D array. Rows are examples, columns are features. If pandas DataFrame, the columns are used for variable names (so make sure they don't contain spaces). | |
| :type X: np.ndarray/pandas.DataFrame | |
| :param y: 1D array (rows are examples) or 2D array (rows are examples, columns are outputs). Putting in a 2D array will trigger a search for equations for each feature of y. | |
| :type y: np.ndarray | |
| :param weights: Optional. Same shape as y. Each element is how to weight the mean-square-error loss for that particular element of y. | |
| :type weights: np.ndarray | |
| :param variable_names: a list of names for the variables, other than "x0", "x1", etc. | |
| :type variable_names: list | |
| """ | |
| if variable_names is None: | |
| if "variable_names" in self.params: | |
| variable_names = self.params["variable_names"] | |
| self.equations = pysr( | |
| X=X, | |
| y=y, | |
| weights=weights, | |
| variable_names=variable_names, | |
| **{k: v for k, v in self.params.items() if k != "variable_names"}, | |
| ) | |
| return self | |
| def predict(self, X): | |
| equation_row = self.get_best() | |
| np_format = equation_row["lambda_format"] | |
| return np_format(X) | |
| # Add the docs from pysr() to PySRRegressor(): | |
| _pysr_docstring_split = [] | |
| _start_recording = False | |
| for line in inspect.getdoc(pysr).split("\n"): | |
| # Skip docs on "X" and "y" | |
| if ":param binary_operators:" in line: | |
| _start_recording = True | |
| if ":returns:" in line: | |
| _start_recording = False | |
| if _start_recording: | |
| _pysr_docstring_split.append(line) | |
| _pysr_docstring = "\n\t".join(_pysr_docstring_split) | |
| PySRRegressor.__init__.__doc__ += _pysr_docstring | |