bms

BMSRegressor

Bases: BaseEstimator, RegressorMixin

Bayesian Machine Scientist.

BMS finds an optimal function to explain a dataset, given a set of variables, and a pre-defined number of parameters

This class is intended to be compatible with the Scikit-Learn Estimator API.

Examples:

>>> from autora.theorist.bms import Parallel, utils
>>> import numpy as np
>>> num_samples = 1000
>>> X = np.linspace(start=0, stop=1, num=num_samples).reshape(-1, 1)
>>> y = 15. * np.ones(num_samples)
>>> estimator = BMSRegressor()
>>> estimator = estimator.fit(X, y)
>>> estimator.predict([[15.]])
array([[15.]])


Attributes:

Name Type Description
pms Parallel

the bayesian (parallel) machine scientist model

model_ Tree

represents the best-fit model

loss_ float

represents loss associated with best-fit model

cache_ List

record of loss_ over model fitting epochs

Source code in autora/skl/bms.py
  23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 class BMSRegressor(BaseEstimator, RegressorMixin): """ Bayesian Machine Scientist. BMS finds an optimal function to explain a dataset, given a set of variables, and a pre-defined number of parameters This class is intended to be compatible with the [Scikit-Learn Estimator API](https://scikit-learn.org/stable/developers/develop.html). Examples: >>> from autora.theorist.bms import Parallel, utils >>> import numpy as np >>> num_samples = 1000 >>> X = np.linspace(start=0, stop=1, num=num_samples).reshape(-1, 1) >>> y = 15. * np.ones(num_samples) >>> estimator = BMSRegressor() >>> estimator = estimator.fit(X, y) >>> estimator.predict([[15.]]) array([[15.]]) Attributes: pms: the bayesian (parallel) machine scientist model model_: represents the best-fit model loss_: represents loss associated with best-fit model cache_: record of loss_ over model fitting epochs """ def __init__( self, prior_par: dict = PRIORS, ts: List[float] = TEMPERATURES, epochs: int = 1500, ): """ Arguments: prior_par: a dictionary of the prior probabilities of different functions based on wikipedia data scraping ts: contains a list of the temperatures that the parallel ms works at """ self.ts = ts self.prior_par = prior_par self.epochs = epochs self.pms: Parallel = Parallel(Ts=ts) self.X_: Optional[np.ndarray] = None self.y_: Optional[np.ndarray] = None self.model_: Tree = Tree() self.models_: List[Tree] = [Tree()] self.loss_: float = np.inf self.cache_: List = [] self.variables: List = [] def fit(self, X: np.ndarray, y: np.ndarray, num_param: int = 1) -> BMSRegressor: """ Runs the optimization for a given set of Xs and ys. Arguments: X: independent variables in an n-dimensional array y: dependent variables in an n-dimensional array num_param: number of parameters Returns: self (BMS): the fitted estimator """ # firstly, store the column names of X since checking will # cast the type of X to np.ndarray if hasattr(X, "columns"): self.variables = list(X.columns) else: # create variables X_1 to X_n where n is the number of columns in X self.variables = ["X%d" % i for i in range(X.shape[1])] X, y = check_X_y(X, y) # cast X into pd.Pandas again to fit the need in mcmc.py X = pd.DataFrame(X, columns=self.variables) y = pd.Series(y) _logger.info("BMS fitting started") self.pms = Parallel( Ts=self.ts, variables=self.variables, parameters=["a%d" % i for i in range(num_param)], x=X, y=y, prior_par=self.prior_par, ) self.model_, self.loss_, self.cache_ = utils.run(self.pms, self.epochs) self.models_ = list(self.pms.trees.values()) _logger.info("BMS fitting finished") self.X_, self.y_ = X, y return self def predict(self, X: np.ndarray) -> np.ndarray: """ Applies the fitted model to a set of independent variables X, to give predictions for the dependent variable y. Arguments: X: independent variables in an n-dimensional array Returns: y: predicted dependent variable values """ # this validation step will cast X into np.ndarray format X = check_array(X) check_is_fitted(self, attributes=["model_"]) assert self.model_ is not None # we need to cast it back into pd.DataFrame with the original # column names (generated in fit). # in the future, we might need to look into mcmc.py to remove # these redundant type castings. X = pd.DataFrame(X, columns=self.variables) return np.expand_dims(self.model_.predict(X).to_numpy(), axis=1) def present_results(self): """ Prints out the best equation, its description length, along with a plot of how this has progressed over the course of the search tasks. """ check_is_fitted(self, attributes=["model_", "loss_", "cache_"]) assert self.model_ is not None assert self.loss_ is not None assert self.cache_ is not None utils.present_results(self.model_, self.loss_, self.cache_) 

__init__(prior_par=PRIORS, ts=TEMPERATURES, epochs=1500)

Parameters:

Name Type Description Default
prior_par dict

a dictionary of the prior probabilities of different functions based on wikipedia data scraping

PRIORS
ts List[float]

contains a list of the temperatures that the parallel ms works at

TEMPERATURES
Source code in autora/skl/bms.py
 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 def __init__( self, prior_par: dict = PRIORS, ts: List[float] = TEMPERATURES, epochs: int = 1500, ): """ Arguments: prior_par: a dictionary of the prior probabilities of different functions based on wikipedia data scraping ts: contains a list of the temperatures that the parallel ms works at """ self.ts = ts self.prior_par = prior_par self.epochs = epochs self.pms: Parallel = Parallel(Ts=ts) self.X_: Optional[np.ndarray] = None self.y_: Optional[np.ndarray] = None self.model_: Tree = Tree() self.models_: List[Tree] = [Tree()] self.loss_: float = np.inf self.cache_: List = [] self.variables: List = [] 

fit(X, y, num_param=1)

Runs the optimization for a given set of Xs and ys.

Parameters:

Name Type Description Default
X np.ndarray

independent variables in an n-dimensional array

required
y np.ndarray

dependent variables in an n-dimensional array

required
num_param int

number of parameters

1

Returns:

Name Type Description
self BMS

the fitted estimator

Source code in autora/skl/bms.py
  78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 def fit(self, X: np.ndarray, y: np.ndarray, num_param: int = 1) -> BMSRegressor: """ Runs the optimization for a given set of Xs and ys. Arguments: X: independent variables in an n-dimensional array y: dependent variables in an n-dimensional array num_param: number of parameters Returns: self (BMS): the fitted estimator """ # firstly, store the column names of X since checking will # cast the type of X to np.ndarray if hasattr(X, "columns"): self.variables = list(X.columns) else: # create variables X_1 to X_n where n is the number of columns in X self.variables = ["X%d" % i for i in range(X.shape[1])] X, y = check_X_y(X, y) # cast X into pd.Pandas again to fit the need in mcmc.py X = pd.DataFrame(X, columns=self.variables) y = pd.Series(y) _logger.info("BMS fitting started") self.pms = Parallel( Ts=self.ts, variables=self.variables, parameters=["a%d" % i for i in range(num_param)], x=X, y=y, prior_par=self.prior_par, ) self.model_, self.loss_, self.cache_ = utils.run(self.pms, self.epochs) self.models_ = list(self.pms.trees.values()) _logger.info("BMS fitting finished") self.X_, self.y_ = X, y return self 

predict(X)

Applies the fitted model to a set of independent variables X, to give predictions for the dependent variable y.

Parameters:

Name Type Description Default
X np.ndarray

independent variables in an n-dimensional array

required

Returns:

Name Type Description
y np.ndarray

predicted dependent variable values

Source code in autora/skl/bms.py
 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 def predict(self, X: np.ndarray) -> np.ndarray: """ Applies the fitted model to a set of independent variables X, to give predictions for the dependent variable y. Arguments: X: independent variables in an n-dimensional array Returns: y: predicted dependent variable values """ # this validation step will cast X into np.ndarray format X = check_array(X) check_is_fitted(self, attributes=["model_"]) assert self.model_ is not None # we need to cast it back into pd.DataFrame with the original # column names (generated in fit). # in the future, we might need to look into mcmc.py to remove # these redundant type castings. X = pd.DataFrame(X, columns=self.variables) return np.expand_dims(self.model_.predict(X).to_numpy(), axis=1) 

present_results()

Prints out the best equation, its description length, along with a plot of how this has progressed over the course of the search tasks.

Source code in autora/skl/bms.py
 145 146 147 148 149 150 151 152 153 154 155 def present_results(self): """ Prints out the best equation, its description length, along with a plot of how this has progressed over the course of the search tasks. """ check_is_fitted(self, attributes=["model_", "loss_", "cache_"]) assert self.model_ is not None assert self.loss_ is not None assert self.cache_ is not None utils.present_results(self.model_, self.loss_, self.cache_)