Skip to content

autora.theorist.bms.regressor

BMSRegressor

Bases: BaseEstimator, RegressorMixin

Bayesian Machine Scientist.

BMS finds an optimal function to explain a dataset, given a set of variables, and a pre-defined number of parameters

This class is intended to be compatible with the Scikit-Learn Estimator API.

Examples:

>>> from autora.theorist.bms import Parallel
>>> import numpy as np
>>> num_samples = 1000
>>> X = np.linspace(start=0, stop=1, num=num_samples).reshape(-1, 1)
>>> y = 15. * np.ones(num_samples)
>>> estimator = BMSRegressor()
>>> estimator = estimator.fit(X, y)
>>> estimator.predict([[15.]])
array([[15.]])

Attributes:

Name Type Description
pms Parallel

the bayesian (parallel) machine scientist model

model_ Tree

represents the best-fit model

loss_ float

represents loss associated with best-fit model

cache_ List

record of loss_ over model fitting epochs

temp_ float

temperature of model_

Source code in temp_dir/bms/src/autora/theorist/bms/regressor.py
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
class BMSRegressor(BaseEstimator, RegressorMixin):
    """
    Bayesian Machine Scientist.

    BMS finds an optimal function to explain a dataset, given a set of variables,
    and a pre-defined number of parameters

    This class is intended to be compatible with the
    [Scikit-Learn Estimator API](https://scikit-learn.org/stable/developers/develop.html).

    Examples:

        >>> from autora.theorist.bms import Parallel
        >>> import numpy as np
        >>> num_samples = 1000
        >>> X = np.linspace(start=0, stop=1, num=num_samples).reshape(-1, 1)
        >>> y = 15. * np.ones(num_samples)
        >>> estimator = BMSRegressor()
        >>> estimator = estimator.fit(X, y)
        >>> estimator.predict([[15.]])
        array([[15.]])


    Attributes:
        pms: the bayesian (parallel) machine scientist model
        model_: represents the best-fit model
        loss_: represents loss associated with best-fit model
        cache_: record of loss_ over model fitting epochs
        temp_: temperature of model_
    """

    def __init__(
        self,
        prior_par: dict = PRIORS,
        ts: List[float] = TEMPERATURES,
        epochs: int = 1500,
    ):
        """
        Arguments:
            prior_par: a dictionary of the prior probabilities of different functions based on
                wikipedia data scraping
            ts: contains a list of the temperatures that the parallel ms works at
        """
        self.ts = ts
        self.prior_par = prior_par
        self.epochs = epochs
        self.pms: Parallel = Parallel(Ts=ts)
        self.ops = get_priors()[1]
        self.custom_ops: Dict[str, Callable] = dict()
        self.X_: Optional[np.ndarray] = None
        self.y_: Optional[np.ndarray] = None
        self.model_: Tree = Tree()
        self.temp_: float = 0.0
        self.models_: List[Tree] = [Tree()]
        self.loss_: float = np.inf
        self.cache_: List = []
        self.variables: List = []

    def fit(
        self,
        X: np.ndarray,
        y: np.ndarray,
        num_param: int = 1,
        root=None,
        custom_ops=None,
        random_state=None,
    ) -> BMSRegressor:
        """
        Runs the optimization for a given set of `X`s and `y`s.

        Arguments:
            X: independent variables in an n-dimensional array
            y: dependent variables in an n-dimensional array
            num_param: number of parameters
            root: fixed root of the tree
            custom_ops: user-defined functions to additionally treated as primitives

        Returns:
            self (BMS): the fitted estimator
        """
        # firstly, store the column names of X since checking will
        # cast the type of X to np.ndarray
        if hasattr(X, "columns"):
            self.variables = list(X.columns)
        else:
            # create variables X_1 to X_n where n is the number of columns in X
            self.variables = ["X%d" % i for i in range(X.shape[1])]

        X, y = check_X_y(X, y)

        # cast X into pd.Pandas again to fit the need in mcmc.py
        X = pd.DataFrame(X, columns=self.variables)
        y = pd.Series(y)
        _logger.info("BMS fitting started")
        if custom_ops is not None:
            for op in custom_ops:
                self.add_primitive(op)
        if (root is not None) and (root not in self.ops.keys()):
            self.add_primitive(root)
        self.pms = Parallel(
            Ts=self.ts,
            variables=self.variables,
            parameters=["a%d" % i for i in range(num_param)],
            x=X,
            y=y,
            prior_par=self.prior_par,
            ops=self.ops,
            custom_ops=self.custom_ops,
            root=root,
            random_state=random_state,
        )
        self.model_, self.loss_, self.cache_ = utils.run(self.pms, self.epochs)
        self.models_ = list(self.pms.trees.values())

        _logger.info("BMS fitting finished")
        self.X_, self.y_ = X, y
        return self

    def predict(self, X: np.ndarray) -> np.ndarray:
        """
        Applies the fitted model to a set of independent variables `X`,
        to give predictions for the dependent variable `y`.

        Arguments:
            X: independent variables in an n-dimensional array

        Returns:
            y: predicted dependent variable values
        """
        # this validation step will cast X into np.ndarray format
        X = check_array(X)

        check_is_fitted(self, attributes=["model_"])

        assert self.model_ is not None
        # we need to cast it back into pd.DataFrame with the original
        # column names (generated in `fit`).
        # in the future, we might need to look into mcmc.py to remove
        # these redundant type castings.
        X = pd.DataFrame(X, columns=self.variables)

        return np.expand_dims(self.model_.predict(X).to_numpy(), axis=1)

    def present_results(self):
        """
        Prints out the best equation, its description length,
        along with a plot of how this has progressed over the course of the search tasks.
        """
        check_is_fitted(self, attributes=["model_", "loss_", "cache_"])
        assert self.model_ is not None
        assert self.loss_ is not None
        assert self.cache_ is not None

        utils.present_results(self.model_, self.loss_, self.cache_)

    def __repr__(self):
        return self.repr()

    def repr(self, decimals=2):
        model_str = self.model_.__repr__()
        parameter_names = self.model_.parameters
        parameter_values = self.model_.par_values
        for name in parameter_names:
            value = parameter_values["d0"][name]
            model_str = model_str.replace(name, str(np.round(value, decimals=decimals)))
        return model_str

    def get_models(self):
        model_list = []
        for idx, tree in enumerate(self.models_):
            bms_model = BMSRegressor()
            bms_model.model_ = tree
            bms_model.temp_ = self.ts[idx]
            bms_model.variables = (
                list(self.X_.columns)
                if hasattr(self.X_, "columns")
                else ["X%d" % i for i in range(self.X_.shape[1])]
            )
            model_list.append(bms_model)
        return model_list

    def latex(self):
        return self.model_.latex()

    def add_primitive(self, op: Callable):
        self.custom_ops.update({op.__name__: op})
        self.ops.update({op.__name__: len(signature(op).parameters)})
        self.prior_par.update({"Nopi_" + op.__name__: 1})

__init__(prior_par=PRIORS, ts=TEMPERATURES, epochs=1500)

Parameters:

Name Type Description Default
prior_par dict

a dictionary of the prior probabilities of different functions based on wikipedia data scraping

PRIORS
ts List[float]

contains a list of the temperatures that the parallel ms works at

TEMPERATURES
Source code in temp_dir/bms/src/autora/theorist/bms/regressor.py
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
def __init__(
    self,
    prior_par: dict = PRIORS,
    ts: List[float] = TEMPERATURES,
    epochs: int = 1500,
):
    """
    Arguments:
        prior_par: a dictionary of the prior probabilities of different functions based on
            wikipedia data scraping
        ts: contains a list of the temperatures that the parallel ms works at
    """
    self.ts = ts
    self.prior_par = prior_par
    self.epochs = epochs
    self.pms: Parallel = Parallel(Ts=ts)
    self.ops = get_priors()[1]
    self.custom_ops: Dict[str, Callable] = dict()
    self.X_: Optional[np.ndarray] = None
    self.y_: Optional[np.ndarray] = None
    self.model_: Tree = Tree()
    self.temp_: float = 0.0
    self.models_: List[Tree] = [Tree()]
    self.loss_: float = np.inf
    self.cache_: List = []
    self.variables: List = []

fit(X, y, num_param=1, root=None, custom_ops=None, random_state=None)

Runs the optimization for a given set of Xs and ys.

Parameters:

Name Type Description Default
X ndarray

independent variables in an n-dimensional array

required
y ndarray

dependent variables in an n-dimensional array

required
num_param int

number of parameters

1
root

fixed root of the tree

None
custom_ops

user-defined functions to additionally treated as primitives

None

Returns:

Name Type Description
self BMS

the fitted estimator

Source code in temp_dir/bms/src/autora/theorist/bms/regressor.py
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
def fit(
    self,
    X: np.ndarray,
    y: np.ndarray,
    num_param: int = 1,
    root=None,
    custom_ops=None,
    random_state=None,
) -> BMSRegressor:
    """
    Runs the optimization for a given set of `X`s and `y`s.

    Arguments:
        X: independent variables in an n-dimensional array
        y: dependent variables in an n-dimensional array
        num_param: number of parameters
        root: fixed root of the tree
        custom_ops: user-defined functions to additionally treated as primitives

    Returns:
        self (BMS): the fitted estimator
    """
    # firstly, store the column names of X since checking will
    # cast the type of X to np.ndarray
    if hasattr(X, "columns"):
        self.variables = list(X.columns)
    else:
        # create variables X_1 to X_n where n is the number of columns in X
        self.variables = ["X%d" % i for i in range(X.shape[1])]

    X, y = check_X_y(X, y)

    # cast X into pd.Pandas again to fit the need in mcmc.py
    X = pd.DataFrame(X, columns=self.variables)
    y = pd.Series(y)
    _logger.info("BMS fitting started")
    if custom_ops is not None:
        for op in custom_ops:
            self.add_primitive(op)
    if (root is not None) and (root not in self.ops.keys()):
        self.add_primitive(root)
    self.pms = Parallel(
        Ts=self.ts,
        variables=self.variables,
        parameters=["a%d" % i for i in range(num_param)],
        x=X,
        y=y,
        prior_par=self.prior_par,
        ops=self.ops,
        custom_ops=self.custom_ops,
        root=root,
        random_state=random_state,
    )
    self.model_, self.loss_, self.cache_ = utils.run(self.pms, self.epochs)
    self.models_ = list(self.pms.trees.values())

    _logger.info("BMS fitting finished")
    self.X_, self.y_ = X, y
    return self

predict(X)

Applies the fitted model to a set of independent variables X, to give predictions for the dependent variable y.

Parameters:

Name Type Description Default
X ndarray

independent variables in an n-dimensional array

required

Returns:

Name Type Description
y ndarray

predicted dependent variable values

Source code in temp_dir/bms/src/autora/theorist/bms/regressor.py
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
def predict(self, X: np.ndarray) -> np.ndarray:
    """
    Applies the fitted model to a set of independent variables `X`,
    to give predictions for the dependent variable `y`.

    Arguments:
        X: independent variables in an n-dimensional array

    Returns:
        y: predicted dependent variable values
    """
    # this validation step will cast X into np.ndarray format
    X = check_array(X)

    check_is_fitted(self, attributes=["model_"])

    assert self.model_ is not None
    # we need to cast it back into pd.DataFrame with the original
    # column names (generated in `fit`).
    # in the future, we might need to look into mcmc.py to remove
    # these redundant type castings.
    X = pd.DataFrame(X, columns=self.variables)

    return np.expand_dims(self.model_.predict(X).to_numpy(), axis=1)

present_results()

Prints out the best equation, its description length, along with a plot of how this has progressed over the course of the search tasks.

Source code in temp_dir/bms/src/autora/theorist/bms/regressor.py
167
168
169
170
171
172
173
174
175
176
177
def present_results(self):
    """
    Prints out the best equation, its description length,
    along with a plot of how this has progressed over the course of the search tasks.
    """
    check_is_fitted(self, attributes=["model_", "loss_", "cache_"])
    assert self.model_ is not None
    assert self.loss_ is not None
    assert self.cache_ is not None

    utils.present_results(self.model_, self.loss_, self.cache_)