Skip to content

autora.experiment_runner.synthetic.psychology.q_learning

AgentQ

An agent that runs simple Q-learning for an n-armed bandits tasks.

Attributes:

Name Type Description
alpha

The agent's learning rate

beta

The agent's softmax temperature

q

The agent's current estimate of the reward probability on each arm

Source code in autora/experiment_runner/synthetic/psychology/q_learning.py
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
class AgentQ:
    """An agent that runs simple Q-learning for an n-armed bandits tasks.

    Attributes:
      alpha: The agent's learning rate
      beta: The agent's softmax temperature
      q: The agent's current estimate of the reward probability on each arm
    """

    def __init__(
        self,
        alpha: float = 0.2,
        beta: float = 3.0,
        n_actions: int = 2,
        forget_rate: float = 0.0,
        perseverance_bias: float = 0.0,
        correlated_reward: bool = False,
    ):
        """Update the agent after one step of the task.

        Args:
          alpha: scalar learning rate
          beta: scalar softmax inverse temperature parameter.
          n_actions: number of actions (default=2)
          forgetting_rate: rate at which q values decay toward the initial values (default=0)
          perseveration_bias: rate at which q values move toward previous action (default=0)
        """
        self._prev_choice = -1
        self._alpha = alpha
        self._beta = beta
        self._n_actions = n_actions
        self._forget_rate = forget_rate
        self._perseverance_bias = perseverance_bias
        self._correlated_reward = correlated_reward
        self._q_init = 0.5
        self.new_sess()

        _check_in_0_1_range(alpha, "alpha")
        _check_in_0_1_range(forget_rate, "forget_rate")

    def new_sess(self):
        """Reset the agent for the beginning of a new session."""
        self._q = self._q_init * np.ones(self._n_actions)
        self._prev_choice = -1

    def get_choice_probs(self) -> np.ndarray:
        """Compute the choice probabilities as softmax over q."""
        decision_variable = np.exp(self.q * self._beta)
        choice_probs = decision_variable / np.sum(decision_variable)
        return choice_probs

    def get_choice(self) -> int:
        """Sample a choice, given the agent's current internal state."""
        choice_probs = self.get_choice_probs()
        choice = np.random.choice(self._n_actions, p=choice_probs)
        return choice

    def update(self, choice: int, reward: float):
        """Update the agent after one step of the task.

        Args:
          choice: The choice made by the agent. 0 or 1
          reward: The reward received by the agent. 0 or 1
        """

        # Forgetting - restore q-values of non-chosen actions towards the initial value
        non_chosen_action = np.arange(self._n_actions) != choice
        self._q[non_chosen_action] = (1 - self._forget_rate) * self._q[
            non_chosen_action
        ] + self._forget_rate * self._q_init

        # Reward-based update - Update chosen q for chosen action with observed reward
        q_reward_update = -self._alpha * self._q[choice] + self._alpha * reward

        # Correlated update - Update non-chosen q for non-chosen action with observed reward
        if self._correlated_reward:
            # index_correlated_update = self._n_actions - choice - 1
            # self._q[index_correlated_update] =
            # (1 - self._alpha) * self._q[index_correlated_update] + self._alpha * (1 - reward)
            # alternative implementation - not dependent on reward but on reward-based update
            index_correlated_update = self._n_actions - 1 - choice
            self._q[index_correlated_update] -= 0.5 * q_reward_update

        # Memorize current choice for perseveration
        self._prev_choice = choice

        self._q[choice] += q_reward_update

    @property
    def q(self):
        q = self._q.copy()
        if self._prev_choice != -1:
            q[self._prev_choice] += self._perseverance_bias
        return q

__init__(alpha=0.2, beta=3.0, n_actions=2, forget_rate=0.0, perseverance_bias=0.0, correlated_reward=False)

Update the agent after one step of the task.

Parameters:

Name Type Description Default
alpha float

scalar learning rate

0.2
beta float

scalar softmax inverse temperature parameter.

3.0
n_actions int

number of actions (default=2)

2
forgetting_rate

rate at which q values decay toward the initial values (default=0)

required
perseveration_bias

rate at which q values move toward previous action (default=0)

required
Source code in autora/experiment_runner/synthetic/psychology/q_learning.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
def __init__(
    self,
    alpha: float = 0.2,
    beta: float = 3.0,
    n_actions: int = 2,
    forget_rate: float = 0.0,
    perseverance_bias: float = 0.0,
    correlated_reward: bool = False,
):
    """Update the agent after one step of the task.

    Args:
      alpha: scalar learning rate
      beta: scalar softmax inverse temperature parameter.
      n_actions: number of actions (default=2)
      forgetting_rate: rate at which q values decay toward the initial values (default=0)
      perseveration_bias: rate at which q values move toward previous action (default=0)
    """
    self._prev_choice = -1
    self._alpha = alpha
    self._beta = beta
    self._n_actions = n_actions
    self._forget_rate = forget_rate
    self._perseverance_bias = perseverance_bias
    self._correlated_reward = correlated_reward
    self._q_init = 0.5
    self.new_sess()

    _check_in_0_1_range(alpha, "alpha")
    _check_in_0_1_range(forget_rate, "forget_rate")

get_choice()

Sample a choice, given the agent's current internal state.

Source code in autora/experiment_runner/synthetic/psychology/q_learning.py
69
70
71
72
73
def get_choice(self) -> int:
    """Sample a choice, given the agent's current internal state."""
    choice_probs = self.get_choice_probs()
    choice = np.random.choice(self._n_actions, p=choice_probs)
    return choice

get_choice_probs()

Compute the choice probabilities as softmax over q.

Source code in autora/experiment_runner/synthetic/psychology/q_learning.py
63
64
65
66
67
def get_choice_probs(self) -> np.ndarray:
    """Compute the choice probabilities as softmax over q."""
    decision_variable = np.exp(self.q * self._beta)
    choice_probs = decision_variable / np.sum(decision_variable)
    return choice_probs

new_sess()

Reset the agent for the beginning of a new session.

Source code in autora/experiment_runner/synthetic/psychology/q_learning.py
58
59
60
61
def new_sess(self):
    """Reset the agent for the beginning of a new session."""
    self._q = self._q_init * np.ones(self._n_actions)
    self._prev_choice = -1

update(choice, reward)

Update the agent after one step of the task.

Parameters:

Name Type Description Default
choice int

The choice made by the agent. 0 or 1

required
reward float

The reward received by the agent. 0 or 1

required
Source code in autora/experiment_runner/synthetic/psychology/q_learning.py
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
def update(self, choice: int, reward: float):
    """Update the agent after one step of the task.

    Args:
      choice: The choice made by the agent. 0 or 1
      reward: The reward received by the agent. 0 or 1
    """

    # Forgetting - restore q-values of non-chosen actions towards the initial value
    non_chosen_action = np.arange(self._n_actions) != choice
    self._q[non_chosen_action] = (1 - self._forget_rate) * self._q[
        non_chosen_action
    ] + self._forget_rate * self._q_init

    # Reward-based update - Update chosen q for chosen action with observed reward
    q_reward_update = -self._alpha * self._q[choice] + self._alpha * reward

    # Correlated update - Update non-chosen q for non-chosen action with observed reward
    if self._correlated_reward:
        # index_correlated_update = self._n_actions - choice - 1
        # self._q[index_correlated_update] =
        # (1 - self._alpha) * self._q[index_correlated_update] + self._alpha * (1 - reward)
        # alternative implementation - not dependent on reward but on reward-based update
        index_correlated_update = self._n_actions - 1 - choice
        self._q[index_correlated_update] -= 0.5 * q_reward_update

    # Memorize current choice for perseveration
    self._prev_choice = choice

    self._q[choice] += q_reward_update

q_learning(name='Q-Learning', learning_rate=0.2, decision_noise=3.0, n_actions=2, forget_rate=0.0, perseverance_bias=0.0, correlated_reward=False)

An agent that runs simple Q-learning for an n-armed bandits tasks.

Parameters:

Name Type Description Default
name

name of the experiment

'Q-Learning'
trials

number of trials

required
learning_rate float

learning rate for Q-learning

0.2
decision_noise float

softmax parameter for decision noise

3.0
n_actions int

number of actions

2
forget_rate float

rate of forgetting

0.0
perseverance_bias float

bias towards choosing the previously chosen action

0.0
correlated_reward bool

whether rewards are correlated

False

Examples:

>>> experiment = q_learning()

The runner can accept numpy arrays or pandas DataFrames, but the return value will

always be a list of numpy arrays. Each array corresponds to the choices made by the agent

for each trial in the input. Thus, arrays have shape (n_trials, n_actions).

>>> experiment.run(np.array([[0, 1], [0, 1], [0, 1], [1, 0], [1, 0], [1, 0]]),
...                random_state=42)
[array([[1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.]])]

The runner can accept pandas DataFrames. Each cell of the DataFrame should contain a

numpy array with shape (n_trials, n_actions). The return value will be a list of numpy

arrays, each corresponding to the choices made by the agent for each trial in the input.

>>> experiment.run(
...     pd.DataFrame(
...         {'reward array': [np.array([[0, 1], [0, 1], [0, 1], [1, 0], [1, 0], [1, 0]])]}),
...     random_state = 42)
[array([[1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.]])]
Source code in autora/experiment_runner/synthetic/psychology/q_learning.py
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
def q_learning(
    name="Q-Learning",
    learning_rate: float = 0.2,
    decision_noise: float = 3.0,
    n_actions: int = 2,
    forget_rate: float = 0.0,
    perseverance_bias: float = 0.0,
    correlated_reward: bool = False,
):
    """
    An agent that runs simple Q-learning for an n-armed bandits tasks.

    Args:
        name: name of the experiment
        trials: number of trials
        learning_rate: learning rate for Q-learning
        decision_noise: softmax parameter for decision noise
        n_actions: number of actions
        forget_rate: rate of forgetting
        perseverance_bias: bias towards choosing the previously chosen action
        correlated_reward: whether rewards are correlated

    Examples:
        >>> experiment = q_learning()

        # The runner can accept numpy arrays or pandas DataFrames, but the return value will
        # always be a list of numpy arrays. Each array corresponds to the choices made by the agent
        # for each trial in the input. Thus, arrays have shape (n_trials, n_actions).
        >>> experiment.run(np.array([[0, 1], [0, 1], [0, 1], [1, 0], [1, 0], [1, 0]]),
        ...                random_state=42)
        [array([[1., 0.],
               [0., 1.],
               [0., 1.],
               [0., 1.],
               [1., 0.],
               [1., 0.]])]

        # The runner can accept pandas DataFrames. Each cell of the DataFrame should contain a
        # numpy array with shape (n_trials, n_actions). The return value will be a list of numpy
        # arrays, each corresponding to the choices made by the agent for each trial in the input.
        >>> experiment.run(
        ...     pd.DataFrame(
        ...         {'reward array': [np.array([[0, 1], [0, 1], [0, 1], [1, 0], [1, 0], [1, 0]])]}),
        ...     random_state = 42)
        [array([[1., 0.],
               [0., 1.],
               [0., 1.],
               [0., 1.],
               [1., 0.],
               [1., 0.]])]
    """

    params = dict(
        name=name,
        trials=100,
        learning_rate=learning_rate,
        decision_noise=decision_noise,
        n_actions=n_actions,
        forget_rate=forget_rate,
        perseverance_bias=perseverance_bias,
        correlated_reward=correlated_reward,
    )

    iv1 = IV(
        name="reward array",
        units="reward",
        variable_label="Reward Sequence",
        type=ValueType.BOOLEAN,
    )

    dv1 = DV(
        name="choice array",
        units="actions",
        variable_label="Action Sequence",
        type=ValueType.REAL,
    )

    variables = VariableCollection(
        independent_variables=[iv1],
        dependent_variables=[dv1],
    )

    def run_AgentQ(rewards):
        if rewards.shape[1] != n_actions:
            Warning(
                "Number of actions in rewards does not match n_actions. Will use "
                + str(rewards.shape[1] + " actions.")
            )
        num_trials = rewards.shape[0]

        y = np.zeros(rewards.shape)
        choice_proba = np.zeros(rewards.shape)

        agent = AgentQ(
            alpha=learning_rate,
            beta=decision_noise,
            n_actions=rewards.shape[1],
            forget_rate=forget_rate,
            perseverance_bias=perseverance_bias,
            correlated_reward=correlated_reward,
        )

        for i in range(num_trials):
            proba = agent.get_choice_probs()
            choice = agent.get_choice()
            y[i, choice] = 1
            choice_proba[i] = proba
            reward = rewards[i, choice]
            agent.update(choice, reward)
        return y, choice_proba

    def run(
        conditions: Union[pd.DataFrame, np.ndarray, np.recarray],
        random_state: Optional[int] = None,
        return_choice_probabilities=False,
    ):
        if random_state is not None:
            np.random.seed(random_state)

        Y = list()
        Y_proba = list()
        if isinstance(conditions, pd.DataFrame):
            for index, session in conditions.iterrows():
                rewards = session[0]
                choice, choice_proba = run_AgentQ(rewards)
                Y.append(choice)
                Y_proba.append(choice_proba)
        elif isinstance(conditions, np.ndarray):
            choice, choice_proba = run_AgentQ(conditions)
            Y.append(choice)
            Y_proba.append(choice_proba)

        if return_choice_probabilities:
            return Y, Y_proba
        else:
            return Y

    ground_truth = partial(run)

    def domain():
        return None

    def plotter():
        raise NotImplementedError

    collection = SyntheticExperimentCollection(
        name=name,
        description=q_learning.__doc__,
        variables=variables,
        run=run,
        ground_truth=ground_truth,
        domain=domain,
        plotter=plotter,
        params=params,
        factory_function=q_learning,
    )
    return collection