R codes
suppressMessages(library(tidyverse))
## number of arms
k <- 10
## true values of the arms
q <- rnorm(k, 0, 1); names(q) <- 1:k
## rate of exploration
eps <- 0.1
## number of steps
M <- 1000
## number of replications
nrep <- 2000
## save the estimated values
Qsave <- c()
for(i in 1:nrep){
## simulate the bandits/rewards
bandit <- lapply(1:k, function(i) rnorm(1, q[i], 1)) %>% unlist()
## save the number of selection and estimated values for ith replication
N <- Q <- rep(0, k)
for (ii in 1:M){
if(runif(1) < (1 - eps)){
A <- sample(which(Q == max(Q)), 1)
}else{
A <- sample(1:k, 1)
}
R <- bandit[A]
N[A] <- N[A] + 1
Q[A] <- Q[A] + (R - Q[A])/N[A]
}
Qsave <- rbind(Qsave, Q)
}
## averaged estimated values vs true values
apply(Qsave, 1, which.max) %>% table() %>% "/"(nrep) %>% round(2)
## .
## 1 2 3 4 5 6 7 9 10
## 0.00 0.31 0.14 0.02 0.14 0.05 0.33 0.02 0.00
q %>% round(2)
## 1 2 3 4 5 6 7 8 9 10
## -1.38 1.00 0.45 -0.82 0.39 -0.09 1.05 -2.76 -0.82 -1.86
Python codes
import numpy as np
import random
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
k = 10
eps = 0.1
M = 1000
nrep = 2000
q = np.random.normal(loc = 0.0, scale = 1.0, size = k)
Qsave = pd.DataFrame()
for i in range(nrep):
bandit = np.random.normal(loc = q, scale = 1.0, size = k).tolist()
N = np.zeros(k)
Q = np.zeros(k)
for ii in range(M):
if np.random.uniform(size = 1) < (1 - eps):
A = random.sample([k for k, j in enumerate(Q) if j == max(Q)], 1)
else:
A = random.sample(range(k), 1)
R = bandit[A[0]]
N[A] = N[A] + 1
Q[A] = Q[A] + (R - Q[A])/N[A]
Qsave = pd.concat([pd.DataFrame(Qsave), pd.DataFrame(Q)], axis = 1)
pd.crosstab(Qsave.idxmax(axis = 0), columns = "Prop", normalize = True)
## col_0 Prop
## row_0
## 0 0.0005
## 1 0.0065
## 2 0.0355
## 3 0.0380
## 4 0.1655
## 5 0.0100
## 6 0.0300
## 7 0.0855
## 8 0.0025
## 9 0.6260
q
## array([-1.17012562, -0.62955718, 0.34365919, 0.36373688, 1.21680322,
## -0.18000371, 0.3253811 , 0.84924658, -1.03595 , 2.28262604])