Multi-arm Bandit

R codes

 suppressMessages(library(tidyverse))

 ## number of arms 
 k <- 10
 
 ## true values of the arms
 q <- rnorm(k, 0, 1); names(q) <- 1:k
 
 ## rate of exploration
 eps <- 0.1
 
 ## number of steps
 M <- 1000
 
 ## number of replications
 nrep <- 2000

 ## save the estimated values 
 Qsave <- c()
 
 for(i in 1:nrep){
   
   ## simulate the bandits/rewards
   bandit <- lapply(1:k, function(i) rnorm(1, q[i], 1)) %>% unlist()
   
   ## save the number of selection and estimated values for ith replication
   N <- Q <- rep(0, k)
   
   for (ii in 1:M){
     
     if(runif(1) < (1 - eps)){
       A <- sample(which(Q == max(Q)), 1)
     }else{
       A <- sample(1:k, 1)
     }
     
     R <- bandit[A]
     
     N[A] <- N[A] + 1
     Q[A] <- Q[A] + (R - Q[A])/N[A]
     
   }
   
   Qsave <- rbind(Qsave, Q)
   
 }
 
 ## averaged estimated values vs true values
 apply(Qsave, 1, which.max) %>% table() %>% "/"(nrep) %>% round(2)
## .
##    1    2    3    4    5    6    7    9   10 
## 0.00 0.31 0.14 0.02 0.14 0.05 0.33 0.02 0.00
 q %>% round(2)
##     1     2     3     4     5     6     7     8     9    10 
## -1.38  1.00  0.45 -0.82  0.39 -0.09  1.05 -2.76 -0.82 -1.86

Python codes

import numpy as np
import random
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

k = 10
eps = 0.1
M = 1000
nrep = 2000

q = np.random.normal(loc = 0.0, scale = 1.0, size = k)

Qsave = pd.DataFrame()

for i in range(nrep):
    
    bandit = np.random.normal(loc = q, scale = 1.0, size = k).tolist()
        
    N = np.zeros(k)
    Q = np.zeros(k)
    
    for ii in range(M):
        if np.random.uniform(size = 1) < (1 - eps):
            A = random.sample([k for k, j in enumerate(Q) if j == max(Q)], 1)
        else:
            A = random.sample(range(k), 1)
            
        R = bandit[A[0]]
    
        N[A] = N[A] + 1
        Q[A] = Q[A] + (R - Q[A])/N[A]
    
    Qsave = pd.concat([pd.DataFrame(Qsave), pd.DataFrame(Q)], axis = 1)

pd.crosstab(Qsave.idxmax(axis = 0), columns = "Prop", normalize = True)
## col_0    Prop
## row_0        
## 0      0.0005
## 1      0.0065
## 2      0.0355
## 3      0.0380
## 4      0.1655
## 5      0.0100
## 6      0.0300
## 7      0.0855
## 8      0.0025
## 9      0.6260
q
## array([-1.17012562, -0.62955718,  0.34365919,  0.36373688,  1.21680322,
##        -0.18000371,  0.3253811 ,  0.84924658, -1.03595   ,  2.28262604])