import numpy as np
import matplotlib
import matplotlib.pyplot as plt

class KArmedBandit:
    
    def __init__(self,num_arms=10,epsilon=0.1,step_size=0.1,initial_reward=0,constant_step_size=False):
        self.num_arms = num_arms
        self.epsilon=epsilon
        self.step_size = step_size
        self.actions = np.arange(num_arms)
        self.true_reward = np.full(shape=num_arms,fill_value=initial_reward)
        self.times_chosen = np.zeros(shape=num_arms)
        self.estimated_reward = np.zeros(shape=num_arms)
        self.constant_step_size=constant_step_size
        self.optimal_actions = np.arange(num_arms) #All rewards are initially zero
        self.avg_reward = 0
        self.current_step = 0
        #self.true_reward = np.random.normal(0,5,self.num_arms)
        
    def step(self, mu, sigma): #Advance environment time, updating actual rewards
        # Update all reward values
        self.current_step += 1
        self.true_reward = np.add(self.true_reward,np.random.normal(mu,sigma,self.num_arms))
        self.optimal_actions = np.where(self.true_reward == self.true_reward.max())[0]
                 
    def get_next_action(self): #Return action to take using epsilon-greedy
        if np.random.rand() < self.epsilon: #Epsilon case
            choice = np.random.choice(self.actions)
        else: #Greedy case
            max_value_actions = np.where(self.estimated_reward==self.estimated_reward.max())[0]
            choice = np.random.choice(max_value_actions)            
        return choice
    
    def do_action(self,action): #Carry out action
        reward = self.true_reward[action]
        optimal_choice = 0
        self.times_chosen[action] += 1
        if action in self.optimal_actions:
            optimal_choice = 1
        self.step(0,0.01)
        return reward, optimal_choice
    
    def update_estimates(self,action,R_n): #Revise Q estimates according to update rule
        Q_n = self.estimated_reward[action]
        self.avg_reward += (R_n - self.avg_reward)/self.current_step
        if self.constant_step_size: #Constant step size factor
            self.estimated_reward[action] += self.step_size * (R_n - Q_n)
        else: #Sample average
            self.estimated_reward[action] += (R_n - Q_n)/self.times_chosen[action]
        
    def run_episode(self, episode_length):
        rewards = np.empty(episode_length)
        optimal_choices = np.empty(episode_length)
        for i in range(0, episode_length):
            action = self.get_next_action()
            reward, optimal_choice = self.do_action(action)
            self.update_estimates(action,reward)
            #rewards[i] = self.avg_reward
            rewards[i] = reward
            optimal_choices[i] = optimal_choice
            
        return rewards,optimal_choices
            

def run_testbed(episode_length, num_episodes, constant_step_size):
    rewards = np.zeros(shape=(num_episodes,episode_length))
    optimal_choices = np.zeros(shape=(num_episodes,episode_length))
    print(rewards.shape)
    for i in range(0,num_episodes-1):
        bandit = KArmedBandit(num_arms=10,epsilon=0.1,step_size=0.1,initial_reward=0,constant_step_size=constant_step_size)
        rewards[i],optimal_choices[i] = bandit.run_episode(episode_length=10000)
        
    avg_rewards = np.average(rewards,axis=0)
    avg_optimal_choices = np.average(optimal_choices,axis=0) * 100.
    return avg_rewards, avg_optimal_choices

r1,o1 = run_testbed(10000,2000,True)
r2,o2 = run_testbed(10000,2000,False)

(2000, 10000)
(2000, 10000)


plt.subplot(2,1,1)
plt.plot(r1,label='Constant sample size')
plt.plot(r2,label='Sample average')
plt.legend()
plt.xlabel('Step')
plt.ylabel('Average Reward')
plt.subplot(2,1,2)
plt.plot(o1,label='Constant sample size')
plt.plot(o2,label='Sample average')
plt.legend()
plt.xlabel('Step')
plt.ylabel('% optimal action')
plt.show()


R = 2

# Optimistic initial value w/ sample average
Q_oiv = np.zeros(100)
Q_oiv[0] = 10 
for i in range(1,100):
    Q_oiv[i] = Q_oiv[i-1] + 0.1 * (R - Q_oiv[i-1])

# Optimistic initial value w/ constant step size
Q_css = np.zeros(100)
Q_css[0] = 10 # Optimistic initial value
for i in range(1,100):
    Q_css[i] = Q_css[i-1] + (R - Q_css[i-1])/(i+1)
    
# Zero initial value
Q_ziv = np.zeros(100) # Zero initial value
for i in range(0,100):
    Q_ziv[i] = Q_ziv[i-1] + 0.1 * (R - Q_ziv[i-1])

plt.plot(Q_oiv,label='Optimistic initial estimate, sample-average')
plt.plot(Q_css,label='Optimistic initial estimate, constant step size')
plt.plot(Q_ziv,label='Zero-initialised estimate')
plt.xlabel('Number of times action is selected')
plt.ylabel('Q Estimate')
plt.title('Convergence of different initial estimate methods with an actual stationary reward of 2')
plt.legend()
plt.show()

$t$	$A_1$	$A_2$
1	-1	0
2	-1	1
3	-1	-0.5
4	-1	0.333
5	-1	0.333

Chapter 2 - Multi-armed Bandits¶

Exercise 2.1¶

Exercise 2.2¶

Exercise 2.3¶

Exercise 2.4¶

Exercise 2.5 (Programming)¶

Exercise 2.6¶

Exercise 2.7¶

Exercise 2.8¶

Exercise 2.9¶

Exercise 2.10¶

Exercise 2.11¶