import numpy as np
import matplotlib.pyplot as plt
# Number of arms (slot machines)n_arms = 10# True reward probabilities for each arm (unknown to the agent)true_rewards = np.random.rand(n_arms) # Each arm has a different probability of reward
print(true_rewards)
# Number of steps (time horizon)n_steps = 1000# Exploration rate (epsilon for ε-greedy)epsilon = 0.1# Initialize estimated rewards for each armR_t = np.zeros(n_arms) # Estimated reward for each arm
N_t = np.zeros(n_arms) # Number of times each arm is selected
# Store rewards and actions over timereward_history = []action_history = []# ε-Greedy Multi-Armed Bandit Simulationfor t in range(1, n_steps + 1):
if np.random.rand() < epsilon:
# Exploration: Choose a random armaction = np.random.choice(n_arms)
else:
# Exploitation: Choose the arm with the highest estimated rewardaction = np.argmax(R_t)
# Simulate receiving a reward (stochastic reward from the true distribution)reward = np.random.normal(true_rewards[action], 0.1) # Adding Gaussian noise
# Update counts and estimated rewards (incremental update formula)N_t[action] += 1R_t[action] += (reward - R_t[action]) / N_t[action]
# Store total reward and actionreward_history.append(reward)
action_history.append(action)
# Plot cumulative rewards over timeplt.figure(figsize=(10, 5))
plt.plot(np.cumsum(reward_history), label="Cumulative Reward", color="orange")
plt.xlabel("Steps")
plt.ylabel("Total Reward")
plt.title("Multi-Armed Bandit - ε-Greedy Strategy")
plt.legend()
plt.grid()
plt.show()
# Plot action selection frequencyplt.figure(figsize=(10, 5))
plt.bar(range(n_arms), N_t, color="blue", alpha=0.7)
plt.xlabel("Arm (Action)")
plt.ylabel("Number of Times Selected")
plt.title("Action Selection Frequency in Multi-Armed Bandit")
plt.xticks(range(n_arms))
plt.grid(axis="y")
plt.show()
# Display final estimated rewards and best actionR_t, np.argmax(R_t)