import numpy as np
import matplotlib.pyplot as plt

# Number of arms (slot machines)
n_arms = 10

# True reward probabilities for each arm (unknown to the agent)
true_rewards = np.random.rand(n_arms)  # Each arm has a different probability of reward
print(true_rewards)
# Number of steps (time horizon)
n_steps = 1000

# Exploration rate (epsilon for ε-greedy)
epsilon = 0.1

# Initialize estimated rewards for each arm
R_t = np.zeros(n_arms)  # Estimated reward for each arm
N_t = np.zeros(n_arms)  # Number of times each arm is selected

# Store rewards and actions over time
reward_history = []
action_history = []

# ε-Greedy Multi-Armed Bandit Simulation
for t in range(1, n_steps + 1):
    if np.random.rand() < epsilon:
        # Exploration: Choose a random arm
        action = np.random.choice(n_arms)
    else:
        # Exploitation: Choose the arm with the highest estimated reward
        action = np.argmax(R_t)

        # Simulate receiving a reward (stochastic reward from the true distribution)
        reward = np.random.normal(true_rewards[action], 0.1)  # Adding Gaussian noise

        # Update counts and estimated rewards (incremental update formula)
        N_t[action] += 1
        R_t[action] += (reward - R_t[action]) / N_t[action]

        # Store total reward and action
        reward_history.append(reward)
        action_history.append(action)

        # Plot cumulative rewards over time
        plt.figure(figsize=(10, 5))
        plt.plot(np.cumsum(reward_history), label="Cumulative Reward", color="orange")
        plt.xlabel("Steps")
        plt.ylabel("Total Reward")
        plt.title("Multi-Armed Bandit - ε-Greedy Strategy")
        plt.legend()
        plt.grid()
        plt.show()

        # Plot action selection frequency
        plt.figure(figsize=(10, 5))
        plt.bar(range(n_arms), N_t, color="blue", alpha=0.7)
        plt.xlabel("Arm (Action)")
        plt.ylabel("Number of Times Selected")
        plt.title("Action Selection Frequency in Multi-Armed Bandit")
        plt.xticks(range(n_arms))
        plt.grid(axis="y")
        plt.show()

        # Display final estimated rewards and best action
        R_t, np.argmax(R_t)