പത്ത് കൈകളുള്ള ടെസ്റ്റ്ബെഡ്

2024, നവംബർ 25

ഇത് മൾട്ടി-ആർമ്ഡ് ബാൻഡിറ്റ് പ്രശ്നത്തിനുള്ള തന്ത്രങ്ങൾ മൂല്യനിർണ്ണയം ചെയ്യുന്ന ഒരു രീതിയാണ് [^1]. ടെസ്റ്റ്ബെഡ് ഇനിപ്പറയുന്ന രീതിയിൽ പ്രവർത്തിക്കുന്നു:

$10$ പ്രവർത്തനങ്ങൾ $a_{i}$ യുമായി ബന്ധപ്പെട്ട $10$ റിവാർഡ് മീൻസ് $μ_{i}$ സൃഷ്ടിക്കുക
ഓരോ ഇറററേഷനിലും ഏജന്റിനെ ചില പ്രവർത്തനങ്ങൾ $a_{j}$ എടുക്കാൻ അനുവദിക്കുക, കൂടാതെ $r_{t} \sim N (μ_{j}, 1)$ എന്ന റിവാർഡ് സ്വീകരിക്കുക.

ഇത് $μ_{i}$ യുടെ $100$ റാൻഡം സാമ്പിളുകൾക്കായി ആവർത്തിക്കുന്നു. ഏജന്റിന്റെ ലക്ഷ്യം ശരാശരി റിവാർഡുകൾ പരമാവധി ആക്കുക എന്നതാണ്. ഏറ്റവും ഉയർന്ന മീൻ ഉള്ള പ്രവർത്തനം ഏതാണെന്ന് അത് മനസ്സിലാക്കി അതിൽ നിന്ന് സാമ്പിൾ എടുക്കണം.

പ്രതിപാദ്യം 1: $\epsilon$-ലോഭം

class GreedyEpsilon:
    def __init__(self, n_actions, eps, reward_fn, bias=0.0):
        self.n_actions = n_actions
        self.Q = np.array([bias] * n_actions)
        self.n_moves = np.zeros((n_actions,))
        self.eps = eps
        self.reward_fn = reward_fn
        self.total_reward = 0.0

    def step(self):
        if np.random.rand() < self.eps:
            action = np.random.randint(0, self.n_actions)
        else:
            action = np.argmax(self.Q)

        reward = self.reward_fn(action)
        self.n_moves[action] += 1

        self.Q[action] += 1.0 / self.n_moves[action] * (reward - self.Q[action])
        self.total_reward += reward
        return reward

തന്ത്രം 2: സിസ്റ്റത്തെ ചതിക്കുക

നമ്മൾ ഒന്നിലധികം തവണ സാമ്പിൾ ചെയ്യാൻ പാടില്ല, പക്ഷേ പ്രകടനത്തിന്റെ ഒരു മുകളിലെ പരിധി ചിത്രീകരിക്കാൻ ഞങ്ങൾ അത് ചെയ്യും.

class CheatingModel:
    def __init__(self, n_actions, eps, reward_fn, bias=0.0):
        self.n_actions = n_actions
        self.reward_fn = reward_fn
        self.total_reward = 0.0

    def step(self):
        reward = max(self.reward_fn(action) for action in range(self.n_actions))
        self.total_reward += reward
        return reward

## ഫലങ്ങൾ


    
        Loading...
    

    
    




  പൂർണ്ണ കോഡ് (matplotlib പതിപ്പ്)
  
    
    
      import numpy as np
from tqdm import trange
import matplotlib.pyplot as plt

def normal_reward(action, action_to_reward_mu, reward_std, n_samples):
    return np.random.normal(action_to_reward_mu[action], reward_std, n_samples)

class CheatingModel:
    def __init__(self, n_actions, eps, reward_fn, bias=0.0):
        self.n_actions = n_actions
        self.reward_fn = reward_fn
        self.total_reward = 0.0

    def step(self):
        reward = max(self.reward_fn(action) for action in range(self.n_actions))
        self.total_reward += reward
        return reward

class GreedyEpsilon:
    def __init__(self, n_actions, eps, reward_fn, bias=0.0):
        self.n_actions = n_actions
        self.Q = np.array([bias] * n_actions)
        self.n_moves = np.zeros((n_actions,))
        self.eps = eps
        self.reward_fn = reward_fn
        self.total_reward = 0.0

    def step(self):
        if np.random.rand() < self.eps:
            action = np.random.randint(0, self.n_actions)
        else:
            action = np.argmax(self.Q)

        reward = self.reward_fn(action)
        self.n_moves[action] += 1

        self.Q[action] += 1.0 / self.n_moves[action] * (reward - self.Q[action])
        self.total_reward += reward
        return reward

def main():
    N_ACTIONS = 10
    N_DISTRIBUTIONS = 100
    reward_std = 1.0

    n_steps = 2000
    epsilon_values = [0.0, 0.01, 0.1, 0.2]
    avg_rewards = {epsilon: np.zeros((n_steps,)) for epsilon in epsilon_values}
    avg_rewards["cheating"] = np.zeros((n_steps,))

    bias_values = [0.0, 0.5, 1.0]
    avg_rewards_bias = {bias: np.zeros((n_steps,)) for bias in bias_values}

    for _ in trange(N_DISTRIBUTIONS):
        action_to_reward_mu = np.random.normal(0, 1, (N_ACTIONS,))
        for epsilon in epsilon_values:
            model = GreedyEpsilon(
                N_ACTIONS,
                epsilon,
                lambda a: normal_reward(a, action_to_reward_mu, reward_std, 1),
            )
            for n in range(n_steps):
                model.step()
                avg_rewards[epsilon][n] += model.total_reward / (n + 1)
            avg_rewards[epsilon] /= N_DISTRIBUTIONS

        cheating_model = CheatingModel(
            N_ACTIONS,
            0,
            lambda a: normal_reward(a, action_to_reward_mu, reward_std, 1),
        )
        for n in range(n_steps):
            cheating_model.step()
            avg_rewards["cheating"][n] += cheating_model.total_reward / (n + 1)
        avg_rewards["cheating"] /= N_DISTRIBUTIONS

        for bias in bias_values:
            biased_model = GreedyEpsilon(
                N_ACTIONS,
                0.01,
                lambda a: normal_reward(a, action_to_reward_mu, reward_std, 1),
                bias=bias,
            )
            for n in range(n_steps):
                biased_model.step()
                avg_rewards_bias[bias][n] += biased_model.total_reward / (n + 1)
            avg_rewards_bias[bias] /= N_DISTRIBUTIONS

    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    for epsilon in epsilon_values:
        plt.plot(avg_rewards[epsilon], label=f"epsilon={epsilon}")
    plt.plot(avg_rewards["cheating"], label="cheating model", linestyle="--")

    plt.legend()

    plt.yscale("log")
    plt.title("എപ്സിലോൺ ഫലങ്ങൾ")

    plt.subplot(1, 2, 2)
    for bias in bias_values:
        plt.plot(avg_rewards_bias[bias], label=f"bias={bias}")
    plt.plot(avg_rewards["cheating"], label="cheating model", linestyle="--")

    plt.legend()
    plt.yscale("log")
    plt.title("എപ്സിലോൺ=0.01 ഉള്ള ബയസ് ഫലങ്ങൾ")

    plt.tight_layout()
    plt.show()

if __name__ == "__main__":
    main()

    
  




[^1]: സട്ടൺ et al ന്റെ റീൻഫോഴ്സ്മെന്റ് ലേണിംഗ്

✦ ഈ ലേഖനത്തിന്റെ ആശയരൂപീകരണം, ഗവേഷണം, എഴുത്ത്, അല്ലെങ്കിൽ എഡിറ്റിംഗ് എന്നിവയിൽ LLM-കൾ ഉപയോഗിച്ചിട്ടില്ല.