Untitled

def optimize_params(problem_id=0, episodes=350):
    print("Optimizing for problem:", problem_id)
    max_avg_reward = np.NINF
    avg_reward_params = np.NINF

    # I have chosen these parameters based off of manual fine-tuning
    BUFFER_SIZES = [100, 1000, 10000] 
    BATCH_SIZES = [50, 100, 200]
    alphas = [0.00001, 0.0001, 0.001, 0.01]

    for BUFFER_S in BUFFER_SIZES:
        for BATCH_S in BATCH_SIZES:
            for alphaa in alphas:
                env = virl.Epidemic(problem_id=problem_id)

                
                d_states    = env.observation_space.shape[0],
                n_actions   = env.action_space.n,

                alpha= alphaa,          # learning rate/stepsize, 0.001 seems to be a good choice
                nn_config   = [24,24], # size of the hidden layers in the MLP [24,24 seems to be a good choice]
                BATCH_SIZE  = BATCH_S,    # number of samples in a batch
                BUFFER_SIZE = BUFFER_S   # size of the replay buffer
                
                # Init the two networks
                nn_func_approximator = NNFunctionApproximatorJointKeras(alpha, d_states, n_actions, nn_config)
                nn_func_approximator_target = NNFunctionApproximatorJointKeras(alpha, d_states, n_actions, nn_config)
                
                # Train agent and return the stats
                stats = q_learning_nn(env,nn_func_approximator, nn_func_approximator_target, 500, max_steps_per_episode=10, 
                                      epsilon_init=0.1, epsilon_decay=0.995, epsilon_min=0.001, 
                                      fn_model_in=None, fn_model_out="virl_temp.h5")
                
                reward = max(stats.episode_rewards)
                avg_reward = avg(stats.episode_rewards)

                if reward > max_reward:
                    max_reward = reward
                    max_reward_params = {"BUFFER SIZE": BUFFER_S, "BATCH_SIZE": BATCH_S, "alpha": alphaa}
                
                if avg_reward > max_avg_reward:
                    max_avg_reward = avg_reward
                    avg_reward_params = {"BUFFER SIZE": BUFFER_S, "BATCH_SIZE": BATCH_S, "alpha": alphaa}
                
                avg_reward = avg(stats.episode_rewards)
                print(f"\rmax reward (BUFFER SIZE: {BUFFER_S}, BATCH_SIZE: {BATCH_S}, alpha: {alphaa}):", reward, "avg:", avg_reward)

    print("\nFinished!")
    print("avg_reward:", max_avg_reward, "params:", avg_reward_params)
Editor is loading...