coach 模块化最好的强化学习框架

Posted CreateAMind

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了coach 模块化最好的强化学习框架相关的知识,希望对你有一定的参考价值。


add agent

http://coach.nervanasys.com/contributing/add_agent/index.html


class Agent(object):
class PolicyOptimizationAgent(Agent):
class ActorCriticAgent(PolicyOptimizationAgent):


presets.py


class Carla_A3C(Preset):
   def __init__(self):
       Preset.__init__(self, ActorCritic, Carla, EntropyExploration)
self.agent.embedder_complexity = EmbedderComplexity.Deep
self.agent.policy_gradient_rescaler = 'GAE'
       self.learning_rate = 0.0001
       self.num_heatup_steps = 0
       # self.env.reward_scaling = 1.0e9
       self.agent.discount = 0.99
       self.agent.apply_gradients_every_x_episodes = 1
       self.agent.num_steps_between_gradient_updates = 30
       self.agent.gae_lambda = 1
       self.agent.beta_entropy = 0.01
       self.clip_gradients = 40
       self.agent.middleware_type = MiddlewareTypes.FC


class Carla_DDPG(Preset):
   def __init__(self):
       Preset.__init__(self, DDPG, Carla, OUExploration)
self.agent.embedder_complexity = EmbedderComplexity.Deep
self.learning_rate = 0.0001
       self.num_heatup_steps = 1000
       self.agent.num_consecutive_training_steps = 5


class Carla_BC(Preset):
   def __init__(self):
       Preset.__init__(self, BC, Carla, ExplorationParameters)
self.agent.embedder_complexity = EmbedderComplexity.Deep
self.agent.load_memory_from_file_path = 'datasets/carla_town1.p'
       self.learning_rate = 0.0005
       self.num_heatup_steps = 0
       self.evaluation_episodes = 5
       self.batch_size = 120
       self.evaluate_every_x_training_iterations = 5000


配置参数:

class Preset(GeneralParameters):
   def __init__(self, agent, env, exploration, visualization=VisualizationParameters):
       """
       :type agent: AgentParameters
       :type env: EnvironmentParameters
       :type exploration: ExplorationParameters
       :type visualization: VisualizationParameters
       """
       self.visualization = visualization
       self.agent = agent
       self.env = env
       self.exploration = exploration

class
ActorCritic(AgentParameters):
   type = 'ActorCriticAgent'
   input_types = {'observation': InputTypes.Observation}
output_types = [OutputTypes.V, OutputTypes.Pi]
loss_weights = [0.5, 1.0]
stop_gradients_from_head = [False, False]
num_episodes_in_experience_replay = 2
   policy_gradient_rescaler = 'A_VALUE'
   hidden_layers_activation_function = 'elu'
   apply_gradients_every_x_episodes = 5
   beta_entropy = 0
   num_steps_between_gradient_updates = 5000  # this is called t_max in all the papers
   gae_lambda = 0.96
   shared_optimizer = True
   estimate_value_using_gae = False
   async_training = True


class PolicyGradient(AgentParameters):
   type = 'PolicyGradientsAgent'
   input_types = {'observation': InputTypes.Observation}
output_types = [OutputTypes.Pi]
loss_weights = [1.0]
num_episodes_in_experience_replay = 2
   policy_gradient_rescaler = 'FUTURE_RETURN_NORMALIZED_BY_TIMESTEP'
   apply_gradients_every_x_episodes = 5
   beta_entropy = 0
   num_steps_between_gradient_updates = 20000  # this is called t_max in all the papers
   async_training = True


class DDPG(AgentParameters):
   type = 'DDPGAgent'
   input_types = {'observation': InputTypes.Observation, 'action': InputTypes.Action}
output_types = [OutputTypes.V] # V is used because we only want a single Q value
   loss_weights = [1.0]
hidden_layers_activation_function = 'relu'
   num_episodes_in_experience_replay = 10000
   num_steps_between_copying_online_weights_to_target = 1
   rate_for_copying_weights_to_target = 0.001
   shared_optimizer = True
   async_training = True


class AgentParameters(Parameters):
   agent = ''

   # Architecture parameters
   input_types = {'observation': InputTypes.Observation}
output_types = [OutputTypes.Q]
middleware_type = MiddlewareTypes.FC
loss_weights = [1.0]
stop_gradients_from_head = [False]
embedder_complexity = EmbedderComplexity.Shallow
num_output_head_copies = 1
   use_measurements = False
   use_accumulated_reward_as_measurement = False
   add_a_normalized_timestep_to_the_observation = False
   l2_regularization = 0
   hidden_layers_activation_function = 'relu'
   optimizer_type = 'Adam'
   async_training = False
   use_separate_networks_per_head = False

   # Agent parameters
   num_consecutive_playing_steps = 1
   num_consecutive_training_steps = 1
   update_evaluation_agent_network_after_every_num_steps = 3000
   bootstrap_total_return_from_old_policy = False
   n_step = -1
   num_episodes_in_experience_replay = 200
   num_transitions_in_experience_replay = None
   discount = 0.99
   policy_gradient_rescaler = 'A_VALUE'
   apply_gradients_every_x_episodes = 5
   beta_entropy = 0
   num_steps_between_gradient_updates = 20000  # t_max
   num_steps_between_copying_online_weights_to_target = 1000
   rate_for_copying_weights_to_target = 1.0
   monte_carlo_mixing_rate = 0.1
   gae_lambda = 0.96
   step_until_collecting_full_episodes = False
   targets_horizon = 'N-Step'
   replace_mse_with_huber_loss = False
   load_memory_from_file_path = None
   collect_new_data = True
   input_rescaler = 255.0

   # PPO related params
   target_kl_divergence = 0.01
   initial_kl_coefficient = 1.0
   high_kl_penalty_coefficient = 1000
   value_targets_mix_fraction = 0.1
   clip_likelihood_ratio_using_epsilon = None
   use_kl_regularization = True
   estimate_value_using_gae = False

   # DFP related params
   num_predicted_steps_ahead = 6
   goal_vector = [1.0, 1.0]
future_measurements_weights = [0.5, 0.5, 1.0]

# NEC related params
   dnd_size = 500000
   l2_norm_added_delta = 0.001
   new_value_shift_coefficient = 0.1
   number_of_knn = 50
   DND_key_error_threshold = 0.01

   # Framework support
   neon_support = False
   tensorflow_support = True

   # distributed agents params
   shared_optimizer = True
   share_statistics_between_workers = True


class EnvironmentParameters(Parameters):
   type = 'Doom'
   level = 'basic'
   observation_stack_size = 4
   frame_skip = 4
   desired_observation_width = 76
   desired_observation_height = 60
   normalize_observation = False
   crop_observation = False
   random_initialization_steps = 0
   reward_scaling = 1.0
   reward_clipping_min = None
   reward_clipping_max = None
   human_control = False


class ExplorationParameters(Parameters):
   # Exploration policies
   policy = 'EGreedy'
   evaluation_policy = 'Greedy'
   # -- bootstrap dqn parameters
   bootstrapped_data_sharing_probability = 0.5
   architecture_num_q_heads = 1
   # -- dropout approximation of thompson sampling parameters
   dropout_discard_probability = 0
   initial_keep_probability = 0.0  # unused
   final_keep_probability = 0.99  # unused
   keep_probability_decay_steps = 50000  # unused
   # -- epsilon greedy parameters
   initial_epsilon = 0.5
   final_epsilon = 0.01
   epsilon_decay_steps = 50000
   evaluation_epsilon = 0.05
   # -- epsilon greedy at end of episode parameters
   average_episode_length_over_num_episodes = 20
   # -- boltzmann softmax parameters
   initial_temperature = 100.0
   final_temperature = 1.0
   temperature_decay_steps = 50000
   # -- additive noise
   initial_noise_variance_percentage = 0.1
   final_noise_variance_percentage = 0.1
   noise_variance_decay_steps = 1
   # -- Ornstein-Uhlenbeck process
   mu = 0
   theta = 0.15
   sigma = 0.3
   dt = 0.01


class GeneralParameters(Parameters):
   train = True
   framework = Frameworks.TensorFlow
threads = 1
   sess = None

   # distributed training options
   num_threads = 1
   synchronize_over_num_threads = 1
   distributed = False

   # Agent blocks
   memory = 'EpisodicExperienceReplay'
   architecture = 'GeneralTensorFlowNetwork'

   # General parameters
   clip_gradients = None
   kl_divergence_constraint = 100000
   num_training_iterations = 10000000000
   num_heatup_steps = 1000
   heatup_using_network_decisions = False
   batch_size = 32
   save_model_sec = None
   save_model_dir = None
   checkpoint_restore_dir = None
   learning_rate = 0.00025
   learning_rate_decay_rate = 0
   learning_rate_decay_steps = 0
   evaluation_episodes = 5
   evaluate_every_x_episodes = 1000000
   evaluate_every_x_training_iterations = 0
   rescaling_interpolation_type = 'bilinear'
   current_episode = 0

   # setting a seed will only work for non-parallel algorithms. Parallel algorithms add uncontrollable noise in
   # the form of different workers starting at different times, and getting different assignments of CPU
   # time from the OS.
   seed = None

   checkpoints_path = ''

   # Testing parameters
   test = False
   test_min_return_threshold = 0
   test_max_step_threshold = 1
   test_num_workers = 1



class Atari(EnvironmentParameters):
   type = 'Gym'
   frame_skip = 4
   observation_stack_size = 4
   desired_observation_height = 84
   desired_observation_width = 84
   reward_clipping_max = 1.0
   reward_clipping_min = -1.0
   random_initialization_steps = 30
   crop_observation = False  # in the original paper the observation is cropped but not in the Nature paper


class Doom(EnvironmentParameters):
   type = 'Doom'
   frame_skip = 4
   observation_stack_size = 3
   desired_observation_height = 60
   desired_observation_width = 76


class Carla(EnvironmentParameters):
   type = 'Carla'
   frame_skip = 1
   observation_stack_size = 4
   desired_observation_height = 128
   desired_observation_width = 180
   normalize_observation = False
   server_height = 256
   server_width = 360
   config = 'environments/CarlaSettings.ini'
   level = 'town1'
   verbose = True
   stereo = False
   semantic_segmentation = False
   depth = False
   episode_max_time = 100000  # miliseconds for each episode
   continuous_to_bool_threshold = 0.5
   allow_braking = False





(coach_env) ubuntu@ubuntu-Default-string:~/github/coach$ python3 coach.py -l

Warning: failed to import the following packages - RoboSchool, CARLA, Neon, ViZDoom, GymExtensions, PyBullet

Available Presets:

Alien_DQN

Alien_NEC

AntBullet_A3C

AntMaze_A3C

Ant_A3C

Ant_ClippedPPO

Ant_DDPG

Atari_DQN_TestBench

BipedalWalker_A3C

Breakout_A3C

Breakout_C51

Breakout_DDQN

Breakout_DQN

Breakout_Dueling_DDQN

Breakout_NEC

Breakout_QRDQN

Carla_A3C

Carla_BC

Carla_DDPG

CartPole_A2C

CartPole_A3C

CartPole_Bootstrapped_DQN

CartPole_C51

CartPole_ClippedPPO

CartPole_DQN

CartPole_Dueling_DDQN

CartPole_MMC

CartPole_NEC

CartPole_NStepQ

CartPole_OneStepQ

CartPole_PAL

CartPole_PG

CartPole_PPO

CartPole_QRDQN

Doom_Basic_A2C

Doom_Basic_A3C

Doom_Basic_BC

Doom_Basic_DFP

Doom_Basic_DQN

Doom_Basic_Dueling_DDQN

Doom_Basic_Dueling_DQN

Doom_Basic_NEC

Doom_Basic_NStepQ

Doom_Basic_OneStepQ

Doom_Basic_PG

Doom_Basic_QRDQN

Doom_Deadly_Corridor_Bootstrapped_DQN

Doom_Deathmatch_BC

Doom_Defend_BC

Doom_Health_DFP

Doom_Health_DQN

Doom_Health_MMC

Doom_Health_NEC

HalfCheetah_ClippedPPO_Roboschool

HalfCheetah_DDPG

HopperBullet_A3C

HopperIceWall_A3C

HopperStairs_A3C

Hopper_A3C

Hopper_ClippedPPO

Hopper_ClippedPPO_Distributed

Hopper_ClippedPPO_Roboschool

Hopper_DDDPG

Hopper_DDPG

Hopper_DDPG_Roboschool

Hopper_DPPO

Hopper_NAF

Hopper_PPO

Hopper_PPO_Roboschool

Humanoid_A3C

Humanoid_ClippedPPO

InvertedPendulum_A3C

InvertedPendulum_ClippedPPO

InvertedPendulum_ClippedPPO_Roboschool

InvertedPendulum_DDPG

InvertedPendulum_NAF

InvertedPendulum_PG

InvertedPendulum_PPO

Kuka_ClippedPPO

Minitaur_ClippedPPO

MontezumaRevenge_BC

Montezuma_NEC

MountainCar_A3C

Pendulum_A3C

Pendulum_ClippedPPO

Pendulum_DDPG

Pendulum_NAF

Pendulum_PG

Pong_A3C

Pong_DQN

Pong_NEC

Pong_NEC_LSTM

Walker_A3C

Walker_PPO


以上是关于coach 模块化最好的强化学习框架的主要内容,如果未能解决你的问题,请参考以下文章

强化学习系列13:基于pytorch的框架“天授”

python进阶强化学习

百度正式发布PaddlePaddle深度强化学习框架PARL

百度PaddlePaddle:

强化学习入门级实践教学

强化学习系列12:使用julia训练深度强化模型