coach 模块化最好的强化学习框架
Posted CreateAMind
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了coach 模块化最好的强化学习框架相关的知识,希望对你有一定的参考价值。
add agent
http://coach.nervanasys.com/contributing/add_agent/index.html
class Agent(object):
class PolicyOptimizationAgent(Agent):
class ActorCriticAgent(PolicyOptimizationAgent):
presets.py
class Carla_A3C(Preset):
def __init__(self):
Preset.__init__(self, ActorCritic, Carla, EntropyExploration)
self.agent.embedder_complexity = EmbedderComplexity.Deep
self.agent.policy_gradient_rescaler = 'GAE'
self.learning_rate = 0.0001
self.num_heatup_steps = 0
# self.env.reward_scaling = 1.0e9
self.agent.discount = 0.99
self.agent.apply_gradients_every_x_episodes = 1
self.agent.num_steps_between_gradient_updates = 30
self.agent.gae_lambda = 1
self.agent.beta_entropy = 0.01
self.clip_gradients = 40
self.agent.middleware_type = MiddlewareTypes.FC
class Carla_DDPG(Preset):
def __init__(self):
Preset.__init__(self, DDPG, Carla, OUExploration)
self.agent.embedder_complexity = EmbedderComplexity.Deep
self.learning_rate = 0.0001
self.num_heatup_steps = 1000
self.agent.num_consecutive_training_steps = 5
class Carla_BC(Preset):
def __init__(self):
Preset.__init__(self, BC, Carla, ExplorationParameters)
self.agent.embedder_complexity = EmbedderComplexity.Deep
self.agent.load_memory_from_file_path = 'datasets/carla_town1.p'
self.learning_rate = 0.0005
self.num_heatup_steps = 0
self.evaluation_episodes = 5
self.batch_size = 120
self.evaluate_every_x_training_iterations = 5000
配置参数:
class Preset(GeneralParameters):
def __init__(self, agent, env, exploration, visualization=VisualizationParameters):
"""
:type agent: AgentParameters
:type env: EnvironmentParameters
:type exploration: ExplorationParameters
:type visualization: VisualizationParameters
"""
self.visualization = visualization
self.agent = agent
self.env = env
self.exploration = exploration
class ActorCritic(AgentParameters):
type = 'ActorCriticAgent'
input_types = {'observation': InputTypes.Observation}
output_types = [OutputTypes.V, OutputTypes.Pi]
loss_weights = [0.5, 1.0]
stop_gradients_from_head = [False, False]
num_episodes_in_experience_replay = 2
policy_gradient_rescaler = 'A_VALUE'
hidden_layers_activation_function = 'elu'
apply_gradients_every_x_episodes = 5
beta_entropy = 0
num_steps_between_gradient_updates = 5000 # this is called t_max in all the papers
gae_lambda = 0.96
shared_optimizer = True
estimate_value_using_gae = False
async_training = True
class PolicyGradient(AgentParameters):
type = 'PolicyGradientsAgent'
input_types = {'observation': InputTypes.Observation}
output_types = [OutputTypes.Pi]
loss_weights = [1.0]
num_episodes_in_experience_replay = 2
policy_gradient_rescaler = 'FUTURE_RETURN_NORMALIZED_BY_TIMESTEP'
apply_gradients_every_x_episodes = 5
beta_entropy = 0
num_steps_between_gradient_updates = 20000 # this is called t_max in all the papers
async_training = True
class DDPG(AgentParameters):
type = 'DDPGAgent'
input_types = {'observation': InputTypes.Observation, 'action': InputTypes.Action}
output_types = [OutputTypes.V] # V is used because we only want a single Q value
loss_weights = [1.0]
hidden_layers_activation_function = 'relu'
num_episodes_in_experience_replay = 10000
num_steps_between_copying_online_weights_to_target = 1
rate_for_copying_weights_to_target = 0.001
shared_optimizer = True
async_training = True
class AgentParameters(Parameters):
agent = ''
# Architecture parameters
input_types = {'observation': InputTypes.Observation}
output_types = [OutputTypes.Q]
middleware_type = MiddlewareTypes.FC
loss_weights = [1.0]
stop_gradients_from_head = [False]
embedder_complexity = EmbedderComplexity.Shallow
num_output_head_copies = 1
use_measurements = False
use_accumulated_reward_as_measurement = False
add_a_normalized_timestep_to_the_observation = False
l2_regularization = 0
hidden_layers_activation_function = 'relu'
optimizer_type = 'Adam'
async_training = False
use_separate_networks_per_head = False
# Agent parameters
num_consecutive_playing_steps = 1
num_consecutive_training_steps = 1
update_evaluation_agent_network_after_every_num_steps = 3000
bootstrap_total_return_from_old_policy = False
n_step = -1
num_episodes_in_experience_replay = 200
num_transitions_in_experience_replay = None
discount = 0.99
policy_gradient_rescaler = 'A_VALUE'
apply_gradients_every_x_episodes = 5
beta_entropy = 0
num_steps_between_gradient_updates = 20000 # t_max
num_steps_between_copying_online_weights_to_target = 1000
rate_for_copying_weights_to_target = 1.0
monte_carlo_mixing_rate = 0.1
gae_lambda = 0.96
step_until_collecting_full_episodes = False
targets_horizon = 'N-Step'
replace_mse_with_huber_loss = False
load_memory_from_file_path = None
collect_new_data = True
input_rescaler = 255.0
# PPO related params
target_kl_divergence = 0.01
initial_kl_coefficient = 1.0
high_kl_penalty_coefficient = 1000
value_targets_mix_fraction = 0.1
clip_likelihood_ratio_using_epsilon = None
use_kl_regularization = True
estimate_value_using_gae = False
# DFP related params
num_predicted_steps_ahead = 6
goal_vector = [1.0, 1.0]
future_measurements_weights = [0.5, 0.5, 1.0]
# NEC related params
dnd_size = 500000
l2_norm_added_delta = 0.001
new_value_shift_coefficient = 0.1
number_of_knn = 50
DND_key_error_threshold = 0.01
# Framework support
neon_support = False
tensorflow_support = True
# distributed agents params
shared_optimizer = True
share_statistics_between_workers = True
class EnvironmentParameters(Parameters):
type = 'Doom'
level = 'basic'
observation_stack_size = 4
frame_skip = 4
desired_observation_width = 76
desired_observation_height = 60
normalize_observation = False
crop_observation = False
random_initialization_steps = 0
reward_scaling = 1.0
reward_clipping_min = None
reward_clipping_max = None
human_control = False
class ExplorationParameters(Parameters):
# Exploration policies
policy = 'EGreedy'
evaluation_policy = 'Greedy'
# -- bootstrap dqn parameters
bootstrapped_data_sharing_probability = 0.5
architecture_num_q_heads = 1
# -- dropout approximation of thompson sampling parameters
dropout_discard_probability = 0
initial_keep_probability = 0.0 # unused
final_keep_probability = 0.99 # unused
keep_probability_decay_steps = 50000 # unused
# -- epsilon greedy parameters
initial_epsilon = 0.5
final_epsilon = 0.01
epsilon_decay_steps = 50000
evaluation_epsilon = 0.05
# -- epsilon greedy at end of episode parameters
average_episode_length_over_num_episodes = 20
# -- boltzmann softmax parameters
initial_temperature = 100.0
final_temperature = 1.0
temperature_decay_steps = 50000
# -- additive noise
initial_noise_variance_percentage = 0.1
final_noise_variance_percentage = 0.1
noise_variance_decay_steps = 1
# -- Ornstein-Uhlenbeck process
mu = 0
theta = 0.15
sigma = 0.3
dt = 0.01
class GeneralParameters(Parameters):
train = True
framework = Frameworks.TensorFlow
threads = 1
sess = None
# distributed training options
num_threads = 1
synchronize_over_num_threads = 1
distributed = False
# Agent blocks
memory = 'EpisodicExperienceReplay'
architecture = 'GeneralTensorFlowNetwork'
# General parameters
clip_gradients = None
kl_divergence_constraint = 100000
num_training_iterations = 10000000000
num_heatup_steps = 1000
heatup_using_network_decisions = False
batch_size = 32
save_model_sec = None
save_model_dir = None
checkpoint_restore_dir = None
learning_rate = 0.00025
learning_rate_decay_rate = 0
learning_rate_decay_steps = 0
evaluation_episodes = 5
evaluate_every_x_episodes = 1000000
evaluate_every_x_training_iterations = 0
rescaling_interpolation_type = 'bilinear'
current_episode = 0
# setting a seed will only work for non-parallel algorithms. Parallel algorithms add uncontrollable noise in
# the form of different workers starting at different times, and getting different assignments of CPU
# time from the OS.
seed = None
checkpoints_path = ''
# Testing parameters
test = False
test_min_return_threshold = 0
test_max_step_threshold = 1
test_num_workers = 1
class Atari(EnvironmentParameters):
type = 'Gym'
frame_skip = 4
observation_stack_size = 4
desired_observation_height = 84
desired_observation_width = 84
reward_clipping_max = 1.0
reward_clipping_min = -1.0
random_initialization_steps = 30
crop_observation = False # in the original paper the observation is cropped but not in the Nature paper
class Doom(EnvironmentParameters):
type = 'Doom'
frame_skip = 4
observation_stack_size = 3
desired_observation_height = 60
desired_observation_width = 76
class Carla(EnvironmentParameters):
type = 'Carla'
frame_skip = 1
observation_stack_size = 4
desired_observation_height = 128
desired_observation_width = 180
normalize_observation = False
server_height = 256
server_width = 360
config = 'environments/CarlaSettings.ini'
level = 'town1'
verbose = True
stereo = False
semantic_segmentation = False
depth = False
episode_max_time = 100000 # miliseconds for each episode
continuous_to_bool_threshold = 0.5
allow_braking = False
(coach_env) ubuntu@ubuntu-Default-string:~/github/coach$ python3 coach.py -l
Warning: failed to import the following packages - RoboSchool, CARLA, Neon, ViZDoom, GymExtensions, PyBullet
Available Presets:
Alien_DQN
Alien_NEC
AntBullet_A3C
AntMaze_A3C
Ant_A3C
Ant_ClippedPPO
Ant_DDPG
Atari_DQN_TestBench
BipedalWalker_A3C
Breakout_A3C
Breakout_C51
Breakout_DDQN
Breakout_DQN
Breakout_Dueling_DDQN
Breakout_NEC
Breakout_QRDQN
Carla_A3C
Carla_BC
Carla_DDPG
CartPole_A2C
CartPole_A3C
CartPole_Bootstrapped_DQN
CartPole_C51
CartPole_ClippedPPO
CartPole_DQN
CartPole_Dueling_DDQN
CartPole_MMC
CartPole_NEC
CartPole_NStepQ
CartPole_OneStepQ
CartPole_PAL
CartPole_PG
CartPole_PPO
CartPole_QRDQN
Doom_Basic_A2C
Doom_Basic_A3C
Doom_Basic_BC
Doom_Basic_DFP
Doom_Basic_DQN
Doom_Basic_Dueling_DDQN
Doom_Basic_Dueling_DQN
Doom_Basic_NEC
Doom_Basic_NStepQ
Doom_Basic_OneStepQ
Doom_Basic_PG
Doom_Basic_QRDQN
Doom_Deadly_Corridor_Bootstrapped_DQN
Doom_Deathmatch_BC
Doom_Defend_BC
Doom_Health_DFP
Doom_Health_DQN
Doom_Health_MMC
Doom_Health_NEC
HalfCheetah_ClippedPPO_Roboschool
HalfCheetah_DDPG
HopperBullet_A3C
HopperIceWall_A3C
HopperStairs_A3C
Hopper_A3C
Hopper_ClippedPPO
Hopper_ClippedPPO_Distributed
Hopper_ClippedPPO_Roboschool
Hopper_DDDPG
Hopper_DDPG
Hopper_DDPG_Roboschool
Hopper_DPPO
Hopper_NAF
Hopper_PPO
Hopper_PPO_Roboschool
Humanoid_A3C
Humanoid_ClippedPPO
InvertedPendulum_A3C
InvertedPendulum_ClippedPPO
InvertedPendulum_ClippedPPO_Roboschool
InvertedPendulum_DDPG
InvertedPendulum_NAF
InvertedPendulum_PG
InvertedPendulum_PPO
Kuka_ClippedPPO
Minitaur_ClippedPPO
MontezumaRevenge_BC
Montezuma_NEC
MountainCar_A3C
Pendulum_A3C
Pendulum_ClippedPPO
Pendulum_DDPG
Pendulum_NAF
Pendulum_PG
Pong_A3C
Pong_DQN
Pong_NEC
Pong_NEC_LSTM
Walker_A3C
Walker_PPO
以上是关于coach 模块化最好的强化学习框架的主要内容,如果未能解决你的问题,请参考以下文章