强化学习--DDPG
Posted zle1992
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了强化学习--DDPG相关的知识,希望对你有一定的参考价值。
Deep_Deterministic_Policy_Gradient
逻辑梳理:
1、DDPG是AC 模型,输入包括(S,R,S_,A)
2、Actor
intput:(S)
output: a
loss :max(q)
q 来自Critic
3、Critic
input : S 、A
output: q
loss: R+ GAMMA * q_ - q
问题来了,q_ how to get? ---->Critic网络可以输入(S_,a_)得到q_ 但是,不能用同一个网络啊,所以,利用错位时间,我们使用Critic2(不可训练的)
Critic2需要a_ how to get?/----->Action网络可以输出(S_)得到a_,同理,我们使用Actor2(不可训练的)得到a_
代码:
1 """ 2 Deep Deterministic Policy Gradient (DDPG), Reinforcement Learning. 3 DDPG is Actor Critic based algorithm. 4 Pendulum example. 5 View more on my tutorial page: https://morvanzhou.github.io/tutorials/ 6 Using: 7 tensorflow 1.0 8 gym 0.8.0 9 """ 10 11 import tensorflow as tf 12 import numpy as np 13 import gym 14 import time 15 16 17 ##################### hyper parameters #################### 18 19 MAX_EPISODES = 200 20 MAX_EP_STEPS = 200 21 LR_A = 0.001 # learning rate for actor 22 LR_C = 0.002 # learning rate for critic 23 GAMMA = 0.9 # reward discount 24 TAU = 0.01 # soft replacement 25 MEMORY_CAPACITY = 10000 26 BATCH_SIZE = 32 27 28 RENDER = False 29 ENV_NAME = ‘Pendulum-v0‘ 30 31 ############################### DDPG #################################### 32 33 class DDPG(object): 34 def __init__(self, a_dim, s_dim, a_bound,): 35 self.memory = np.zeros((MEMORY_CAPACITY, s_dim * 2 + a_dim + 1), dtype=np.float32) 36 self.pointer = 0 37 self.sess = tf.Session() 38 39 self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound, 40 self.S = tf.placeholder(tf.float32, [None, s_dim], ‘s‘) 41 self.S_ = tf.placeholder(tf.float32, [None, s_dim], ‘s_‘) 42 self.R = tf.placeholder(tf.float32, [None, 1], ‘r‘) 43 44 with tf.variable_scope(‘Actor‘): 45 self.a = self._build_a(self.S, scope=‘eval‘, trainable=True) 46 a_ = self._build_a(self.S_, scope=‘target‘, trainable=False) 47 with tf.variable_scope(‘Critic‘): 48 # assign self.a = a in memory when calculating q for td_error, 49 # otherwise the self.a is from Actor when updating Actor 50 q = self._build_c(self.S, self.a, scope=‘eval‘, trainable=True) 51 q_ = self._build_c(self.S_, a_, scope=‘target‘, trainable=False) 52 53 # networks parameters 54 self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=‘Actor/eval‘) 55 self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=‘Actor/target‘) 56 self.ce_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=‘Critic/eval‘) 57 self.ct_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=‘Critic/target‘) 58 59 # target net replacement 60 self.soft_replace = [tf.assign(t, (1 - TAU) * t + TAU * e) 61 for t, e in zip(self.at_params + self.ct_params, self.ae_params + self.ce_params)] 62 63 q_target = self.R + GAMMA * q_ 64 # in the feed_dic for the td_error, the self.a should change to actions in memory 65 td_error = tf.losses.mean_squared_error(labels=q_target, predictions=q) 66 self.ctrain = tf.train.AdamOptimizer(LR_C).minimize(td_error, var_list=self.ce_params) 67 68 a_loss = - tf.reduce_mean(q) # maximize the q 69 self.atrain = tf.train.AdamOptimizer(LR_A).minimize(a_loss, var_list=self.ae_params) 70 71 self.sess.run(tf.global_variables_initializer()) 72 73 def choose_action(self, s): 74 return self.sess.run(self.a, {self.S: s[np.newaxis, :]})[0] 75 76 def learn(self): 77 # soft target replacement 78 self.sess.run(self.soft_replace) 79 80 indices = np.random.choice(MEMORY_CAPACITY, size=BATCH_SIZE) 81 bt = self.memory[indices, :] 82 bs = bt[:, :self.s_dim] 83 ba = bt[:, self.s_dim: self.s_dim + self.a_dim] 84 br = bt[:, -self.s_dim - 1: -self.s_dim] 85 bs_ = bt[:, -self.s_dim:] 86 87 self.sess.run(self.atrain, {self.S: bs}) 88 self.sess.run(self.ctrain, {self.S: bs, self.a: ba, self.R: br, self.S_: bs_}) 89 90 def store_transition(self, s, a, r, s_): 91 transition = np.hstack((s, a, [r], s_)) 92 index = self.pointer % MEMORY_CAPACITY # replace the old memory with new memory 93 self.memory[index, :] = transition 94 self.pointer += 1 95 96 def _build_a(self, s, scope, trainable): 97 with tf.variable_scope(scope): 98 net = tf.layers.dense(s, 30, activation=tf.nn.relu, name=‘l1‘, trainable=trainable) 99 a = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, name=‘a‘, trainable=trainable) 100 return tf.multiply(a, self.a_bound, name=‘scaled_a‘) 101 102 def _build_c(self, s, a, scope, trainable): 103 with tf.variable_scope(scope): 104 n_l1 = 30 105 w1_s = tf.get_variable(‘w1_s‘, [self.s_dim, n_l1], trainable=trainable) 106 w1_a = tf.get_variable(‘w1_a‘, [self.a_dim, n_l1], trainable=trainable) 107 b1 = tf.get_variable(‘b1‘, [1, n_l1], trainable=trainable) 108 net = tf.nn.relu(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1) 109 return tf.layers.dense(net, 1, trainable=trainable) # Q(s,a) 110 111 ############################### training #################################### 112 113 env = gym.make(ENV_NAME) 114 env = env.unwrapped 115 env.seed(1) 116 117 s_dim = env.observation_space.shape[0] 118 a_dim = env.action_space.shape[0] 119 a_bound = env.action_space.high 120 121 ddpg = DDPG(a_dim, s_dim, a_bound) 122 123 var = 3 # control exploration 124 t1 = time.time() 125 for i in range(MAX_EPISODES): 126 s = env.reset() 127 ep_reward = 0 128 for j in range(MAX_EP_STEPS): 129 if RENDER: 130 env.render() 131 132 # Add exploration noise 133 a = ddpg.choose_action(s) 134 a = np.clip(np.random.normal(a, var), -2, 2) # add randomness to action selection for exploration 135 s_, r, done, info = env.step(a) 136 137 ddpg.store_transition(s, a, r / 10, s_) 138 139 if ddpg.pointer > MEMORY_CAPACITY: 140 var *= .9995 # decay the action randomness 141 ddpg.learn() 142 143 s = s_ 144 ep_reward += r 145 if j == MAX_EP_STEPS-1: 146 print(‘Episode:‘, i, ‘ Reward: %i‘ % int(ep_reward), ‘Explore: %.2f‘ % var, ) 147 if ep_reward > -300:RENDER = True 148 break 149 print(‘Running time: ‘, time.time() - t1)
以上是关于强化学习--DDPG的主要内容,如果未能解决你的问题,请参考以下文章
强化学习笔记 DDPG (Deep Deterministic Policy Gradient)
强化学习笔记 Ornstein-Uhlenbeck 噪声和DDPG
强化学习调参技巧一: DDPG算法训练动作选择边界值_分析解决
深度强化学习(Deep Reinforcement Learning)入门:RL base & DQN-DDPG-A3C introduction