强化学习--DDPG

Posted 2021-02-02 zle1992

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了强化学习--DDPG相关的知识，希望对你有一定的参考价值。

Deep_Deterministic_Policy_Gradient

逻辑梳理：

1、DDPG是AC 模型，输入包括（S,R,S_,A）

2、Actor

intput:(S)

output: a

loss :max(q)

q 来自Critic

3、Critic

input : S 、A

output: q

loss: R+ GAMMA * q_ - q

问题来了，q_ how to get? ---->Critic网络可以输入（S_,a_）得到q_ 但是，不能用同一个网络啊，所以，利用错位时间，我们使用Critic2（不可训练的）

Critic2需要a_ how to get?/----->Action网络可以输出（S_）得到a_，同理，我们使用Actor2(不可训练的)得到a_

代码：

  1 """
  2 Deep Deterministic Policy Gradient (DDPG), Reinforcement Learning.
  3 DDPG is Actor Critic based algorithm.
  4 Pendulum example.
  5 View more on my tutorial page: https://morvanzhou.github.io/tutorials/
  6 Using:
  7 tensorflow 1.0
  8 gym 0.8.0
  9 """
 10 
 11 import tensorflow as tf
 12 import numpy as np
 13 import gym
 14 import time
 15 
 16 
 17 #####################  hyper parameters  ####################
 18 
 19 MAX_EPISODES = 200
 20 MAX_EP_STEPS = 200
 21 LR_A = 0.001    # learning rate for actor
 22 LR_C = 0.002    # learning rate for critic
 23 GAMMA = 0.9     # reward discount
 24 TAU = 0.01      # soft replacement
 25 MEMORY_CAPACITY = 10000
 26 BATCH_SIZE = 32
 27 
 28 RENDER = False
 29 ENV_NAME = ‘Pendulum-v0‘
 30 
 31 ###############################  DDPG  ####################################
 32 
 33 class DDPG(object):
 34     def __init__(self, a_dim, s_dim, a_bound,):
 35         self.memory = np.zeros((MEMORY_CAPACITY, s_dim * 2 + a_dim + 1), dtype=np.float32)
 36         self.pointer = 0
 37         self.sess = tf.Session()
 38 
 39         self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound,
 40         self.S = tf.placeholder(tf.float32, [None, s_dim], ‘s‘)
 41         self.S_ = tf.placeholder(tf.float32, [None, s_dim], ‘s_‘)
 42         self.R = tf.placeholder(tf.float32, [None, 1], ‘r‘)
 43 
 44         with tf.variable_scope(‘Actor‘):
 45             self.a = self._build_a(self.S, scope=‘eval‘, trainable=True)
 46             a_ = self._build_a(self.S_, scope=‘target‘, trainable=False)
 47         with tf.variable_scope(‘Critic‘):
 48             # assign self.a = a in memory when calculating q for td_error,
 49             # otherwise the self.a is from Actor when updating Actor
 50             q = self._build_c(self.S, self.a, scope=‘eval‘, trainable=True)
 51             q_ = self._build_c(self.S_, a_, scope=‘target‘, trainable=False)
 52 
 53         # networks parameters
 54         self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=‘Actor/eval‘)
 55         self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=‘Actor/target‘)
 56         self.ce_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=‘Critic/eval‘)
 57         self.ct_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=‘Critic/target‘)
 58 
 59         # target net replacement
 60         self.soft_replace = [tf.assign(t, (1 - TAU) * t + TAU * e)
 61                              for t, e in zip(self.at_params + self.ct_params, self.ae_params + self.ce_params)]
 62 
 63         q_target = self.R + GAMMA * q_
 64         # in the feed_dic for the td_error, the self.a should change to actions in memory
 65         td_error = tf.losses.mean_squared_error(labels=q_target, predictions=q)
 66         self.ctrain = tf.train.AdamOptimizer(LR_C).minimize(td_error, var_list=self.ce_params)
 67 
 68         a_loss = - tf.reduce_mean(q)    # maximize the q
 69         self.atrain = tf.train.AdamOptimizer(LR_A).minimize(a_loss, var_list=self.ae_params)
 70 
 71         self.sess.run(tf.global_variables_initializer())
 72 
 73     def choose_action(self, s):
 74         return self.sess.run(self.a, {self.S: s[np.newaxis, :]})[0]
 75 
 76     def learn(self):
 77         # soft target replacement
 78         self.sess.run(self.soft_replace)
 79 
 80         indices = np.random.choice(MEMORY_CAPACITY, size=BATCH_SIZE)
 81         bt = self.memory[indices, :]
 82         bs = bt[:, :self.s_dim]
 83         ba = bt[:, self.s_dim: self.s_dim + self.a_dim]
 84         br = bt[:, -self.s_dim - 1: -self.s_dim]
 85         bs_ = bt[:, -self.s_dim:]
 86 
 87         self.sess.run(self.atrain, {self.S: bs})
 88         self.sess.run(self.ctrain, {self.S: bs, self.a: ba, self.R: br, self.S_: bs_})
 89 
 90     def store_transition(self, s, a, r, s_):
 91         transition = np.hstack((s, a, [r], s_))
 92         index = self.pointer % MEMORY_CAPACITY  # replace the old memory with new memory
 93         self.memory[index, :] = transition
 94         self.pointer += 1
 95 
 96     def _build_a(self, s, scope, trainable):
 97         with tf.variable_scope(scope):
 98             net = tf.layers.dense(s, 30, activation=tf.nn.relu, name=‘l1‘, trainable=trainable)
 99             a = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, name=‘a‘, trainable=trainable)
100             return tf.multiply(a, self.a_bound, name=‘scaled_a‘)
101 
102     def _build_c(self, s, a, scope, trainable):
103         with tf.variable_scope(scope):
104             n_l1 = 30
105             w1_s = tf.get_variable(‘w1_s‘, [self.s_dim, n_l1], trainable=trainable)
106             w1_a = tf.get_variable(‘w1_a‘, [self.a_dim, n_l1], trainable=trainable)
107             b1 = tf.get_variable(‘b1‘, [1, n_l1], trainable=trainable)
108             net = tf.nn.relu(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
109             return tf.layers.dense(net, 1, trainable=trainable)  # Q(s,a)
110 
111 ###############################  training  ####################################
112 
113 env = gym.make(ENV_NAME)
114 env = env.unwrapped
115 env.seed(1)
116 
117 s_dim = env.observation_space.shape[0]
118 a_dim = env.action_space.shape[0]
119 a_bound = env.action_space.high
120 
121 ddpg = DDPG(a_dim, s_dim, a_bound)
122 
123 var = 3  # control exploration
124 t1 = time.time()
125 for i in range(MAX_EPISODES):
126     s = env.reset()
127     ep_reward = 0
128     for j in range(MAX_EP_STEPS):
129         if RENDER:
130             env.render()
131 
132         # Add exploration noise
133         a = ddpg.choose_action(s)
134         a = np.clip(np.random.normal(a, var), -2, 2)    # add randomness to action selection for exploration
135         s_, r, done, info = env.step(a)
136 
137         ddpg.store_transition(s, a, r / 10, s_)
138 
139         if ddpg.pointer > MEMORY_CAPACITY:
140             var *= .9995    # decay the action randomness
141             ddpg.learn()
142 
143         s = s_
144         ep_reward += r
145         if j == MAX_EP_STEPS-1:
146             print(‘Episode:‘, i, ‘ Reward: %i‘ % int(ep_reward), ‘Explore: %.2f‘ % var, )
147             　　if ep_reward > -300:RENDER = True
148             break
149 print(‘Running time: ‘, time.time() - t1)

以上是关于强化学习--DDPG的主要内容，如果未能解决你的问题，请参考以下文章