Actor Critic算法源码分析
来源:互联网 发布:linux suse iso下载 编辑:程序博客网 时间:2024/06/08 04:47
Actor Critic算法源码分析
Actor-Critic算法主要是为了解决Policy Gradient算法中仅能在一个回合完成之后才能更新参数。简单的说是在玩游戏结束了之后,才能对参数进行更新。Policy Gradient算法从一个游戏的整体回合来看,加大好动作的权重,减小不好动作的权重。下面贴出两种算法对应的代码参考;
Policy Gradient
import numpy as npimport tensorflow as tf# reproduciblenp.random.seed(1)tf.set_random_seed(1)class PolicyGradient: def __init__( self, n_actions, n_features, learning_rate=0.01, reward_decay=0.95, output_graph=False, ): self.n_actions = n_actions self.n_features = n_features self.lr = learning_rate self.gamma = reward_decay self.ep_obs, self.ep_as, self.ep_rs = [], [], [] self._build_net() self.sess = tf.Session() if output_graph: # $ tensorboard --logdir=logs # http://0.0.0.0:6006/ # tf.train.SummaryWriter soon be deprecated, use following tf.summary.FileWriter("logs/", self.sess.graph) self.sess.run(tf.global_variables_initializer()) def _build_net(self): with tf.name_scope('inputs'): self.tf_obs = tf.placeholder(tf.float32, [None, self.n_features], name="observations") self.tf_acts = tf.placeholder(tf.int32, [None, ], name="actions_num") self.tf_vt = tf.placeholder(tf.float32, [None, ], name="actions_value") # fc1 主要包含两个全连接层,输入的是图像特征,也就是状态值 layer = tf.layers.dense( inputs=self.tf_obs, # 输入的状态 n_state*n_features units=10, activation=tf.nn.tanh, # tanh activation kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.3), bias_initializer=tf.constant_initializer(0.1), name='fc1' ) # fc2 all_act = tf.layers.dense( inputs=layer, units=self.n_actions, #输出层的个数是对应动作的个数 n_state*n_action activation=None, kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.3), bias_initializer=tf.constant_initializer(0.1), name='fc2' ) self.all_act_prob = tf.nn.softmax(all_act, name='act_prob') # use softmax to convert to probability with tf.name_scope('loss'): # tf.log(self.all_act_prob) 大小为 n_state*n_action, # self.tf_acts 表示该回合中采取的一系列的动作,这些动作使用动作的索引进行标识 # tf.log(self.all_act_prob)*tf.one_hot(self.tf_acts, self.n_actions)表示 # 将动作tf_acts对应的概率值选择出来 neg_log_prob = tf.reduce_sum(-tf.log(self.all_act_prob)*tf.one_hot(self.tf_acts, self.n_actions), axis=1) # tf_vt 每个状态的状态值 n_state,也可以认为是鼓励值 loss = tf.reduce_mean(neg_log_prob * self.tf_vt) # reward guided loss with tf.name_scope('train'): self.train_op = tf.train.AdamOptimizer(self.lr).minimize(loss) def choose_action(self, observation):# 这是在选择动作的时候不是greed模式,而是使用输出的每个动作对应的概率值选择动作 prob_weights = self.sess.run(self.all_act_prob, feed_dict={self.tf_obs: observation[np.newaxis, :]}) action = np.random.choice(range(prob_weights.shape[1]), p=prob_weights.ravel()) # select action w.r.t the actions prob return action def store_transition(self, s, a, r): self.ep_obs.append(s) # 状态 self.ep_as.append(a) # 当前保存状态下使用的动作 self.ep_rs.append(r) #计算得到的奖励 def learn(self): # discount and normalize episode reward discounted_ep_rs_norm = self._discount_and_norm_rewards() # train on episode self.sess.run(self.train_op, feed_dict={ self.tf_obs: np.vstack(self.ep_obs), # shape=[None, n_obs] 环境 self.tf_acts: np.array(self.ep_as), # shape=[None, ] 动作 self.tf_vt: discounted_ep_rs_norm, # shape=[None, ] 打折扣的奖励 }) self.ep_obs, self.ep_as, self.ep_rs = [], [], [] # 表示当前回合参数已经更新完毕 return discounted_ep_rs_norm def _discount_and_norm_rewards(self): # 打折扣的未来奖励的目的是:为了减小无模型算法中导致的偏差 discounted_ep_rs = np.zeros_like(self.ep_rs) running_add = 0 for t in reversed(range(0, len(self.ep_rs))):# 这里使用倒叙计算每个状态下打折扣的未来奖励 running_add = running_add * self.gamma + self.ep_rs[t] discounted_ep_rs[t] = running_add # normalize episode rewards discounted_ep_rs -= np.mean(discounted_ep_rs) discounted_ep_rs /= np.std(discounted_ep_rs) return discounted_ep_rs
"""Actor-Critic using TD-error as the Advantage, Reinforcement Learning.The cart pole example. Policy is oscillated.View more on my tutorial page: https://morvanzhou.github.io/tutorials/Using:tensorflow 1.0gym 0.8.0"""import numpy as npimport tensorflow as tfimport gymnp.random.seed(2)tf.set_random_seed(2) # reproducible# SuperparametersOUTPUT_GRAPH = FalseMAX_EPISODE = 3000DISPLAY_REWARD_THRESHOLD = 200 # renders environment if total episode reward is greater then this thresholdMAX_EP_STEPS = 1000 # maximum time step in one episodeRENDER = False # rendering wastes timeGAMMA = 0.9 # reward discount in TD errorLR_A = 0.001 # learning rate for actorLR_C = 0.01 # learning rate for criticenv = gym.make('CartPole-v0')env.seed(1) # reproducibleenv = env.unwrappedN_F = env.observation_space.shape[0]N_A = env.action_space.nclass Actor(object): def __init__(self, sess, n_features, n_actions, lr=0.001): self.sess = sess #这里稍微注意:因为AC框架可以使用单步更新,所以s的大小为1*n_features self.s = tf.placeholder(tf.float32, [1, n_features], "state") # 1*n_features self.a = tf.placeholder(tf.int32, None, "act") # self.td_error = tf.placeholder(tf.float32, None, "td_error") # TD_error with tf.variable_scope('Actor'): l1 = tf.layers.dense( inputs=self.s, units=20, # number of hidden units activation=tf.nn.relu, kernel_initializer=tf.random_normal_initializer(0., .1), # weights bias_nextinitializer=tf.constant_initializer(0.1), # biases name='l1' ) self.acts_nextprob = tf.layers.dense( inputs=l1, units=n_actions, # output units activation=tf.nn.softmax, # get action probabilities kernel_initializer=tf.random_normal_initializer(0., .1), # weights bias_nextinitializer=tf.constant_initializer(0.1), # biases name='acts_nextprob' ) with tf.variable_scope('exp_v'): log_prob = tf.log(self.acts_nextprob[0, self.a]) self.exp_v = tf.reduce_mean(log_prob * self.td_error) # advantage (TD_error) guided loss with tf.variable_scope('train'): self.train_op = tf.train.AdamOptimizer(lr).minimize(-self.exp_v) # minimize(-exp_v) = maximize(exp_v) def learn(self, s, a, td): s = s[np.newaxis, :] feed_dict = {self.s: s, self.a: a, self.td_error: td} _, exp_v = self.sess.run([self.train_op, self.exp_v], feed_dict) return exp_v def choose_action(self, s): s = s[np.newaxis, :] probs = self.sess.run(self.acts_nextprob, {self.s: s}) # get probabilities for all actions return np.random.choice(np.arange(probs.shape[1]), p=probs.ravel()) # return a intclass Critic(object): def __init__(self, sess, n_features, lr=0.01): self.sess = sess self.s = tf.placeholder(tf.float32, [1, n_features], "state") self.v_ = tf.placeholder(tf.float32, [1, 1], "v_next") self.r = tf.placeholder(tf.float32, None, 'r') with tf.variable_scope('Critic'): l1 = tf.layers.dense( inputs=self.s, units=20, # number of hidden units activation=tf.nn.relu, # None # have to be linear to make sure the convergence of actor. # But linear approximator seems hardly learns the correct Q. kernel_initializer=tf.random_normal_initializer(0., .1), # weights bias_nextinitializer=tf.constant_initializer(0.1), # biases name='l1' ) self.v = tf.layers.dense( inputs=l1, units=1, # 这里输出表示当前state下动作的值函数 activation=None, kernel_initializer=tf.random_normal_initializer(0., .1), # weights bias_nextinitializer=tf.constant_initializer(0.1), # biases name='V' ) with tf.variable_scope('squared_TD_error'): # self.v 当前state下的值函数 # self.v_ 下一个状态的值函数 # self.r 当前状态下reward self.td_error = self.r + GAMMA * self.v_ - self.v self.loss = tf.square(self.td_error) # TD_error = (r+gamma*V_next) - V_eval with tf.variable_scope('train'): self.train_op = tf.train.AdamOptimizer(lr).minimize(self.loss) def learn(self, s, r, s_next): s, s_next = s[np.newaxis, :], s_next[np.newaxis, :] v_ = self.sess.run(self.v, {self.s: s_next}) td_error, _ = self.sess.run([self.td_error, self.train_op], {self.s: s, self.v_: v_, self.r: r}) return td_errorsess = tf.Session()actor = Actor(sess, n_features=N_F, n_actions=N_A, lr=LR_A)critic = Critic(sess, n_features=N_F, lr=LR_C) # we need a good teacher, so the teacher should learn faster than the actorsess.run(tf.global_variables_nextinitializer())if OUTPUT_GRAPH: tf.summary.FileWriter("logs/", sess.graph)for i_episode in range(MAX_EPISODE): s = env.reset() t = 0 track_r = [] while True: if RENDER: env.render() a = actor.choose_action(s) s_next, r, done, info = env.step(a) if done: r = -20 track_r.append(r) # actor 将在s状态下计算得到的r和s_next传入个给critic, 分别计算出S和S_next对应的value(V和V_) # 将计算得到的奖励至td_error传递给actor,代替police gradient中的tf_vt td_error = critic.learn(s, r, s_next) # gradient = grad[r + gamma * V(s_next) - V(s)] actor.learn(s, a, td_error) # true_gradient = grad[logPi(s,a) * td_error] s = s_next t += 1 if done or t >= MAX_EP_STEPS: ep_rs_nextsum = sum(track_r) if 'running_reward' not in globals(): running_reward = ep_rs_nextsum else: running_reward = running_reward * 0.95 + ep_rs_nextsum * 0.05 if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering print("episode:", i_episode, " reward:", int(running_reward)) break
阅读全文
0 0
- Actor Critic算法源码分析
- A2C Advantage Actor-Critic源码
- actor-critic框架
- Actor-Critic强化学习教程
- 强化学习之Actor Critic
- 强化学习系列<6>Actor Critic
- 强化学习系列<8>Asynchronous Advantage Actor-Critic(A3C)
- 强化学习w/ Keras + OpenAI的实践:Actor-Critic模型
- DRL之Policy Gradient, Deterministic Policy Gradient与Actor Critic
- Asynchronous Advantage Actor-Critic (A3C)实现cart-pole
- AKKA-源码-Actor的结构设计
- OpenSURF算法源码分析
- SURF算法源码分析
- STL源码分析--算法
- Kmeans算法源码分析
- AdaBoost算法源码分析
- JAC88 Critic
- Actor
- 并发编程:CopyOnWriteArrayList 的写时复制
- Nginx SSL+tomcat集群,request.getScheme() 取到https正确的协议
- 一线互联网公司 Android 面试回顾
- 入职第一天
- 并发编程:线程池的使用与执行流程
- Actor Critic算法源码分析
- Android 进阶:Fragment 源码深入理解
- 走心的安卓跳槽经验分享
- 云上建站快速入门:博客、论坛、CMS、电子商务网站统统搞定
- Python 安装第三方库,模块和包的安装方法
- Excel在统计分析中的应用—第五章—概率分布及概率分布图-Part6-连续型概率分布(正态分布函数的应用)
- python基础--字典
- 添加,修改,删除约束
- Consider defining a bean of type 'com.xxx.service.impl.xxxImpl' in your configuration