Actor Critic算法源码分析

来源：互联网发布：linux suse iso下载编辑：程序博客网时间：2024/06/08 04:47

Actor Critic算法源码分析

Actor-Critic算法主要是为了解决Policy Gradient算法中仅能在一个回合完成之后才能更新参数。简单的说是在玩游戏结束了之后，才能对参数进行更新。Policy Gradient算法从一个游戏的整体回合来看，加大好动作的权重，减小不好动作的权重。下面贴出两种算法对应的代码参考；

Policy Gradient

import numpy as npimport tensorflow as tf# reproduciblenp.random.seed(1)tf.set_random_seed(1)class PolicyGradient:    def __init__(            self,            n_actions,            n_features,            learning_rate=0.01,            reward_decay=0.95,            output_graph=False,    ):        self.n_actions = n_actions        self.n_features = n_features        self.lr = learning_rate        self.gamma = reward_decay        self.ep_obs, self.ep_as, self.ep_rs = [], [], []        self._build_net()        self.sess = tf.Session()        if output_graph:            # $ tensorboard --logdir=logs            # http://0.0.0.0:6006/            # tf.train.SummaryWriter soon be deprecated, use following            tf.summary.FileWriter("logs/", self.sess.graph)        self.sess.run(tf.global_variables_initializer())    def _build_net(self):        with tf.name_scope('inputs'):            self.tf_obs = tf.placeholder(tf.float32, [None, self.n_features], name="observations")            self.tf_acts = tf.placeholder(tf.int32, [None, ], name="actions_num")            self.tf_vt = tf.placeholder(tf.float32, [None, ], name="actions_value")        # fc1  主要包含两个全连接层，输入的是图像特征，也就是状态值        layer = tf.layers.dense(            inputs=self.tf_obs, # 输入的状态 n_state*n_features            units=10,            activation=tf.nn.tanh,  # tanh activation            kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.3),            bias_initializer=tf.constant_initializer(0.1),            name='fc1'        )        # fc2        all_act = tf.layers.dense(             inputs=layer,            units=self.n_actions, #输出层的个数是对应动作的个数 n_state*n_action            activation=None,            kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.3),            bias_initializer=tf.constant_initializer(0.1),            name='fc2'        )        self.all_act_prob = tf.nn.softmax(all_act, name='act_prob')  # use softmax to convert to probability        with tf.name_scope('loss'):            # tf.log(self.all_act_prob) 大小为 n_state*n_action,            # self.tf_acts 表示该回合中采取的一系列的动作,这些动作使用动作的索引进行标识            # tf.log(self.all_act_prob)*tf.one_hot(self.tf_acts, self.n_actions)表示            # 将动作tf_acts对应的概率值选择出来            neg_log_prob = tf.reduce_sum(-tf.log(self.all_act_prob)*tf.one_hot(self.tf_acts, self.n_actions), axis=1)            # tf_vt 每个状态的状态值 n_state,也可以认为是鼓励值            loss = tf.reduce_mean(neg_log_prob * self.tf_vt)  # reward guided loss        with tf.name_scope('train'):            self.train_op = tf.train.AdamOptimizer(self.lr).minimize(loss)    def choose_action(self, observation):# 这是在选择动作的时候不是greed模式，而是使用输出的每个动作对应的概率值选择动作        prob_weights = self.sess.run(self.all_act_prob, feed_dict={self.tf_obs: observation[np.newaxis, :]})        action = np.random.choice(range(prob_weights.shape[1]), p=prob_weights.ravel())  # select action w.r.t the actions prob        return action    def store_transition(self, s, a, r):        self.ep_obs.append(s) # 状态        self.ep_as.append(a) # 当前保存状态下使用的动作        self.ep_rs.append(r)  #计算得到的奖励    def learn(self):        # discount and normalize episode reward        discounted_ep_rs_norm = self._discount_and_norm_rewards()        # train on episode        self.sess.run(self.train_op, feed_dict={             self.tf_obs: np.vstack(self.ep_obs),  # shape=[None, n_obs] 环境             self.tf_acts: np.array(self.ep_as),  # shape=[None, ] 动作             self.tf_vt: discounted_ep_rs_norm,  # shape=[None, ]  打折扣的奖励        })        self.ep_obs, self.ep_as, self.ep_rs = [], [], [] # 表示当前回合参数已经更新完毕        return discounted_ep_rs_norm    def _discount_and_norm_rewards(self):        # 打折扣的未来奖励的目的是：为了减小无模型算法中导致的偏差        discounted_ep_rs = np.zeros_like(self.ep_rs)        running_add = 0        for t in reversed(range(0, len(self.ep_rs))):# 这里使用倒叙计算每个状态下打折扣的未来奖励            running_add = running_add * self.gamma + self.ep_rs[t]            discounted_ep_rs[t] = running_add        # normalize episode rewards        discounted_ep_rs -= np.mean(discounted_ep_rs)        discounted_ep_rs /= np.std(discounted_ep_rs)        return discounted_ep_rs

AC模型

"""Actor-Critic using TD-error as the Advantage, Reinforcement Learning.The cart pole example. Policy is oscillated.View more on my tutorial page: https://morvanzhou.github.io/tutorials/Using:tensorflow 1.0gym 0.8.0"""import numpy as npimport tensorflow as tfimport gymnp.random.seed(2)tf.set_random_seed(2)  # reproducible# SuperparametersOUTPUT_GRAPH = FalseMAX_EPISODE = 3000DISPLAY_REWARD_THRESHOLD = 200  # renders environment if total episode reward is greater then this thresholdMAX_EP_STEPS = 1000   # maximum time step in one episodeRENDER = False  # rendering wastes timeGAMMA = 0.9     # reward discount in TD errorLR_A = 0.001    # learning rate for actorLR_C = 0.01     # learning rate for criticenv = gym.make('CartPole-v0')env.seed(1)  # reproducibleenv = env.unwrappedN_F = env.observation_space.shape[0]N_A = env.action_space.nclass Actor(object):    def __init__(self, sess, n_features, n_actions, lr=0.001):        self.sess = sess        #这里稍微注意：因为AC框架可以使用单步更新，所以s的大小为1*n_features        self.s = tf.placeholder(tf.float32, [1, n_features], "state") # 1*n_features        self.a = tf.placeholder(tf.int32, None, "act") #         self.td_error = tf.placeholder(tf.float32, None, "td_error")  # TD_error        with tf.variable_scope('Actor'):            l1 = tf.layers.dense(                inputs=self.s,                units=20,    # number of hidden units                activation=tf.nn.relu,                kernel_initializer=tf.random_normal_initializer(0., .1),    # weights                bias_nextinitializer=tf.constant_initializer(0.1),  # biases                name='l1'            )            self.acts_nextprob = tf.layers.dense(                inputs=l1,                units=n_actions,    # output units                activation=tf.nn.softmax,   # get action probabilities                kernel_initializer=tf.random_normal_initializer(0., .1),  # weights                bias_nextinitializer=tf.constant_initializer(0.1),  # biases                name='acts_nextprob'            )        with tf.variable_scope('exp_v'):            log_prob = tf.log(self.acts_nextprob[0, self.a])            self.exp_v = tf.reduce_mean(log_prob * self.td_error)  # advantage (TD_error) guided loss        with tf.variable_scope('train'):            self.train_op = tf.train.AdamOptimizer(lr).minimize(-self.exp_v)  # minimize(-exp_v) = maximize(exp_v)    def learn(self, s, a, td):        s = s[np.newaxis, :]        feed_dict = {self.s: s, self.a: a, self.td_error: td}        _, exp_v = self.sess.run([self.train_op, self.exp_v], feed_dict)        return exp_v    def choose_action(self, s):        s = s[np.newaxis, :]        probs = self.sess.run(self.acts_nextprob, {self.s: s})   # get probabilities for all actions        return np.random.choice(np.arange(probs.shape[1]), p=probs.ravel())   # return a intclass Critic(object):    def __init__(self, sess, n_features, lr=0.01):        self.sess = sess        self.s = tf.placeholder(tf.float32, [1, n_features], "state")        self.v_ = tf.placeholder(tf.float32, [1, 1], "v_next")        self.r = tf.placeholder(tf.float32, None, 'r')        with tf.variable_scope('Critic'):            l1 = tf.layers.dense(                inputs=self.s,                units=20,  # number of hidden units                activation=tf.nn.relu,  # None                # have to be linear to make sure the convergence of actor.                # But linear approximator seems hardly learns the correct Q.                kernel_initializer=tf.random_normal_initializer(0., .1),  # weights                bias_nextinitializer=tf.constant_initializer(0.1),  # biases                name='l1'            )            self.v = tf.layers.dense(                inputs=l1,                units=1,  # 这里输出表示当前state下动作的值函数                activation=None,                kernel_initializer=tf.random_normal_initializer(0., .1),  # weights                bias_nextinitializer=tf.constant_initializer(0.1),  # biases                name='V'            )        with tf.variable_scope('squared_TD_error'):            # self.v 当前state下的值函数            # self.v_ 下一个状态的值函数            # self.r 当前状态下reward            self.td_error = self.r + GAMMA * self.v_ - self.v            self.loss = tf.square(self.td_error)    # TD_error = (r+gamma*V_next) - V_eval        with tf.variable_scope('train'):            self.train_op = tf.train.AdamOptimizer(lr).minimize(self.loss)    def learn(self, s, r, s_next):        s, s_next = s[np.newaxis, :], s_next[np.newaxis, :]        v_ = self.sess.run(self.v, {self.s: s_next})        td_error, _ = self.sess.run([self.td_error, self.train_op],                                          {self.s: s, self.v_: v_, self.r: r})        return td_errorsess = tf.Session()actor = Actor(sess, n_features=N_F, n_actions=N_A, lr=LR_A)critic = Critic(sess, n_features=N_F, lr=LR_C)     # we need a good teacher, so the teacher should learn faster than the actorsess.run(tf.global_variables_nextinitializer())if OUTPUT_GRAPH:    tf.summary.FileWriter("logs/", sess.graph)for i_episode in range(MAX_EPISODE):    s = env.reset()    t = 0    track_r = []    while True:        if RENDER: env.render()        a = actor.choose_action(s)        s_next, r, done, info = env.step(a)        if done: r = -20        track_r.append(r)        # actor 将在s状态下计算得到的r和s_next传入个给critic,  分别计算出S和S_next对应的value(V和V_)        # 将计算得到的奖励至td_error传递给actor，代替police gradient中的tf_vt        td_error = critic.learn(s, r, s_next)  # gradient = grad[r + gamma * V(s_next) - V(s)]        actor.learn(s, a, td_error)     # true_gradient = grad[logPi(s,a) * td_error]        s = s_next        t += 1        if done or t >= MAX_EP_STEPS:            ep_rs_nextsum = sum(track_r)            if 'running_reward' not in globals():                running_reward = ep_rs_nextsum            else:                running_reward = running_reward * 0.95 + ep_rs_nextsum * 0.05            if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True  # rendering            print("episode:", i_episode, "  reward:", int(running_reward))            break

阅读全文

0 0