20171124学习强化学习，尽快出论文，Q-learning2.1

来源：互联网发布：在手机上怎么开淘宝店编辑：程序博客网时间：2024/05/21 22:24

ubuntu安装中文输入法：

https://jingyan.baidu.com/article/a3aad71aa1abe7b1fa009641.html

调出输入法：https://jingyan.baidu.com/article/adc815134f4b92f722bf7350.html

python2.7 转自莫烦Q-learning 2.1

code：

import numpy as npimport pandas as pdimport timenp.random.seed(2)  # reproducible  N_STATES = 6   # the length of the 1 dimensional worldACTIONS = ['left', 'right']     # available actionsEPSILON = 0.9   # greedy policeALPHA = 0.1     # learning rateGAMMA = 0.9    # discount factorMAX_EPISODES = 13   # maximum episodesFRESH_TIME = 0.3    # fresh time for one movedef build_q_table(n_states, actions):    table = pd.DataFrame(        np.zeros((n_states, len(actions))),     # q_table initial values        columns=actions,    # actions's name    )    # print(table)    # show table    return tabledef choose_action(state, q_table):    # This is how to choose an action    state_actions = q_table.iloc[state, :]    if (np.random.uniform() > EPSILON) or (state_actions.all() == 0):  # act non-greedy or state-action have no value        action_name = np.random.choice(ACTIONS)    else:   # act greedy        action_name = state_actions.argmax()    return action_namedef get_env_feedback(S, A):    # This is how agent will interact with the environment    if A == 'right':    # move right        if S == N_STATES - 2:   # terminate   S 是从 0 到 N_STATES -1, 所以  N_STATES - 2 是 倒数第二个 state            S_ = 'terminal'            R = 1        else:            S_ = S + 1            R = 0    else:   # move left        R = 0        if S == 0:            S_ = S  # reach the wall        else:            S_ = S - 1    return S_, Rdef update_env(S, episode, step_counter):    # This is how environment be updated    env_list = ['-']*(N_STATES-1) + ['T']   # '---------T' our environment    if S == 'terminal':        interaction = 'Episode %s: total_steps = %s' % (episode+1, step_counter)        print '\r{}'.format(interaction),        time.sleep(2)        print '\r                                ',    else:        env_list[S] = 'o'        interaction = ''.join(env_list)        print '\r{}'.format(interaction),        time.sleep(FRESH_TIME)def rl():    # main part of RL loop    q_table = build_q_table(N_STATES, ACTIONS)    for episode in range(MAX_EPISODES):        step_counter = 0        S = 0        is_terminated = False        update_env(S, episode, step_counter)        while not is_terminated:            A = choose_action(S, q_table)            S_, R = get_env_feedback(S, A)  # take action & get next state and reward            q_predict = q_table.ix[S, A]            if S_ != 'terminal':                q_target = R + GAMMA * q_table.iloc[S_, :].max()   # next state is not terminal            else:                q_target = R     # next state is terminal                is_terminated = True    # terminate this episode            q_table.ix[S, A] += ALPHA * (q_target - q_predict)  # update            S = S_  # move to next state            update_env(S, episode, step_counter+1)            step_counter += 1    return q_tableif __name__ == "__main__":    q_table = rl()    print('\r\nQ-table:\n')    print(q_table)

阅读全文

0 0