Machine Learning 学习 之 Qleaning 学习

来源:互联网 发布:金蝶引出数据失败原因 编辑:程序博客网 时间:2024/05/17 02:07

学习改编自 莫烦Python Qlearning学习教程

import numpy as npimport timenp.random.seed(2)  # reproducibleN_STATES = 6   # the length of the 1 dimensional worldACTIONS = ['left', 'right']     # available actionsEPSILON = 0.9  # greedy policeALPHA = 0.1     # learning rateGAMMA = 0.9    # discount factorMAX_EPISODES = 53   # maximum episodesFRESH_TIME = 0.05    # fresh time for one movedef build_q_table(n_states, actions):    table=[[0 for j in range(len(actions))] for i in range(n_states)]    print(table)    # show table    return tabledef list_Zero(lis):    for li in lis:        if li!=0:            return False    return Truedef max_Index(lis):    mmax=-999999    index=0    for j in range(len(lis)):        if lis[j]>mmax:            mmax=lis[j]            index=j    return indexdef max_Value(lis):    mmax=-999999    index=0    for j in range(len(lis)):        if lis[j]>mmax:            mmax=lis[j]            index=j    return mmaxdef choose_action(state, q_table):    # This is how to choose an action    state_actions = q_table[state]    #print state_actions    if (np.random.uniform() > EPSILON) or list_Zero(state_actions):  # act non-greedy or state-action have no value        action_num=np.random.randint(0,len(ACTIONS))        action_name = ACTIONS[action_num]        #print action_name    else:   # act greedy        action_num=max_Index(state_actions)        action_name = ACTIONS[action_num]    return action_name,action_numdef get_feedback(S, A):    # This is how agent will interact with the environment    if A == 'right':    # move right        if S == N_STATES - 2:   # terminate            S_ = 'terminal'            R = 1        else:            S_ = S + 1            R = 0    else:   # move left        R = 0        if S == 0:            S_ = S  # reach the wall        else:            S_ = S - 1    return S_, Rdef update(S, episode, step_counter):    # This is how environment be updated    env_list = ['-']*(N_STATES-1) + ['T']   # '---------T' our environment    if S == 'terminal':        interaction = 'Episode %s: total_steps = %s' % (episode+1, step_counter)        print '\r{}'.format(interaction)        time.sleep(1)        print '\r'                                    else:        env_list[S] = 'o'        interaction = ''.join(env_list)        print '\r{}'.format(interaction)        time.sleep(FRESH_TIME)def RL(method):    # main part of RL loop    q_table = build_q_table(N_STATES, ACTIONS)    print(q_table)    for episode in range(MAX_EPISODES):        step_counter = 0        S = 0        is_terminated = False        update(S, episode, step_counter)        while not is_terminated:            A,Anum= choose_action(S, q_table)            S_, R = get_feedback(S, A)  # take action & get next state and reward            if S_ != 'terminal':                if method=='Qlearning':                    q_reward= R + GAMMA * max_Value(q_table[S_])                elif method=='Sarsa':                    A_,Anum_= choose_action(S_, q_table)                    q_reward= R + GAMMA * q_table[S_][Anum_]            else:                 q_reward = R     # next state is terminal                is_terminated = True    # terminate this episode            q_table[S][Anum] =(1-ALPHA)*q_table[S][Anum]+ALPHA *q_reward             #print(q_table)            S = S_  # move to next state            update(S, episode, step_counter+1)            step_counter += 1    return q_tableif __name__ == "__main__":    #q_table = RL('Qlearning')    q_table = RL('Sarsa')    print('\r\nQ-table:\n')    print(q_table)
原创粉丝点击