对“视觉机器学习20讲配套仿真代码”的研究心得---增强学习

来源：互联网发布：淘宝云客服考试编辑：程序博客网时间：2024/06/06 18:54

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%功能：演示增强学习算法在计算机视觉中的应用
%基于增强学习实现目标分类；
%环境：Win7，Matlab2012b
%Modi: NUDT-VAP
%时间：2014-02-04
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
global NO_REPLICATIONS ITERMAX NA NS SMALL TPM TRM LAMBDA

global3 % global parameters initialized

stat=struct('Q',zeros(NS,NA),'iter',0,'old_action',1,'old_state',1,'current_state',1,'rimm',0,'total_reward',0);
done=0; % Pnemonic for simulation, 1 stands for end
% 0 stands for continue
while 0==done
[stat,done]=jump_learn(stat);
end
policy=pol_finder(stat);

&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&

function action=action_selector(stat)

global NA

ran=rand(1);

candidate=1;

sum=1/NA;

complete=0;

% Selecting each action with equal probability

while 0==complete

if ran<sum
% action selected
action=candidate;
complete=1;

else
% test if ran is associated with next action
candidate=candidate+1;
sum=sum+(1/NA);
end
end

&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&7

function [stat,done]=jump_learn(stat)

global NO_REPLICATIONS ITERMAX NA NS SMALL TPM TRM

% This function simulates a jump and also updates the learning stats

old_state=stat.old_state;

old_action=stat.old_action;

% Determine current state

current_state=state_finder(stat);

% Record Feedback in stat

stat.current_state=current_state;

stat.rimm=TRM(old_state,current_state,old_action);

% DO LEARNING

stat=qlearn(stat);

% Select next action

next_action=action_selector(stat);

% Get ready to get out of this function

stat.old_state=current_state;

stat.old_action=next_action;

if stat.iter>=ITERMAX

% Learning should end

done=1;

else

done=0;

end

&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&7

function policy=pol_finder(stat)

global NO_REPLICATIONS ITERMAX NA NS SMALL TPM TRM

for state=1:NS

[maxQfactor,index]=max(stat.Q(state,:));

policy(state)=index;

value_function(state)=maxQfactor;

end

policy

value_function

for state=1:NS

for action=1:NA
state
action
stat.Q(state,action)
end

end

&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&

function stat=qlearn(stat)

global NO_REPLICATIONS ITERMAX NA NS SMALL TPM TRM LAMBDA

% Q-Learning

% Finding the Max factor in the current state

q_next=max(stat.Q(stat.current_state,:));

stat.iter=stat.iter+1;

%learn_rate=1/(stat.iter);

learn_rate=log(stat.iter+1)/(stat.iter+1);

%learn_rate=0.5*300/(300+stat.iter);

q=stat.Q(stat.old_state,stat.old_action);

q=q*(1-learn_rate)+(learn_rate*(stat.rimm+(LAMBDA*q_next)));

stat.Q(stat.old_state,stat.old_action)=q;

&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&7

function candidate=state_finder(stat)

global NO_REPLICATIONS ITERMAX NA NS SMALL TPM TRM

ran=rand(1);

old_action=stat.old_action;
old_state=stat.old_state;

sum=TPM(old_state,1,old_action);

candidate=1;

complete=0;

while 0==complete

if ran<sum

complete=1;

else

candidate=candidate+1;

sum=sum+TPM(old_state,candidate,old_action);

end

end

&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&7

global NO_REPLICATIONS ITERMAX NA NS SMALL TPM TRM LAMBDA

NO_REPLICATIONS=30; % No of replications of simulation
ITERMAX=10000; % No of iterations of learning
NA=2; % Number of actions in each state
NS=2; % Number of states

LAMBDA=0.8; % discount factor

SMALL=-1000000;

TPM(:,:,1)=[0.7,0.3;0.4,0.6];

TPM(:,:,2)=[0.9,0.1;0.2,0.8];

TRM(:,:,1)=[6,-5;7,12];

TRM(:,:,2)=[10,17;-14,13];

&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&

&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&

&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&7

&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&7

0 0