%tracker_ensemble: Correlation filter tracking with convolutional features

% Input:

%   - video_path:          path to the image sequence

%   - img_files:           list of image names

%   - pos:                 intialized center position ofthe target in (row, col)

%   - target_sz:           intialized target size in (Height,Width)

%   - padding:             padding parameter for the searcharea

%   - lambda:              regularization term for ridgeregression

%   - output_sigma_factor: spatial bandwidth forthe Gaussian label

%   - interp_factor:       learning rate for model update

%   - cell_size:           spatial quantization level

%   - show_visualization:  set to True for showing intermediate results

% Output:

%   - positions:           predicted target position at each frame

%   - time:                time spent for tracking



function [positions, time] = tracker_ensemble(video_path,img_files, pos, target_sz,...

    padding, lambda, output_sigma_factor,interp_factor, cell_size, show_visualization)




% Environment setting环境变量设置



indLayers = [37,28, 19];   % relu5-4, relu4-4, andrelu3-4 in VGG Net

nweights  = [1, 0.5, 0.25]; % Weights for combining correlation filter responses

numLayers =length(indLayers);


% Get imagesize and search window size

im_sz     = size(imread([video_path img_files{1}]));

window_sz =get_search_window(target_sz, im_sz, padding);%以目标为中心,padding一圈得到搜索窗口


% Compute the sigma for the Gaussian function label 高斯核带宽

output_sigma =sqrt(prod(target_sz)) * output_sigma_factor / cell_size;


%createregression labels, gaussian shaped, with a bandwidth

%proportionalto target size    d=bsxfun(@times,c,[12]);


l1_patch_num =floor(window_sz/ cell_size);



% Pre-computethe Fourier Transform of the Gaussian function label

yf =fft2(gaussian_shaped_labels(output_sigma, l1_patch_num));%计算缩小后图像块每个像素点的高斯标签,



% Pre-computeand cache the cosine window (for avoiding boundary discontinuity)

cos_window =hann(size(yf,1)) * hann(size(yf,2))';%余弦窗大小为62x61,每次都是在最后过滤一下,消除边界不连续


% Create videointerface for visualization


    update_visualization =show_video(img_files, video_path);



% Initializevariables for calculating FPS and distance precision

time      = 0;

positions =zeros(numel(img_files), 2);%本函数最后返回的其中一个参数,是目标的中心位置坐标,格式[y,x]。

%rects = zeros(numel(img_files),4);

nweights  = reshape(nweights,1,1,[]);


% Note:variables ending with 'f' are in the Fourier domain.

model_xf     = cell(1, numLayers);%滤波器的两部分,和论文里边的A,B并不对应,详情看后边。

model_alphaf =cell(1, numLayers);





% Start tracking


for frame = 1:numel(img_files),

    im = imread([video_path img_files{frame}]);% Load the image at the currentframe

    if ismatrix(im)

        im = cat(3, im, im, im);



    tic(); %计时开始


    %Predicting the object position from the learned object model


    if frame > 1

        % Extracting hierarchical convolutional features

        feat = extractFeature(im, pos,window_sz, cos_window, indLayers);

        % Predict position

        pos = predictPosition(feat, pos, indLayers, nweights, cell_size,l1_patch_num,...

            model_xf, model_alphaf);


        % Scale estimation

        current_scale_factor = estimate_scale(rgb2gray(im), pos, current_scale_factor);  






        init_scale_para(rgb2gray(im),target_sz, pos);






    %Learning correlation filters over hierarchical convolutional features


    %Extracting hierarchical convolutional features

    feat = extractFeature(im, pos, window_sz, cos_window, indLayers);


    %Model update

    [model_xf, model_alphaf] =updateModel(feat, yf, interp_factor, lambda, frame,...    %滤波器的初始化更新

        model_xf, model_alphaf);



    %Save predicted position and timing


    positions(frame,:) = pos;%从第二帧起,每估计一次位置就存到这个数组里边。


    box = [pos([2,1]) - target_sz_t([2,1])/2,target_sz_t([2,1])];%x,y,w,h,其实就是咱们估计到的位置,在图像上画框用

%     rects(frame,:)=box;


    time = time + toc(); % toc()计时开始



    if show_visualization,

        %box = [pos([2,1]) - target_sz([2,1])/2,target_sz([2,1])];

        stop = update_visualization(frame,box);

        if stop,break,end %userpressed Esc, stop early


        %          pause(0.05)  % uncomment to runslower







function pos = predictPosition(feat, pos, indLayers, nweights,cell_size, l1_patch_num,...

    model_xf, model_alphaf)

% pos是上一帧的位置,feat是以pos为中心提取的特征,l1_patch_num就是那个缩小到62x61的图像块,

% model_xf, model_alphaf是滤波器参数


% Computecorrelation filter responses at each layer


res_layer =zeros([l1_patch_num, length(indLayers)]);


for ii = 1 : length(indLayers)

    zf = fft2(feat{ii});  %特征是在window_sz缩小后的62x61图像块上,高频分量在四个角上

    kzf=sum(zf .* conj(model_xf{ii}), 3) /numel(zf);



    temp= real(fftshift(ifft2(model_alphaf{ii}.* kzf))); %equation for fast detection





% Combineresponses from multiple layers (see Eqn. 5)

response =sum(bsxfun(@times, res_layer, nweights), 3);%把三个响应集合进行加权融合



% Find targetlocation


% Target locationis at the maximum response. we must take into

% account thefact that, if the target doesn't move, the peak

% will appearat the top-left corner, not at the center (this is

% discussedin the KCF paper). The responses wrap around cyclically.


[vert_delta,horiz_delta] = find(response == max(response(:)), 1);%找出62x61图像块上最大响应的坐标

vert_delta  = vert_delta - floor(size(zf,1)/2);

horiz_delta =horiz_delta - floor(size(zf,2)/2);

% Map the position to the image space将最大响应点的坐标变换到整个360x640图像上,求其坐标。

pos = pos +cell_size * [vert_delta - 1, horiz_delta - 1]; 



function [model_xf, model_alphaf] = updateModel(feat, yf,interp_factor, lambda, frame,...

    model_xf, model_alphaf)


numLayers =length(feat);




xf       = cell(1, numLayers);

alphaf   = cell(1, numLayers);


% ================================================================================

% Modelupdate



for ii=1 : numLayers

    xf{ii} = fft2(feat{ii});%对三个层的特征分别进行2维傅里叶变换,高频分量都集中在四个角

    kf = sum(xf{ii} .* conj(xf{ii}), 3) /numel(xf{ii});%应该是归一化,反正在分母上,大家都一样。

    alphaf{ii} = yf./ (kf+ lambda);  %Fast training,高频分量都集中在四个角



% Modelinitialization or update

if frame == 1, % First frame, train with a single image

    for ii=1:numLayers

        model_alphaf{ii} = alphaf{ii};%滤波器模型初始化

          model_xf{ii} = xf{ii};



    %Online model update using learning rate interp_factor

    for ii=1:numLayers

        model_alphaf{ii} = (1 - interp_factor)* model_alphaf{ii} + interp_factor * alphaf{ii};

        model_xf{ii}     = (1 - interp_factor) * model_xf{ii}     + interp_factor * xf{ii};







function feat  =extractFeature(im, pos, window_sz, cos_window, indLayers)


%后提取indLayers三层特征,大小为62x61,并用余弦窗过滤,避免边界不连续。 另外,除了第一帧只提取一次特征



% Get thesearch window from previous detection

patch =get_subwindow(im, pos, window_sz); 

% Extractinghierarchical convolutional features

feat  = get_features(patch, cos_window, indLayers);%提取分层特征


