Linear Spatial Pyramid Matching Using Sparse Coding for Image Classification代码解

来源：互联网发布：成人频道直播软件编辑：程序博客网时间：2024/06/05 16:06

function [beta] = sc_approx_pooling(feaSet, B, pyramid, gamma, knn)%================================================% % Usage:% Compute the linear spatial pyramid feature using sparse coding. %% Inputss:%   feaSet        -structure defining the feature set of an image   %                       .feaArr     local feature array extracted from the%                                   image, column-wise%                       .x          x locations of each local feature, 2nd%                                   dimension of the matrix%                       .y          y locations of each local feature, 1st%                               dimension of the matrix%                       .width      width of the image%                       .height     height of the image%   B             -sparse dictionary, column-wise%   pyramid       -defines structure of pyramid %   gamma         -sparsity regularization parameter%   knn           -k nearest neighbors selected for sparse coding% % Output:%   beta          -multiscale max pooling feature%% Written by Jianchao Yang @ NEC Research Lab America (Cupertino)% Mentor: Kai Yu% July 2008%% Revised May. 2010%===============================================dSize = size(B, 2);nSmp = size(feaSet.feaArr, 2);img_width = feaSet.width;img_height = feaSet.height;idxBin = zeros(nSmp, 1);sc_codes = zeros(dSize, nSmp); % compute the local feature for each local featureD = feaSet.feaArr'*B;IDX = zeros(nSmp, knn);for ii = 1:nSmp,d = D(ii, :);[dummy, idx] = sort(d, 'descend');IDX(ii, :) = idx(1:knn);endfor ii = 1:nSmp,    y = feaSet.feaArr(:, ii);    idx = IDX(ii, :);    BB = B(:, idx);    sc_codes(idx, ii) = feature_sign(BB, y, 2*gamma); %sc_codes:sparse coding coefficient(S) X=BSendsc_codes = abs(sc_codes);% spatial levelspLevels = length(pyramid);% spatial bins on each levelpBins = pyramid.^2;% total spatial binstBins = sum(pBins);beta = zeros(dSize, tBins);bId = 0;for iter1 = 1:pLevels,        nBins = pBins(iter1); %[1 4 16]        wUnit = img_width / pyramid(iter1);    hUnit = img_height / pyramid(iter1);        % find to which spatial bin each local descriptor belongs    xBin = ceil(feaSet.x / wUnit); %feaSet.x  960*1 vector    yBin = ceil(feaSet.y / hUnit); %feaSet.y  960*1 vector    idxBin = (yBin - 1)*pyramid(iter1) + xBin;        for iter2 = 1:nBins,             bId = bId + 1;        sidxBin = find(idxBin == iter2)        if isempty(sidxBin),            continue;        end              beta(:, bId) = max(sc_codes(:, sidxBin), [], 2);   %max(s, [], 2)：max for each row; beta:have 1+4+16 columns, zj=max(|sj1|,|sj2|..|sjm|) X=BS where X:n*m B:n*k S:k*m    endendif bId ~= tBins,    error('Index number error!');endbeta = beta(:);beta = beta./sqrt(sum(beta.^2));

有两点需要注意：

1）在求解beta时，作者将原始图像用spatial pyramid划分为3 levels，相应的cells个数分别为1,4,16。在论文中由于X=UV(U为稀疏编码，V为字典)，所以

但是在代码中，作者是按照X=BS(B为字典，S为稀疏编码)求解的，所以

2）在论文中作者说z和U有相同的列数，但在代码中作者是在3个level的每一个cell上求最大值，因此z的列数为1+4+16=21。

function [beta] = sc_approx_pooling(feaSet, B, pyramid, gamma, knn)%================================================% % Usage:% Compute the linear spatial pyramid feature using sparse coding. %% Inputss:%   feaSet        -structure defining the feature set of an image   %                       .feaArr     local feature array extracted from the%                                   image, column-wise%                       .x          x locations of each local feature, 2nd%                                   dimension of the matrix%                       .y          y locations of each local feature, 1st%                               dimension of the matrix%                       .width      width of the image%                       .height     height of the image%   B             -sparse dictionary, column-wise%   pyramid       -defines structure of pyramid %   gamma         -sparsity regularization parameter%   knn           -k nearest neighbors selected for sparse coding% % Output:%   beta          -multiscale max pooling feature%% Written by Jianchao Yang @ NEC Research Lab America (Cupertino)% Mentor: Kai Yu% July 2008%% Revised May. 2010%===============================================dSize = size(B, 2);nSmp = size(feaSet.feaArr, 2);img_width = feaSet.width;img_height = feaSet.height;idxBin = zeros(nSmp, 1);sc_codes = zeros(dSize, nSmp);% compute the local feature for each local featureD = feaSet.feaArr'*B;IDX = zeros(nSmp, knn);for ii = 1:nSmp,d = D(ii, :);[dummy, idx] = sort(d, 'descend');IDX(ii, :) = idx(1:knn);endfor ii = 1:nSmp,    y = feaSet.feaArr(:, ii);    idx = IDX(ii, :);    BB = B(:, idx);    sc_codes(idx, ii) = feature_sign(BB, y, 2*gamma);endsc_codes = abs(sc_codes);% spatial levelspLevels = length(pyramid);% spatial bins on each levelpBins = pyramid.^2;% total spatial binstBins = sum(pBins);beta = zeros(dSize, tBins);bId = 0;for iter1 = 1:pLevels,        nBins = pBins(iter1);        wUnit = img_width / pyramid(iter1);    hUnit = img_height / pyramid(iter1);        % find to which spatial bin each local descriptor belongs    xBin = ceil(feaSet.x / wUnit);    yBin = ceil(feaSet.y / hUnit);    idxBin = (yBin - 1)*pyramid(iter1) + xBin;        for iter2 = 1:nBins,             bId = bId + 1;        sidxBin = find(idxBin == iter2)        if isempty(sidxBin),            continue;        end              beta(:, bId) = max(sc_codes(:, sidxBin), [], 2);%求每行的最大值    endendif bId ~= tBins,    error('Index number error!');endbeta = beta(:);beta = beta./sqrt(sum(beta.^2));

function [beta] = sc_pooling(feaSet, B, pyramid, gamma)%================================================% % Usage:% Compute the linear spatial pyramid feature using sparse coding. %% Inputss:% feaSet        -structure defining the feature set of an image   %                   .feaArr     local feature array extracted from the%                               image, column-wise%                   .x          x locations of each local feature, 2nd%                               dimension of the matrix%                   .y          y locations of each local feature, 1st%                               dimension of the matrix%                   .width      width of the image%                   .height     height of the image% B             -sparse dictionary, column-wise% gamma         -sparsity regularization parameter% pyramid       -defines structure of pyramid % % Output:% beta          -multiscale max pooling feature%% Written by Jianchao Yang @ NEC Research Lab America (Cupertino)% Mentor: Kai Yu% July 2008%% Revised May. 2010%===============================================dSize = size(B, 2);nSmp = size(feaSet.feaArr, 2);img_width = feaSet.width;img_height = feaSet.height;idxBin = zeros(nSmp, 1);sc_codes = zeros(dSize, nSmp);% compute the local feature for each local featurebeta = 1e-4;A = B'*B + 2*beta*eye(dSize);Q = -B'*feaSet.feaArr;for iter1 = 1:nSmp,    sc_codes(:, iter1) = L1QP_FeatureSign_yang(gamma, A, Q(:, iter1)); %即Zj={u1j,u2j,...uMj},M: size of the dictionaryendsc_codes = abs(sc_codes);% spatial levelspLevels = length(pyramid);% spatial bins on each levelpBins = pyramid.^2; %[1,4,16]% total spatial binstBins = sum(pBins);beta = zeros(dSize, tBins);bId = 0;for iter1 = 1:pLevels,        nBins = pBins(iter1);        wUnit = img_width / pyramid(iter1);    hUnit = img_height / pyramid(iter1);        % find to which spatial bin each local descriptor belongs    xBin = ceil(feaSet.x / wUnit);    yBin = ceil(feaSet.y / hUnit);    idxBin = (yBin - 1)*pyramid(iter1) + xBin;        for iter2 = 1:nBins,             bId = bId + 1;        sidxBin = find(idxBin == iter2);        if isempty(sidxBin),            continue;        end              beta(:, bId) = max(sc_codes(:, sidxBin), [], 2); %max(x,[],1)按列求最大值，max(x,[],2)按行求最大值    endendif bId ~= tBins,    error('Index number error!');endbeta = beta(:);beta = beta./sqrt(sum(beta.^2));

main函数

% This is an example code for running the ScSPM algorithm described in "Linear % Spatial Pyramid Matching using Sparse Coding for Image Classification" (CVPR'09)%% Written by Jianchao Yang @ IFP UIUC% For any questions, please email to jyang29@ifp.illinois.edu.% % Revised May, 2010 by Jianchao Yangclear all;clc;%% set pathaddpath('large_scale_svm');addpath('sift');addpath(genpath('sparse_coding'));%% parameter setting% directory setupimg_dir = 'image';                  % directory for dataset imagesdata_dir = 'data';                  % directory to save the sift features of the chosen datasetdataSet = 'Caltech101';% sift descriptor extractionskip_cal_sift = true;              % if 'skip_cal_sift' is false, set the following parametergridSpacing = 6;patchSize = 16;maxImSize = 300;nrml_threshold = 1;                 % low contrast region normalization threshold (descriptor length)% dictionary training for sparse codingskip_dic_training = true;           %have provided dict_Caltech101_1024，so skip dic trainingnBases = 1024;nsmp = 200000;beta = 1e-5;                        % a small regularization for stablizing sparse codingnum_iters = 50;% feature pooling parameterspyramid = [1, 2, 4];                % spatial block number on each level of the pyramidgamma = 0.15;knn = 200;                          % find the k-nearest neighbors for approximate sparse coding                                    % if set 0, use the standard sparse coding% classification test on the datasetnRounds = 5;                        % number of random testslambda = 0.1;                       % regularization parameter for wtr_num = 30;                        % training number per categoryrt_img_dir = fullfile(img_dir, dataSet);rt_data_dir = fullfile(data_dir, dataSet);%% calculate sift features or retrieve the database directoryif skip_cal_sift,    database = retr_database_dir(rt_data_dir);else    [database, lenStat] = CalculateSiftDescriptor(rt_img_dir, rt_data_dir, gridSpacing, patchSize, maxImSize, nrml_threshold);end;%% load sparse coding dictionary (one dictionary trained on Caltech101 is provided: dict_Caltech101_1024.mat)Bpath = ['dictionary/dict_' dataSet '_' num2str(nBases) '.mat'];Xpath = ['dictionary/rand_patches_' dataSet '_' num2str(nsmp) '.mat'];if ~skip_dic_training,        try         load(Xpath);    catch        X = rand_sampling(database, nsmp);%如果没有提供patches，则需自己从database中提取nsmp个patches        save(Xpath, 'X');    end    [B, S, stat] = reg_sparse_coding(X, nBases, eye(nBases), beta, gamma, num_iters);    save(Bpath, 'B', 'S', 'stat');else    load(Bpath);endnBases = size(B, 2);                    % size of the dictionary%% calculate the sparse coding featuredimFea = sum(nBases*pyramid.^2); %每张图片特征的维数 1024(1+4+16)numFea = length(database.path); %图片个数，database.path:contain the pathes for each image of each classsc_fea = zeros(dimFea, numFea);%每列一个特征，共numFea个sift特征sc_label = zeros(numFea, 1);disp('==================================================');fprintf('Calculating the sparse coding feature...\n');fprintf('Regularization parameter: %f\n', gamma);disp('==================================================');for iter1 = 1:numFea,      if ~mod(iter1, 50),        fprintf('.\n');    else        fprintf('.');    end;    fpath = database.path{iter1};    load(fpath);    if knn,        sc_fea(:, iter1) = sc_approx_pooling(feaSet, B, pyramid, gamma, knn);    else        sc_fea(:, iter1) = sc_pooling(feaSet, B, pyramid, gamma);    end    sc_label(iter1) = database.label(iter1);end;%% evaluate the performance of the computed feature using linear SVM[dimFea, numFea] = size(sc_fea);clabel = unique(sc_label);nclass = length(clabel);accuracy = zeros(nRounds, 1);for ii = 1:nRounds,    fprintf('Round: %d...\n', ii);    tr_idx = [];    ts_idx = [];        for jj = 1:nclass,        idx_label = find(sc_label == clabel(jj));        num = length(idx_label);                idx_rand = randperm(num);                tr_idx = [tr_idx; idx_label(idx_rand(1:tr_num))];        ts_idx = [ts_idx; idx_label(idx_rand(tr_num+1:end))];    end;        tr_fea = sc_fea(:, tr_idx);    tr_label = sc_label(tr_idx);        ts_fea = sc_fea(:, ts_idx);    ts_label = sc_label(ts_idx);        [w, b, class_name] = li2nsvm_multiclass_lbfgs(tr_fea', tr_label, lambda); % train multi-class via lbfgs, one-against-all decomposition    [C, Y] = li2nsvm_multiclass_fwd(ts_fea', w, b, class_name); % make multi-class prediction    acc = zeros(length(class_name), 1);        for jj = 1 : length(class_name)        c = class_name(jj);        idx = find(ts_label == c);        curr_pred_label = C(idx);        curr_gnd_label = ts_label(idx);            acc(jj) = length(find(curr_pred_label == curr_gnd_label))/length(idx);    end;         accuracy(ii) = mean(acc); end;fprintf('Mean accuracy: %f\n', mean(accuracy));fprintf('Standard deviation: %f\n', std(accuracy));