[MATLAB] Simple TF-IDF implementation

来源:互联网 发布:刘縯 刘邦 知乎 编辑:程序博客网 时间:2024/05/16 00:52

Term-Frequency word weighting scheme is one of most used in normalization of document-term matrices in text mining and information retrieval.

See wikipedia for details.

 

tfidf

 

function Y = tfidf( X )
% FUNCTION computes TF-IDF weighted word histograms.
%
%   Y = tfidf( X );
%
% INPUT :
%   X        - document-term matrix (documents in columns)
%
% OUTPUT :
%   Y        - TF-IDF weighted document-term matrix
%
 
% get term frequencies
X = tf(X);
 
% get inverse document frequencies
I = idf(X);
 
% apply weights for each document
for j=1:size(X, 2)
    X(:, j) = X(:, j)*I(j);
end
 
Y = X;
 
 
function X = tf(X)
% SUBFUNCTION computes word frequencies
 
% for every word
for i=1:size(X, 1)
    
    % get word i counts for all documents
    x = X(i, :);
    
    % sum all word i occurences in the whole collection
    sumX = sum( x );
    
    % compute frequency of the word i in the whole collection
    if sumX ~= 0
        X(i, :) = x / sum(x);
    else
        % avoiding NaNs : set zero to never appearing words
        X(i, :) = 0;
    end
    
end
 
 
function I = idf(X)
% SUBFUNCTION computes inverse document frequencies
 
% m - number of terms or words
% n - number of documents
[m, n]=size(X);
 
% allocate space for document idf's
I = zeros(n, 1);
 
% for every document
for j=1:n
    
    % count non-zero frequency words
    nz = nnz( X(:, j) );
    
    % if not zero, assign a weight:
    if nz
        I(j) = log( m / nz );
    end
    
end