PCA主成分分析

来源：互联网发布：photoshop mac版破解编辑：程序博客网时间：2024/05/17 23:14
不废话，先上代码。
package com.pca;public class Main {    public static void main(String[] args) {        // 原始数据        double[][] rawData = new double[][] {                { 40.4, 24.7, 7.2, 6.1, 8.3, 8.7, 2.442, 20.0 },                { 25.0, 12.7, 11.2, 11.0, 12.9, 20.2, 3.542, 9.1 },                { 13.2, 3.3, 3.9, 4.3, 4.4, 5.5, 0.578, 3.6 },                { 22.3, 6.7, 5.6, 3.7, 6.0, 7.4, 0.176, 7.3 },                { 34.3, 11.8, 7.1, 7.1, 8.0, 8.9, 1.726, 27.5 },                { 35.6, 12.5, 16.4, 16.7, 22.8, 29.3, 3.017, 26.6 },                { 22.0, 7.8, 9.9, 10.2, 12.6, 17.6, 0.847, 10.6 },                { 48.4, 13.4, 10.9, 9.9, 10.9, 13.9, 1.772, 1.772 },                { 40.6, 19.1, 19.8, 19.0, 29.7, 39.6, 2.449, 35.8 },                { 24.8, 8.0, 9.8, 8.9, 11.9, 16.2, 0.789, 13.7 },                { 12.5, 9.7, 4.2, 4.2, 4.6, 6.5, 0.874, 3.9 },                { 1.8, 0.6, 0.7, 0.7, 0.8, 1.1, 0.056, 1.0 },                { 32.3, 13.9, 9.4, 8.3, 9.8, 13.3, 2.126, 17.1 },                { 38.5, 9.1, 11.3, 9.5, 12.2, 16.4, 1.327, 11.6 },                { 26.2, 10.1, 5.6, 15.6, 7.7, 30.1, 0.126, 25.9 } };        PrincipalComponentAnalysis pca = new PrincipalComponentAnalysis();        pca.buildPCA(rawData);        int[] selected = pca.getSelected();        for (int i = 0; i < selected.length; i++) {            System.out.print(selected[i] + " ");        }        System.out.println();        System.out.println("======");        double[] eigenValues = pca.getEigenValues();        for (int i = 0; i < eigenValues.length; i++) {            System.out.print(eigenValues[i] + " ");        }        System.out.println();        System.out.println("======");        double[][] eigenVectors = pca.getEigenVectors();        for (int i = 0; i < eigenVectors.length; i++) {            for (int j = 0; j < eigenVectors[0].length; j++) {                System.out.print(eigenVectors[i][j] + " ");            }            System.out.println();        }        System.out.println();        System.out.println("======");        double[] pEigenValues = pca.getPrincipalEigenValues();        for (int i = 0; i < pEigenValues.length; i++) {            System.out.print(pEigenValues[i] + " ");        }        System.out.println();        System.out.println("======");        double[][] pEigenVectors = pca.getPrincipalEigenVectors();        for (int i = 0; i < pEigenVectors.length; i++) {            for (int j = 0; j < pEigenVectors[0].length; j++) {                System.out.print(pEigenVectors[i][j] + " ");            }            System.out.println();        }        System.out.println();        System.out.println("======");        double[][] pData = pca.getPrincipalData();        for (int i = 0; i < pData.length; i++) {            for (int j = 0; j < pData[0].length; j++) {                System.out.print(pData[i][j] + " ");            }            System.out.println();        }    }}package com.pca;import no.uib.cipr.matrix.DenseMatrix;import no.uib.cipr.matrix.EVD;import no.uib.cipr.matrix.Matrix;import no.uib.cipr.matrix.NotConvergedException;import org.apache.log4j.Logger;public class PrincipalComponentAnalysis {    private static Logger logger = Logger            .getLogger(PrincipalComponentAnalysis.class);    private double[][] rawData = null; // 原始数据    private double[][] principalData = null; // 主成分提取后的数据    private double[] principalEigenValues = null; // 主要的特征值    private double[][] principalEigenVectors = null; // 主要的特征向量    private double[] eigenValues = null; // 特征值    private double[][] eigenVectors = null; // 特征向量    private int[] selected = null; // 选择的特征值序号    private double proportion = 0.9; // 主成分的比重，默认为90%    /**************** 构造函数里配置一些参数 ****************/    public PrincipalComponentAnalysis() {    }    public PrincipalComponentAnalysis(double proportion) {        this.proportion = proportion;    }    /**************** get方法 ****************/    public double[][] getRawData() {        return rawData;    }    public double[][] getPrincipalData() {        return principalData;    }    public double[] getPrincipalEigenValues() {        return principalEigenValues;    }    public double[][] getPrincipalEigenVectors() {        return principalEigenVectors;    }    public double[] getEigenValues() {        return eigenValues;    }    public double[][] getEigenVectors() {        return eigenVectors;    }    public int[] getSelected() {        return selected;    }    public double getProportion() {        return proportion;    }    /**************** PCA的内部方法 ****************/    // 将原始数据标准化    private double[][] calcStandardlizer(double[][] rawData) {        double[][] standardData = null;        if (rawData != null) {            int N = rawData.length; // 二维矩阵的行数，样本个数            int p = rawData[0].length; // 二维矩阵的列数，属性个数            // 每个属性对应的列是该属性的一个采样，近似为该属性的分布            double[] average = new double[p]; // 每一列的平均值            double[] var = new double[p]; // 每一列的方差            standardData = new double[N][p]; // 标准化后的列向量组成的矩阵            // 取得平均值            for (int k = 0; k < p; k++) {                double temp = 0;                for (int i = 0; i < N; i++) {                    temp += rawData[i][k];                }                average[k] = temp / N;            }            // 取得方差            for (int k = 0; k < p; k++) {                double temp = 0;                for (int i = 0; i < N; i++) {                    temp += (rawData[i][k] - average[k])                            * (rawData[i][k] - average[k]);                }                var[k] = temp / (N - 1);            }            // 取得标准化的矩阵 期望为0，方差为1，简化相关系数的计算公式            for (int i = 0; i < N; i++) {                for (int j = 0; j < p; j++) {                    standardData[i][j] = (double) ((rawData[i][j] - average[j]) / Math                            .sqrt(var[j]));                }            }        } else {            logger.info("There is no raw data.");        }        return standardData;    }    // 计算样本相关系数矩阵 各列之间相互计算（共p列），形成p*p的矩阵    // 输入为标准化之后的矩阵，利用期望=0，方差=1简化了相关系数的计算    private double[][] calcCoefficientOfAssociation(double[][] standardData) {        double[][] assosiationMatrix = null;        if (standardData != null) {            int n = standardData.length; // 二维矩阵的行号            int p = standardData[0].length; // 二维矩阵的列号            assosiationMatrix = new double[p][p];// 相关系数矩阵            for (int i = 0; i < p; i++) {                for (int j = 0; j < p; j++) {                    double temp = 0;                    for (int k = 0; k < n; k++) {                        temp += standardData[k][i] * standardData[k][j];                    }                    assosiationMatrix[i][j] = temp / (n - 1);                }            }        }        return assosiationMatrix;    }    // 计算相关系数矩阵的特征值    private double[] calcEigenValue(double[][] assosiationMatrix) {        // assosiationMatrix是一个方阵        double[] eigenValues = null;        if (assosiationMatrix != null) {            DenseMatrix Assosiation = new DenseMatrix(assosiationMatrix);            int len = assosiationMatrix.length;            EVD evd = new EVD(len);            try {                evd.factor(Assosiation);                eigenValues = evd.getRealEigenvalues();            } catch (NotConvergedException e) {                e.printStackTrace();                logger.error(e);            }        }        return eigenValues;    }    // 计算相关系数矩阵的特征向量    private double[][] calcEigenVector(double[][] assosiationMatrix) {        // assosiationMatrix是一个方阵        double[][] eigenVectors = null;        if (assosiationMatrix != null) {            DenseMatrix Assosiation = new DenseMatrix(assosiationMatrix);            int len = assosiationMatrix.length;            EVD evd = new EVD(len);            try {                evd.factor(Assosiation);                DenseMatrix tempMatrix = evd.getLeftEigenvectors();                eigenVectors = new double[len][len];                for (int i = 0; i < len; i++) {                    for (int j = 0; j < len; j++) {                        eigenVectors[i][j] = tempMatrix.get(i, j);                    }                }            } catch (NotConvergedException e) {                e.printStackTrace();                logger.error(e);            }        }        return eigenVectors;    }    // 假设阈值是90%，选取最大的前几个特征值的序号    private int[] selectPrincipalComponent(double[] eigenValues) {        int p = eigenValues.length; // 特征值个数        int[] sortedEigenValueIndex = new int[p]; // 特征值由大到小，序号排序        for (int i = 0; i < p; i++) {            sortedEigenValueIndex[i] = i;        }        double[] temp = new double[p]; // 特征值副本        System.arraycopy(eigenValues, 0, temp, 0, p);        // TODO 排序过程可以再优化        double tmp = 0.0;        int pos = 0;        for (int i = 1; i <= p; i++) {            for (int j = 0; j < p - i - 1; j++) {                if (temp[j] < temp[j + 1]) {                    tmp = temp[j];                    temp[j] = temp[j + 1];                    temp[j + 1] = tmp;                    pos = sortedEigenValueIndex[j];                    sortedEigenValueIndex[j] = sortedEigenValueIndex[j + 1];                    sortedEigenValueIndex[j + 1] = pos;                }            }        }        double total = 0.0; // 特征值的和        for (int i = 0; i < p; i++) {            total += temp[i];        }        int count = 0;        double sum = 0.0;        for (int i = 0; i < p; i++) {            if (sum / total <= proportion) {                sum += temp[i];                count++;            }        }        int[] selected = new int[count];        System.arraycopy(sortedEigenValueIndex, 0, selected, 0, count);        return selected;    }    // 取得主特征值    private double[] calcPrincipalEigenValues(double[] eigenValues,            int[] selected) {        int p = eigenValues.length;        double[] principalEigenValues = new double[selected.length];        for (int i = 0; i < selected.length; i++) {            principalEigenValues[i] = eigenValues[selected[i]];        }        return principalEigenValues;    }    // 取得主特征向量，即变换矩阵    private double[][] calcPrincipalEigenVectors(double[][] eigenVectors,            int[] selected) {        int p = eigenVectors.length;        double[][] principalEigenVectors = new double[p][selected.length];        for (int i = 0; i < selected.length; i++) {            for (int j = 0; j < p; j++) {                principalEigenVectors[j][i] = eigenVectors[j][selected[i]];            }        }        return principalEigenVectors;    }    // 原始数据的主成分数据    private double[][] calcPrincipalComponent(double[][] rawData) {        Matrix A = new DenseMatrix(rawData);        Matrix B = new DenseMatrix(principalEigenVectors);        Matrix C = new DenseMatrix(rawData.length,                principalEigenVectors[0].length);        A.mult(B, C); // C=A*B        double[][] principalData = new double[C.numRows()][C.numColumns()];        for (int i = 0; i < C.numRows(); i++) {            for (int j = 0; j < C.numColumns(); j++) {                principalData[i][j] = C.get(i, j);            }        }        return principalData;    }    /**************** PCA的主流程 ****************/    public void buildPCA(double[][] rawData) {        this.rawData = rawData;        double[][] standardData = calcStandardlizer(rawData);        double[][] assosiationMatrix = calcCoefficientOfAssociation(standardData);        this.eigenValues = calcEigenValue(assosiationMatrix);        this.eigenVectors = calcEigenVector(assosiationMatrix);        this.selected = selectPrincipalComponent(eigenValues);        this.principalEigenValues = calcPrincipalEigenValues(eigenValues,                selected);        this.principalEigenVectors = calcPrincipalEigenVectors(eigenVectors,                selected);        this.principalData = calcPrincipalComponent(rawData);    }}
PCA的原理和过程不难理解
1. 先计算原始数据的相关系数矩阵
2. 求相关系数矩阵的特征值和特征向量
3. 选择特征值最大的n个特征值对应的特征向量作为转换矩阵
4. 原始数据矩阵和转换矩阵相乘，得到结果
本质上，PCA是一个坐标转换。将原来不好的坐标系转换为好的坐标系。将每条样本的原始属性变为新属性。
第一，各个属性之间独立、不相关
第二，使得各个属性方差较大，数据具有区分度
0 0