
来源:互联网 发布:帮人送东西的软件 编辑:程序博客网 时间:2024/05/04 06:19







2、对数据进行重采样:对小类的数据样本进行采样来增加小类的数据样本个数,即过采样(over-sampling ,采样的个数大于该类样本的个数)。对大类的数据样本进行采样来减少该类数据样本的个数,即欠采样(under-sampling,采样的次数少于该类样本的个数)。




强烈建议不要对待每一个分类都使用自己喜欢而熟悉的分类算法。应该使用不同的算法对其进行比较,因为不同的算法使用于不同的任务与数据。决策树往往在类别不均衡数据上表现不错。它使用基于类变量的划分规则去创建分类树,因此可以强制地将不同类别的样本分开。目前流行的决策树算法有:C4.5、C5.0、CART和Random Forest等。




/** * This file is part of the Java Machine Learning Library *  * The Java Machine Learning Library is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. *  * The Java Machine Learning Library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU General Public License for more details. *  * You should have received a copy of the GNU General Public License * along with the Java Machine Learning Library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA *  * Copyright (c) 2006-2012, Thomas Abeel *  * Project: *  */package com.gddx;import;import java.util.ArrayList;import java.util.HashMap;import java.util.Iterator;import java.util.List;import java.util.Map;import be.abeel.util.Pair;import net.sf.javaml.classification.Classifier;import net.sf.javaml.classification.KNearestNeighbors;import net.sf.javaml.core.Dataset;import net.sf.javaml.core.DefaultDataset;import net.sf.javaml.core.DenseInstance;import net.sf.javaml.core.Instance;import net.sf.javaml.sampling.Sampling;import;/** * This tutorial show how to use a the k-nearest neighbors classifier. *  * @author Thomas Abeel *  */public class TutorialKNN {    /**     * Shows the default usage of the KNN algorithm.     */    public static void main(String[] args)throws Exception {        /* Load a data set */        Dataset data = FileHandler.loadDataset(new File("D:\\tmp\\javaml-0.1.7-src\\UCI-small\\iris\\"), 4, ",");        Sampling s = Sampling.SubSampling;        int iretSam=10;//有放回重复采样10次        List<String> lOut=new ArrayList<String>();        while (iretSam>0){        Pair<Dataset, Dataset> sam_data = s.sample(data, (int) (data.size() * 0.8));            Classifier knn = new KNearestNeighbors(5);            knn.buildClassifier(sam_data.x());            /* Counters for correct and wrong predictions. */            int correct = 0, wrong = 0;            /* Classify all instances and check with the correct class values */            for (Instance inst : sam_data.y()) {                Object predictedClassValue = knn.classify(inst);                Object realClassValue = inst.classValue();                if (predictedClassValue.equals(realClassValue))                    correct++;                else                    wrong++;            }            System.out.println("Correct predictions  " + correct);            System.out.println("Wrong predictions " + wrong);            //预测            Dataset pre_data = FileHandler.loadDataset(new File("D:\\tmp\\javaml-0.1.7-src\\UCI-small\\iris\\"),",");        for(Instance inst:pre_data){        double[] values = new double[4];         for(int i=0;i<4;i++) values[i]=inst.value(i);        Instance pre_inst = new DenseInstance(values); //无标记,4列特征参与训练        Object pre_classvalue = knn.classify(pre_inst);//预测结果        Object id=(int)inst.value(4);        lOut.add(id+"|"+pre_classvalue);//输出id+标记        }        iretSam--;        }         //重复采样和训练后,用投票法决定类别        Map<String,Integer> mISe=new HashMap<String,Integer>();//Iris-setosa        Map<String,Integer> mIVe=new HashMap<String,Integer>();//Iris-versicolor        Map<String,Integer> mIVi=new HashMap<String,Integer>();//Iris-virginica        for(String sout:lOut){         String id=sout.split("\\|")[0];        String classvalue=sout.split("\\|")[1];        if(classvalue.equals("Iris-setosa")){        if(mISe.containsKey(id)){        int value=mISe.get(id);        value++;        mISe.put(id, value);        }else mISe.put(id, 1);        }else if (classvalue.equals("Iris-versicolor")){        if(mIVe.containsKey(id)){        int value=mIVe.get(id);        value++;        mIVe.put(id, value);        }else mIVe.put(id, 1);        }else if (classvalue.equals("Iris-virginica")){        if(mIVi.containsKey(id)){        int value=mIVi.get(id);        value++;        mIVi.put(id, value);        }else mIVi.put(id, 1);        }        }        for(int i=1;i<=12;i++){          String key=String.valueOf(i);        int mISe_value=0;        if (mISe.containsKey(key)) mISe_value= mISe.get(key);              int mIVe_value=0;            if (mIVe.containsKey(key)) mIVe_value= mIVe.get(key);             int mIVi_value=0;            if (mIVi.containsKey(key)) mIVi_value = mIVi.get(key);             if(mISe_value>=mIVe_value && mISe_value>=mIVi_value)            System.out.println("样本:"+key+"的类别是:Iris-setosa");            else if(mIVe_value>=mISe_value && mIVe_value>=mIVi_value)            System.out.println("样本:"+key+"的类别是:Iris-versicolor");            else if(mIVi_value>=mISe_value && mIVi_value>=mIVe_value)            System.out.println("样本:"+key+"的类别是:Iris-virginica");        }      }}

