数据挖掘-Knn算法实现

来源:互联网 发布:php外卖订餐系统源码 编辑:程序博客网 时间:2024/04/29 18:21
import java.io.BufferedReader;import java.io.FileReader;import java.io.IOException;import java.util.ArrayList;import java.util.HashMap;import java.util.Iterator;import java.util.List;import java.util.Map;public class Knn {List<String> data_var=new ArrayList<String>();List<String> data_tag=new ArrayList<String>();public Knn() throws IOException{//函数作用:数据载入BufferedReader br=new BufferedReader(new FileReader("F:/数据挖掘--算法实现/Knn算法/input.txt"));        String line="";        while((line=br.readLine())!=null){        this.data_tag.add(line.split(" ",2)[0]);        this.data_var.add(line.split(" ",2)[1]);     }}public void Quick_sort(List<Float> list_sort,int low,int high,List<String> list_tag){//函数作用:快速排序法,按顺序欧氏距离输出类别列表if(low>=high) return;int first=low;int last=high;float key=list_sort.get(first);String key_tag=list_tag.get(first);while(first<last){while(first<last && list_sort.get(last)>=key) --last;list_sort.set(first,list_sort.get(last));list_tag.set(first,list_tag.get(last));while(first<last && list_sort.get(first)<=key) ++first;list_sort.set(last,list_sort.get(first));list_tag.set(last, list_tag.get(first));}list_sort.set(first,key);list_tag.set(first,key_tag);Quick_sort(list_sort,low,first-1,list_tag);Quick_sort(list_sort,first+1,high,list_tag);}public String Knn_method(String predict_var,int k){//函数作用:最邻近算法,输出最近的前k个点的最多数类别String predict_tag="null";List<String> data_var_stand=new ArrayList<String>();List<Float> data_var_stand_distance=new ArrayList<Float>();List<String> data_tag_stand=new ArrayList<String>(this.data_tag);List<String> data_var_=new ArrayList<String>(this.data_var);data_var_.add(predict_var);data_tag_stand.add(predict_tag);data_var_stand=Standard(data_var_);for(int i=0;i<data_var_stand.size()-1;i++){data_var_stand_distance.add(Eu_distance(data_var_stand.get(data_var_stand.size()-1),data_var_stand.get(i)));}data_var_stand_distance.add(1000f);Quick_sort(data_var_stand_distance,0,data_var_stand.size()-1,data_tag_stand);predict_tag=K_Near(data_tag_stand,k);return predict_tag;}private String K_Near(List<String> data_tag_stand, int k) {//函数作用:输出data_tag_stand的前k个元素中,最多的类别.String re="";Map<String,Integer> map=new HashMap<String,Integer>();int count=0;for(int i=0;i<k;i++){if(map.containsKey(data_tag_stand.get(i))){count=map.get(data_tag_stand.get(i))+1; map.put(data_tag_stand.get(i),count); }else{map.put(data_tag_stand.get(i),1);}}Iterator<String> Iter=map.keySet().iterator();int max=0;while(Iter.hasNext()){String tmp=Iter.next();if(map.get(tmp)>max){re=tmp;max=map.get(tmp);}}return re;}public float Eu_distance(String a,String b){//函数作用:欧几里得距离公式定义String[] tmp1=a.split(" ");String[] tmp2=b.split(" ");float tmp=0;for(int i=0;i<tmp1.length;i++){tmp=tmp+(Float.parseFloat(tmp1[i])-Float.parseFloat(tmp2[i]))*(Float.parseFloat(tmp1[i])-Float.parseFloat(tmp2[i]));}return (float)Math.sqrt(tmp);}public List<String> Standard(List<String> data_var_stand) {//函数作用:把数据标准化,消除量纲影响String[] var_max=data_var_stand.get(0).split(" ");String[] var_min=data_var_stand.get(0).split(" ");for(int i=1;i<data_var_stand.size();i++){for(int j=0;j<var_max.length;j++){if(Float.parseFloat(data_var_stand.get(i).split(" ")[j])>Float.parseFloat(var_max[j])){var_max[j]=data_var_stand.get(i).split(" ")[j];}if(Float.parseFloat(data_var_stand.get(i).split(" ")[j])<Float.parseFloat(var_min[j])){var_min[j]=data_var_stand.get(i).split(" ")[j];}}}List<String> list=new ArrayList<String>();float max=0;float min=0;float v=0;for(int i=0;i<data_var_stand.size();i++){String tmp="";for(int j=0;j<var_max.length;j++){v=Float.parseFloat(data_var_stand.get(i).split(" ")[j]);max=Float.parseFloat(var_max[j]);min=Float.parseFloat(var_min[j]);tmp=tmp+" "+String.valueOf((v-min)/(max-min));}list.add(tmp.trim());}return list;}public static void main(String[] args) throws IOException {Knn a=new Knn();for(int i=0;i<a.data_tag.size();i++){System.out.print("预测类:"+a.Knn_method(a.data_var.get(i),5));System.out.println("    真实类:"+a.data_tag.get(i));}}}

训练样本:

类别 变量

1 1.5 1.2 0.3
1 2.5 0.3 0.0
1 0.8 0.2 0.3
1 1.1 0.3 0.0
1 1.1 0.2 1.0
2 0.3 2.2 0.3
2 0.1 1.2 1.3
2 0.5 1.3 0.3
2 1.1 1.3 1.0
2 1.5 1.6 0.3
2 1.1 1.4 1.0
3 1.9 3.9 1.0
3 1.1 2.4 1.0
3 2.1 2.9 1.8
4 2.5 0.2 1.3
4 3.1 1.3 1.0
4 3.5 1.2 1.3
4 2.5 1.2 0.3
4 2.5 1.5 0.6
4 3.1 1.7 1.4
4 4.5 1.8 1.3
4 3.5 1.1 0.4
4 4.1 0.8 1.2
5 1.3 1.3 1.0
5 2.2 1.3 1.0
5 1.4 1.2 1.3
5 1.6 1.2 1.9
5 1.7 1.3 1.0
5 2.1 1.5 1.0


对训练样本进行检验,输出:

预测类:2    真实类:1
预测类:1    真实类:1
预测类:1    真实类:1
预测类:1    真实类:1
预测类:2    真实类:1
预测类:2    真实类:2
预测类:2    真实类:2
预测类:2    真实类:2
预测类:5    真实类:2
预测类:2    真实类:2
预测类:5    真实类:2
预测类:3    真实类:3
预测类:2    真实类:3
预测类:3    真实类:3
预测类:4    真实类:4
预测类:4    真实类:4
预测类:4    真实类:4
预测类:4    真实类:4
预测类:4    真实类:4
预测类:4    真实类:4
预测类:4    真实类:4
预测类:4    真实类:4
预测类:4    真实类:4
预测类:5    真实类:5
预测类:5    真实类:5
预测类:5    真实类:5
预测类:4    真实类:5
预测类:5    真实类:5
预测类:5    真实类:5

0 0
原创粉丝点击