java实现关联分析算法Apriori

来源:互联网 发布:runoob 知乎 编辑:程序博客网 时间:2024/06/02 03:42

package com.dataming.association;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;

import org.apache.log4j.Logger;

public class Apriori {

 private static final Logger log = Logger.getLogger(Apriori.class);
 
 private int min_sport = 2;
 
 private List<String> items; //这里面的内容一定要按照顺序存放
 private List<List<Integer>> bitVectorList = new ArrayList<List<Integer>>();
 private List<CFCon> candidateList = new ArrayList<CFCon>();
 private List<CFCon> freqenceList = new ArrayList<CFCon>();
 
 public static void main(String args[]){
  Apriori apriori = new Apriori();
  
  apriori.generateData();
  apriori.apriMain();
  apriori.printFreqItems();
  
 }
 
 private void printFreqItems(){
  CFCon cfcL = freqenceList.get(freqenceList.size() - 1);
  for(CF cf : cfcL.cfList){
   String kk = "";
   List<String> itemList = cf.itemList;
   for(int i = 0; i < itemList.size(); i++){
    if(i == 0){
     kk = itemList.get(i);
    } else {
     kk += "," + itemList.get(i);
    }
   }
   log.info("freqence: " + kk + " supCount:" + cf.supCount);
  }
 }
 
 private void apriMain(){
  //C1
  CFCon cfcC1 = find_frequent_1_itemsets();
  
  candidateList.add(cfcC1);
  CFCon cfcL1 = candidateToFreqent(cfcC1);
  freqenceList.add(cfcL1);
  
  CFCon cfcL = cfcL1;
  
  HashSet<String> set = new HashSet<String>();
  for(int k = 2; cfcL != null && cfcL.cfList != null && cfcL.cfList.size() > 0; k++){
   CFCon cfcCk = getCandateFroFreq(cfcL);
   //为cfcC计数
   for(List<Integer> bitVector : bitVectorList){
    set.clear();
    for(int i = 0; i < items.size(); i++){
     int bit = bitVector.get(i);
     if(bit == 1){
      set.add(items.get(i));
     }
    }
    List<CF> cfList = cfcCk.cfList;
    for(CF cf : cfList){
     List<String> itemList = cf.itemList;
     boolean isAdd = true;
     for(String item : itemList){
      if(!set.contains(item)){
       isAdd = false;
       break;
      }
     }
     if(isAdd)cf.supCount++;
    }
   }
   
   cfcL = candidateToFreqent(cfcCk);
   if(cfcCk.cfList != null && cfcCk.cfList.size() > 0)candidateList.add(cfcCk);
   if(cfcL.cfList != null && cfcL.cfList.size() > 0)freqenceList.add(cfcL);
  }
 }
 
 /**
  * 从L(k-1) 生成 C(k);
  *
  * @param cfc
  * @return
  */
 private CFCon getCandateFroFreq(CFCon cfcL){
  CFCon cfcC = null;
  
  if(cfcL != null){
   cfcC = new CFCon(1, cfcL.iteratNum + 1);
   List<CF> cfList = cfcL.cfList;
   for(int outIndex = 0; outIndex < cfList.size(); outIndex++){
    CF cfOut = cfList.get(outIndex);
    List<String> itemOutList = cfOut.itemList;
    for(int inIndex = outIndex + 1; inIndex < cfList.size(); inIndex++){
     if(outIndex == inIndex) continue;
     
     CF cfIn = cfList.get(inIndex);
     List<String> itemInList = cfIn.itemList;
     
     List<String> itemList = new ArrayList<String>();
     
     boolean same = true;
     for(int index = 0; index < itemOutList.size() - 1; index++){
      String out = itemOutList.get(index);
      String in = itemInList.get(index);
      if(out == null || in == null || !out.equals(in)){
       same = false;
       break;
      }
      itemList.add(out);
     }
     if(same){
      String out = itemOutList.get(itemOutList.size() - 1 );
      String in = itemInList.get(itemInList.size() - 1);
      if(out != null && in != null && !out.equals(in)){
       if(out.compareTo(in) >= 0){
        itemList.add(in);
        itemList.add(out);
       } else {
        itemList.add(out);
        itemList.add(in);
       }
       CF cf = new CF(itemList, 0);
       if(!has_infreqent_subset(itemList, cfcL)){
        cfcC.cfList.add(cf);
       }
      }
     }
    }
   }
  }
  
  return cfcC;
 } 
 
 /**
  * 在L(k-1)查找是否存在,cList(k-1)子集
  *
  * @param cList
  * @param cfc L(k-1)
  * @return
  */
 private boolean has_infreqent_subset(List<String> cList, CFCon cfc){
  HashSet<String> set = new HashSet<String>();
  
  List<CF> cfList = cfc.cfList;
  for(int index = 0; index < cfList.size(); index++){
   CF cf = cfList.get(index);
   List<String> itemList = cf.itemList;
   String key = "";
   boolean first = true;
   for(String item : itemList){
    if(first){
     first = false;
     key = item;
    } else {
     key += "," + item;
    }
   }
   set.add(key);
  }
  
  StringBuilder sb = new StringBuilder();  
  for(int index = 0; index < cList.size(); index++){
   
   sb.delete(0, sb.length());
   boolean first = true;
   for(int index2 = 0; index2 < cList.size(); index2++){
    if(index2 == index)continue;
    else {
     if(first){
      sb.append(cList.get(index2));
      first = false;
     } else {
      sb.append(",");
      sb.append(cList.get(index2));
     }
    }
   }
   boolean setCon = set.contains(sb.toString());
   if(!setCon) return true;
  }
  
  return false;
 }
 
 private class CFCon {
  
  List<CF> cfList;
  int cOrf;  //1.候选集,2,频繁集
  int iteratNum; //迭代次数
  
  public CFCon(int cOrf, int iteratNum){
   cfList = new ArrayList<CF>();
   this.cOrf = cOrf;
   this.iteratNum = iteratNum;
  }
  
  public CFCon(int n, int cOrf, int iteratNum){
   this.cOrf = cOrf;
   this.iteratNum = iteratNum;
   cfList = new ArrayList<CF>();
   for(int index = 0; index < n; index++){
    List<String> itemList = new ArrayList<String>();
    itemList.add(items.get(index));
    
    CF cf = new CF(itemList, 0);
    
    cfList.add(cf);
   }
  }
 }
 
 private class CF {
  List<String> itemList;
  int supCount;
  
  public CF(List<String> itemList, int supCount){
   this.itemList = itemList;
   this.supCount = supCount;
  }
 }
 
 private CFCon find_frequent_1_itemsets(){
  
  CFCon cfc = null;
  if(bitVectorList != null && items != null){
   cfc = new CFCon(items.size(), 1, 1);
   for(List<Integer> bitVector : bitVectorList){
    if(bitVector != null){
     for(int index = 0; index < bitVector.size(); index++){
      int bit = bitVector.get(index);
      CF cf = cfc.cfList.get(index);
      if(bit == 1) cf.supCount++;
     }
    }
   }
  }
  return cfc;
 }
 
 /**
  * 通过min_suport过滤掉最小的
  *
  * @param cfcC
  * @return
  */
 private CFCon candidateToFreqent(CFCon cfcC){
  List<CF> cfList = cfcC.cfList;
  
  CFCon cfcL = new CFCon(2, cfcC.iteratNum);
  if(cfList != null){
   for(int index = cfList.size() - 1; index >= 0; index--){
    CF cf = cfList.get(index);
    int supCount = cf.supCount;
    if(supCount >= min_sport){
     cfcL.cfList.add(cf);
    }
   }
  }
  return cfcL;
 }
 
 private void generateData(){
  items = new ArrayList<String>();
  for(int index = 1; index <=5; index++) items.add("I" + index);
  
  bitVectorList.add(getStrList("1,1,0,0,1"));
  bitVectorList.add(getStrList("0,1,0,1,0"));
  bitVectorList.add(getStrList("0,1,1,0,0"));
  bitVectorList.add(getStrList("1,1,0,1,0"));
  bitVectorList.add(getStrList("1,0,1,0,0"));
  bitVectorList.add(getStrList("0,1,1,0,0"));
  bitVectorList.add(getStrList("1,0,1,0,0"));
  bitVectorList.add(getStrList("1,1,1,0,1"));
  bitVectorList.add(getStrList("1,1,1,0,0"));
  
 }
 
 private List<Integer> getStrList(String bitVector){
  List<Integer> list = new ArrayList<Integer>();
  if(bitVector != null){
   String[] bitArr = bitVector.split(",");
   for(String bit : bitArr){
    list.add(Integer.parseInt(bit));
   }
  }
  return list;
 }
}

原创粉丝点击