数据挖掘笔记-关联规则-FPGrowth-简单实现

来源：互联网发布：java有参构造方法示例编辑：程序博客网时间：2024/05/17 01:09

由于Apriori算法需要多次扫描事务数据库，需要生成候选项集，大大增加了时间与空间的代价，FP Growth算法利用了巧妙的数据结构，大大降低了Aproir挖掘算法的代价，它不需要不断得生成候选项目队列和不断得扫描整个数据库进行比对。为了达到这样的效果，它采用了一种简洁的数据结构，叫做frequent-pattern tree(频繁模式树)。FP-growth算法比Apriori算法快一个数量级，在空间复杂度方面也比Apriori也有数量级级别的优化。对于海量数据，FP-growth的时空复杂度仍然很高，可以采用的改进方法包括数据库划分，数据采样等等。

FPGrowth算法的介绍与实例说明可以参考下面这个连接，里面讲的很详细。

http://hi.baidu.com/nefzpohtpndhovr/item/9d5c371ba2dbdc0ed1d66dca

Apriori和FP-Tree都是寻找频繁项集的算法，后面根据频繁项集产生关联规则都是一样的，就不再这里重复了。

FPGrowth算法Java简单实现:

public class FPGrowthBuilder {/** 最小支持度 */private int minSupport = 2;/** 频繁集集合*/private List<List<ItemSet>> frequencies = new ArrayList<List<ItemSet>>();//创建头表public List<FPTreeNode> buildHeadTables(Data data) {//统计各项出现频次Map<String, Integer> map = new HashMap<String, Integer>();for (Instance instance : data.getInstances()) {for (String value : instance.getValues()) {Integer mValue = map.get(value);map.put(value, null == mValue ? 1 : mValue + 1);}}//过滤掉未满足最小支持度的项List<Map.Entry<String, Integer>> entries = new ArrayList<Map.Entry<String, Integer>>(); for (Map.Entry<String, Integer> entry : map.entrySet()) {if (entry.getValue() >= minSupport) {entries.add(entry);}}//根据出现频次排序项Collections.sort(entries, new Comparator<Map.Entry<String, Integer>>() {public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {return ((Integer) o2.getValue()).compareTo((Integer) o1.getValue());}});//数据集的过滤重排for (Instance instance : data.getInstances()) {instance.replaceValues(entries);ShowUtils.print(instance.getValues());}//建立头表List<FPTreeNode> headerTables = new ArrayList<FPTreeNode>();for (Map.Entry<String, Integer> entry : entries) {headerTables.add(new FPTreeNode(entry.getKey(), entry.getValue()));}return headerTables;}//创建FPGrowthTreepublic FPTreeNode buildFPGrowthTree(Data data, List<FPTreeNode> headerTables) {FPTreeNode rootNode = new FPTreeNode();for (Instance instance : data.getInstances()) {LinkedList<String> items = instance.getValuesList();FPTreeNode tempNode = rootNode;//如果节点已经存在则加1FPTreeNode childNode = tempNode.findChild(items.peek());while (!items.isEmpty() && null != childNode) {childNode.incrementCount();tempNode = childNode;items.poll();childNode = tempNode.findChild(items.peek());}//如果节点不存在则新增addNewTreeNode(tempNode, items, headerTables);}return rootNode;}//新增树节点private void addNewTreeNode(FPTreeNode parent, LinkedList<String> items, List<FPTreeNode> headerTables) {while (items.size() > 0) {String item = items.poll();FPTreeNode child = new FPTreeNode(item, 1);child.setParent(parent);parent.addChild(child);//建立节点之间的关联关系for (FPTreeNode headerTable : headerTables) {if (item.equals(headerTable.getName())) {while (null != headerTable.getNext()) {headerTable = headerTable.getNext();}headerTable.setNext(child);break;}}addNewTreeNode(child, items, headerTables);}}//构建频繁项集public void build(Data data, List<String> postfixs) {List<FPTreeNode> headerTables = buildHeadTables(data);FPTreeNode treeNode = buildFPGrowthTree(data, headerTables);FPTreeNodeHelper.print(treeNode, 0);if (treeNode.getChildren().size() == 0) {return;}//收集频繁项集List<ItemSet> frequency = new ArrayList<ItemSet>();frequencies.add(frequency);for (FPTreeNode header : headerTables) {ItemSet itemSet = new ItemSet(header.getName(), header.getCount());if(null != postfixs){for (String postfix : postfixs) {itemSet.add(postfix);}}frequency.add(itemSet);}//进入下一步迭代for (FPTreeNode headerTable : headerTables) {List<String> newPostfix = new LinkedList<String>();newPostfix.add(headerTable.getName());if (null != postfixs) {newPostfix.addAll(postfixs);}Data newData = new Data();FPTreeNode startNode = headerTable.getNext();while (null != startNode) {List<String> prefixNodes = new ArrayList<String>();FPTreeNode parent = startNode;while (null != (parent = parent.getParent()).getName()) {prefixNodes.add(parent.getName());}int count = startNode.getCount();while (count-- > 0 && prefixNodes.size() > 0) {Instance instance = new Instance();instance.setValues(prefixNodes.toArray(new String[0]));newData.getInstances().add(instance);}startNode = startNode.getNext();}build(newData, newPostfix);}}public void print(List<List<ItemSet>> itemSetss) {System.out.println("Frequency Item Set");System.out.println(itemSetss.size());for (List<ItemSet> itemSets : itemSetss) {for (ItemSet itemSet : itemSets) {System.out.print(itemSet.getSupport() + "\t");System.out.println(itemSet.getItems());}}}public void build() {Data data = DataLoader.load("d:\\apriori.txt");build(data, null);print(frequencies);}public static void main(String[] args) {FPGrowthBuilder fpg = new FPGrowthBuilder();fpg.build();}}

数据集格式可以参考以下数据：

1 豆奶,莴苣2 莴苣,尿布,葡萄酒,甜菜3 豆奶,尿布,葡萄酒,橙汁4 莴苣,豆奶,尿布,葡萄酒5 莴苣,豆奶,尿布,橙汁6 莴苣,尿布,葡萄酒

结果如下：

5[莴苣]5[尿布]4[豆奶]4[葡萄酒]2[橙汁]4[尿布, 莴苣]3[莴苣, 豆奶]3[尿布, 豆奶]2[尿布, 莴苣, 豆奶]4[尿布, 葡萄酒]3[莴苣, 葡萄酒]2[葡萄酒, 豆奶]3[尿布, 莴苣, 葡萄酒]2[尿布, 葡萄酒, 豆奶]2[橙汁, 豆奶]2[尿布, 橙汁]2[尿布, 橙汁, 豆奶]

[莴苣]------>[尿布,葡萄酒]{confidence: 0.75}[尿布]------>[莴苣,葡萄酒]{confidence: 1.0}[葡萄酒]------>[尿布,莴苣]{confidence: 0.75}[尿布]------>[橙汁,豆奶]{confidence: 1.0}[豆奶]------>[尿布,橙汁]{confidence: 1.0}[葡萄酒]------>[尿布,豆奶]{confidence: 0.6666666666666666}[尿布,葡萄酒]------>[莴苣]{confidence: 0.6}[尿布]------>[莴苣,豆奶]{confidence: 0.6666666666666666}[莴苣]------>[尿布,豆奶]{confidence: 0.6666666666666666}[橙汁]------>[尿布,豆奶]{confidence: 0.6666666666666666}[尿布,豆奶]------>[橙汁]{confidence: 1.0}[莴苣,葡萄酒]------>[尿布]{confidence: 0.6}[尿布,莴苣]------>[葡萄酒]{confidence: 0.75}[尿布]------>[葡萄酒,豆奶]{confidence: 1.0}

代码托管:https://github.com/fighting-one-piece/repository-datamining.git

0 0