apriori算法实现
来源:互联网 发布:小旭音乐 知乎 编辑:程序博客网 时间:2024/06/07 04:00
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import com.google.common.collect.Lists;
public class Apriori {
private int minSup;
private static List<String> data;
private static List<Set<String>> dataSet;
public static void main(String[] args) {
long startTime = System.currentTimeMillis();
Apriori apriori = new Apriori();
// 设置最小支持度
apriori.setMinSup(100);
// 构造数据集
data = apriori.buildData("item.cvs");
// 构造频繁1项集
List<Set<String>> f1Set = apriori.findF1Item(data);
apriori.printSet(f1Set, 1);
List<Set<String>> result = f1Set;
int i = 2;
do {
System.out.println("频繁项:" + i);
result = apriori.aprioriGen(result);
apriori.printSet(result, i);
i++;
} while (result.size() != 0);
long endTime = System.currentTimeMillis();
System.out.println("共用时: " + (endTime - startTime) + "ms");
}
public List<String> buildData(String fileName) {
List<String> data = new ArrayList<String>();
File file = new File(fileName);
BufferedReader reader = null;
try {
InputStreamReader read = new InputStreamReader(new FileInputStream(
file), "utf-8");
reader = new BufferedReader(read);
String tempString = null;
while ((tempString = reader.readLine()) != null) {
data.add(tempString);
}
reader.close();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (reader != null) {
try {
reader.close();
} catch (IOException e1) {
}
}
}
dataSet = new ArrayList<Set<String>>();
Set<String> dSet;
for (String d : data) {
dSet = new TreeSet<String>();
String[] dArr = d.split(",");
for (String str : dArr) {
dSet.add(str);
}
dataSet.add(dSet);
}
return data;
}
public void setMinSup(int minSup) {
this.minSup = minSup;
}
/**
* 找出候选1项集
*
* @param data
* @return result
*/
public List<Set<String>> findF1Item(List<String> data) {
List<Set<String>> result = new ArrayList<Set<String>>();
Map<String, Integer> dc = new HashMap<String, Integer>();
for (String d : data) {
String[] items = d.split(",");
for (String item : items) {
if (dc.containsKey(item)) {
dc.put(item, dc.get(item) + 1);
} else {
dc.put(item, 1);
}
}
}
Set<String> itemKeys = dc.keySet();
Set<String> tempKeys = new TreeSet<String>();
for (String str : itemKeys) {
tempKeys.add(str);
}
for (String item : tempKeys) {
if (dc.get(item) >= minSup) {
Set<String> f1Set = new TreeSet<String>();
f1Set.add(item);
result.add(f1Set);
}
}
return result;
}
/**
* 利用arioriGen方法由k-1项集生成k项集
*
* @param preSet
* @return
*
*/
public List<Set<String>> aprioriGen(List<Set<String>> preSet) {
List<Set<String>> result = new ArrayList<Set<String>>();
int preSetSize = preSet.size();
for (int i = 0; i < preSetSize - 1; i++) {
for (int j = i + 1; j < preSetSize; j++) {
String[] strA1 = preSet.get(i).toArray(new String[0]);
String[] strA2 = preSet.get(j).toArray(new String[0]);
if (isCanLink(strA1, strA2)) {// 判断两个k-1项集是否符合连接成K项集的条件
Set<String> set = new TreeSet<String>();// 连接成K项集
for (String str : strA1) {
set.add(str);// 将strA1加入set中连成前K-1项集
}
set.add((String) strA2[strA2.length - 1]);// 连接成K项集
// 判断K项集是否需要剪切掉,如果不需要被cut掉,则加入到k项集的列表中
if (!isNeedCut(preSet, set)) {
result.add(set);
}
}
}
}
System.out.println("aprioriGen:" + result.size());
return checkSupport(result);// 返回的都是频繁K项集
}
/**
* 把set中的项集与数量集比较并进行计算,求出支持度大于要求的项集
*
* @param set
* @return
*/
private List<Set<String>> checkSupport(List<Set<String>> setList) {
// TODO Auto-generated method stub
List<Set<String>> result = new ArrayList<Set<String>>();
boolean flag = true;
int[] counter = new int[setList.size()];
for (int i = 0; i < setList.size(); i++) {
for (Set<String> dSets : dataSet) {
if (setList.get(i).size() <= dSets.size()) {
flag = true;
for (String str : setList.get(i)) {
if (!dSets.contains(str)) {
flag = false;
break;
}
}
if (flag) {
counter[i] += 1;
}
}
}
if (i % 10000 == 0) {
System.out.println("checkSupport:" + i);
}
}
for (int i = 0; i < setList.size(); i++) {
if (counter[i] >= minSup) {
result.add(setList.get(i));
}
}
return result;
}
/**
* 判断两个项集能否执行连接操作
*
* @param s1
* @param s2
* @return
*/
boolean isCanLink(String[] s1, String[] s2) {
boolean flag = true;
if (s1.length == s2.length) {
for (int i = 0; i < s1.length - 1; i++) {
if (!s1[i].equals(s2[i])) {
flag = false;
break;
}
}
if (s1[s1.length - 1].equals(s2[s2.length - 1])) {
flag = false;
}
} else {
flag = true;
}
return flag;
}
/**
* 判断set是否需要被cut
*
* @param setList
* @param set
* @return
*/
// setList指频繁K-1项集,set指候选K项集
boolean isNeedCut(List<Set<String>> setList, Set<String> set) {
boolean flag = false;
List<Set<String>> subSets = getSubset(set);// 获得K项集的所有k-1项集
for (Set<String> subSet : subSets) {
// 判断当前的k-1项集set是否在频繁k-1项集中出现,如果出现,则不需要cut
// 若没有出现,则需要被cut
if (!isContained(setList, subSet)) {
flag = true;
break;
}
}
return flag;
}
/**
* 功能:判断k项集的某k-1项集是否包含在频繁k-1项集列表中
*
* @param setList
* @param set
* @return
*/
boolean isContained(List<Set<String>> setList, Set<String> set) {
boolean flag = false;
int position = 0;
for (Set<String> s : setList) {
String[] sArr = s.toArray(new String[0]);
String[] setArr = set.toArray(new String[0]);
for (int i = 0; i < sArr.length; i++) {
if (sArr[i].equals(setArr[i])) {
// 如果对应位置的元素相同,则position为当前位置的值
position = i;
} else {
break;
}
}
if (position == sArr.length - 1) {
flag = true;
break;
} else {
flag = false;
position = 0;
}
}
return flag;
}
/**
* 获得k项集的所有k-1项子集
*
* @param set
* @return
*/
List<Set<String>> getSubset(Set<String> set) {
List<Set<String>> result = new ArrayList<Set<String>>();
String[] setArr = set.toArray(new String[0]);
for (int i = 0; i < setArr.length; i++) {
Set<String> subSet = new TreeSet<String>();
for (int j = 0; j < setArr.length; j++) {
if (i != j) {
subSet.add((String) setArr[j]);
}
}
result.add(subSet);
}
return result;
}
/**
* 功能:打印频繁项集
*/
void printSet(List<Set<String>> setList, int i) {
System.out.println("频繁" + i + "项集: 共" + setList.size() + "项: ");
System.out.println("{");
for (Set<String> set : setList) {
System.out.print("[");
for (String str : set) {
System.out.print(str + " ");
}
System.out.println("] ");
}
System.out.println("}");
}
}
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import com.google.common.collect.Lists;
public class Apriori {
private int minSup;
private static List<String> data;
private static List<Set<String>> dataSet;
public static void main(String[] args) {
long startTime = System.currentTimeMillis();
Apriori apriori = new Apriori();
// 设置最小支持度
apriori.setMinSup(100);
// 构造数据集
data = apriori.buildData("item.cvs");
// 构造频繁1项集
List<Set<String>> f1Set = apriori.findF1Item(data);
apriori.printSet(f1Set, 1);
List<Set<String>> result = f1Set;
int i = 2;
do {
System.out.println("频繁项:" + i);
result = apriori.aprioriGen(result);
apriori.printSet(result, i);
i++;
} while (result.size() != 0);
long endTime = System.currentTimeMillis();
System.out.println("共用时: " + (endTime - startTime) + "ms");
}
public List<String> buildData(String fileName) {
List<String> data = new ArrayList<String>();
File file = new File(fileName);
BufferedReader reader = null;
try {
InputStreamReader read = new InputStreamReader(new FileInputStream(
file), "utf-8");
reader = new BufferedReader(read);
String tempString = null;
while ((tempString = reader.readLine()) != null) {
data.add(tempString);
}
reader.close();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (reader != null) {
try {
reader.close();
} catch (IOException e1) {
}
}
}
dataSet = new ArrayList<Set<String>>();
Set<String> dSet;
for (String d : data) {
dSet = new TreeSet<String>();
String[] dArr = d.split(",");
for (String str : dArr) {
dSet.add(str);
}
dataSet.add(dSet);
}
return data;
}
public void setMinSup(int minSup) {
this.minSup = minSup;
}
/**
* 找出候选1项集
*
* @param data
* @return result
*/
public List<Set<String>> findF1Item(List<String> data) {
List<Set<String>> result = new ArrayList<Set<String>>();
Map<String, Integer> dc = new HashMap<String, Integer>();
for (String d : data) {
String[] items = d.split(",");
for (String item : items) {
if (dc.containsKey(item)) {
dc.put(item, dc.get(item) + 1);
} else {
dc.put(item, 1);
}
}
}
Set<String> itemKeys = dc.keySet();
Set<String> tempKeys = new TreeSet<String>();
for (String str : itemKeys) {
tempKeys.add(str);
}
for (String item : tempKeys) {
if (dc.get(item) >= minSup) {
Set<String> f1Set = new TreeSet<String>();
f1Set.add(item);
result.add(f1Set);
}
}
return result;
}
/**
* 利用arioriGen方法由k-1项集生成k项集
*
* @param preSet
* @return
*
*/
public List<Set<String>> aprioriGen(List<Set<String>> preSet) {
List<Set<String>> result = new ArrayList<Set<String>>();
int preSetSize = preSet.size();
for (int i = 0; i < preSetSize - 1; i++) {
for (int j = i + 1; j < preSetSize; j++) {
String[] strA1 = preSet.get(i).toArray(new String[0]);
String[] strA2 = preSet.get(j).toArray(new String[0]);
if (isCanLink(strA1, strA2)) {// 判断两个k-1项集是否符合连接成K项集的条件
Set<String> set = new TreeSet<String>();// 连接成K项集
for (String str : strA1) {
set.add(str);// 将strA1加入set中连成前K-1项集
}
set.add((String) strA2[strA2.length - 1]);// 连接成K项集
// 判断K项集是否需要剪切掉,如果不需要被cut掉,则加入到k项集的列表中
if (!isNeedCut(preSet, set)) {
result.add(set);
}
}
}
}
System.out.println("aprioriGen:" + result.size());
return checkSupport(result);// 返回的都是频繁K项集
}
/**
* 把set中的项集与数量集比较并进行计算,求出支持度大于要求的项集
*
* @param set
* @return
*/
private List<Set<String>> checkSupport(List<Set<String>> setList) {
// TODO Auto-generated method stub
List<Set<String>> result = new ArrayList<Set<String>>();
boolean flag = true;
int[] counter = new int[setList.size()];
for (int i = 0; i < setList.size(); i++) {
for (Set<String> dSets : dataSet) {
if (setList.get(i).size() <= dSets.size()) {
flag = true;
for (String str : setList.get(i)) {
if (!dSets.contains(str)) {
flag = false;
break;
}
}
if (flag) {
counter[i] += 1;
}
}
}
if (i % 10000 == 0) {
System.out.println("checkSupport:" + i);
}
}
for (int i = 0; i < setList.size(); i++) {
if (counter[i] >= minSup) {
result.add(setList.get(i));
}
}
return result;
}
/**
* 判断两个项集能否执行连接操作
*
* @param s1
* @param s2
* @return
*/
boolean isCanLink(String[] s1, String[] s2) {
boolean flag = true;
if (s1.length == s2.length) {
for (int i = 0; i < s1.length - 1; i++) {
if (!s1[i].equals(s2[i])) {
flag = false;
break;
}
}
if (s1[s1.length - 1].equals(s2[s2.length - 1])) {
flag = false;
}
} else {
flag = true;
}
return flag;
}
/**
* 判断set是否需要被cut
*
* @param setList
* @param set
* @return
*/
// setList指频繁K-1项集,set指候选K项集
boolean isNeedCut(List<Set<String>> setList, Set<String> set) {
boolean flag = false;
List<Set<String>> subSets = getSubset(set);// 获得K项集的所有k-1项集
for (Set<String> subSet : subSets) {
// 判断当前的k-1项集set是否在频繁k-1项集中出现,如果出现,则不需要cut
// 若没有出现,则需要被cut
if (!isContained(setList, subSet)) {
flag = true;
break;
}
}
return flag;
}
/**
* 功能:判断k项集的某k-1项集是否包含在频繁k-1项集列表中
*
* @param setList
* @param set
* @return
*/
boolean isContained(List<Set<String>> setList, Set<String> set) {
boolean flag = false;
int position = 0;
for (Set<String> s : setList) {
String[] sArr = s.toArray(new String[0]);
String[] setArr = set.toArray(new String[0]);
for (int i = 0; i < sArr.length; i++) {
if (sArr[i].equals(setArr[i])) {
// 如果对应位置的元素相同,则position为当前位置的值
position = i;
} else {
break;
}
}
if (position == sArr.length - 1) {
flag = true;
break;
} else {
flag = false;
position = 0;
}
}
return flag;
}
/**
* 获得k项集的所有k-1项子集
*
* @param set
* @return
*/
List<Set<String>> getSubset(Set<String> set) {
List<Set<String>> result = new ArrayList<Set<String>>();
String[] setArr = set.toArray(new String[0]);
for (int i = 0; i < setArr.length; i++) {
Set<String> subSet = new TreeSet<String>();
for (int j = 0; j < setArr.length; j++) {
if (i != j) {
subSet.add((String) setArr[j]);
}
}
result.add(subSet);
}
return result;
}
/**
* 功能:打印频繁项集
*/
void printSet(List<Set<String>> setList, int i) {
System.out.println("频繁" + i + "项集: 共" + setList.size() + "项: ");
System.out.println("{");
for (Set<String> set : setList) {
System.out.print("[");
for (String str : set) {
System.out.print(str + " ");
}
System.out.println("] ");
}
System.out.println("}");
}
}
0 0
- Apriori算法实现
- Apriori算法实现
- Python 实现Apriori算法
- Apriori算法实现
- Apriori算法实现
- Apriori算法实现
- Apriori算法的实现
- Apriori算法c++实现
- apriori算法 初步实现
- java实现Apriori算法
- apriori算法简单实现
- C++实现apriori算法
- Apriori算法实现
- Apriori算法Matlab实现
- Apriori算法实现
- apriori算法实现
- nodejs实现apriori算法
- apriori算法 python实现
- int main(int argc, char** argv)
- http与https的理解
- Hadoop回顾--Hive常用函数
- 类型强转的那些坑
- 欢迎使用CSDN-markdown编辑器
- apriori算法实现
- FPgrowth实现
- 上传下载
- VM虚拟主机设置网络
- 单链表实现查找中间结点
- VC++ 防火墙 Win7 XP MFC
- 数据库--索引的类型及特点
- jQuery学习笔记(6)——自定义插件
- UVa272