Apriori算法的R语言实现

来源:互联网 发布:windows文件夹加密软件 编辑:程序博客网 时间:2024/05/18 00:08


1)数据准备

library(arules)

 a<-matrix(0,7,4,dimnames=list(c("basket1","basket2","basket3","basket4",
+ "basket5","basket6","basket7"),c("item1","item2","item3","item4")))
> a[1,]<-c(1,0,1,0)
> a[2,]<-c(0,0,1,1)
> a[3,]<-c(1,1,1,1)
> a[4,]<-c(1,1,0,0)
> a[5,]<-c(0,0,1,0)
> a[6,]<-c(1,0,1,1)
> a[7,]<-c(0,1,1,1)

 a.class<-as(a,"transactions")

apriori函数进行关联分析

rules<-apriori(a.class,parameter=list(supp=0.2,conf=0.6,target="rules"))

inspect(rules)

   lhs              rhs              support   confidence   lift     
1  {}            => {item3} 0.8571429 0.8571429  1.0000000
2  {item2}       => {item1} 0.2857143 0.6666667  1.1666667
3  {item2}       => {item4} 0.2857143 0.6666667  1.1666667
4  {item2}       => {item3} 0.2857143 0.6666667  0.7777778
5  {item1}       => {item3} 0.4285714 0.7500000  0.8750000
6  {item4}       => {item3} 0.5714286 1.0000000  1.1666667
7  {item3}       => {item4} 0.5714286 0.6666667  1.1666667
8  {item2,item4} => {item3} 0.2857143 1.0000000  1.1666667
9  {item2,item3} => {item4} 0.2857143 1.0000000  1.7500000
10 {item1,item4} => {item3} 0.2857143 1.0000000  1.1666667
11 {item1,item3} => {item4} 0.2857143 0.6666667  1.1666667

 rules<-apriori(a.class,parameter=list(supp=0.2,conf=0.6,target="rules"),appearance=
+ list(rhs="item3",default="lhs"))

inspect(rules)

 lhs              rhs     support            confidence     lift     
1 {}            => {item3} 0.8571429 0.8571429  1.0000000
2 {item2}       => {item3} 0.2857143 0.6666667  0.7777778
3 {item1}       => {item3} 0.4285714 0.7500000  0.8750000
4 {item4}       => {item3} 0.5714286 1.0000000  1.1666667
5 {item2,item4} => {item3} 0.2857143 1.0000000  1.1666667
6 {item1,item4} => {item3} 0.2857143 1.0000000  1.1666667

rules<-rules[which(rules@quality$lift>1)]
rules.sorted<-sort(rules,by="lift")
write(rules,file="apriori_rules.txt",sep = "|",col.names=NA)


序列关联分析——可以挖掘带有因果效应的关联

library(arulesSequences)
item<-factor(c("A","B","B","A","B","A","C","A","B","C","B","A","B","A","A","B","A","B"))
seqid<-c(1,1,1,1,1,2,2,2,2,2,2,3,3,3,4,4,4,4)
eventid<-c(10,10,20,30,30,20,20,30,30,30,50,10,30,40,30,30,40,50)

data.tran<-as(data,"transactions")



transactionInfo(data.tran)$sequenceID<-seqid
transactionInfo(data.tran)$eventID<-eventid


transactionInfo(data.tran)


result<-cspade(data.tran,parameter = list(support = 0.5),control = list(verbose = TRUE))
result<-sort(result,by="support")
inspect(result)

实例:用户点击页面的行为分析

root<-"C:/"
tmpp<-read.fwf(paste(root,"anonymous-msweb.data",sep=""),widths=c(60))
train_list<-tmpp$V1


tmp_page<-c(0)
tmp_sequenceid<-c(0)
tmp_eventid<-c(0)
m<-0
sequenceid<-0
train_length<-length(train_list)
for (i in 302:train_length){
tmp<-unlist(strsplit(as.character(train_list[i]),","))
if(tmp[1]=="C"){
sequenceid<-sequenceid+1
eventid<-0
}else if (tmp[1]=="V") {
m<-m+1
eventid<-eventid+1
tmp_sequenceid[m]<-sequenceid
tmp_eventid[m]<-eventid
tmp_page[m]<-as.numeric(tmp[2])
}
}
tmp_page<-factor(tmp_page)
data<-data.frame(page=tmp_page,seqid=tmp_sequenceid,eventid=tmp_eventid)


user.page=1034
user.sequenceid<-unique(data$seqid[which(data$page==user.page)])
i<-i;data.user<-data[1,]
for (seq.i in user.sequenceid) {
data.user<-rbind(data.user,subset(data,seqid==seq.i))
i<-i+1
}
data.user<-data.user[-1,]


#数据准备
library(arulesSequences)
tmp_data<-data.frame(page=data.user$page)
data.tran<-as(tmp_data,"transactions")
transactionInfo(data.tran)$sequenceID<-data.user$seqid
transactionInfo(data.tran)$eventID<-data.user$eventid
#函数进行分析
result<-cspade(data.tran,parameter=list(support=0,maxlen=2),control=list(verbose=TRUE))
result<-sort(result,by="support")
page.2<-paste(".*page=",user.page,"[^\\}]*\\}>",sep="")
result.2<-result[grep(page.2,as(result,"data.frame")$sequence)]
inspect(result.2)




#筛选重点页面并衡量其引导能力
result.data.frame<-as(result.2[-1],"data.frame")
persent<-result.data.frame$support/sum(result.data.frame$support)
sum.persent<-cumsum(persent)
result.data.frame<-cbind(result.data.frame,persent,sum.persent)
max.persent=0.7
result.data.frame<-subset(result.data.frame,sum.persent<=max.persent)
result.data.frame


page<-0;i<-1
for (i.seq in result.data.frame$sequence) {
real_seq1<-regexpr("<\\{page=",i.seq)+7
real_seq2<-regexpr("\\}",i.seq)[1]-1
page[i]<-substr(i.seq,real_seq1,real_seq2)
i<-i+1
}


i<-1;uv<-0
for (i.page in page){
uv[i]<-length(unique(data$seqid[which(data$page==i.page)]))
i<-i+1
}
conf<-result.data.frame$support*result@info$nsequences/uv
result.data.frame<-cbind(result.data.frame,conf=conf,page=page)


#绘制结果表图
barplot(as.matrix(result.data.frame$persent,nrow=1),ylim=c(0,1),beside=TRUE,xlab=
"页面名称",main="引导用户进入关键页面1034的重点页面分析")
lines(0.5+c(1:nrow(result.data.frame)),result.data.frame$conf,type="b",col="red")
text(0.5+c(1:nrow(result.data.frame)),result.data.frame$conf,labels=paste(round(result.data.frame$conf*100,2),"%",sep=""))
axis(1,at=0.5+c(1:nrow(result.data.frame)),labels=result.data.frame$page,tick=FALSE)

















0 0
原创粉丝点击