挖掘DBLP作者合作关系，FP-Growth算法实践（2）：从DBLP数据集中提取信息，三种源码（dom，sax，string）

来源：互联网发布：交换机的端口镜像编辑：程序博客网时间：2024/04/29 10:24

上篇文章：http://blog.csdn.net/mmc2015/article/details/50988375 （挖掘DBLP作者合作关系，FP-Growth算法实践（1）：从DBLP数据集中提取目标信息（会议、作者等））

大家反映代码不能用，主要是太慢了，好吧，我也承认慢，在内存构造树，肯定的！

这次给出另外两种。

为了完整，先给出dom：

#do not use this code!def DomParser():    domTree=parse(fileName)    dblp=domTree.documentElement    inproceedingsList=dblp.getElementsByTagName("inproceedings")    for inproceedings in inproceedingsList:                year=inproceedings.getElementsByTagName("year")[0]        yearStr=str(year.childNodes[0].data)        if yearStr<fromYear:            continue        print "yearStr", yearStr, "=="*20                booktitle=inproceedings.getElementsByTagName("booktitle")[0]        booktitleStr=str(booktitle.childNodes[0].data)        #for "<booktitle>ICML Unsupervised and Transfer Learning</booktitle>"        booktitleStr=booktitleStr.split(" ")[0]        if not confNameDict.has_key(booktitleStr):            continue        print "booktitleStr", booktitleStr, "^^"*20                #allList=[] #"confName    \t    year    \t    title    \t    author1|author2|..|authorn"        #authorDict={} #author: [frequence, yearStart, yearEnd]        allContent=booktitleStr+"\t"+yearStr+"\t" #confName    \t    year    \t        title=inproceedings.getElementsByTagName("title")[0]        titleStr=str(title.childNodes[0].data)        allContent+=titleStr+"\t" #title    \t        authorList=inproceedings.getElementsByTagName("author")        for i, author in enumerate(authorList):            authorStr=str(author.childNodes[0].data)            allContent+=authorStr+"|" #authori|            if authorDict.has_key(authorStr):                authorDict[authorStr][0]+=1                if yearStr<authorDict[authorStr][1]:                    authorDict[authorStr][1]=yearStr                elif yearStr>authorDict[authorStr][2]:                    authorDict[authorStr][2]=yearStr            else:                authorDict[authorStr]=[1, yearStr, yearStr]        allList.append(allContent)        allContent="\n".join(allList)    wf=open("allDB.txt","w")    wf.write(allContent)    wf.close()        authorList=sorted(authorDict.items(), lambda x, y: cmp(x[1], y[1]), reverse=True)    wf=open("authorDB.txt","w")    allContent="\n".join([author+"\t"+str(frequence)+"\t"+yearStart+"\t"+yearEnd for author, (frequence , yearStart, yearEnd) in authorList])    wf.write(allContent)    wf.close()

再给出sax：

class SAX_PARSER(xml.sax.ContentHandler):    '''    startDocument()方法    文档启动的时候调用。    endDocument()方法    解析器到达文档结尾时调用。    startElement(name, attrs)方法    遇到XML开始标签时调用，name是标签的名字，attrs是标签的属性值字典。    endElement(name)方法    遇到XML结束标签时调用。        characters(content)方法,调用时机：    从行开始，遇到标签之前，存在字符，content的值为这些字符串。    从一个标签，遇到下一个标签之前， 存在字符，content的值为这些字符串。    从一个标签，遇到行结束符之前，存在字符，content的值为这些字符串。    标签可以是开始标签，也可以是结束标签。    '''        def __init__(self):                self.authorList=""        self.title=""        self.year=""        self.booktitle=""        self.flag=0        self.tag=""            def startDocument(self):         print "Document start", "=="*20             def endDocument(self):         print "Document end", "=="*20        def startElement(self, tag, attributes):        print "startElement", "ss"*20, tag        if tag=="inproceedings":            self.flag=1        elif self.flag==1: #tag!="inproceedings" and self.flag==1, we are now in a subtag of "inproceedings"            self.tag=tag            def endElement(self, tag):        print "endElement", "ee"*20, tag        if self.flag==1 and tag=="inproceedings":            if confNameDict.has_key(self.booktitle) and self.year>=fromYear:                #allList=[] #"confName    \t    year    \t    title    \t    author1|author2|..|authorn"                allContent=self.booktitle+"\t"+self.year+"\t"+self.title+"\t"+self.authorList[:-1]+"\n" #for the last "|"                wf=open("allDB.txt","a")                wf.write(allContent)                wf.close()            self.authorList=""            self.title=""            self.year=""            self.booktitle=""            self.flag=0            self.tag=""        def characters(self, content):        print "characters", "cc"*20, content        if self.flag==1: #we are now in "inproceedings" tag            print self.tag            if self.tag=="author":                self.authorList+=content+"|"            elif self.tag=="title":                self.title=content            elif self.tag=="year":                self.year=content            elif self.tag=="booktitle":                self.booktitle=content.split(" ")[0] #for "<booktitle>ICML Unsupervised and Transfer Learning</booktitle>"

最后给出string，把每行看成字符串来处理的方式：

def XmlLineParser(fileName):    rf=open(fileName,"r")    for line in rf:        #print "line [1]", line        if line.startswith("<inproceedings"):            print "line [1]", line            booktitle=""            year=""            title=""            authorList=""            for line in rf:                print "line [2]", line                if line.startswith("<author"):                    authorList+=line                if line.startswith("<title"):                    title=line                elif line.startswith("<year"):                    year=line[6:10]                    if year<fromYear:                        break                elif line.startswith("<booktitle"):                    booktitle=((line[11:]).split("</")[0]).split(" ")[0]                    if not confNameDict.has_key(booktitle):                        break                elif line.startswith("</inproceedings"):                    #tranList=[] #"confName    \t    year    \t    title    \t    author1|author2|..|authorn"                    localTran=booktitle+"\t"+year+"\t"+(title[7:]).split("</")[0]+"\t"                    for authorLine in authorList.split("\n"):                        for author in re.findall(re.compile(r'<author>(.*)</author>', re.S), authorLine):                            localTran+=author+"|"                    wf=open("tranDB.txt","a")                    wf.write(localTran[:-1]+"\n") #remove last "|"                    wf.close()                    break #do not forget    rf.close()

调用直接：

XmlLineParser(fileName)

拿走不谢，代码写得有点水，见谅。

1 0