大数据学习笔记（四）-构建全文搜索引擎

来源：互联网发布：jar软件下载网站编辑：程序博客网时间：2024/05/11 21:40

完整代码在这
对于搜索引擎，我们几乎每天都要用到，这个也是集体智慧算法中最重要的算法之一。其中Google的pagerank算法是引领搜索引擎前进的一大重要算法。当然，这儿学习的是小数据搜索。
1.获取数据，建立索引
这儿用的是sqlite数据库，因为它轻量。没有c/s架构，直接可以用。因为中文的分词也是一门科学，所以这儿用的是英文文档来实现简单的搜索引擎。这儿的网页全部来源于维基百科。
获取数据的过程就是对所有的网页进行抓取，分析，将文章中的单词全部拆分出来，然后建立数据库，将我们需要的信息存放进数据库。这儿涉及了python 网页解析的beautifulsoup模块的使用，就自己理解。关于获取数据这块，自己理解理解就行了。因为我敲完了代码，发现集体智慧编程里面的那个网页已经找不到了。完全没法解析，没法获取数据。不过这个获取数据这个过程挺重要的。下面是代码。

  #coding:utf8import bs4import  reimport urllib2from urlparse import urljoinfrom sqlite3 import dbapi2 as sqliteclass crawler:    #初始化crawler类并传入数据库    def __init__(self,dbname):        self.con=sqlite.connect(dbname)    def __del__(self):        self.con.close()    def dbcommit(self):        self.con.commit()    #辅助函数，用于获取条目的id，并且如果条目不存在，就将其加入数据库中    def getentryid(self,table,field,value,createnew=True):        cur =self.con.execute('select rowid from %s where %s=\'%s\''%(table,field,value))        res=cur.fetchone()        if res==None:            cur=self.con.execute('insert into %s (%s) values (\'%s\')'%(table,field,value))            return cur.lastrowid        else:            return res[0]    #为每个网页建立索引    def addtoindex(self,url,soup):        if self.isindexed(url):            return        print 'Indexing %s '%url        #获取每个单词        text=self.gettextonly(soup)        words=self.separatewords(text)        #得到URL的id        urlid=self.getentryid('urllist','url',url)        #将每个单词与该url关联        for i in range(len(words)):            word=words[i]            if word in ignorewords:                continue            wordid=self.getentryid('wordlist','word',word)            self.con.execute('insert into wordlocation(urlid,wordid,location) values(%d,%d,%d)'%(urlid,wordid,i))    #从一个HTML网页中提取文字（不带标签）    def gettextonly(self,soup):        v=soup.string        if v==None:            c=soup.contents            resulttext=''            for t in c:                subtext=self.gettextonly(t)                resulttext+=subtext+'\n'            return resulttext        else:            return v.strip()    #根据任何非空白字符进行分词处理    def separatewords(self,text):        splitter=re.compile('\\W*')        return [s.lower() for s in splitter.split(text) if s!='']    #如果url已经建过索引，则返回true    def isindexed(self,url):        u=self.con.execute('select rowid from urllist where url =\'%s\' '%url).fetchone()        if u!=None:            #检查它是否已经被检索过了            v=self.con.execute('select * from wordlocation where urlid=%d'%u[0]).fetchone()            if v!=None:                return True        return False    #添加一个关联两个网页的链接    def addlinkref(self,urlFrom,urlTo,linkText):        words = self.separateWords(linkText)        fromid = self.getentryid('urllist', 'url', urlFrom)        toid = self.getentryid('urllist', 'url', urlTo)        if fromid == toid: return        cur = self.con.execute("insert into link(fromid,toid) values (%d,%d)" % (fromid, toid))        linkid = cur.lastrowid        for word in words:            if word in ignorewords: continue            wordid = self.getentryid('wordlist', 'word', word)            self.con.execute("insert into linkwords(linkid,wordid) values (%d,%d)" % (linkid, wordid))    #从一小组网页开始进行广度优先搜索，直至某一给定深度，    #期间为网页建立索引    def crawl(self,pages,depth=2):        for i in range(depth):            newpages=set()            for page in pages:                try:                    c=urllib2.urlopen(page)                except:                    print "Could not open %s"%page                    continue                soup=bs4.BeautifulSoup(c.read(),'html.parser',from_encoding='utf8')                self.addtoindex(page,soup)                links=soup('a')                for link in links:                    if ('href' in dict(link.attrs)):                        url=urljoin(page,link['href'])                        if url.find("'")!=-1:                            continue                        url=url.split('#')[0]#去掉位置部分                        if url[0:4]=='http' and not self.isindexed(url):                            newpages.add(url)                        linkText=self.gettextonly(link)                        self.addlinkref(page,url,linkText)                self.dbcommit()            page=newpages    #创建数据库表    def createindextables(self):        self.con.execute('create table urllist(url)')        self.con.execute('create table wordlist(word)')        self.con.execute('create table wordlocation(urlid,wordid,location)')        self.con.execute('create table link(fromid integer ,toid integer)')        self.con.execute('create table linkwords(wordid,linkid)')        self.con.execute('create index wordidx on wordlist(word)')        self.con.execute('create index urlidx on urllist(url)')        self.con.execute('create index wordurlidx on wordlocation(wordid)')        self.con.execute('create index urltoidx on link(toid)')        self.con.execute('create index urlfromidx on link(fromid)')        self.dbcommit()

关于我们获取的数据库表有5张，分别是link,urllist,linkwords,wordlist,wordlocation.
1）.urllist 表（存放所有的url）:rowid->行号(也是url的id号),url->具体的url
2）.wordlist 表(存放所有的单词):rowid->行号（也是word的id号）,word->具体的word
3）.linkwords表（存放链接文字的id和对应的url的id）:wordid->链接文字的id,linkid->对应的url的id
4）.link表，rowid->行号,fromid->链接的源url的id，toid->链接到的url的id
5）.wordlocation表（存放每个url中每个单词的位置信息）:urlid->url的id号，wordid->word的id号，location->该wordid对应的单词在该url文档中位于第几个位置(如url对应的文档是 “hello world,hello python”，那么python就位于第4个位置)
关于表之间的联系如下图：这里写图片描述
关于一些字段的作用，后面会一一用到。
以上代码就是获取数据的过程，关于数据问题，我找了很久，终于在GitHub上找到了一个存好了的数据库。现成的数据库在这儿
2.构建排名
以下几个函数是对于查询请求，对于不同的评价方法，将各个网页以格式化列表形式输出,详细看注释

    #q->用户输入的查询字符串 如"word1 word2 word3 ..."    #该函数返回的rows表示的是(urlid,word1在该url中的位置，word2在该url中的位置,word3在该url中的位置,....)的集合，wordids->表示的是这些输入的word的id的集合    def getmatchrows(self,q):        #构造查询的字符串        fieldlist='w0.urlid'        tablelist=''        clauselist=''        wordids=[]        #根据空格拆分单词        words=q.split(' ')        tablenumber=0        for word in words:            #获取单词的ID            wordrow=self.con.execute('select rowid from wordlist where word =\'%s\' '%word).fetchone()            if wordrow!=None:                wordid=wordrow[0]                wordids.append(wordid)                if tablenumber>0:                    tablelist+=','                    clauselist+=' and '                    clauselist+='w%d.urlid=w%d.urlid and '%(tablenumber-1,tablenumber)                fieldlist+=',w%d.location'%tablenumber                tablelist+='wordlocation w%d'%tablenumber                clauselist+='w%d.wordid=%d'%(tablenumber,wordid)                tablenumber+=1        #根据各个组分，建立查询        fullquery='select %s from %s where %s'%(fieldlist,tablelist,clauselist)        print fullquery        cur=self.con.execute(fullquery)        rows=[row for row in cur]        return rows,wordids    #接受查询请求，将获取到的行集置于字典中，并以格式化列表的形式显示输出    def getscoredlist(self,rows,wordids):        totalscores=dict([(row[0],0) for row in rows ])        #此处是放置评价函数的地方        # weights=[(1.0,self.frequencyscore(rows))] #单词频度        # weights=[(1.0,self.locationscore(rows))] #单词在文档中的位置        # weights=[(1.0,self.frequencyscore(rows)),(1.5,self.locationscore(rows))] #频度和位置的加权值        # weights=[(1.0,self.distancescore(rows))]  #单词在文章中的相对距离        # weights=[(1.0,self.inboundlinkscore(rows))] #利用外部回指链接来评价        # weights=[(1.0,self.locationscore(rows)),(1.0,self.frequencyscore(rows)),(1.0,self.pagerankscore(rows))]        weights=[(1.0,self.linktextscore(rows,wordids))]        for (weight,scores) in weights:            for url in totalscores:                totalscores[url]+=weight*scores[url]        return totalscores    def geturlname(self,id):        return self.con.execute('select url from urllist where rowid=%d'%id).fetchone()[0]    #查询前10个分值最高的网页    def query(self,q):        rows,wordids=self.getmatchrows(q)        scores=self.getscoredlist(rows,wordids)        rankedscores=sorted([(score,url) for (url,score) in scores.items() ],reverse=1)        for (score,urlid) in rankedscores[0:10]:            print '%f\t%s'%(score,self.geturlname(urlid))

使用归一化的方法，让不同评价结果拥有相同的值域，方便多种评价方法一起配合使用

    #归一化评价值    def normalizescores(self,scores,smallIsBetter=0):        vsmall=0.00001#避免被0整除        if smallIsBetter:            minscore=min(scores.values())            return dict([(u,float(minscore)/max(vsmall,l)) for (u,l) in scores.items()])        else:            maxscore=max(scores.values())            if maxscore==0:                maxscore=vsmall            return dict([(u,float(c)/maxscore) for (u,c) in scores.items()])

有6种评价方法：
第一种：根据单词频度来对网页进行评价。
单词在网页中出现的次数，可以作为我们搜索引擎的一个评价方法。
因为getmatchrows返回的rows是所有匹配的url，以及单词在该url中的位置。我们可以利用每个url在rows中出现的次数来对该网页进行评价。如我们输入一个”hello world” ，假设rows中返回的是[(1,22,28),(1,56,87),(1,98,100),(2,34,78),(2,99,181),(3,199,288)] 那么urlid=1的网页有三组匹配，所以1的评价度最高。

    #根据单词频度来进行评价    def frequencyscore(self,rows):        counts=dict([(row[0],0) for row in rows])        for row in rows:            counts[row[0]]+=1        return self.normalizescores(counts)

第二种:根据各个单词在文档出现的前后顺序来评价
在搜索中，如果一个网页与待搜索的单词相关，则该单词就更可能靠近网页开始处出现。
因为我们获取的rows是(urlid,word1在该url中的位置，word2在该url中的位置,word3在该url中的位置,….)记录了各个单词在该url中的位置，所以我们就可以这样实现:

    #根据单词在文档中的文章进行评价    def locationscore(self,rows):        locations=dict([(row[0],10000000) for row in rows])        for row in rows:            loc=sum(row[1:])            if loc<locations[row[0]]:                locations[row[0]]=loc        return self.normalizescores(locations,smallIsBetter=1)

因为这儿是越靠前越好，所以进行归一化的时候smallIsbetter为1

第三种:根据单词在文档中的相对距离
在进行搜索时，通常用户输入的单词是连在一起的，所以输入单词在文档中的相对位置也可以作为评判标准
具体实现如下：

  #根据单词在文档中的相对距离    def distancescore(self,rows):        #如果只有一个单词，所有得分都一样        if len(rows[0])<=2:            return dict([(row[0],0) for row in rows])        mindistance=dict([(row[0],10000000) for row in rows])        for row in rows:            dist=sum([abs(row[i]-row[i-1]) for i in range(2,len(row))])            if dist<mindistance[row[0]]:                mindistance[row[0]]=dist        return self.normalizescores(mindistance,smallIsBetter=1)

第四种:根据外部回指链接来评价
一个包含了用户输入单词的网页，如果其他包含了用户输入单词的网页中，很多网页都链接到了这个网页，那么这个网页就是一个比较好的搜索网页。
实现如下:

 #根据外部回指链接来评价    def inboundlinkscore(self,rows):        uniqueurls=set([row[0] for row in rows])        inboundcount=dict([(u,self.con.execute('select count(*) from link where toid=%d '%u).fetchone()[0] )for u in uniqueurls])        return self.normalizescores(inboundcount)

第五种：google 公司发明的pagerank评价
算法思想：网页的重要性是依据指向该网页的所有其他网页的重要性，以及这些网页中所包含的链接数求得。
这儿有一个阻尼因子为0.85
如计算A的pagerank值
这里写图片描述
箭头表示链接关系，如B->A表示B链接到A。下面数字表示该网页的pagerank的值。links(B)->B中链接的个数。现在求A的rankpage
计算如下

PR(A)=0.15+0.85*(PR(B)/links(B)+PR(C)/links(C)+PR(D)/links(D))     =0.15+0.85*(0.5/4+0.7/5+0.2/1)     =0.54525

这样我们就可以通过迭代的方法求出每个网页的pagerank。因为这儿数据量小，大约20次迭代就可以求出每个网页的pagerank值。

#通过迭代为每个网页计算怕个rank    def calculatepagerank(self,iterations=20):        #清除当前的PageRank表        self.con.execute('drop table if exists pagerank')        self.con.execute('create table pagerank(urlid primary key ,score)')        #初始化每个url,令其pagerank值为1        self.con.execute('insert into pagerank select rowid,1.0 from urllist')        self.dbcommit()        for i in range(iterations):            print "Iteration %d"%(i)            for (urlid,) in self.con.execute('select rowid from urllist'):                pr=0.15                #循环遍历指向当前网页的所有的其他网页                for (linker,) in self.con.execute('select distinct fromid from link where toid=%d '%urlid):                    #得到链接源对应的pagerank值                    linkingpr=self.con.execute('select score from pagerank where urlid=%d'%linker).fetchone()[0]                    #根据链接源，求得总的连接数                    linkingcount=self.con.execute('select count(*) from link where fromid =%d' % linker).fetchone()[0]                    pr+=0.85*(linkingpr/linkingcount)                self.con.execute('update pagerank set score=%f where urlid=%d' %(pr,urlid))            self.dbcommit()

通过pagerank对网页评分代码如下

    def pagerankscore(self,rows):        pageranks=dict([(row[0],self.con.execute('select score from pagerank where urlid=%d '% row[0]).fetchone()[0])for row in rows])        maxrank=max(pageranks.values())        normalizedscores=dict([(u,float(l)/maxrank) for (u,l) in pageranks.items()])        return normalizedscores

第六种：利用链接文本进行评价
大多数时候，相比于被连接的网页自身所提供的信息而言，我们从指向该网页的链接中所得到的信息会更有价值。
实现如下

#根据链接文本进行评分#rows和wordids是getmatchrows函数的两个返回值    def linktextscore(self,rows,wordids):        linkscores=dict([(row[0],0) for row in rows])        for wordid in wordids:            cur=self.con.execute('select link.fromid,link.toid from linkwords,link where wordid=%d and linkwords.linkid=link.rowid'%wordid)            for (fromid,toid) in cur:                if toid in linkscores:                    pr=self.con.execute('select score from pagerank where urlid=%d'%fromid).fetchone()[0]                    linkscores[toid]+=pr        maxscore=max(linkscores.values())        normalizedscores=dict([(u,float(l)/maxscore) for (u,l) in linkscores.items()])        return normalizedscores

以上6种算法，可以使用不同的权重分配来共同对网页进行评价。

0 0