python爬虫第一课,制作搜索引擎
来源:互联网 发布:vivo软件招聘 编辑:程序博客网 时间:2024/04/29 18:59
from BeautifulSoup import *from urlparse import urljoinignaorewords=set(['the','of','to','and','a','in','is','it'])
我们的搜索引擎基于关键词, 所以将连词,冠词忽略
下面的代码是爬虫, 将网页的文本数据存储到我们的sqlite中, 大家看不懂也没有关系, 知道这些函数是干什么的就行了
from sqlite3 import dbapi2 as sqliteimport urllib2class crawler: def __init__(self,dbname): self.con=sqlite.connect(dbname) #连接并建立数据库, dbname 随意, 'xxx.db'就可以 def __del__(self): self.con.close() def dbcommit(self): self.con.commit() def getentryid(self,table,field,value,createnew=True): cur=self.con.execute( "select rowid from %s where %s='%s'" %(table,field,value)) res=cur.fetchone() if res==None: cur=self.con.execute( "insert into %s (%s) values ('%s')" % (table,field,value)) return cur.lastrowid else: return res[0] def addtoindex(self,url,soup): if self.isindexed(url): return print 'Indexing',url #Get words text=self.gettextonly(soup) words=self.separatewords(text) #Get URL id urlid=self.getentryid('urllist','url',url) # Link word to url for i in range(len(words)): word=words[i] if word in ignaorewords: continue wordid=self.getentryid('wordlist','word',word) self.con.execute("insert into wordlocation(urlid,wordid,location) \ values(%d,%d,%d)" % (urlid,wordid,i)) def gettextonly(self,soup): v=soup.string if v==None: c=soup.contents resulttext='' for t in c: subtext=self.gettextonly(t) resulttext+=subtext+'\n' return resulttext else: return v.strip() def separatewords(self,text): splitter=re.compile('\\W*') return [s.lower() for s in splitter.split(text) if s!=''] def isindexed(self,url): u=self.con.execute( "select rowid from urllist where url='%s'" % url).fetchone() if u!=None: #if crawled v=self.con.execute( 'select * from wordlocation where urlid=%d' % u[0]).fetchone() if v != None: return True return False def addlinkref(self,urlFrom,urlTo,linkText): pass def crawl(self,pages,depth=2): for i in range(depth): newpages=set() for page in pages: try: c=urllib2.urlopen(page) except: print "Could not open",page continue soup=BeautifulSoup(c.read()) self.addtoindex(page,soup) links=soup('a') for link in links: if 'href' in dict(link.attrs): url=urljoin(page,link['href']) if url.find("'") != -1: continue url=url.split('#')[0] #remove location portion if url[0:4]=='http' and not self.isindexed(url): newpages.add(url) linkText=self.gettextonly(link) self.addlinkref(page,url,linkText) self.dbcommit() pages=newpages def createindextables(self): self.con.execute('create table urllist(url)') self.con.execute('create table wordlist(word)') self.con.execute('create table wordlocation(urlid,wordid,location)') self.con.execute('create table link(fromid integer,toid integer)') self.con.execute('create table linkwords(wordid,linid)') self.con.execute('create index wordidx on wordlist(word)') self.con.execute('create index urlidx on urllist(url)') self.con.execute('create index wordurlidx on wordlocation(wordid)') self.con.execute('create index urltoidx on link(toid)') self.con.execute('create index urlfromidx on link(fromid)') self.dbcommit()好了, 有了爬虫, 我们再将需要爬取的页面写出来
pagelist=[['http://en.xjtu.edu.cn/'], ['http://www.lib.xjtu.edu.cn/'], ['http://en.wikipedia.org/wiki/Xi%27an_Jiaotong_University']]建立数据库
mycrawler=crawler('searchindex.db')mycrawler.createindextables()爬取
mycrawler.crawl(pagelist[0])搜索引擎
class searcher: def __init__(self,dbname): self.con=sqlite.connect(dbname) def __del__(self): self.con.close() def getmatchrows(self,q): # Strings to build the query fieldlist='w0.urlid' tablelist='' clauselist='' wordids=[] # Split the words by spaces words=q.split(' ') tablenumber=0 for word in words: #Get the word ID wordrow=self.con.execute( "select rowid from wordlist where word='%s'" % word).fetchone() if wordrow!=None: wordid=wordrow[0] wordids.append(wordid) if tablenumber>0: tablelist+=',' clauselist+=' and ' clauselist+='w%d.urlid=w%d.urlid and ' % (tablenumber-1,tablenumber) fieldlist+=',w%d.location' % tablenumber tablelist+='wordlocation w%d' % tablenumber clauselist+='w%d.wordid=%d' % (tablenumber,wordid) tablenumber+=1 # Create the query from the separate parts fullquery='select %s from %s where %s' % (fieldlist,tablelist,clauselist) print fullquery cur=self.con.execute(fullquery) rows=[row for row in cur] return rows,wordids def geturlname(self,id): return self.con.execute( "select url from urllist where rowid=%d" % id).fetchone()[0] def normaliszescores(self,scores,smallIsBetter=0): vsmall=0.00001 if smallIsBetter: minscore=min(scores.value()) return dict([(u,float(minscore)/max(vsmall,l)) for (u,l)\ in scores.items()]) else: maxscore=max(scores.values()) if maxscore==0: maxscore=vsmall return dict([(u,float(c)/maxscore) for (u,c) in scores.items()])#score methods def frequencyscore(self,rows): counts=dict([(row[0],0) for row in rows]) for row in rows: counts[row[0]]+=1 return self.normaliszescores(counts) def locationscore(self,rows): locations=dict([(row[0],1000000) for row in rows]) for row in rows: loc=sum(row[1:]) if loc<locations[row[0]]: locations[row[0]]=loc return self.normaliszescores(locations,smallIsBetter=1) def distancescore(self,rows): if len(row[0])<=2: return dict([(row[0],1.0) for row in rows]) mindistance=dict([(row[0],1000000) for row in rows]) for row in rows: dist=sum([abs(row[i]-row[i-1]) for i in range(2,len(row))]) if dist < mindistance[row[0]]: mindistance[row[0]]=dist return self.normaliszescores(mindistance,smallIsBetter=1)#--------------------------------------------------------------------------- def getscoredlist(self,rows,wordids): totalscores=dict([(row[0],0) for row in rows]) weights=[(1.0,self.frequencyscore(rows))] for (weight,scores) in weights: for url in totalscores: totalscores[url]+=weight*scores[url] return totalscores def query(self,q): rows,wordids=self.getmatchrows(q) scores=self.getscoredlist(rows,wordids) rankedscores=sorted([(score,url) for (url,score) in scores.items()],reverse=1) for (score,urlid) in rankedscores[:10]: print '%f\t%s' % (score,self.geturlname(urlid))建立搜索引擎与数据库的关联
e=searcher('searchindex.db')搜索
e.query('xjtu college')这样你的第一个搜索引擎就搭建完毕啦:
1.000000http://en.xjtu.edu.cn/XJTU_Introduction/Introduction.htm0.941176http://en.xjtu.edu.cn/info/1044/1683.htm0.705882http://en.xjtu.edu.cn/Schools_and_Colleges.htm0.529412http://en.xjtu.edu.cn/info/1044/1681.htm0.470588http://en.xjtu.edu.cn/Education/Undergraduate_Education.htm0.382353http://en.xjtu.edu.cn/XJTU_News/News.htm0.382353http://en.xjtu.edu.cn/Campus_Life/Student_Bodies.htm0.294118http://en.xjtu.edu.cn/XJTU_News/Teaching_and_learning.htm0.294118http://en.xjtu.edu.cn/info/1044/1572.htm0.279412http://en.xjtu.edu.cn/info/1044/1571.htm
0 0
- python爬虫第一课,制作搜索引擎
- 七月算法《python爬虫》第一课:Python爬虫小示例
- Python爬虫多线程爬搜索引擎
- Python分布式爬虫打造搜索引擎
- python简易爬虫制作
- python第一个爬虫
- 第一个Python爬虫
- 第一个python爬虫
- 第一个python爬虫
- 第一个python爬虫
- Python--第一个爬虫
- python网络爬虫-第一个网络爬虫
- 【Python爬虫】编写第一个爬虫
- python制作简单的爬虫
- Python分布式爬虫打造搜索引擎Scrapy
- 第一只python小爬虫
- 0基础Python实战:爬虫计划---第一课
- 第一课 Python爬虫初识与网络请求
- 如何使自己的QML应用不进入到屏保状态以保持屏幕是亮的
- Pro Android学习笔记(一六五):联系人API(8):控制整合
- LeetCode的medium题集合(C++实现)十三
- C++ 写leetcode遇到的一些问题总结string to integer
- Android Hook学习之ptrace函数的使用
- python爬虫第一课,制作搜索引擎
- Hiho 国庆出游
- Android应用Context详解及源码解析
- Uncaught TypeError: undefined is not a function
- 各种距离算法汇总
- 用c++实现 c++顺序表的实现(采用模板)
- linux下清空文件内容方法
- html输入构输入验证码达到一定位数触发事件及回车时触发事件
- 用Windows自带的画笔工具调整图片的大小