写了个小爬虫

来源:互联网 发布:jdk 7u17 windows x32 编辑:程序博客网 时间:2024/05/02 02:46
from mysql import connectorimport hashlib from _codecs import encodeclass Quercy(object):    def __init__(self,mysqlDB ='crawler',mysqlUser='root'):        self.db=connector.Connect(db=mysqlDB,user=mysqlUser)        self.cursor=self.db.cursor()            def times(self,website):             '''查询访问次数'''        md5=self.__md5Digest__(website)        quercy=str('''SELECT times                    FROM crawler.website                    WHERE md5 = '%s'                     ''')%md5                            self.cursor.execute(quercy)                times= self.cursor.fetchone()                if times is None:            return 0        else:            return times[0]                def restore(self,website,keywords):        '''把网页存入数据库'''        md5=self.__md5Digest__(website)        quercy = str('''INSERT INTO website (md5,pri_website,keywords)                       VALUES('%s','%s','%s')                        ON DUPLICATE KEY UPDATE times=times+1                       ''')%(md5,website,keywords)                               self.cursor.execute(quercy)                       self.db.commit()                                  def __md5Digest__(self,website):        '''原始网址转化为md5'''        md5=hashlib.md5()        md5.update(encode(website))        return md5.hexdigest()if __name__=='__main__':    q = Quercy()        website = 'www.baidu.com'    q.restore(website,'baidu')      print(q.times(website))        


from html.parser import HTMLParserfrom _codecs import decodeclass WebParser(HTMLParser):         def __init__(self):        super().__init__()        self.tag=None        self.href=[]#超链接        self.keyowrds=[]#关键字                    def handle_starttag(self, tag, attrs):        #只关注meta和title标签        if tag =='meta' or tag =='title' or tag=='a':            self.tag=tag        else:            self.tag=None                             #分析超链接和关键字        for attr in attrs:            if attr[0]=='href' and attr[1].startswith("http"):                self.href.append(attr[1])            if attr[0]=='content':                key=attr[1].split()                self.keyowrds += key                    def handle_endtag(self, tag):        pass        def handle_data(self, data):        if self.tag == None:            return                   if self.tag =='title':            self.keyowrds.append(data)        if __name__=='__main__':        from urllib.request import urlopen    parser = WebParser()    data = urlopen("http://www.taobao.com/")    print(data.info())    d=decode(data.read(),'gbk')    parser.feed(d)    print(parser.href)    print(parser.keyowrds)    

from webparser import WebParserfrom query import Quercyfrom collections import dequefrom urllib.request import urlopenfrom _codecs import decodefrom sre_parse import Patternimport htmlclass Crawler(object):    def __init__(self):        self.urllist=deque()            def addURL(self,*urllist):        for eachurl in urllist:            self.urllist.append(eachurl)                def visit(self,website):        html=self.__open__(website)        parser = WebParser()        parser.feed(html)                key=''        for each in parser.keyowrds:            key += each            key += ','                    return parser.href,key            def __open__(self,url):        import copy        import re        data = urlopen(url)        header = str(data.info())                regex = re.compile('charset=(\w*)')         result= regex.search(header)                 charset = 'utf-8'        if result != None:            charset=result.group(1)                    html=decode(data.read(),charset)        return html        def __popurl__(self):        if len(self.urllist)==0:            exit(0)                return self.urllist.popleft()    def __puturl__(self,urllist):        for url in urllist:            self.urllist.append(url)        def run(self):                while True:            url=self.__popurl__()                         q = Quercy()            times = q.times(url)                        if times==0:                            urllist,key =self.visit(url)                self.__puturl__(urllist)                q.restore(url,key)                 c=Crawler()c.addURL('http://www.taobao.com/')c.run()

只有三个类,Quercy负责跟MYSQL通信, WebParser负责解释网页,Crawler负责广度优先遍历,

在淘宝上乱爬,把抓取到的网页及其关键词存入数据库。广度优先

已发现但未抓取的则留在内存。当增长到一定规模时就直接卡死

效率嘛。。。。继续研究。。。



0 0