Python学习之网页抓取(二)

来源:互联网 发布:cad2005软件下载 编辑:程序博客网 时间:2024/05/22 14:26

这一篇实现的功能是:对下载的网页内容进一步提取内容,如网页编码类型,标题等。

 

#!/usr/bin/env python# -*- coding: GBK -*-'''Created on Jul 17, 2013@author: belong'''import urllibimport reclass Tools:    def write_log(self, level, info):        print '[' + level + ']' + info    def match_regex(self,pattern,src):        result = ""        com = re.compile(pattern,re.I)        matchers = com.findall(src)        print matchers        for matcher in matchers:            result += matcher +" "        return result.strip()#strip函数返回去掉两端空格的字符串           class Crawler:    #获取一个url链接的所有urls    def get_url(self, url):        html = urllib.urlopen(url)    #     pattern = re.compile("http://.*?\.com",re.I)        pattern = re.compile(r'[a-zA-z]+://[^\s]*\.html?', re.I)        while True:            data = html.read()            if data:                urls = pattern.findall(data)            else:                break        html.close()        return urls        #下载url对应网页    def download_url(self, url, filename):        Tools().write_log("info","开始下载")        html = urllib.urlopen(url)        f = open(filename, 'w')        while True:            page = html.read()            if page:                f.write(page);            else:                break        html.close()        f.close()        Tools().write_log("info","网页下载成功")        return page        #广度优先遍历    def broad_traverse(self, start_url, number):        Tools().write_log("info","开始遍历")        visited = []        unvisited = []        unvisited.append(start_url)        while len(unvisited):            if len(visited) < number:                url = unvisited.pop(0);                print url, len(visited)                i = len(visited)#                 self.download_url(url, str(i) + '.html')                visited.append(url)                url_list = self.get_url(url)                for eachlink in url_list:                    if ((unvisited.count(eachlink) == 0) & (visited.count(eachlink) == 0)):                        unvisited.append(eachlink)            else:                break        Tools().write_log("info","遍历成功")        return visited         def main(self):        start_url = "http://www.baidu.com"        self.broad_traverse(start_url, 10)#数据提取类class DataExtractor:    #提取标题    def get_title(self,data):        title = Tools().match_regex('<title>.*?<\/title>', data)        if title == "":            Tools().write_log( "Error", "标题匹配不成功")        return title    #提取内容格式    def get_type(self,data):        type = Tools().match_regex('<meta.*content=.*?\/>', data)        return type    #提取字符类型    def get_charset(self,data):        charset = Tools().match_regex('<meta.*?charset=.*?\/>', data)        return charset        def get_info(self,url):        try:            data = Crawler().download_url(url,str(11)+'.html')        except:            Tools().write_log("error", url+"抓取失败")            raise            Tools().write_log("info", "开始数据匹配")        rst = {}#rst是一个字典        rst['title'] = self.get_title(data)        print "title:",rst['title']         rst['type'] = self.get_type(data)        print "type:",rst['type']         rst['charset'] = self.get_charset(data)        print "charset:",rst['charset']         Tools().write_log("DEBUG",'title=%s,type=%s,type=%s'%(rst['title'],rst['type'],rst['charset']))        return rstdef main():    start_url = "http://www.baidu.com"    crawler = Crawler()    url_list = crawler.broad_traverse(start_url, 10)       dataExtractor= DataExtractor()    for url in url_list:        dataExtractor.get_info(url)main()


 

原创粉丝点击