python中用Beautifulsoup提取集搜客网站的信息

来源：互联网发布：get music软件编辑：程序博客网时间：2024/05/21 17:34

1爬取集搜客网站上面的信息点击打开链接

2看下这个源代码

3使用正则表达式提取出来

# coding:utf8import urllib2import timefrom bs4 import BeautifulSoupclass YZW():#先是定义一个模块然后就是用来包含用到的函数    def __init__(self):        self.user_agent='Mozilla/5.0 (Windows NT 6.1; WOW64)'#设置它的用户代理，模仿浏览器来访问        self.header={'User_Agent':self.user_agent}#利用header的字典来传入    def getHtml(url,page):        try:#运用try except 函数            urls='http://www.gooseeker.com/cn/forum/7?page='+str(page)#如果要爬取多页，观察多页网站规律            page=urllib2.urlopen(urls)            html=page.read()            return html        except urllib2.URLError, g:#如果出现URLError的错误，会执行以下代码            if hasattr (g,"reason"):                print u'loding error', g.reason                return None    def getitem(self):        for i in range(12):            html=self.getHtml(i)            time.sleep(1)            soup=BeautifulSoup(html,"html.parser")            Data=soup.find_all('tr',class_='odd')#观察自己要提取信息的标签，然后来提取信息            for item in Data:                lists = item.get_text("|")#得到其中的文本，然后就是用 | 来代替以前的分割                lists = lists.split('|')                try:                    print lists[0],lists[1],lists[3]                except:                    print 'None'            Data=soup.find_all('tr',class_='even')            for item in Data:                lists = item.get_text("|")                lists = lists.split('|')

4然后看下这个运行结果

0 0