Python定向爬虫——校园论坛帖子信息

来源：互联网发布：高校网络舆情工作方案编辑：程序博客网时间：2024/06/10 23:30

引言

写这个小爬虫主要是为了爬校园论坛上的实习信息，主要采用了Requests库

源码

URLs.py

主要功能是根据一个初始url（包含page页面参数）来获得page页面从当前页面数到pageNum的url列表

import redef getURLs(url, attr, pageNum=1):    all_links = []    try:        now_page_number = int(re.search(attr+'=(\d+)', url, re.S).group(1))        for i in range(now_page_number, pageNum + 1):            new_url = re.sub(attr+'=\d+', attr+'=%s' % i, url, re.S)            all_links.append(new_url)        return all_links    except TypeError:        print "arguments TypeError:attr should be string."

uni_2_native.py

由于论坛上爬取得到的网页上的中文都是unicode编码的形式，文本格式都为 &#XXXX;的形式，所以在爬得网站内容后还需要对其进行转换

import sysimport rereload(sys)sys.setdefaultencoding('utf-8')def get_native(raw):    tostring = raw    while True:        obj = re.search('&#(.*?);', tostring, flags=re.S)        if obj is None:            break        else:            raw, code = obj.group(0), obj.group(1)            tostring = re.sub(raw, unichr(int(code)), tostring)    return tostring

存入SQLite数据库：saveInfo.py

# -*- coding: utf-8 -*-import MySQLdbclass saveSqlite():    def __init__(self):        self.infoList = []    def saveSingle(self, author=None, title=None, date=None, url=None,reply=0, view=0):        if author is None or title is None or date is None or url is None:            print "No info saved!"        else:            singleDict = {}            singleDict['author'] = author            singleDict['title'] = title            singleDict['date'] = date            singleDict['url'] = url            singleDict['reply'] = reply            singleDict['view'] = view            self.infoList.append(singleDict)    def toMySQL(self):        conn = MySQLdb.connect(host='localhost', user='root', passwd='', port=3306, db='db_name', charset='utf8')        cursor = conn.cursor()        # sql = "select * from info"        # n = cursor.execute(sql)        # for row in cursor.fetchall():        #     for r in row:        #         print r        #     print '\n'        sql = "delete from info"        cursor.execute(sql)        conn.commit()        sql = "insert into info(title,author,url,date,reply,view) values (%s,%s,%s,%s,%s,%s)"        params = []        for each in self.infoList:            params.append((each['title'], each['author'], each['url'], each['date'], each['reply'], each['view']))        cursor.executemany(sql, params)        conn.commit()        cursor.close()        conn.close()    def show(self):        for each in self.infoList:            print "author: "+each['author']            print "title: "+each['title']            print "date: "+each['date']            print "url: "+each['url']            print "reply: "+str(each['reply'])            print "view: "+str(each['view'])            print '\n'if __name__ == '__main__':    save = saveSqlite()    save.saveSingle('网','aaa','2008-10-10 10:10:10','www.baidu.com',1,1)    # save.show()    save.toMySQL()

主要爬虫代码

import requestsfrom lxml import etreefrom cc98 import uni_2_native, URLs, saveInfo# 根据自己所需要爬的网站，伪造一个headerheaders ={    'Accept': '',    'Accept-Encoding': '',    'Accept-Language': '',    'Connection': '',    'Cookie': '',    'Host': '',    'Referer': '',    'Upgrade-Insecure-Requests': '',    'User-Agent': ''}url = 'http://www.cc98.org/list.asp?boardid=459&page=1&action='cc98 = 'http://www.cc98.org/'print "get infomation from cc98..."urls = URLs.getURLs(url, "page", 50)savetools = saveInfo.saveSqlite()for url in urls:    r = requests.get(url, headers=headers)    html = uni_2_native.get_native(r.text)    selector = etree.HTML(html)    content_tr_list = selector.xpath('//form/table[@class="tableborder1 list-topic-table"]/tbody/tr')    for each in content_tr_list:        href = each.xpath('./td[2]/a/@href')        if len(href) == 0:            continue        else:            # print len(href)            # not very well using for, though just one element in list            # but I don't know why I cannot get the data by index            for each_href in href:                link = cc98 + each_href            title_author_time = each.xpath('./td[2]/a/@title')            # print len(title_author_time)            for info in title_author_time:                info_split = info.split('\n')                title = info_split[0][1:len(info_split[0])-1]                author = info_split[1][3:]                date = info_split[2][3:]            hot = each.xpath('./td[4]/text()')            # print len(hot)            for hot_num in hot:                reply_view = hot_num.strip().split('/')                reply, view = reply_view[0], reply_view[1]            savetools.saveSingle(author=author, title=title, date=date, url=link, reply=reply, view=view)print "All got! Now saving to Database..."# savetools.show()savetools.toMySQL()print "ALL CLEAR! Have Fun!"

0 0