单页爬虫

来源:互联网 发布:cisco 通过ip查端口 编辑:程序博客网 时间:2024/06/06 04:08

1.项目简介

任务:爬取某基金网站数据, 以单页单条模式存储。
性能:抓取各式数据,以dict模式存入数据库,以json模式导出到文本。

最后实现90分钟内对全站约3500支基金数据更新, 约100条error。

2.版块说明

List_page.py
简单抓取列表页面基金代号,方便后面的url队列生成。

Crawl.py
分为Gen, Parse, Clean三个部分。
Gen为url队列生成, Parse做单页解析, Clean做简单的数据清洗。
-库调用:
requests + bs4 + pymongo 为主要的爬取函数
codecs + json 链接文本输出
threading + datetime 控制多线程运行,以及负责进度反馈

-各部分:
Gen:通过生成器函数实现,节省空间,结构也更清晰。
Parse:观察网页结构后,划出相应到版块解析,一方面调理更清晰,方便调试,另一方面则实现了并行。
Clean:在这个项目里并没有太多处理,只是在网页原始数据中把该拆分到部分拆分出来。

3.总结

通过本次项目实践,对于数据采集有了比较全面的认识,其间的各种细节技术都是在实践中学来的。
在项目中做了很多新尝试,比如报错系统和进度反馈,这些可有可无到部分既可以方便调试,也是为了让数据更加可靠。
为了使程序自由度高,没有使用爬虫框架,尽量用外层函数库编写。这样可以根据不同网页情况自己修改。
在多线程控制上仍然存在一些问题,为什么在每个单页解析里都要初始进程list呢,我试过把list放在外面,进程控制会报错。

-对于数据存储结构的心得:
跟据不同需要以及数据本身的特征,我找到了一些简单的存储方式。决定存储方式的当然还是dict以及json本身的格式。
1. dict: {‘attribute’ : …}
2. list: [a, b, c…]
对于一些表格,在不同到需要下有不同到存储方式:
1. dict: {date : {f(d), g(d)}, d : {}, d: {}…}
2. list: [{date, f(d), g(d)}, {}, {}…]
3. multi-list: {date : [d1, d2, d3…], f : [f1, f2, f3…], g : [g1, g2, g3…]}
dict方便对table单行查询,list能保证数据的完整性,multi-list便于对数据可视化。


又丑又长到代码我本是不想贴的, 只是还没找到合适的管理方式。
代码块中的地址为了保护网页权益擦掉了。

import requestsfrom bs4 import BeautifulSoupimport pymongoimport codecsimport jsonimport datetimeimport threadingdef Gen():    global dataframe    global home    tit = 0    file = codecs.open('fund.json', 'r', 'utf-8')    tmp = json.load(file)    file.close    datagram = tmp.get('fund')    for dataframe in datagram:        home = {}        code = dataframe.get('code')        tit += 1        if tit % 100 == 0:            now = datetime.datetime.now()            print(tit, ' / ', len(datagram))            print(now.strftime('%Y-%m-%d %H:%M:%S'))            print(str(tit * 100 / len(datagram)) + '%')            #input()        url = ...        yield urldef Getsoup(url):    res = requests.get(url)    res.encoding = 'utf-8'    html = res.text    soup = BeautifulSoup(html, 'html.parser')    return soupdef Part01():    #01---infoOfItem---    global soup    global home    global eros    item = {}    ks = []    vs = []    try:        head = soup.find(attrs={'class': 'fundInfoItem'})    except:        print('Error 01: at', dataframe.get('code'))        return    #01.1-infoOfFund    info = {}    try:        div = head.find(attrs={'class' : 'infoOfFund'})        table = div.find_all('td')        for i in range(5):            td = table[i]            tmp = str(td.get_text())            p = tmp.find(':')            tmp = tmp[p + 1 : len(tmp)].replace('\xa0', '')            if i == 0:                info['type'] = tmp            if i == 1:                info['scale'] = tmp            if i == 2:                info['manager'] = tmp            if i == 3:                info['est_date'] = tmp            if i == 4:                info['damin'] = tmp        ks.append('info')        vs.append(info)    except:        print('Error 01.1: at' + dataframe.get('code'))    flag = False    #01.2-dataOfFund    data = {}    try:        div = head.find(attrs={'class' : 'dataOfFund'})        table = div.find_all('dd')        for i in range(9):            dd = table[i]            tmp = str(dd.get_text())            p = tmp.find(':')            tmp = tmp[p + 1: len(tmp)].replace('\xa0', '')            if i == 0:                data['estimation'] = tmp            if i == 1:                data['1month'] = tmp            if i == 2:                data['1year'] = tmp            if i == 3:                data['unit_net'] = tmp            if i == 4:                data['3month'] = tmp            if i == 5:                data['3year'] = tmp            if i == 6:                data['accum_net'] = tmp            if i == 7:                data['6month'] = tmp            if i == 8:                data['since_est'] = tmp        ks.append('data')        vs.append(data)        flag = True    except:        pass    #01.3-dataOfFund_hb    if flag == False:        data = {}        try:            div = head            table = div.find_all('dd')            for i in range(10):                dd = table[i]                tmp = str(dd.get_text())                p = tmp.find(':')                tmp = tmp[p + 1: len(tmp)].replace('\xa0', '')                if i == 0:                    data['per_million'] = tmp                if i == 1:                    data['7day'] = tmp                if i == 2:                    data['14day'] = tmp                if i == 3:                    data['28day'] = tmp                if i == 4:                    data['1month'] = tmp                if i == 5:                    data['1year'] = tmp                if i == 6:                    data['3month'] = tmp                if i == 7:                    data['3year'] = tmp                if i == 8:                    data['6month'] = tmp                if i == 9:                    data['since_est'] = tmp            ks.append('data')            vs.append(data)            flag = True        except:            pass    if flag == False:        eros += 1        print('Error 01.2/3: at' + dataframe.get('code'))    #01------    for i in range(len(ks)):        item[ks[i]] = vs[i]    home['item'] = itemdef Part02():    #02---historyReturnRate---    global soup    global home    global eros    history = {}    flag = False    #02.1-Monetary    try:        head  = soup.find(attrs={'id': 'historyReturnRate'})        table = head.find_all('tr')        date = []        per_million = []        seven_day = []        for i in range(len(table)):            if i == 0:                continue            tr = table[i]            date.append(tr.find_all('td')[0].get_text())            per_million.append(tr.find_all('td')[1].get_text())            seven_day.append(tr.find_all('td')[2].get_text())        history['date'] = date        history['per_million'] = per_million        history['7day'] = seven_day        home['history'] = history        flag = True    except:        pass    if flag == True:        return    #02.2-stock    try:        head = soup.find(attrs={'id': 'Div2'})        table = head.find('table')        table = table.find_all('tr')        date = []        unit_net = []        accum_net = []        rate = []        for i in range(len(table)):            if i == 0:                continue            tr = table[i]            date.append(tr.find_all('td')[0].get_text())            unit_net.append(tr.find_all('td')[1].get_text())            accum_net.append(tr.find_all('td')[2].get_text())            rate.append(tr.find_all('td')[3].span.get_text())        history['date'] = date        history['unit_net'] = unit_net        history['accum_net'] = accum_net        history['rate'] = rate        home['history'] = history        flag = True    except:        pass    if flag == False:        eros += 1        print('Error 02: at' + dataframe.get('code'))    #02------def Part03():    #03---IncreaseAmount---    global soup    global home    global eros    increase = []    period = []    inc = []    avg = []    hs300 = []    rank = []    f = False    try:        head = soup.find(attrs={'class': 'IncreaseAmount'})        table = head.find_all('tr')    except:        f = True    if f:        return    try:        for i in range(5):            tr = table[i]            if i == 0:                cols= tr.find_all('th')                for th in cols[1: len(cols)]:                    period.append(th.get_text())            else:                cols = tr.find_all('td')                for td in cols[1 : len(cols)]:                    if i == 1:                        inc.append(td.get_text())                    if i == 2:                        avg.append(td.get_text())                    if i == 3:                        hs300.append(td.get_text())                    if i == 4:                        rank.append(td.get_text())        for i in range(len(period)):            tmp = {}            tmp['period'] = period[i]            tmp['inc'] = inc[i]            tmp['avg'] = avg[i]            tmp['hs300'] = hs300[i]            tmp['rank'] = rank[i]            increase.append(tmp)        home['increase'] = increase    except:        eros += 1        print('Error 03 at:' + dataframe.get('code'))    #03------def Parse():    global home    global col    global partision    #00***fundInfoItem***    home['fund'] = {'name' : dataframe.get('name'), 'code' : dataframe.get('code')}    for thread in partision:        thread.setDaemon(True)        thread.start()    for thread in partision:        thread.join()def Clean():    tmp = str(home.get('item').get('info').get('type'))    p = tmp.find('|')    a = tmp[:p]    b = tmp[p+1:]    home['item']['info']['type'] = {'a' : a, 'b' : b}    tmp = str(home.get('item').get('info').get('scale'))    p = tmp.find('(')    num = tmp[:p]    date = tmp[p+1:len(tmp) - 1]    home['item']['info']['scale'] = {'num' : num, 'date' : date}file_w = codecs.open('...', 'w', 'utf-8')eros = 0#col = pymongo.MongoClient('localhost', 27017).Easy2.Easy2for url in Gen():    partision = []    p1 = threading.Thread(target=Part01)    partision.append(p1)    p2 = threading.Thread(target=Part02)    partision.append(p2)    p3 = threading.Thread(target=Part03)    partision.append(p3)    #!Caution : I tried to move it out of the loop, But it seems like not workable outside...    soup = Getsoup(url)    Parse()    Clean()    # col.insert(dict(home))    file_w.write(str(json.dumps(home, ensure_ascii=False, indent=4)) + '\n')print('Errors in tot: %d' %eros)file_w.close()