【python 爬虫】全国失信被执行人名单爬虫

来源:互联网 发布:9008端口刷机工具 编辑:程序博客网 时间:2024/04/27 17:25

一、需求说明
通过百度的接口,爬取全国失信被执行人名单。翻页爬虫,获取姓名,身份证等信息。

这里写图片描述

二、python实现

版本1:

# -*- coding:utf-8*-import sysreload(sys)sys.setdefaultencoding('utf-8')import timeimport requeststime1=time.time()import pandas as pdimport  json################定义数据结构列表存储数据####################iname=[]icard=[]################循环发送请求解析数据#######################for i in range(1,101):    print '正在抓取第'+str(i)+"页................................."    url="https://sp0.baidu.com/8aQDcjqpAAV3otqbppnN2DJv/api.php?resource_id=6899&query=%E8%80%81%E8%B5%96&pn="+str(i*10)+"&ie=utf-8&oe=utf-8&format=json"    head={    "Host": "sp0.baidu.com",    "Connection": "keep-alive",    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36",    "Accept": "*/*",    "Referer": "https://www.baidu.com/s?ie=utf-8&f=3&rsv_bp=1&tn=95943715_hao_pg&wd=%E8%80%81%E8%B5%96&oq=%25E8%2580%2581%25E8%25B5%2596&rsv_pq=ec5e631d0003d8eb&rsv_t=b295wWZB5DEWWt%2FICZvMsf2TZJVPmof2YpTR0MpCszb28dLtEQmdjyBEidZohtPIr%2FBmMrB3&rqlang=cn&rsv_enter=0&prefixsug=%25E8%2580%2581%25E8%25B5%2596&rsp=0&rsv_sug9=es_0_1&rsv_sug=9",    "Accept-Encoding": "gzip, deflate, br",    "Accept-Language": "zh-CN,zh;q=0.8"    }    html=requests.get(url,headers=head).content    html_json=json.loads(html)    html_data=html_json['data']    for each in html_data:        k=each['result']        for each in k:            print each['iname'],each['cardNum']            iname.append(each['iname'])            icard.append(each['cardNum'])#####################将数据组织成数据框###########################data=pd.DataFrame({"name":iname,"IDCard":icard})#################数据框去重####################################data1=data.drop_duplicates()print data1# #########################写出数据到excel########################################## pd.DataFrame.to_excel(data1,"F:\\iname_icard.xlsx",header=True,encoding='gbk',index=False)# time2=time.time()# print u'ok,爬虫结束!'# print u'总共耗时:'+str(time2-time1)+'s'

三、效果展示

"D:\Program Files\Python27\python.exe" D:/PycharmProjects/learn2017/全国失信被执行人名单爬虫.py正在抓取第1页.................................陈虹桦 44170219860****4223党训 32022219720****1898林战 44032119680****0038江祖藕 44172119821****2546智彩玲 15042819840****4543孙嘉佳 41232219680****1826简光粉 53223119630****2120魏凡东 37092119691****3612张夫连 37092019660****6483沈新建 32102519661****1658戴光明 32102619680****5935梁伦奇 33262319710****5959王义拓 41140319890****6612王怀旭 41092319651****601X许耀明 32022219580****6555

版本2:

# -*- coding:utf-8*-import sysreload(sys)sys.setdefaultencoding('utf-8')import timeimport requeststime1=time.time()import pandas as pdimport  json################定义数据结构列表存储数据####################iname=[]icard=[]courtName=[]areaName=[]caseCode=[]duty=[]performance=[]disruptTypeName=[]publishDate=[]################循环发送请求解析数据#######################for i in range(1,101):    print '正在抓取第'+str(i)+"页................................."    url="https://sp0.baidu.com/8aQDcjqpAAV3otqbppnN2DJv/api.php?resource_id=6899&query=%E8%80%81%E8%B5%96&pn="+str(i*10)+"&ie=utf-8&oe=utf-8&format=json"    head={    "Host": "sp0.baidu.com",    "Connection": "keep-alive",    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36",    "Accept": "*/*",    "Referer": "https://www.baidu.com/s?ie=utf-8&f=3&rsv_bp=1&tn=95943715_hao_pg&wd=%E8%80%81%E8%B5%96&oq=%25E8%2580%2581%25E8%25B5%2596&rsv_pq=ec5e631d0003d8eb&rsv_t=b295wWZB5DEWWt%2FICZvMsf2TZJVPmof2YpTR0MpCszb28dLtEQmdjyBEidZohtPIr%2FBmMrB3&rqlang=cn&rsv_enter=0&prefixsug=%25E8%2580%2581%25E8%25B5%2596&rsp=0&rsv_sug9=es_0_1&rsv_sug=9",    "Accept-Encoding": "gzip, deflate, br",    "Accept-Language": "zh-CN,zh;q=0.8"    }    html=requests.get(url,headers=head).content    html_json=json.loads(html)    html_data=html_json['data']    for each in html_data:        k=each['result']        for each in k:            print each['iname'], each['cardNum'], each['courtName'], each['areaName'], each['caseCode'], each['duty'], \            each['performance'], each['disruptTypeName'], each['publishDate']            iname.append(each['iname'])            icard.append(each['cardNum'])            courtName.append(each['courtName'])            areaName.append(each['areaName'])            caseCode.append(each['caseCode'])            duty.append(each['duty'])            performance.append(each['performance'])            disruptTypeName.append(each['disruptTypeName'])            publishDate.append(each['publishDate'])#####################将数据组织成数据框############################ data=pd.DataFrame({"name":iname,"IDCard":icard})detail_data = pd.DataFrame(    {"name": iname, "IDCard": icard, "courtName": courtName, "areaName": areaName, "caseCode": caseCode, "duty": duty,\     "performance": performance, \     "disruptTypeName": disruptTypeName, "publishDate": publishDate})# #################数据框去重##################################### data1=data.drop_duplicates()# print data1detail_data1 = detail_data.drop_duplicates()print detail_data1# #########################写出数据到excel########################################## pd.DataFrame.to_excel(data1,"F:\\iname_icard.xlsx",header=True,encoding='gbk',index=False)# time2=time.time()# print u'ok,爬虫结束!'# print u'总共耗时:'+str(time2-time1)+'s'

这里写图片描述

原创粉丝点击