job爬虫

来源:互联网 发布:龙宫礼奈 知乎 编辑:程序博客网 时间:2024/06/05 16:00

要爬取的数据以json的形式存在于url中,而我们在拉勾网首页输入框中输入关键字后发现网址是 http://www.lagou.com/jobs/positionAjax.json?city=%E5%8C%97%E4%BA%AC&needAddtionalResult=false, 打开Headers,我们可以看到有一项是Form Data,里面有三个参数,分别为first:true;pn:1;kd:python,其中first应该是判断是不是首页,kd就是你输入的关键字,pn就是页码。除了第一页的first是true以外都是false。这就是post中的数据,我们可以通过提交这四个参数得到响应页面。

# -*- coding:utf-8 -*-import re,jsonimport urllibfrom pandas import DataFrame,Seriesimport pandas as pd# 处理字符串的函数def ProcessingString(string):    string = string.decode('utf-8','ignore').encode('utf-8','ignore')    string = str(string).replace(r'\x','%').replace(r"'","")    string = re.sub('^b','',string)    return string# 计算总共页数def SearchPageCount(position, city):    i = 0    type = 'true'    url = 'http://www.lagou.com/jobs/positionAjax.json?city='+city+'&first='+type+'&kd='+position+'&pn='+str(i+1)    f=urllib.urlopen(url)    data = f.read()    print json.loads(str(data ))    count = int(json.loads(str(data ))["content"]["pageSize"])    totalCount = int(json.loads(str(data ))["content"]["positionResult"]["totalCount"])    print('本次搜索到%d个职位'%totalCount)    return countdef LaGouSpiderWithKeyWord(position, city):    positionTemp = ProcessingString(position)    cityTemp = ProcessingString(city)# 获取总共页数    pageCount = SearchPageCount(positionTemp,cityTemp)    for i in range(0,pageCount):        if i ==0 :            type='true'        else:            type='false'        url = 'http://www.lagou.com/jobs/positionAjax.json?city='+cityTemp+'&first='+type+'&kd='+positionTemp+'&pn=1'        data =urllib.urlopen(url).read()        jsondata = json.loads(str(data))['content']["positionResult"]['result']        for t in list(range(len(jsondata))):            jsondata[t]['companyLabelListTotal']='-'.join(jsondata[t]['companyLabelList'])            print jsondata[t]['companyLabelListTotal']            jsondata[t].pop('companyLabelList')            print  jsondata[t]            if t == 0:                rdata=DataFrame(Series(data=jsondata[t])).T                print DataFrame(Series(data=jsondata[t]))                           else:                rdata=pd.concat([rdata,DataFrame(Series(data=jsondata[t])).T])                 if i == 0:            totaldata=rdata        else:            totaldata=pd.concat([totaldata,rdata])        print('正在解析第%d页...'%i)    totaldata.to_excel('output.xls',sheet_name='sheet1')if __name__ == "__main__":    position = raw_input('请输入你要爬取的职位')    city = raw_input('请输入你要爬取的城市')    LaGouSpiderWithKeyWord(position, city)
0 0