爬取拉勾网

来源:互联网 发布:linux安全加固脚本 编辑:程序博客网 时间:2024/06/08 01:36
# -*- coding: utf-8 -*-# @Time    : 2017/8/29 15:14# @Author  : z# @File    : 拉勾网.py# @Software: PyCharmimport requestsfrom urllib.parse import urlencodefrom bs4 import BeautifulSoupimport jsonimport pandasimport timeclass LaGou(object):    def __init__(self,kd='python爬虫'):        self.url = "https://www.lagou.com/jobs/positionAjax.json?city=%E5%8C%97%E4%BA%AC&needAddtionalResult=false&isSchoolJob=0"        self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36',                        'Referer':'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=',                        'Host':'www.lagou.com'}        self.kd = kd        self.list=[]    def parse_html(self):        self.list.append(['公司', "福利", '地址', '岗位', '薪资', '发布时间', '学历', '工作经验'])        for i in range(1, 20):            self.data = {'kd': self.kd, 'pn': i, 'first': 'true'}            while True:                try:                    response = requests.post(self.url, headers=self.headers, data=self.data).text                    json_response = json.loads(response)                    list_all = json_response['content']['positionResult']['result']                    break                except:                    time.sleep(1)            print('------------------------', i, '-------------------')            for i in list_all:                list1 = []                list1.append(i['companyFullName'])                list1.append(','.join(i['companyLabelList']))                list1.append(i['district'])                list1.append(i['positionName'])                list1.append(i['salary'])                list1.append(i['createTime'])                list1.append(i['education'])                list1.append(i['workYear'])                self.list.append(list1)            self.to_file(self.list)    def to_file(self,list):        pd = pandas.DataFrame(self.list)        pd.to_excel('gg.xls')
if __name__ == '__main__':
LaGou().parse_html()
原创粉丝点击