拉勾网爬虫_面向对象

来源:互联网 发布:微信o2o系统源码下载 编辑:程序博客网 时间:2024/06/05 22:52

ajax请求下,动态页面数据的抓取,主要是构建post请求头,模拟请求

import requestsfrom lxml import etreeimport jsonimport sysclass Lagou(object):    def __init__(self):        self.headers = {            'X-Requested-With': 'XMLHttpRequest',            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',            'Cookie': 'user_trace_token=20170920183457-58cc73d5-9def-11e7-9c29-525400f775ce; LGUID=20170920183457-58cc7899-9def-11e7-9c29-525400f775ce; index_location_city=%E5%85%A8%E5%9B%BD; TG-TRACK-CODE=search_code; _gid=GA1.2.1674249864.1506152972; _ga=GA1.2.661938952.1505903691; LGRID=20170924124247-cffed120-a0e2-11e7-9278-5254005c3644; JSESSIONID=ABAAABAACDBABJB5CBA63393ECA49354BFB77C6B0BD0B5B; SEARCH_ID=4b82b54b5eab451392d2f73823a14a00',            'Host': 'www.lagou.com',            'Origin': 'https://www.lagou.com',            'Referer': 'https://www.lagou.com/jobs/list_python%E7%88%AC%E8%99AB?px=default&city=%E6%88%90%E9%83%BD',            'X-Anit-Forge-Code': 0,            'X-Anit-Forge-Token': None        }            self.base_url = 'https://www.lagou.com/jobs/list_%s?px=default&city=%s'    def get_combine_list(self):        combine_list = []        # for position in position_list:        #     for city in city_list:        dict = {}        url = self.base_url % (position, city)        html = requests.get(url, headers=self.headers).content.decode()        # print(html)        selector = etree.HTML(html)        # print(selector)        total_page = selector.xpath('//span[@class="span totalNum"]/text()')        dict['city'] = city        dict['position'] = position        dict['total_page'] = total_page        combine_list.append(dict)    # print(combine_list)        return combine_list    def get_position_data(self, url, post_data):        data = requests.post(url, data=post_data, headers=self.headers).content        # print(data)        # time.sleep(5)        json_data = json.loads(data.decode('utf-8'))        # print(json_data)        position_data = json_data['content']['positionResult']['result']        data_list = []        for positions in position_data:            # for i in positions:            #     print(i)            item = {}            item['position_city'] = positions['city']            item['creat_time'] = positions['createTime']            item['company_name'] = positions['companyFullName']            item['district'] = positions['district']            item['education'] = positions['education']            item['workyear'] = positions['workYear']            item['salary'] = positions['salary']            item['positon'] = positions['positionLables']            # print(item)            data_list.append(item)            # print(len(data_list))        return data_list    def save_data(self,data_list):        # print(len(data_list))        filename = city + '.xml'        with open(filename,'a') as f:            for data in data_list:                result = json.dumps(data, ensure_ascii=False) + ',\n'                f.write(result)    def run(self):        combine_list = self.get_combine_list()        # time.sleep(5)        for dict in combine_list:            city = dict['city']            position = dict['position']            total_page = dict['total_page']            for num in total_page:                total_page = int(num)            # print(city, total_page, position)        json_url = 'https://www.lagou.com/jobs/positionAjax.json?px=default&city=%s&needAddtionalResult=false&isSchoolJob=0'        url = json_url % city        for page in range(1, total_page+1):            # print(page)            post_data = {                'first': 'true',                'pn': str(page),                'kd':position            }            data_list = self.get_position_data(url, post_data)            # time.sleep(5)            # print(len(data_list))            # for node in data_list:            self.save_data(data_list)if __name__ == "__main__":    city= sys.argv[1]    position = sys.argv[2]    # city_list = ['上海']    # position_list = ['python爬虫']    lagou = Lagou()    lagou.run()