拉勾网爬虫_面向对象
来源:互联网 发布:微信o2o系统源码下载 编辑:程序博客网 时间:2024/06/05 22:52
ajax请求下,动态页面数据的抓取,主要是构建post请求头,模拟请求
import requestsfrom lxml import etreeimport jsonimport sysclass Lagou(object): def __init__(self): self.headers = { 'X-Requested-With': 'XMLHttpRequest', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', 'Cookie': 'user_trace_token=20170920183457-58cc73d5-9def-11e7-9c29-525400f775ce; LGUID=20170920183457-58cc7899-9def-11e7-9c29-525400f775ce; index_location_city=%E5%85%A8%E5%9B%BD; TG-TRACK-CODE=search_code; _gid=GA1.2.1674249864.1506152972; _ga=GA1.2.661938952.1505903691; LGRID=20170924124247-cffed120-a0e2-11e7-9278-5254005c3644; JSESSIONID=ABAAABAACDBABJB5CBA63393ECA49354BFB77C6B0BD0B5B; SEARCH_ID=4b82b54b5eab451392d2f73823a14a00', 'Host': 'www.lagou.com', 'Origin': 'https://www.lagou.com', 'Referer': 'https://www.lagou.com/jobs/list_python%E7%88%AC%E8%99AB?px=default&city=%E6%88%90%E9%83%BD', 'X-Anit-Forge-Code': 0, 'X-Anit-Forge-Token': None } self.base_url = 'https://www.lagou.com/jobs/list_%s?px=default&city=%s' def get_combine_list(self): combine_list = [] # for position in position_list: # for city in city_list: dict = {} url = self.base_url % (position, city) html = requests.get(url, headers=self.headers).content.decode() # print(html) selector = etree.HTML(html) # print(selector) total_page = selector.xpath('//span[@class="span totalNum"]/text()') dict['city'] = city dict['position'] = position dict['total_page'] = total_page combine_list.append(dict) # print(combine_list) return combine_list def get_position_data(self, url, post_data): data = requests.post(url, data=post_data, headers=self.headers).content # print(data) # time.sleep(5) json_data = json.loads(data.decode('utf-8')) # print(json_data) position_data = json_data['content']['positionResult']['result'] data_list = [] for positions in position_data: # for i in positions: # print(i) item = {} item['position_city'] = positions['city'] item['creat_time'] = positions['createTime'] item['company_name'] = positions['companyFullName'] item['district'] = positions['district'] item['education'] = positions['education'] item['workyear'] = positions['workYear'] item['salary'] = positions['salary'] item['positon'] = positions['positionLables'] # print(item) data_list.append(item) # print(len(data_list)) return data_list def save_data(self,data_list): # print(len(data_list)) filename = city + '.xml' with open(filename,'a') as f: for data in data_list: result = json.dumps(data, ensure_ascii=False) + ',\n' f.write(result) def run(self): combine_list = self.get_combine_list() # time.sleep(5) for dict in combine_list: city = dict['city'] position = dict['position'] total_page = dict['total_page'] for num in total_page: total_page = int(num) # print(city, total_page, position) json_url = 'https://www.lagou.com/jobs/positionAjax.json?px=default&city=%s&needAddtionalResult=false&isSchoolJob=0' url = json_url % city for page in range(1, total_page+1): # print(page) post_data = { 'first': 'true', 'pn': str(page), 'kd':position } data_list = self.get_position_data(url, post_data) # time.sleep(5) # print(len(data_list)) # for node in data_list: self.save_data(data_list)if __name__ == "__main__": city= sys.argv[1] position = sys.argv[2] # city_list = ['上海'] # position_list = ['python爬虫'] lagou = Lagou() lagou.run()
阅读全文
0 0
- 拉勾网爬虫_面向对象
- 面向对象_静态
- 面向对象_封装
- 面向对象_继承
- 面向对象_多态
- 面向对象_继承
- 面向对象_总结
- 面向对象_封装
- 面向对象_静态
- 面向对象_匿名对象
- 面向对象_上部总结
- 黑马程序员_面向对象
- 黑马程序员_面向对象
- 黑马程序员_面向对象
- 黑马程序员_面向对象
- 黑马程序员_面向对象
- 面向对象_内部类
- 面向对象_异常总结
- Java常用算法之堆排序
- selenium 问题定期总结
- 7种数据库事务传递性代码实例详解
- Python编程——Python基础知识之字典(三)
- 虚拟机性能监控和故障处理工具
- 拉勾网爬虫_面向对象
- Error creating bean with name 'sessionFactory' defined in class path resource
- JFreeChart的使用
- linux笔记
- 数据库分库分表中间件 Sharding-JDBC 源码分析 —— SQL 路由(一)分库分表配置
- Java CalendarApi
- static 详细分析
- 八数码问题有解条件&推广N×N,N×N×N
- 学习c++:(0)安装与教程