爬取智联招聘的招聘信息

来源：互联网发布：网络恐怖漫画编辑：程序博客网时间：2024/04/25 18:08

#!/usr/bin/python
#encoding:utf-8
import requests
from bs4 import BeautifulSoup
import codecs
import xlwt
from xlutils.copy import copy
from xlrd import open_workbook
import os

class Spider():
def __init__(self):
self.url = 'http://sou.zhaopin.com/jobs/searchresult.ashx?'
self.headers={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate, sdch',
'Accept-Language':'zh-CN,zh;q=0.8',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'
}

self.data={
'ji':'上海',
'kw':'大数据',
'p':1,
'isadv':0
}
self.filename='./zlzp.xls'
f=xlwt.Workbook(encoding='utf-8')
sheet1=f.add_sheet('sheet1')
row=['position', 'company', 'salary', 'address', 'data']
for i in range(len(row)):
sheet1.write(0,i,row[i])
f.save(self.filename)

def Zlzp(self):
html=requests.get(self.url,headers=self.headers,params=self.data)
soup=BeautifulSoup(html.text,'html.parser')

# f=codecs.open('./zhilian.html','w','utf-8')
# f.write(html.text)
# f.close()

newlist=soup.find('div',{'class':'newlist_list_content'})
tables=newlist.findAll('table',{'newlist'})
line = 1

for i in range(1,len(tables)):
table=tables[i]
link=table.find('a')['href']
link=str(link)
position, company, salary, address, data=self.get_info(link)
print position, company, salary, address, data
rb=open_workbook(self.filename)
wb=copy(rb)
jobData=[position.decode('utf-8'), company.decode('utf-8'), salary.decode('utf-8'), address.decode('utf-8'), data.decode('utf-8')]
sheet=wb.get_sheet(0)
for j in range(len(jobData)):
sheet.write(line,j,jobData)
line+=1
os.remove(self.filename)
wb.save(self.filename)

def get_info(self,link):
header = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'
}

html=requests.get(link,headers=header)
soup=BeautifulSoup(html.text,'html.parser')
try:
tfb=soup.find('div',{'class':'top-fixed-box'})

position=tfb.find('h1').text.encode('utf-8') #地点
company=tfb.find('h2').text.encode('utf-8') #公司
tpl=soup.find('div',{'class':'terminalpage-left'})

tuc=tpl.find('ul',{'class':'terminal-ul clearfix'})
lis=tuc.findAll('li')

salary=lis[0].find('strong').text.encode('utf-8') #工资
address=lis[1].find('strong').text.encode('utf-8') #地点
data=lis[2].find('strong').text.encode('utf-8') #日期

return position,company,salary,address,data
except Exception as e:
print e

if __name__ == '__main__':
spider=Spider()
spider.Zlzp()

阅读全文

0 0