拉钩网

来源:互联网 发布:r2r梯形网络 编辑:程序博客网 时间:2024/04/28 18:08
import requests,re,time,random,os,pinyin
from bs4 import BeautifulSoup
m=requests.get('http://www.lagou.com/').content.decode('utf-8')
bs=BeautifulSoup(m,'lxml')
n=bs.find("div","mainNavs").text
n=re.split(r'\s+',n)
n=n[2:]
for l in n:
if not os.path.isfile('d://lg//%s.txt'%l):
L=pinyin.get(l, format="strip")
urls=['http://www.lagou.com/zhaopin/%s/%d/?filterOption=%d'%(L,i,i) for i in range(1,31)]
for url in urls:
print(url)
time.sleep(random.randint(1,3))
data=requests.get(url).content.decode('utf-8')
print('得到数据')
bs=BeautifulSoup(data,'lxml')
a=bs.find_all("div","p_top")
b=bs.find_all("div","li_b_l")
c=bs.find_all("div","company_name")
if a :
for i,j,k in zip(a,b,c):
i=re.sub(r'\s+',',',i.text).lstrip(',')
j=re.sub(r'\s+',',',j.text).lstrip(',')
k=re.sub(r'\s+',',',k.text).lstrip(',')
l=re.sub(r'\/|\\|\*|\>|\<|\?|\:|\"|\|','',l)
with open('d://lg//%s.txt'%l,'a',errors='replace') as f:
f.write(i+j+k+'\n')
time.sleep(1)
else:
break
else:
print('已存在')
continue
0 0