python 爬取各大招聘网站信息
来源:互联网 发布:网络监听技术ppt 编辑:程序博客网 时间:2024/04/29 05:58
python 爬取各大招聘网站信息,源码,留给自己看的
1、拉勾
from bs4 import BeautifulSoupimport requestsimport urllibfrom selenium import webdriverfrom selenium.webdriver.common.keys import Keysimport selenium.webdriver.support.ui as uifrom selenium.webdriver.common.action_chains import ActionChainsimport pymysqlfrom time import sleepimport rewhat1 = '数据挖掘'what2 ='全职'what3 = '北京'what1 = urllib.parse.quote(what1)what2 = urllib.parse.quote(what2)what3 = urllib.parse.quote(what3)driver=webdriver.PhantomJS()# driver=webdriver.Chrome(executable_path='E:\package\Chrome64_48.0.2564.109\chromedriver.exe')url = 'https://www.lagou.com/jobs/list_%s?px=default&gx=%s&city=%s#order' % (what1,what2,what3)url2 = 'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?labelWords=sug&fromSearch=true&suginput=shuju'driver.implicitly_wait(100)driver.get(url)bs = BeautifulSoup(driver.page_source)req = bs.find('ul',class_ = 'item_con_list',style ='display: block;')urllinks = req.find_all('a',class_='position_link')import queueque = queue.Queue()for i in urllinks: print(i.get('href')) que.put(i.get('href'))link_next = driver.find_element_by_xpath("//span[@class='pager_next ']")link_next.click()times = 0while True: times += 1 driver.implicitly_wait(10) bs = BeautifulSoup(driver.page_source) req = bs.find('ul',class_ = 'item_con_list',style ='display: block;') urllinks = req.find_all('a',class_='position_link') for i in urllinks: print(i.get('href')) que.put(i.get('href')) print(times) if times == 3: break link_next = driver.find_element_by_xpath("//span[@class='pager_next ']") link_next.click() sleep(3)driver2 = webdriver.PhantomJS()# driver2=webdriver.Chrome(executable_path='E:\package\Chrome64_48.0.2564.109\chromedriver.exe')while que: try : newurl = que.get() driver2.get(newurl) driver2.implicitly_wait(100) bs2 = BeautifulSoup(driver2.page_source) job_info = bs2.find('div', class_='job-name') company = job_info.find('div', class_='company') reg1 = re.compile("<[^>]*>") ###部门 company = reg1.sub('', company.prettify()) ####职位 job = job_info.find('span', class_='name') reg2 = re.compile("<[^>]*>") job = reg2.sub('', job.prettify()).strip('\n') ###工资 、地点 、经验、学历 job_req = bs2.find('dd', class_='job_request') all_info = [] for i in job_req.find_all('span'): reg3 = re.compile("<[^>]*>") new_in = reg3.sub('', i.prettify()) all_info.append(new_in) salary = all_info[0] mod = re.compile('/') salary = mod.sub('', salary).strip('\n') address = all_info[1] address = mod.sub('', address).strip('\n') exp = all_info[2] exp = mod.sub('', exp).strip('\n') edu = all_info[3] edu = mod.sub('', edu).strip('\n') ###job_detail job_det = bs2.find('dl', class_='job_detail', id='job_detail') ###职位诱惑 job_lu = job_det.find('dd', class_='job-advantage').find('p') reg4 = re.compile("<[^>]*>") job_lu = reg4.sub('', job_lu.prettify()) ###工作责任与要求 job_zong = job_det.find('dd', class_='job_bt') job_res = job_zong.find('div') reg5 = re.compile("<[^>]*>") job_res = str(reg5.sub('', job_res.prettify()).strip('\n').strip()) ###工作地址 job_ad = bs2.find('dd', class_='job-address clearfix').find('div', class_='work_addr') reg6 = re.compile("<[^>]*>") job_ad = reg6.sub('', job_ad.prettify()).strip('\n') job_con = bs2.find('dl', class_='job_company', id='job_company') ###公司名称 com_name = job_con.find('dt').find('a').find('img').get('alt') ###公司类型 com_cat = job_con.find('ul', class_='c_feature').find_all('li') all_info2 = [] for i in com_cat: reg7 = re.compile("<[^>]*>") new_in = reg7.sub('', i.prettify()) all_info2.append(new_in) com_cat = all_info2[0].strip('\n') lingyu = '领域' dev = '发展阶段' gui ='规模' a1 = re.compile(lingyu) a2 = re.compile(dev) a3 = re.compile(gui) com_cat = a1.sub('',com_cat).strip() com_qua = all_info2[1].strip('\n') com_qua = a2.sub('',com_qua).strip() com_peo = all_info2[-2].strip('\n') com_peo = a3.sub('',com_peo).strip() db = pymysql.connect('localhost', 'root', 'xin123456789', 'test') db.encoding = 'utf-8' cursor = db.cursor() cursor.execute('set names utf8') sql = "INSERT INTO lagou_wajue (job_name,com_name,com_addr,com_cat,com_qua,com_peo,exp1,edu,salary,com_resp) VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') " cursor.execute(sql % (job, com_name, address, com_cat, com_qua, com_peo, exp, edu, salary, job_res)) db.commit() cursor.close() db.close() except: print('该页面无法获取')driver.close()driver2.close()
2、猎聘
2、1下载 链接
from bs4 import BeautifulSoupimport requestsimport urllibfrom selenium import webdriverfrom selenium.webdriver.common.keys import Keysimport selenium.webdriver.support.ui as uifrom selenium.webdriver.common.action_chains import ActionChainsimport pymysqlfrom time import sleepwhat1 = '数据分析'what1 = urllib.parse.quote(what1)driver = webdriver.Chrome(executable_path='E:\package\Chrome64_48.0.2564.109\chromedriver.exe')for i in range(5): page = i url = 'https://www.liepin.com/zhaopin/?pubTime=&ckid=5ac323b614701474&fromSearchBtn=2&compkind=&isAnalysis=&init=-1&searchType=1&dqs=070020&industryType=&jobKind=&sortFlag=15°radeFlag=0&industries=&salary=&compscale=&key=%s&clean_condition=&headckid=5ac323b614701474&curPage=%d' % ( what1, page) # url = 'https://www.liepin.com/bj/zhaopin/?sfrom=click-pc_homepage-centre_searchbox-search_new&key=%s' % what1 # driver=webdriver.PhantomJS() driver.get(url) driver.implicitly_wait(100) links = driver.find_elements_by_xpath("//div[@class='job-info']/h3") w = open('e:/myurl2.txt', 'a', encoding='utf-8') for i in links: final = i.find_element_by_xpath("./a") print(final.get_attribute('href')) w.writelines(final.get_attribute('href') + '\n') w.close()
2、2 信息抓取
from bs4 import BeautifulSoupimport requestsimport urllibfrom selenium import webdriverfrom selenium.webdriver.common.keys import Keysimport selenium.webdriver.support.ui as uifrom selenium.webdriver.common.action_chains import ActionChainsimport pymysqlimport refrom time import sleepimport threadingfrom threading import current_thread,Lockimport multiprocessingimport queueclass MyThread(threading.Thread): def __init__(self, funcs, args, name=''): threading.Thread.__init__(self) self.funcs = funcs self.name = name self.args = args def run(self): self.funcs(*self.args)def getcontent(que,driver): while que: try: newurl = que.get() driver.get(newurl) driver.implicitly_wait(100) bs2 = BeautifulSoup(driver.page_source) job_info = bs2.find('div', class_='title-info') company = job_info.find('h3').find('a') reg1 = re.compile("<[^>]*>") ###部门 company = reg1.sub('', company.prettify()).strip('\n').strip() print(company) ####职位 job = job_info.find('h1') reg2 = re.compile("<[^>]*>") job = reg2.sub('', job.prettify()).strip('\n') print(job) ###工资 、地点 、经验、学历 job_req = bs2.find('div', class_='job-title-left') salary = job_req.p.contents[0].strip() com_addr = job_req.find('p', class_='basic-infor').find('span').find('a').text qua = job_req.find('div', class_='job-qualifications') need = [] for i in qua.find_all('span'): need.append(i.text) edu = need[0] exps = need[1] print(edu) print(exps) print(com_addr) print(salary) response = bs2.find('div', class_='job-item main-message').find('div', class_='content content-word') reg3 = re.compile("<[^>]*>") job_res = reg3.sub('', response.prettify()).strip('\n').strip() print(job_res) com_info = bs2.find('div', class_='company-infor').find('ul').find_all('li') infom = [] for i in com_info: infom.append(i.text) print(i.text) com_cat = infom[0].strip('\n').strip() com_peo = infom[1] com_qua = infom[2] sleep(1) db = pymysql.connect('localhost', 'root', 'xin123456789', 'test') db.encoding = 'utf-8' cursor = db.cursor() cursor.execute('set names utf8') sql = "INSERT INTO lagou (job_name,com_name,com_addr,com_cat,com_qua,com_peo,exp1,edu,salary,com_resp) VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') " cursor.execute(sql % (job, company, com_addr, com_cat, com_qua, com_peo, exps, edu, salary, job_res)) db.commit() cursor.close() db.close() except: print('页面发生错误')def main(): w = open('e:/myurl2.txt', 'r', encoding='utf-8') urls = [] for i in w.readlines(): newline = i.strip() urls.append(newline) w.close() print(len(urls)) que = queue.Queue() for i in urls: que.put(i) # driver = webdriver.PhantomJS() driver = webdriver.Chrome(executable_path='E:\package\Chrome64_48.0.2564.109\chromedriver.exe') getcontent(que,driver)if __name__== '__main__': main()
3、前程无忧
from bs4 import BeautifulSoupimport requestsimport urllibfrom selenium import webdriverfrom selenium.webdriver.common.keys import Keysimport selenium.webdriver.support.ui as uifrom selenium.webdriver.common.action_chains import ActionChainsimport pymysqlimport refrom time import sleepimport threadingfrom threading import current_thread,Lockimport multiprocessingimport queueclass MyThread(threading.Thread): def __init__(self, funcs, args, name=''): threading.Thread.__init__(self) self.funcs = funcs self.name = name self.args = args def run(self): self.funcs(*self.args)def getcontent(que,driver): while que: try: newurl = que.get() driver.get(newurl) driver.implicitly_wait(100) bs2 = BeautifulSoup(driver.page_source) job_info = bs2.find('div', class_='cn') company = job_info.find('p',class_='cname').find('a') ###gongsi company = company.get('title') ####职位 job = job_info.find('h1') job = job.get('title') ###工资 、地点 、经验、学历 com_addr = job_info.find('span',class_='lname').text salary = job_info.find('strong').text com_all= job_info.find('p',class_='msg ltype').text.strip('\t').strip('\n').split('|') com_qua = com_all[0].strip('\n').strip() com_peo = com_all[1].strip('\n').strip() com_cat = com_all[2].strip('\n').strip() print(com_qua) job_main = bs2.find('div',class_= 'tCompany_main') info_all = [] for i in job_main.find_all('span',class_='sp4'): info_all.append(i.text) exps = info_all[0].strip() edu = info_all[1].strip() if '经验' not in exps: exps =None if edu not in ['初中及以下','高中/中技/中专','大专','本科','硕士','博士']: edu =None job_res = job_main.find('div',class_='bmsg job_msg inbox') reg3 = re.compile("<[^>]*>") job_res = reg3.sub('', job_res.prettify()).strip('\n').strip() reg4 = re.compile('分享') reg5 = re.compile('举报') job_res = reg4.sub('', job_res).strip('\n').strip() job_res = reg5.sub('', job_res).strip('\n').strip() db = pymysql.connect('localhost', 'root', 'xin123456789', 'test') db.encoding = 'utf-8' cursor = db.cursor() cursor.execute('set names utf8') sql = "INSERT INTO lagou (job_name,com_name,com_addr,com_cat,com_qua,com_peo,exp1,edu,salary,com_resp) VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') " cursor.execute(sql % (job, company, com_addr, com_cat, com_qua, com_peo, exps, edu, salary, job_res)) db.commit() cursor.close() db.close() except: print('页面发生错误')def main(): w = open('e:/myurl10.txt', 'r', encoding='utf-8') urls = [] for i in w.readlines(): newline = i.strip() urls.append(newline) w.close() print(len(urls)) que = queue.Queue() for i in urls: que.put(i) # driver = webdriver.PhantomJS() driver = webdriver.Chrome(executable_path='E:\package\Chrome64_48.0.2564.109\chromedriver.exe') getcontent(que,driver)if __name__== '__main__': main()
4 中华英才
from bs4 import BeautifulSoupimport requestsimport urllibfrom selenium import webdriverfrom selenium.webdriver.common.keys import Keysimport selenium.webdriver.support.ui as uifrom selenium.webdriver.common.action_chains import ActionChainsimport pymysqlimport refrom time import sleepimport threadingfrom threading import current_thread,Lockimport multiprocessingimport queueclass MyThread(threading.Thread): def __init__(self, funcs, args, name=''): threading.Thread.__init__(self) self.funcs = funcs self.name = name self.args = args def run(self): self.funcs(*self.args)def getcontent(que,driver): while que: try: newurl = que.get() driver.get(newurl) driver.implicitly_wait(100) bs2 = BeautifulSoup(driver.page_source) job_info = bs2.find('div', class_='base_info') ####职位 job = job_info.find('div').find('h1').find('span').text ###工资 、地点 、经验、学历 min_info = job_info.find('div',class_='job_require') all_in = [] for i in min_info.find_all('span'): print(i.text) all_in.append(i.text) print(all_in) salary = all_in[0].strip() com_addr = all_in[1].strip() edu = all_in[3].strip() exps = all_in[4].strip() job_main = bs2.find('div',class_= 'job_intro_wrap') job_res = job_main.find('div',class_='job_intro_info') reg3 = re.compile("<[^>]*>") job_res = reg3.sub('', job_res.prettify()).strip('\n').strip() com_intro = bs2.find('div',class_='job-company jrpadding') company = com_intro.find('h4').find('a').text.strip() print(company) com_info = com_intro.find('tbody').find_all('tr') com_s = [] for i in com_info: times = 0 for j in i.find_all('td'): times += 1 if times ==2 : com_s.append(j.text) com_cat = com_s[0].strip() com_qua = com_s[2].strip() com_peo = com_s[1].strip() print(job, company, com_addr, com_cat, com_qua, com_peo, exps, edu, salary, job_res) sleep(1) db = pymysql.connect('localhost', 'root', 'xin123456789', 'test') db.encoding = 'utf-8' cursor = db.cursor() cursor.execute('set names utf8') sql = "INSERT INTO lagou (job_name,com_name,com_addr,com_cat,com_qua,com_peo,exp1,edu,salary,com_resp) VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') " cursor.execute(sql % (job, company, com_addr, com_cat, com_qua, com_peo, exps, edu, salary, job_res)) db.commit() cursor.close() db.close() except: print('页面发生错误')def main(): w = open('e:/myurl8.txt', 'r', encoding='utf-8') urls = [] for i in w.readlines(): newline = i.strip() urls.append(newline) w.close() print(len(urls)) que = queue.Queue() for i in urls: que.put(i) driver = webdriver.PhantomJS() # driver = webdriver.Chrome(executable_path='E:\package\Chrome64_48.0.2564.109\chromedriver.exe') getcontent(que,driver)if __name__== '__main__': main()
阅读全文
1 0
- python 爬取各大招聘网站信息
- 招聘信息网站
- 简单的Python抓取招聘网站信息(1)
- Python爬取拉勾网招聘信息
- Python scrapy 爬取拉勾网招聘信息
- Android 智联招聘网站信息获取
- python 爬虫之校园招聘信息
- python抓取招聘信息简单代码
- Python爬虫爬取智联招聘职位信息
- 【Python】抓取拉勾网全国Python的招聘信息
- 各大招聘网站信息实时查询浏览
- 招聘信息
- 招聘信息
- 招聘信息
- 招聘信息
- 招聘信息
- 招聘信息
- 招聘信息
- WindowsNT 下的 Service 编程
- Magoosh Coupon Code : CNAMBLER 所有产品打八折
- SSH免密码登录配置
- Java SE7新特性之try-with-resources语句
- iOS-工具方法-UIColor
- python 爬取各大招聘网站信息
- 解决兼容xp的webrtc早期版本的Thread::Clear() 奔溃问题!
- MySQL 语句记录
- linux快捷键操作
- Java实现-尾部的零
- SVN(subvision)的搭建和配置
- android 开机广播接收慢处理方式
- 用ssh框架做项目时,输出数据出现错误
- tom8 报错 Http header parsing errors will be logged at DEBUG level