python 爬取各大招聘网站信息

来源:互联网 发布:网络监听技术ppt 编辑:程序博客网 时间:2024/04/29 05:58

python 爬取各大招聘网站信息,源码,留给自己看的

1、拉勾

from bs4 import BeautifulSoupimport requestsimport urllibfrom selenium import webdriverfrom selenium.webdriver.common.keys import Keysimport selenium.webdriver.support.ui as uifrom selenium.webdriver.common.action_chains import ActionChainsimport pymysqlfrom time import sleepimport rewhat1 = '数据挖掘'what2 ='全职'what3 = '北京'what1 = urllib.parse.quote(what1)what2  = urllib.parse.quote(what2)what3  = urllib.parse.quote(what3)driver=webdriver.PhantomJS()# driver=webdriver.Chrome(executable_path='E:\package\Chrome64_48.0.2564.109\chromedriver.exe')url = 'https://www.lagou.com/jobs/list_%s?px=default&gx=%s&city=%s#order' % (what1,what2,what3)url2 = 'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?labelWords=sug&fromSearch=true&suginput=shuju'driver.implicitly_wait(100)driver.get(url)bs = BeautifulSoup(driver.page_source)req = bs.find('ul',class_ = 'item_con_list',style ='display: block;')urllinks = req.find_all('a',class_='position_link')import queueque = queue.Queue()for i in urllinks:    print(i.get('href'))    que.put(i.get('href'))link_next = driver.find_element_by_xpath("//span[@class='pager_next ']")link_next.click()times = 0while True:    times += 1    driver.implicitly_wait(10)    bs = BeautifulSoup(driver.page_source)    req = bs.find('ul',class_ = 'item_con_list',style ='display: block;')    urllinks = req.find_all('a',class_='position_link')    for i in urllinks:        print(i.get('href'))        que.put(i.get('href'))    print(times)    if times  == 3:        break    link_next = driver.find_element_by_xpath("//span[@class='pager_next ']")    link_next.click()    sleep(3)driver2 = webdriver.PhantomJS()# driver2=webdriver.Chrome(executable_path='E:\package\Chrome64_48.0.2564.109\chromedriver.exe')while que:    try :        newurl = que.get()        driver2.get(newurl)        driver2.implicitly_wait(100)        bs2 = BeautifulSoup(driver2.page_source)        job_info = bs2.find('div', class_='job-name')        company = job_info.find('div', class_='company')        reg1 = re.compile("<[^>]*>")        ###部门        company = reg1.sub('', company.prettify())        ####职位        job = job_info.find('span', class_='name')        reg2 = re.compile("<[^>]*>")        job = reg2.sub('', job.prettify()).strip('\n')        ###工资 、地点 、经验、学历        job_req = bs2.find('dd', class_='job_request')        all_info = []        for i in job_req.find_all('span'):            reg3 = re.compile("<[^>]*>")            new_in = reg3.sub('', i.prettify())            all_info.append(new_in)        salary = all_info[0]        mod = re.compile('/')        salary = mod.sub('', salary).strip('\n')        address = all_info[1]        address = mod.sub('', address).strip('\n')        exp = all_info[2]        exp = mod.sub('', exp).strip('\n')        edu = all_info[3]        edu = mod.sub('', edu).strip('\n')        ###job_detail        job_det = bs2.find('dl', class_='job_detail', id='job_detail')        ###职位诱惑        job_lu = job_det.find('dd', class_='job-advantage').find('p')        reg4 = re.compile("<[^>]*>")        job_lu = reg4.sub('', job_lu.prettify())        ###工作责任与要求        job_zong = job_det.find('dd', class_='job_bt')        job_res = job_zong.find('div')        reg5 = re.compile("<[^>]*>")        job_res = str(reg5.sub('', job_res.prettify()).strip('\n').strip())        ###工作地址        job_ad = bs2.find('dd', class_='job-address clearfix').find('div', class_='work_addr')        reg6 = re.compile("<[^>]*>")        job_ad = reg6.sub('', job_ad.prettify()).strip('\n')        job_con = bs2.find('dl', class_='job_company', id='job_company')        ###公司名称        com_name = job_con.find('dt').find('a').find('img').get('alt')        ###公司类型        com_cat = job_con.find('ul', class_='c_feature').find_all('li')        all_info2 = []        for i in com_cat:            reg7 = re.compile("<[^>]*>")            new_in = reg7.sub('', i.prettify())            all_info2.append(new_in)        com_cat = all_info2[0].strip('\n')        lingyu  = '领域'        dev = '发展阶段'        gui ='规模'        a1 = re.compile(lingyu)        a2 = re.compile(dev)        a3 = re.compile(gui)        com_cat = a1.sub('',com_cat).strip()        com_qua = all_info2[1].strip('\n')        com_qua = a2.sub('',com_qua).strip()        com_peo = all_info2[-2].strip('\n')        com_peo = a3.sub('',com_peo).strip()        db = pymysql.connect('localhost', 'root', 'xin123456789', 'test')        db.encoding = 'utf-8'        cursor = db.cursor()        cursor.execute('set names utf8')        sql = "INSERT INTO lagou_wajue (job_name,com_name,com_addr,com_cat,com_qua,com_peo,exp1,edu,salary,com_resp) VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') "        cursor.execute(sql % (job, com_name, address, com_cat, com_qua, com_peo, exp, edu, salary, job_res))        db.commit()        cursor.close()        db.close()    except:        print('该页面无法获取')driver.close()driver2.close()

2、猎聘

2、1下载 链接

from bs4 import BeautifulSoupimport requestsimport urllibfrom selenium import webdriverfrom selenium.webdriver.common.keys import Keysimport selenium.webdriver.support.ui as uifrom selenium.webdriver.common.action_chains import ActionChainsimport pymysqlfrom time import sleepwhat1 = '数据分析'what1 = urllib.parse.quote(what1)driver = webdriver.Chrome(executable_path='E:\package\Chrome64_48.0.2564.109\chromedriver.exe')for i in range(5):    page = i    url = 'https://www.liepin.com/zhaopin/?pubTime=&ckid=5ac323b614701474&fromSearchBtn=2&compkind=&isAnalysis=&init=-1&searchType=1&dqs=070020&industryType=&jobKind=&sortFlag=15&degradeFlag=0&industries=&salary=&compscale=&key=%s&clean_condition=&headckid=5ac323b614701474&curPage=%d' % (    what1, page)    # url = 'https://www.liepin.com/bj/zhaopin/?sfrom=click-pc_homepage-centre_searchbox-search_new&key=%s' % what1    # driver=webdriver.PhantomJS()    driver.get(url)    driver.implicitly_wait(100)    links = driver.find_elements_by_xpath("//div[@class='job-info']/h3")    w = open('e:/myurl2.txt', 'a', encoding='utf-8')    for i in links:        final = i.find_element_by_xpath("./a")        print(final.get_attribute('href'))        w.writelines(final.get_attribute('href') + '\n')    w.close()

2、2 信息抓取

from bs4 import BeautifulSoupimport requestsimport urllibfrom selenium import webdriverfrom selenium.webdriver.common.keys import Keysimport selenium.webdriver.support.ui as uifrom selenium.webdriver.common.action_chains import ActionChainsimport pymysqlimport refrom time import sleepimport threadingfrom threading import current_thread,Lockimport multiprocessingimport queueclass MyThread(threading.Thread):    def __init__(self, funcs, args, name=''):        threading.Thread.__init__(self)        self.funcs = funcs        self.name = name        self.args = args    def run(self):        self.funcs(*self.args)def getcontent(que,driver):    while que:        try:            newurl = que.get()            driver.get(newurl)            driver.implicitly_wait(100)            bs2 = BeautifulSoup(driver.page_source)            job_info = bs2.find('div', class_='title-info')            company = job_info.find('h3').find('a')            reg1 = re.compile("<[^>]*>")            ###部门            company = reg1.sub('', company.prettify()).strip('\n').strip()            print(company)            ####职位            job = job_info.find('h1')            reg2 = re.compile("<[^>]*>")            job = reg2.sub('', job.prettify()).strip('\n')            print(job)            ###工资 、地点 、经验、学历            job_req = bs2.find('div', class_='job-title-left')            salary = job_req.p.contents[0].strip()            com_addr = job_req.find('p', class_='basic-infor').find('span').find('a').text            qua = job_req.find('div', class_='job-qualifications')            need = []            for i in qua.find_all('span'):                need.append(i.text)            edu = need[0]            exps = need[1]            print(edu)            print(exps)            print(com_addr)            print(salary)            response = bs2.find('div', class_='job-item main-message').find('div', class_='content content-word')            reg3 = re.compile("<[^>]*>")            job_res = reg3.sub('', response.prettify()).strip('\n').strip()            print(job_res)            com_info = bs2.find('div', class_='company-infor').find('ul').find_all('li')            infom = []            for i in com_info:                infom.append(i.text)                print(i.text)            com_cat = infom[0].strip('\n').strip()            com_peo = infom[1]            com_qua = infom[2]            sleep(1)            db = pymysql.connect('localhost', 'root', 'xin123456789', 'test')            db.encoding = 'utf-8'            cursor = db.cursor()            cursor.execute('set names utf8')            sql = "INSERT INTO lagou (job_name,com_name,com_addr,com_cat,com_qua,com_peo,exp1,edu,salary,com_resp) VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') "            cursor.execute(sql % (job, company, com_addr, com_cat, com_qua, com_peo, exps, edu, salary, job_res))            db.commit()            cursor.close()            db.close()        except:            print('页面发生错误')def main():    w = open('e:/myurl2.txt', 'r', encoding='utf-8')    urls = []    for i in w.readlines():        newline = i.strip()        urls.append(newline)    w.close()    print(len(urls))    que = queue.Queue()    for i in urls:        que.put(i)    # driver = webdriver.PhantomJS()    driver = webdriver.Chrome(executable_path='E:\package\Chrome64_48.0.2564.109\chromedriver.exe')    getcontent(que,driver)if  __name__== '__main__':    main()

3、前程无忧

from bs4 import BeautifulSoupimport requestsimport urllibfrom selenium import webdriverfrom selenium.webdriver.common.keys import Keysimport selenium.webdriver.support.ui as uifrom selenium.webdriver.common.action_chains import ActionChainsimport pymysqlimport refrom time import sleepimport threadingfrom threading import current_thread,Lockimport multiprocessingimport queueclass MyThread(threading.Thread):    def __init__(self, funcs, args, name=''):        threading.Thread.__init__(self)        self.funcs = funcs        self.name = name        self.args = args    def run(self):        self.funcs(*self.args)def getcontent(que,driver):    while que:        try:            newurl = que.get()            driver.get(newurl)            driver.implicitly_wait(100)            bs2 = BeautifulSoup(driver.page_source)            job_info = bs2.find('div', class_='cn')            company = job_info.find('p',class_='cname').find('a')            ###gongsi            company = company.get('title')            ####职位            job = job_info.find('h1')            job = job.get('title')            ###工资 、地点 、经验、学历            com_addr = job_info.find('span',class_='lname').text            salary = job_info.find('strong').text            com_all= job_info.find('p',class_='msg ltype').text.strip('\t').strip('\n').split('|')            com_qua = com_all[0].strip('\n').strip()            com_peo = com_all[1].strip('\n').strip()            com_cat = com_all[2].strip('\n').strip()            print(com_qua)            job_main = bs2.find('div',class_= 'tCompany_main')            info_all = []            for i in job_main.find_all('span',class_='sp4'):                info_all.append(i.text)            exps = info_all[0].strip()            edu = info_all[1].strip()            if '经验' not in exps:                exps =None            if edu not in ['初中及以下','高中/中技/中专','大专','本科','硕士','博士']:                edu =None            job_res = job_main.find('div',class_='bmsg job_msg inbox')            reg3 = re.compile("<[^>]*>")            job_res = reg3.sub('', job_res.prettify()).strip('\n').strip()            reg4 = re.compile('分享')            reg5 = re.compile('举报')            job_res =  reg4.sub('', job_res).strip('\n').strip()            job_res = reg5.sub('', job_res).strip('\n').strip()            db = pymysql.connect('localhost', 'root', 'xin123456789', 'test')            db.encoding = 'utf-8'            cursor = db.cursor()            cursor.execute('set names utf8')            sql = "INSERT INTO lagou (job_name,com_name,com_addr,com_cat,com_qua,com_peo,exp1,edu,salary,com_resp) VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') "            cursor.execute(sql % (job, company, com_addr, com_cat, com_qua, com_peo, exps, edu, salary, job_res))            db.commit()            cursor.close()            db.close()        except:            print('页面发生错误')def main():    w = open('e:/myurl10.txt', 'r', encoding='utf-8')    urls = []    for i in w.readlines():        newline = i.strip()        urls.append(newline)    w.close()    print(len(urls))    que = queue.Queue()    for i in urls:        que.put(i)    # driver = webdriver.PhantomJS()    driver = webdriver.Chrome(executable_path='E:\package\Chrome64_48.0.2564.109\chromedriver.exe')    getcontent(que,driver)if  __name__== '__main__':    main()

4 中华英才

from bs4 import BeautifulSoupimport requestsimport urllibfrom selenium import webdriverfrom selenium.webdriver.common.keys import Keysimport selenium.webdriver.support.ui as uifrom selenium.webdriver.common.action_chains import ActionChainsimport pymysqlimport refrom time import sleepimport threadingfrom threading import current_thread,Lockimport multiprocessingimport queueclass MyThread(threading.Thread):    def __init__(self, funcs, args, name=''):        threading.Thread.__init__(self)        self.funcs = funcs        self.name = name        self.args = args    def run(self):        self.funcs(*self.args)def getcontent(que,driver):    while que:        try:            newurl = que.get()            driver.get(newurl)            driver.implicitly_wait(100)            bs2 = BeautifulSoup(driver.page_source)            job_info = bs2.find('div', class_='base_info')            ####职位            job = job_info.find('div').find('h1').find('span').text            ###工资 、地点 、经验、学历            min_info = job_info.find('div',class_='job_require')            all_in = []            for i in min_info.find_all('span'):                print(i.text)                all_in.append(i.text)            print(all_in)            salary = all_in[0].strip()            com_addr = all_in[1].strip()            edu = all_in[3].strip()            exps = all_in[4].strip()            job_main = bs2.find('div',class_= 'job_intro_wrap')            job_res = job_main.find('div',class_='job_intro_info')            reg3 = re.compile("<[^>]*>")            job_res = reg3.sub('', job_res.prettify()).strip('\n').strip()            com_intro = bs2.find('div',class_='job-company jrpadding')            company = com_intro.find('h4').find('a').text.strip()            print(company)            com_info = com_intro.find('tbody').find_all('tr')            com_s = []            for i in com_info:                times = 0                for j in i.find_all('td'):                    times += 1                    if times ==2 :                        com_s.append(j.text)            com_cat = com_s[0].strip()            com_qua = com_s[2].strip()            com_peo = com_s[1].strip()            print(job, company, com_addr, com_cat, com_qua, com_peo, exps, edu, salary, job_res)            sleep(1)            db = pymysql.connect('localhost', 'root', 'xin123456789', 'test')            db.encoding = 'utf-8'            cursor = db.cursor()            cursor.execute('set names utf8')            sql = "INSERT INTO lagou (job_name,com_name,com_addr,com_cat,com_qua,com_peo,exp1,edu,salary,com_resp) VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') "            cursor.execute(sql % (job, company, com_addr, com_cat, com_qua, com_peo, exps, edu, salary, job_res))            db.commit()            cursor.close()            db.close()        except:            print('页面发生错误')def main():    w = open('e:/myurl8.txt', 'r', encoding='utf-8')    urls = []    for i in w.readlines():        newline = i.strip()        urls.append(newline)    w.close()    print(len(urls))    que = queue.Queue()    for i in urls:        que.put(i)    driver = webdriver.PhantomJS()    # driver = webdriver.Chrome(executable_path='E:\package\Chrome64_48.0.2564.109\chromedriver.exe')    getcontent(que,driver)if  __name__== '__main__':    main()