一个用于爬取智联招聘信息的python爬虫

来源:互联网 发布:js tolocalestring 编辑:程序博客网 时间:2024/04/27 21:54
这个爬虫在xpath调用的时候会出现问题。。。并且目前限于个人能力水平无法完成,恳请各位大佬帮助斧正。
#coding:utf-8import reimport requestsimport Queueimport threadingimport pymongofrom config import *from urllib import urlencodefrom lxml import etreefrom bs4 import BeautifulSoup as bsclient = pymongo.MongoClient(MONGO_URL)db = client[MONGO_DB]class Position_Spider(threading.Thread):def __init__(self):threading.Thread.__init__(self)def run(self,queue):while not queue.empty():url = queue.get_nowait()self.request(url)def request(self,url):headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0',}re = requests.get(url=url,headers=headers)if re.status_code == 200:html = re.contentself.get_position(html)else:print "open website filed"print re.status_codedef get_position(self,html):soup = bs(html,'lxml')position_sites = soup.find_all(name="a",attrs={"style":"font-weight: bold"})for position_site in position_sites:position = position_site.stringprint positionheaders = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0',}re = requests.get(url=position_site["href"],headers=headers)if re.status_code == 200:html = re.contentself.positions_detail(html)else:print "open website filed"print re.status_codedef positions_detail(self,html):selector = etree.HTML(html)position_list= selector.xpath("//div[5]/div[1]/div[1]/h1/text()")company_list= selector.xpath("//div[5]/div[1]/div[1]/h2/a/text()")salary_list= selector.xpath("//div[6]/div[1]/ul/li[1]/strong/text()")        city_list= selector.xpath("//div[6]/div[1]/ul/li[2]/strong/a/text()")        record_list= selector.xpath("//div[6]/div[1]/ul/li[6]/strong/text()")        experience_list= selector.xpath("//div[6]/div[1]/ul/li[5]/strong/text()")        skill_list= selector.xpath("//div[@class='tab-inner-cont']/p[2]/span/text()")        for p,c ,s,ci,r,e,sk in zip(position_list,company_list,salary_list,city_list,record_list,experience_list,skill_list):        #print p.encode('utf-8')        print c.encode('utf-8')        print s.encode('utf-8')        print ci.encode('utf-8')        print r.encode('utf-8')        print e.encode('utf-8')        print sk.encode('utf-8')        result = {'position':p,'company':c,'salary':s,'city':ci,'record':r,'experience':e,'skill':sk}save_to_mongo(result)def save_to_mongo(self,result):if db[MONGO_TABLE].insert(result):print "存储成功"return Truereturn Falsedef get_url(self):queue = Queue.Queue()for i in range(1,90):queue.put("http://sou.zhaopin.com/jobs/searchresult.ashx?bj=160000&jl=%E9%80%89%E6%8B%A9%E5%9C%B0%E5%8C%BA&isadv=0&p=" + str(i))threads = []threads_count = 1for i in range(threads_count):threads.append(self.run(queue))for t in threads:t.start()for t in threads:t.join()def main():position = Position_Spider()#for province in provinces:print position.get_url()if __name__ == '__main__':main()

原创粉丝点击