Python爬虫
来源:互联网 发布:怎么在淘宝找工作 编辑:程序博客网 时间:2024/05/14 10:23
之前用来搜职位的Python爬虫
# -*- coding:utf-8 -*-##GB18030import urllibimport urllib2import reimport osimport mathimport sqlite3##import sys##reload(sys)##sys.setdefaultencoding('utf8')import socketsocket.setdefaulttimeout(25)#int re module the () things must add \(\) oh#if you want to insert into database the content must use decode('GB18030').encode('utf8')x=1conn=sqlite3.connect("jobs.db")try : conn.execute("create table jobs(id int primary key,name text,wage text,comment text)")#conn.close()except Exception as e: print "create table",efor i in range(1, 5): url = 'http://search.51job.com/jobsearch/search_result.php?fromJs=1&jobarea=190200%2C00&district=000000&funtype=0000&industrytype=00&issuedate=9&providesalary=99&keyword=%E5%B5%8C%E5%85%A5%E5%BC%8F%E8%BD%AF%E4%BB%B6&keywordtype=1&curr_page=2&lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=01&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&list_type=1&fromType=14&dibiaoid=-1' user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = { 'User-Agent' : user_agent } try: request = urllib2.Request(url,headers = headers) response = urllib2.urlopen(request) content = response.read().decode('GB18030') #print content #pattern = re.compile('<li>.*<a href="/W/(.*?)" title="(.*?)" class="ah" target="_blank">'+ # '<img src="(.*?)"',re.S) pattern = re.compile('<a adid="" onmousedown="return AdsClick\(\)" href="(.*?)" onclick="zzSearch.acStatRecJob\( 1 \)',re.S) items = re.findall(pattern,content) #print items for item in items: #haveImg = re.search("img",item[3]) #if not haveImg: print i,item url = item request = urllib2.Request(url,headers = headers) response = urllib2.urlopen(request) content = response.read().decode('GB18030').encode('utf8') pattern = re.compile('<div style="padding-bottom:30px;">(.*?)</div>',re.S) newitems = re.findall(pattern,content) for newitem in newitems: print newitem.replace("<br>","\r\n") try : conn.execute("insert into jobs(id,name,wage,comment) values(%d,'%s','%s','%s')" % (x,item.decode('GB18030').encode('utf8'),'2',newitem.replace('<br>','\r\n'))) conn.commit() except Exception as e: print e x+=1 except urllib2.URLError, e: if hasattr(e,"code"): print e.code if hasattr(e,"reason"): print e.reason if isinstance(e.reason, socket.timeout): print e except socket.timeout, e: print e print "\n"retval = conn.execute("select * from jobs")for val in retval: print val[0] print val[1] print val[2] print val[3] conn.close()
0 0
- python爬虫-->爬虫基础
- [爬虫] Python爬虫技巧
- Python爬虫
- python 爬虫
- python 爬虫
- python 爬虫
- python爬虫
- Python爬虫
- Python爬虫
- python 爬虫
- Python爬虫
- python爬虫
- python 爬虫
- python 爬虫
- python爬虫
- python爬虫
- python爬虫
- python 爬虫
- Mac开发中NSUserDefaults用法解析
- 一直在用的通用makefile
- HDU2151——worm
- JSON要点
- collection hierarchy in java
- Python爬虫
- HDOJ 1872-稳定排序
- 教你如何迅速秒杀掉:99%的海量数据处理面试题
- Android 布局学习之——LinearLayout属性baselineAligned的作用及baseline
- leetcode-002
- 数据结构----串
- libevent的大框框
- 第三周项目4穷举法(3)
- 浅谈C++内联函数