Python Web数据抓取(xpath版)
来源:互联网 发布:京东美工注意事项 编辑:程序博客网 时间:2024/06/15 23:41
(1)采用SQLite缓存抓取的HTML页面,大大提高了二次数据处理的效率。第一次运行程序大约耗时6小时,以后只需3分钟左右即可完成。(2)采用xpath替换之前的正则表达式进行HTML解析。xpath定位更加简单、方便,而且能够自动修正html错误语法。xpath真强大!!(3)去掉了重复的结果。
# coding:utf-8# Practice of scraping web data with xpath# by redice 2010.11.05import codecsimport sys reload(sys) sys.setdefaultencoding('utf-8')import urllib2from urllib2 import URLError, HTTPErrorimport zlibimport sqlite3try: import cPickle as pickleexcept ImportError: import pickle conn = sqlite3.connect("html_cache.db")conn.text_factory = lambda x: unicode(x, 'utf-8', 'replace')curs = conn.cursor()#if htmls tables not exist,create it#curs.execute('''CREATE TABLE if not exists htmls(url VARCHAR(255) UNIQUE,content BLOG,size INTEGER);''')curs.execute('''CREATE TABLE if not exists htmls(url VARCHAR(255) UNIQUE,content TEXT,size INTEGER);''')conn.commit()def serialize(value): """convert object to a compressed pickled string to save in the db """ #return sqlite3.Binary(zlib.compress(pickle.dumps(value, protocol=pickle.HIGHEST_PROTOCOL), 5)) #return sqlite3.Binary(value) return value def deserialize(value): """convert compressed pickled string from database back into an object """ #return pickle.loads(zlib.decompress(value)) if value else value return value# Fetch the target htmldef gethtml(url): '''Fetch the target html''' try: # look up the html_cache.db first curs.execute("select * from htmls where url=?;" ,(url,)) row = curs.fetchone() if row: # find the target #print deserialize(str(row[1])) return deserialize(str(row[1])) response = urllib2.urlopen(url) result = response.read() # insert into the html_cache.db curs.execute("insert into htmls values(?,?,?);", (url,serialize(result),len(result))) conn.commit() print "saved %s into html_cache.db" % (url) return result except URLError, e: if hasattr(e, 'reason'): print 'Failed to reach a server.' print 'Reason: ', e.reason return 'None' elif hasattr(e, 'code'): print 'The server couldn't fulfill the request.' print 'Error code: ', e.code return 'None' #except: #return 'None' # end def gethtmlimport re#Fetch the all string matched. Return a list.def regexmatch(rule,str): '''Fetch the all string matched. Return a list.''' p = re.compile(rule) return p.findall(str)#end def regexmatch# decodeHtmlEntitydef decodeHtmlEntity(s) : '''decodeHtmlEntity''' if s=='' or not s: return '' result = s import locale result = result.decode(locale.getdefaultlocale()[1],"ignore").encode(locale.getdefaultlocale()[1]).replace("xc2xa0"," ") return result# end def decodeHtmlEntity#final resultdining_db = []total = 0;#debugdebug = 0# Fetch menupalace.com's htmlprint 'Fetching html from http://menupalace.com ...'html = gethtml('http://menupalace.com')from lxml import etreeif html=='' or html=='None': print "Can't get them html from http://menupalace.com" sys.exit()try: tree = etree.HTML(html) nodes = tree.xpath("//table[@class='n_table']")except: f = open("log.txt","wa") f.write(html) print("error to resolve the html http://menupalace.com") sys.exit()for node in nodes: if debug and total>=10: break; n = node.xpath("./tr[1]/td[1]/img") # Fetch country country = "" if len(n)>0: country = decodeHtmlEntity(n[0].tail) country = country.strip() # Fetch all link ls = node.xpath(".//a") # Through all link for l in ls: if debug and total>=10: break; #city city = decodeHtmlEntity(l.text) city = city.strip() prelink = l.get("href") link = prelink + "restaurants/restaurants.aspx" #print 'Fetching html from '+ link +' ...' html = gethtml(link) if html=='' or html == 'None': print "Can't get them html from " + link continue try: subtree = etree.HTML(html) subnodes = subtree.xpath("//td[@class='frame_style_padding']") except: if debug: f = open("log.txt","wa") f.write(html) print("error to resolve the html " + link) sys.exit() else: continue for sn in subnodes: if debug and total>=10: break; sls = sn.xpath(".//a") for sl in sls: if debug and total>=10: break; link = prelink + "restaurants/" + sl.get("href") print 'Fetching html from '+ link +' ...' html = gethtml(link) if html=='' or html == 'None': print "Can't get them html from " + link continue try: sstree = etree.HTML(html) ssnodes = sstree.xpath("//table[@width='94%'][@height='80px']") except: if debug: f = open("log.txt","wa") f.write(html) f.write(" ") print("error to resolve the html" + link) sys.exit() else: continue for ssn in ssnodes: if debug and total>=10: break; #name n = ssn.xpath(".//tr[1]/td[1]/a[1]") name = '' if len(n)>0: name = decodeHtmlEntity(n[0].text) name = name.strip() #print name #address n = ssn.xpath(".//tr[2]/td[1]") #address array address_arr =[] address = '' state = '' if len(n)>0: address = decodeHtmlEntity(n[0].text) #has many locations if address.strip()=='Various Locations': n = ssn.xpath(".//tr[1]/td[1]/div[1]/span[1]") if len(n)>0: address = decodeHtmlEntity(n[0].text) addrlist = address.split() if len(addrlist)>4: state = addrlist[-2] city = addrlist[-3] #remove state and city from the address address = address.replace(state,'') address = address.replace(city,'') address = address.replace(addrlist[-1],'') address = address.strip() address_arr.append((address,city,state)) brn = ssn.xpath(".//tr[1]/td[1]/div[1]/span[1]/br") for n in brn: address = decodeHtmlEntity(n.tail) addrlist = address.split() if len(addrlist)>4: state = addrlist[-2] city = addrlist[-3] #remove state and city from the address address = address.replace(state,'') address = address.replace(city,'') address = address.replace(addrlist[-1],'') address = address.strip() address_arr.append((address,city,state)) else: address_arr.append(('','','')) else: addrlist = address.split() if len(addrlist)>3: state = addrlist[-1] city = addrlist[-2] #remove state and city from the address address = address.replace(state,'') address = address.replace(city,'') address = address.strip() address_arr.append((address,city,state)) #website website = '' n = ssn.xpath(".//tr[3]/td[1]/a[1]") if len(n)>0: website = decodeHtmlEntity(n[0].text) website = website.strip() if name and len(address)>0: for addr in address_arr: dining = {} dining['name'] = name if addr[0] == 'Various Locations': dining['address'] = '' else: dining['address'] = addr[0] dining['city'] = addr[1] dining['state'] = addr[2] dining['country'] = country dining['website'] = website # Avoid duplication if not (dining in dining_db): dining_db.append(dining) total = total + 1 if debug and total>=10: break; #Close database linkconn.close() #print and save the final resultimport csvcf = open("scraping_result.csv", "w")writer = csv.writer(cf)writer.writerow(['name','address','city','state','country','website'])for item in dining_db: #print item['name'],item['address'],item['city'],item['state'],item['country'],item['website'] rlist=[] rlist.append(item['name']) rlist.append(item['address']) rlist.append(item['city']) rlist.append(item['state']) rlist.append(item['country']) rlist.append(item['website']) writer.writerow(rlist)cf.close()print 'The result has been saved into scraping_result.csv!'
- Python Web数据抓取(xpath版)
- Python Web数据抓取(xpath版)
- htmlcleaner+xpath抓取网页数据
- ios hpple通过xpath抓取数据。
- iOS 抓取 HTML ,CSS XPath 解析数据
- Python学习(从Web抓取信息)
- Python学习(从Web抓取信息)
- Python学习(从Web抓取信息)
- Web数据采集(抓取)介绍
- 抓取Web网页数据分析(转)
- web抓取数据
- Python数据抓取(抓图片)
- python抓取数据例子
- python抓取数据步骤
- Python Scrapy抓取数据
- python数据抓取
- python 抓取网页数据
- Python, 数据抓取
- Nginx作为最前端的Web cache系统
- 链表的分类
- 3GPP TS 24.007 (1) "Mobile radio interface signalling layer 3; General Aspects" V10.0.0
- 《30天精通iPhone手机编程》-Day19-电子琴
- java中HashSet的add的使用,以及equals和hashcode的重写
- Python Web数据抓取(xpath版)
- ASP.NET调用存储过程的含义及好处
- Nginx和Squid配合搭建的Web服务器前端系统
- up()&down()
- 揭秘全球最大网站Facebook背后应用的软件
- 管理表空间和数据文件(笔记,备忘!)
- 使用Nginx给GAE(Google App Engine)做负载均衡
- GitHub的Rails/Git架构
- Nginx配置文件解析之一