淘宝网页面解析

来源:互联网 发布:时间错乱 知乎 编辑:程序博客网 时间:2024/04/30 12:59

以下代码是我用在我的个人网站项目上用于解析淘宝页面的,网站(fuckinstall.com)的主要功能是将几个搜索引擎的内容后台整合在一起,结果经过相似度排序及聚类处理。顺带还做了个谷歌镜像的页面,前端是真心不太会。。


#coding=utf8from ..common import crawlerTool as ctfrom HTMLParser import HTMLParser#这个出来是unicode的格式,后面没法弄import sysreload(sys)sys.setdefaultencoding('utf-8')import reimport tracebackimport json# 摘取所要数据def process(keyword,page):url='https://s.taobao.com/search?q=%s&s=%s' % (keyword, (page-1)*44)urlinsfos=[]page = ct.crawlerTool.getPage(url)g_page_config =ct.crawlerTool.getRegex('g_page_config\s*=\s*(.*);',page)#print eval(g_page_config)['mod']['data']['auctions']try:segments = json.loads(g_page_config)['mods']['itemlist']['data']['auctions']  #搜索微波炉就不用这个了except:segments = []if segments:#print segments[0]for segment in segments:try:#print segmenturlinfo={}urlinfo['url']='https://detail.tmall.com/item.htm?id='+segment['nid']urlinfo['title'] = segment['raw_title']if 'tmall' in urlinfo['url']:urlinfo['title']=urlinfo['title']+'-天猫'urlinfo['source'] = 'tmall'else:urlinfo['title'] = urlinfo['title'] + '-淘宝'urlinfo['source'] = 'taobao'num=segment.get('view_sales','0')price = segment["view_price"]urlinfo['info'] = '价格<em>%s</em>元 购买数量<em>%s</em>'%(price,num)urlinfo['imglink'] = segment["pic_url"]#print urlinfo['url'], urlinfo['title'], urlinfo['info'],urlinfo['imglink']urlinsfos.append(urlinfo)except:traceback.print_exc()else:segments = json.loads(g_page_config)['mods']['grid']['data']['spus']for segment in segments:try:#print segmenturlinfo={}urlinfo['url']=segment['url']urlinfo['title'] = segment['title']if 'tmall' in urlinfo['url']:urlinfo['title']=urlinfo['title']+'-天猫'urlinfo['source'] = 'tmall'else:urlinfo['title'] = urlinfo['title'] + '-淘宝'urlinfo['source'] = 'taobao'importantKey = segment['importantKey']price = segment["price"]urlinfo['info'] = '价格<em>%s</em>元 <em>%s</em> '%(price,importantKey)urlinfo['imglink'] = segment["pic_url"]#print urlinfo['url'], urlinfo['title'], urlinfo['info'],urlinfo['imglink']urlinsfos.append(urlinfo)except:traceback.print_exc()return urlinsfosdef test():return process("https://s.taobao.com/search?q=python")