Python抓取华中大二手市场商品信息

来源:互联网 发布:通用规范汉字 软件 编辑:程序博客网 时间:2024/04/28 23:01

最近做项目需要获取一些商品信息,于是就写了个简单的脚本来抓取某电子商场。基本原理是发送request请求然后分析response文本信息,正则匹配想要的内容。

#coding=utf-8#time:2014/4/29#author:Li#OS:windowsimport requestsimport reimport osdef catch_ershou():'''抓取华中大二手市场'''host_url="http://ershou.hustonline.net"#add all the index ,total is 21 pagesindex_url=[]for i in range(1,22):index_url.append(host_url+'/index/index/'+str(i)+'/all')#find all the goods detail page linkslinks=[]for url in index_url:req=requests.get(url)req.encoding='utf-8'regex=re.compile(r'ui-link-img[^>]+?href="(/goods/details/.+?)"')for link in regex.findall(req.content):links.append(host_url+link)#catch all the goods informationsgood_arr=[]for link in links:print linkgood_info={"name":"无","price":"无","addr":"无","time":"无","Tags":"无","contact":"无","QQ":"无"}req=requests.get(link)req.encoding='utf-8'try:regex=re.compile(r'stock-info-name.+?>(.+?)</h3>')info=regex.search(req.content).groups()good_info.update({"name":info[0].strip()})regex=re.compile(r'stock-price.+?>(.+?)</span>')info=regex.search(req.content).groups()good_info.update({"price":info[0].strip()})regex=re.compile(r'stock-info-attr.+?>([^<]+?)</div>')info=regex.findall(req.content)good_info.update({"addr":info[0].strip(),"time":info[1].strip(),"Tags":info[2].strip(),"contact":info[3].strip(),"QQ":info[4].strip()})except Exception:passgood_arr.append(good_info)print "total links:"+len(links)#write reasult in filetry:fp=open(os.getcwd()+'\\ershou.txt','a+')for good_info in good_arr:fp.write('{"名称":'+'"'+good_info["name"]+'",'+\'"价格":'+'"'+good_info["price"]+'",'+\'"交易地点":'+'"'+good_info["addr"]+'",'+\'"发布时间":'+'"'+good_info["time"]+'",'+\'"Tags":'+'"'+good_info["Tags"]+'",'+\'"联系人":'+'"'+good_info["contact"]+'",'+\'"QQ":'+'"'+good_info['QQ']+'"'+\"}\r\n")fp.close()except Exception:print "write reasult in file failed!"print "all is done..."def main():catch_ershou()if __name__ == '__main__':main()


0 0
原创粉丝点击