python+mongodb初级练习

来源:互联网 发布:linux操作系统下载 编辑:程序博客网 时间:2024/05/21 11:21
将百度百科词条抓取,然后解析html,将对应词条的url提取,然后以json方式存入mongodb
# --* coding=utf-8 *--import refrom urlparse import urljoinfrom pymongo import MongoClientimport urllib2from bs4 import BeautifulSoup as bsconn = MongoClient('127.0.0.1', 27017)db = conn.mydbmycarw = db.set_craw# mycarw.insert({'name':'url'})url = 'https://baike.baidu.com/item/%E8%B5%9B%E9%97%A8%E9%93%81%E5%85%8B'request = urllib2.Request(url)html = urllib2.urlopen(request)data = html.read()# print datasoup = bs(data, 'html.parser', from_encoding='utf-8')li = soup.find_all('a', href=re.compile('/item/.*'))li=list(set(li))urllist = []name=[]for l in li:        urllist.append(l['href'])        name.append(l.get_text())print len(name)print len(urllist)llen=len(name)for i in range(llen):        urlcomp=urljoin(url,urllist[i])        mycarw.insert({name[i]:urlcomp})

原创粉丝点击