[python]抓取网页的内容
来源:互联网 发布:mac ps无法退出全屏 编辑:程序博客网 时间:2024/04/30 01:31
#-*- coding: UTF-8 -*- import urllib2, BeautifulSoup# @param url: complete url# 完整的url# @param usr, pwd: if the page need account, # \p usr and \p pwd will be used# 当访问的页面需要密码的时候会用到# @return: the formatted string content of the url# 用了BeautifulSoup返回结果文本def getWebPage(url, usr=None, pwd=None): if not usr and not pwd: content = urllib2.urlopen(url).read() else: pwdMgr = urllib2.HTTPPasswordMgrWithDefaultRealm() pwdMgr.add_password(None, url, usr, pwd) handler = urllib2.HTTPBasicAuthHandler(pwdMgr) opener = urllib2.build_opener(handler) page = opener.open(url).read() content = BeautifulSoup.BeautifulSoup(page).prettify() return contenturl='http://www.csdn.net/'print getWebPage(url)