[python]抓取网页的内容

来源:互联网 发布:mac ps无法退出全屏 编辑:程序博客网 时间:2024/04/30 01:31
#-*- coding: UTF-8 -*- import urllib2, BeautifulSoup# @param url: complete url#             完整的url# @param usr, pwd: if the page need account, #        \p usr and \p pwd will be used#             当访问的页面需要密码的时候会用到# @return: the formatted string content of the url#             用了BeautifulSoup返回结果文本def getWebPage(url, usr=None, pwd=None):    if not usr and not pwd:        content = urllib2.urlopen(url).read()    else:        pwdMgr = urllib2.HTTPPasswordMgrWithDefaultRealm()        pwdMgr.add_password(None, url, usr, pwd)        handler = urllib2.HTTPBasicAuthHandler(pwdMgr)        opener = urllib2.build_opener(handler)        page = opener.open(url).read()        content = BeautifulSoup.BeautifulSoup(page).prettify()    return contenturl='http://www.csdn.net/'print getWebPage(url)


原创粉丝点击