python爬虫系列二

来源:互联网 发布:java软件工程师培训班 编辑:程序博客网 时间:2024/06/05 02:40
# encoding: utf-8from bs4 import BeautifulSoupimport urllib2import redef header (url):    user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'    header = {'User-Agent':'user_agent'}    re = urllib2.Request(url,headers=header)    respone = urllib2.urlopen(re,'html.parser')    htmlcode = respone.read()    return htmlcodedef file_save(filename,text):    f = open(filename,'w')    f.write(text)    f.close()def load_url(url,begin_page,end_page):    #拼接url    for i in range(begin_page,end_page+1):        zcurl = url + str(i)        zc_html_code = header(zcurl)        st_html(zc_html_code)def st_html(text):    soup = BeautifulSoup(text,'html.parser')    find_html = soup.find_all(attrs={'target':'_blank'})    for i in find_html:        print i.get_text()
重点:
print i.get_text()----获取i里面的string字符串
print i.['href']---可以获取url
原创粉丝点击