xpath爬取Python100例

来源：互联网发布：mysql 任务编辑：程序博客网时间：2024/05/15 01:41
#!/usr/bin/env python# _*_coding:utf-8 _*_# @Time     :2017/8/26 15:55# @Author   :luoyu_bie# @File     :python_100_example.py# @Software :PyCharm Community Edition#导入模块import requestsimport urlparse#转换文件编码import codecsfrom lxml import etree#步骤#1、获取li标签#2、遍历li里面的题目#3、获取下一页的链接#4、分页处理，实现回调#（返回爬取到的结果及下一页的链接）agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0'headers = {"User-Agent":agent}def get_tag_list(url):    response = requests.get(url,headers)    html = response.content    ht = etree.HTML(html)    items = ht.xpath('/html/body/div[3]/div/div[2]/div/div[3]/div/ul[1]/li/a/@href')    for url in items:        #返回一个可迭代对象        yield 'http://www.runoob.com/'+urldef get_text(url_list):    for item in url_list:        html = requests.get(item).content        ht = etree.HTML(html)        title = ht.xpath('//*[@id="content"]/h1/text()')[0] #获取h1标签的文本内容        #position()得到相应的标签的序号        content_list1 = ht.xpath('//*[@id="content"]/p[position()>1 and position()<4]/text()')        #join把列表链接为字符串        content = '\n'.join(content_list1)        yield (title,content)url = "http://www.runoob.com/python/python-100-examples.html"#获取所有题目页面的链接url_list = get_tag_list(url)#获取所有详细的页面数据content_list = get_text(url_list)with codecs.open('1.txt','w+','utf-8') as f:    for title,content in content_list:        f.write(title+'\n')        f.write(content+'\n')        f.write('*'*20+'\n')
阅读全文
1 0