PyQuery
来源:互联网 发布:快刀软件 编辑:程序博客网 时间:2024/05/21 07:13
PyQuery 崔老师爬虫系列课程学习笔记
安装
pip install pyquery
初始化方法
字符串初始化
html = '''<div> <ul> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href=\"link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href=\"link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div>'''from pyquery import PyQuery as pqdoc = pq(html)print(doc('ul'))
URL初始化
from pyquery import PyQuery as pqdoc = pq(url='http://www.baidu.com')print(doc('head'))
文件初始化
from pyquery import PyQuery as pqdoc = pq(filename='demo.html')print(doc('li'))
基本CSS选择器
html = '''<div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href=\"link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href=\"link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div>'''from pyquery import PyQuery as pqdoc = pq(html)print(doc('#container .list li'))
查找元素
子元素
html = '''<div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href=\"link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href=\"link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div>'''from pyquery import PyQuery as pqdoc = pq(html)items = doc('.list')print(type(items))print(items)lis = items.find('li')print(type(lis))print(lis)
list = items.children()print(type(list))print(list)
list = items.children('.active')print(list)
父元素
html = '''<div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href=\"link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href=\"link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div>'''from pyquery import PyQuery as pqdoc = pq(html)items = doc('.list')container = items.parent()print(type(container))print(container)
html = '''<div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href=\"link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href=\"link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div>'''from pyquery import PyQuery as pqdoc = pq(html)items = doc('.list')parents = items.parents()#可以在括号里添加CSS选择标志print(type(parents))print(parents)
兄弟元素
html = '''<div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href=\"link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href=\"link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div>'''from pyquery import PyQuery as pqdoc = pq(html)items = doc(.list .item-0.activate)#出现空格表示选择内部的条目,,连续无空格表示并列选择关系print(items.siblings())#获取兄弟元素
遍历
单个元素
html = '''<div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href=\"link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href=\"link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div>'''from pyquery import PyQuery as pqdoc = pq(html)item = doc('.item-0.active')print(item)
html = '''<div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href=\"link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href=\"link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div>'''from pyquery import PyQuery as pqdoc = pq(html)lis = doc('li').items()print(type(lis))for li in lis: print(li)
获取信息
获取属性
html = '''<div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href=\"link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href=\"link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div>'''from pyquery import PyQuery as pqdoc = pq(html)a = doc('.item-1.active a')print(a)print(a.attr.href)print(a.attr('href'))
获取文本
html = '''<div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href=\"link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href=\"link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div>'''from pyquery import PyQuery as pqdoc = pq(html)a = doc('.item-0.active a')print(a)print(a.text())
<a href="link3.html"><span class="bold">third item</span></a>third item
html = '<html><head><meta charset=utf-8><title>浙大美女校花 甜美神似张子萱</title><meta http-equiv=x-dns-prefetch-control content=on>'from pyquery import PyQuery as pqdoc = pq(html)a = doc('title')print(a.text())
浙大美女校花 甜美神似张子萱
获取HTML
html = '''<div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href=\"link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href=\"link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div>'''from pyquery import PyQuery as pqdoc = pq(html)a = doc('.item-0.active')print(a.html())
DOM操作
addClass、removeClass
html = '''<div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href=\"link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href=\"link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div>'''from pyquery import PyQuery as pqdoc = pq(html)#初始化li = doc('.item-1.active')#选中这一标签print(li)li.removeClass('active')print(li)li.addClass('active')print(li)
attr、css
html = '''<div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href=\"link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href=\"link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div>'''from pyquery import PyQuery as pqdoc = pq(html)li = doc('.item-0.active')print(li)li.attr('name','link')print(li)li.css('font-size','14px')print(li)
remove挺重要
html = '''<div class="wrap">Hello, Word!<p> This is a paragraph.</p></div>'''from pyquery import PyQuery as pqdoc = pq(html)a = doc('.wrap')print(a.text())a.find('p').remove()print(a.text())
其他DOM方法
伪类选择器
html = '''<div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href=\"link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href=\"link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div>'''from pyquery import PyQuery as pqdoc = pq(html)li = doc('li:first-child')#获取第一个标签print(li)li = doc('li:last-child')#获取最后一个print(li)li = doc('li:nth-child(2)')#获取第二个print(li)li = doc('li:gt(2)')#获取第二个以后的,计数从0开始print(li)li = doc('li:nth-child(2n)')#获取偶数的标签print(li)li = doc('li:contains(second)')#获取包含“second”的print(li)
阅读全文
2 0
- pyQuery
- PyQuery
- PyQuery介绍
- pyquery安装
- 安装pyquery
- python pyquery
- pyquery安装
- pyquery小记
- pyquery ----用法
- 五. PyQuery
- 007 PyQuery
- PyQuery Tutorial: Basic HTML Parsing with PyQuery
- PyQuery读写html文件
- pyquery的用法
- Windows下安装pyquery
- python之pyquery 学习
- win7下面安装pyquery
- mac os安装pyquery
- C. Watching Fireworks is Fun---dp
- asp.net中“sender as object,e as EventArgs”的sender 与e是做什么用的?
- java的守护线程与非守护线程
- 设计模式之外观模式
- Redis解析之订阅与发布
- PyQuery
- [编程题] 魔力手环
- 最小生成树构造算法(普里姆算法,克鲁斯卡尔算法)
- mac os maverics里commend+tab切换程序不能打开被最小化的程序解决办法
- 关于无序数据快速查询 以及atoi和atof函数的简单实现
- IAR无法跳转问题解决
- Requests
- Django资源大全
- POJ1741 [Tree] 点分治