HTML 解析库

来源:互联网 发布:java二叉树的镜像树 编辑:程序博客网 时间:2024/06/03 22:53

pyquery 使用

参考

from pyquery import PyQuery as pqfrom lxml import etree#四种创建对象的方法doc1 =pq(etree.fromstring('<div> <tr class="item-0"> <td>first section</td> <td>1111</td> <td>17-01-28 22:51</td> </tr> <tr class="item-1"> <td>second section</td> <td>2222</td> <td>17-01-28 22:53</td> </tr> </div>'))doc2 = pq('<div><div> <tr class="item-0"> <td>first section</td> <td>1111</td> <td>17-01-28 22:51</td> </tr> <tr class="item-1"> <td>second section</td> <td>2222</td> <td>17-01-28 22:53</td> </tr> </div>')#直接给html字符串创建对象doc3 = pq(filename ='hello')#给html文件doc4 = pq(url = 'http://google.com')#给url
doc('.class')#获取对应class的对象doc('#id')#获取对应的id对象data = doc('tr')#以list形式返回文件中tr元素for tr in doc('tr').items:    print(tr('td').eq(2).text)#输出tr元素中第二个td元素的文本。doc('p').attr('id')#获取p标签的属性id值doc('p').find('#n')#在p块中查找id

beautifulsoup4

beautifulsoup4

同上

# beautiful练习html_doc = """<html><head><title>The Dormouse's story</title></head><body><p class="title"><b>The Dormouse's story</b></p><p class="story">Once upon a time there were three little sisters; and their names were<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;and they lived at the bottom of a well.</p><p class="story">...</p>"""from bs4 import BeautifulSoupsoup = BeautifulSoup(html_doc,'lxml')print(soup.prettify())#格式化输出htmlprint(soup.title.string)for link in soup.find_all('a'):#获取输出所有a标签的链接    print(link.get('href'))print(soup.get_text())#输出所有的文本

原创粉丝点击