BeautifulSoup的基本用法

来源:互联网 发布:网络棋牌信息 编辑:程序博客网 时间:2024/06/16 11:41
from bs4 import BeautifulSoupimport re#一段代码html_doc = """<html><head><title>The Dormouse's story</title></head><body><p class="title"><b>The Dormouse's story</b></p><p class="story">Once upon a time there were three little sisters; and their names were<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;and they lived at the bottom of a well.</p><p class="story">...</p>"""#打印html_doc所有代码soup=BeautifulSoup(html_doc,"html.parser")#用html.parser解析器解析print(soup.prettify())
print(soup.title)#<title>The Dormouse's story</title>print(soup.title.string)#The Dormouse's storyprint(soup.a)#<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>print(soup.p)#<p class="title"><b>The Dormouse's story</b></p>print(soup.p['class'])#['title']print(soup.findAll('a'))'''[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]'''for link in soup.findAll('a'):    print(link.string)#Elsie#Lacie#Tillieprint(soup.find(id="link3"))#<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>print(soup.find('p'))#<p class="title"><b>The Dormouse's story</b></p>
print(soup.find('p',{"class":"story"}))'''<p class="story">Once upon a time there were three little sisters; and their names were<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;and they lived at the bottom of a well.</p>'''print(soup.find('p',{'class':'story'}).get_text())'''Once upon a time there were three little sisters; and their names wereElsie,Lacie andTillie;and they lived at the bottom of a well.'''

正则表达式:

for tag in soup.find_all(re.compile("t")):    print(tag.name)#html#titlefor tag in soup.find_all(re.compile("^b")):    print(tag.name)# body# bdata=soup.findAll('a',href=re.compile(r"^http://example\.com/"))print(data)'''[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]'''
原创粉丝点击