BeautifulSoup的基本用法

来源：互联网发布：网络棋牌信息编辑：程序博客网时间：2024/06/16 11:41

from bs4 import BeautifulSoupimport re#一段代码html_doc = """<html><head><title>The Dormouse's story</title></head><body><p class="title"><b>The Dormouse's story</b></p><p class="story">Once upon a time there were three little sisters; and their names were<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;and they lived at the bottom of a well.</p><p class="story">...</p>"""#打印html_doc所有代码soup=BeautifulSoup(html_doc,"html.parser")#用html.parser解析器解析print(soup.prettify())

print(soup.title)#<title>The Dormouse's story</title>print(soup.title.string)#The Dormouse's storyprint(soup.a)#<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>print(soup.p)#<p class="title"><b>The Dormouse's story</b></p>print(soup.p['class'])#['title']print(soup.findAll('a'))'''[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]'''for link in soup.findAll('a'):    print(link.string)#Elsie#Lacie#Tillieprint(soup.find(id="link3"))#<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>print(soup.find('p'))#<p class="title"><b>The Dormouse's story</b></p>

print(soup.find('p',{"class":"story"}))'''<p class="story">Once upon a time there were three little sisters; and their names were<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;and they lived at the bottom of a well.</p>'''print(soup.find('p',{'class':'story'}).get_text())'''Once upon a time there were three little sisters; and their names wereElsie,Lacie andTillie;and they lived at the bottom of a well.'''

正则表达式：

for tag in soup.find_all(re.compile("t")):    print(tag.name)#html#titlefor tag in soup.find_all(re.compile("^b")):    print(tag.name)# body# bdata=soup.findAll('a',href=re.compile(r"^http://example\.com/"))print(data)'''[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]'''

阅读全文

0 0