BeautifulSoup学习笔记

来源:互联网 发布:淘宝注销后有什么后果 编辑:程序博客网 时间:2024/06/05 14:31
from BeautifulSoup import BeautifulSoupimport re doc = ['<html><head><title>Page title</title></head>',       '<body><p id="firstpara" align="center">This is paragraph <b>one</b>.',       '<p id="secondpara" align="blah">This is paragraph <b>two</b>.',       '</html>']soup = BeautifulSoup(''.join(doc))print soup.prettify() 运行结果为:<img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAQoAAAFfCAIAAAA56hXaAAAOkklEQVR4nO3d3bWjOgyGYddFQdRDNTRDMcwFEy/bkozBwLbhfda5mJ2DEyAo/H1RnAMA4Dpr3fDZudW51bnxmtkBWmKVx3Jkiz80MdC0ybn59+87ymNybjo5a8Cfmp1bgj+346LtAGnbpqffn/6/4bfFr84tvwOqsB5kLSWvAnRgFZ/r629PMsZ7ErlDWINnSPYP6q5mqj6xAR4yGCfQazyBJ8tj/k0/lJWH+5XcUDXjwFPW4KzDP7K5vDzYe6A/8txjk5THHBfA0fLg3AO9Uq9cJeUxiFNztTyW+CR+/D0/V64AAAAAAAAA4FFX3c4buDOI96E8gEhJyv2osDy4V4he7abcXXwffQwGygeH+Fa69SpABwpT7ms8QZLDHeIpR/HghpwiulGeck92CL48RrGjGINdhHruQcodPSlJuY/GcZHf0NUp2XvgDUpS7nI/MwTlMcf7me3BhXMPvMNuyj08vvJbuT81D3cL/tvqY/wgV64AAAAAAAAAoAnWrfcEvdzxcmq7N7U8rL6J9HLHe0xx3qR8464pD24gogNhBkRt2+60QyZryk1SHmpI3hE/QeNkyt0d2ScUTpkJyRNeRIsyp9rXlocVkveIvqNRMuXuri4PKyS/Ye+BpskTgNk4aVY7tJdMae2mOPdAB5IrV4M4EFI7tJdPqYbkuXIFAAAAAAAAAC9Bch7tUu9v3ITkPDqTNLy6NQr1THIeuEyYgPIbn9qhPWm8u90Ct7LriYeT88A1fHmoQdqk3ahvUL0EDzoxgeXJ5DxwAfVzWnZod1p57GbXEw8n54Faarzcb2rh3mMWG2I+uy49mZwHLiDLw+rQbv3UQflB/2PJeeAa6t5D7dA+a0dcanbd8kxyHnjaGBeS9ekOfFFSHhzVAJHw8IZdBwAAAICT1F8rV6chfI7PSRrtED7HR01ar8R8NjaDtu14D7VV4RgUDG3b8VHWLYtFxF1p244PyZxVq9FX2rbjc1btrGPWtm/atuOLkmN9K6lO23Z8VHjlytq4aduOryu5FQgAAAAAAAA8Jul9ePlzAh0L7zZQHviu3ez6HeXBzT50YDe77n53tZP+I2r4XG3bnmQN8y8NtKIwu+5DiqNxQiKDtMmpyyge3BA0RIsOZdeTdqDODp/Ltu3hs6nnHsTU0ajC7LosDyt87jd0dUr2HuhMSXZdlofTdj5W2/YkpWu9NNCi3ey6Wh5q+Fxt2+6/bj7GD3LlCj3hvgQAAAAAAACadeLKFde78BWUBxAJbwveUR7cAUSvkmRHeXZdxtSn+KmW4BnIj6A/MtDuQ4o+KxWGpsJ/qzF1GUn0SB+iG1agPdyCtwa4yXcJtzyvlcP1kS31gIrsOnoiA+2V5eEDv3JHwd4D/ZHnHhufrlUPrvIxdfnldc490KvkylXyBUAXfwdwCEbJmLqfODxs48oV8F/+V2yAT7O+wg582my3PgEAAACAi2UazJWEF7cJ5DmM/6nOJAnW+GWAzBKVmIOr56HLl71yPu/Q4CxdoKY8/C2UJV4j6sXiyk3E+qn1a1lLdIic1cvL45L5lGpW8k2zdA21bXulkvLYymARNxYvn5nHykNdokMemNVL5lOqLA85S03cYpZJk0kc21gD5cGA1aG9xBSPDbOP8oXmeHr56pPxnD6DLBdT7TmvvpA6ZbnM8EUsppws6W482Ut0eq426rcb5AtZK9lq7C/XZ2aW/vK+s1yVq0i55yVvp18RmQ7W+e3J2nvI7Sbs1zjG/8iPdXuLmQST5QupU54gh6ufwXIl+y7gS/Bg0lr/0JqfjR9+kS8qX8iac3V4fn1KfxNvLU+554XT7HZoL1FeHnLepuDNy4x1xmLKnvPWC6lTlssMrymPzbk1b31GqDte9YWSmbSGl5RE4s++HCE3psvL4769h/U8o1iuwvJIjgoyL6ROWS4zvKQ8ZrHNOW2rPbTm5faX6cyfvJA6k9bwo+Xxx1+O2E2554UrJRN9L1dfHk68c7LZtlxMtee8+kLWlIXyw3fLQ+2r76r3HotxcJU5vihZyXL4ofJo4ssRuyl31aJ9MlnR93Mzk3khdS0vYrLNIBZKXUy157z6QuqU5dTh6mJayy6PzSrLI3mt8Km2/3aP4uRKthr7F5ZHE1euEn+5I3tQv4uZfO6ov0KBu/S73RzS72Im5bF7Locr9bvdHNL1YoZHQew6AAAA8CrWkXplmLSLmHqlrk9yEJHZMh93U1EeuyiPF1qzf6Icq+6FZHmM4kribqC9Mg+fH75q+e3KRLoaFC8frs6SuurQN1keaobZHYxCSaeH+1m6MJHun2eMh5QPl7OUWXXolXVwtZswtSLlltPDw9m4KpFulUf5cDlL9VEoNOd0eWxkpNxyeri1LdYk0tWgePlwyuMrKstDnVJ1erj/v1cl0q2geHmgXc6SozxeqaQ8dgPY+bOIyuHh2KsS6WpQvHy4OkuUB/7A5ZtafVCcrR+tuLs8dndfEuWBVtyxLVYGxSkPAAAAoBZH1YCJ8gAiSfMrdQIiqPgi2TpRTWU30cQOeJK86p9Jj/9xC1TgMefatv9ZA23geTJMnikP9h74nMK27Zx74KN227Zz5QoAAAAAAAAAAAAA8Alh4/R8Q3J1SuC1whD7bu/xpFUUMV683BQHeGe7ufIQp7MoD7xfeXmMBHjxNZmDq3AC93uQUw58y6gdL6nlER5fsScBAAAAAAAAgL9n/U4xgKh5D+WB75pEr0Rn3BZMRtH8Ci+ntj8c44KhbTu+yPpd1iVO4NK2Hd9idWh3WhqXtu34Itmh3QU59nAyj7bt+JDk/GHQTido247vCq9czdrZCG3bgehWIAAAAAAAAAAAAAAAAAAAQKmRRuuAZaI8AAAAAAAAAKBHdGAATLTkAZwraMZujaJ+8HKFzdgnmrHja2jGDihoxg7soBk7kEMzdiCHZuzAPutWIIdPgInyAEyUBwAAAIC/lLn1vpuH9/cTrWDLM+7O7Vcu5vy7np6s5MyaP0edT3XNWLMkqcMbed+fUFMe/g7j8uryuGQxZZDn8vJQ5zOzZuQsSerwRt73A9RAe6WS8tjuuJes6Ps8UB71i/nAKlLn86bykC/U7i3mJBhiZdfVgXInO8S32I8Kh0/ag+FrzeKF1CnVJVLnU51y1p5zDqZcs1OWywxfjAUPHxzjJZrsJSq3Bk+bDLdmKVnz1nCpxYCSnO9Mdl2VrCa/dvyncrIh5jcdmYZ0cZn5WZrFHi+c4fDffonCwJicT7e37OGU4Xsp57lyjySHqx/Vcs1vMxwGruWyH3o7MivE2nuo63MsWyENxVutg9dw/kp2oOE04RtzbhOZgnUaPk/4n9/oBzE2HOi32mTfkplPddmTT+XkyUPqlOUyw2vKY3Pu7chsDMmf6syfePW2vhyxis/gy8vj0MeVf57wg0fd5z5WHskeKXlydZZOb4vW8JLyCA9v/JqRy35072G9nFWc6vosXCEN7T08ee6xKdwhhqsp3AUv1x1gqG/hcuTgKnlOaz7lsodTztnysKYslB++Wx7q1xDcdXsPuTFY7/t8tjxaPPfYhJ+78jDGsmgfQlPw54n3YxFP6OIPvMWYeDNqM6++Sep8qsvuP5XDzzZ176FOWU4drq5k9cE5fjCz7OXUFZJ/9enUq7d75SrR3N7tQf0uu/yaWh9bW3f63UTq9bvsSXnkzyhwXr+bSL2ulz085mHXAQAAgF5ZB+WVudHLY6cN6vp8BpEpvqS9Bsk2FeWxi/J4oTX7J8qx6l5IlodMJu8G2ivz8Pnh6w3hczUTXj5cnSV11aFvsjysZLIMxh06WDo93M/SheFz/zxjPKR8uJylzKpDr6yDq90wqUyk550eHs7GVeFzqzzKh8tZqkw9oUWny2MzFm/lp4db22JN+FzNhJcPpzy+orI81ClVp4f7/3tV+NzKhJdn1+UsOcrjlUrKYzdrnT+LqBwejr0qfK5mwsuHq7NEeeAPXL6p1WfC2frRirvLY3f3JVEeaMUd22JlJpzyAAAAAGpxVA2YKA8gkjS/Uicggoovkq0T1VR2u53tgJvIq/6Z9HiLfVGBO5xr295WV23gVjJMnikP9h74nMK27Zx74KN227Zz5QoAAAAAAAAAAPSp5DfOgY+iPACF/NGPQVSLb0obtlinnPAVspV6+Ms4WwIljLurjdOBd5JB3em30xh/f05x1PfojxkAvZJHU/OvQuZfJVAe+Kik5+x2+jH//tfi3MDBFT5rEJl2/53bSXwhZKU2AAAAAAAAAKAR1u8UA4ia91Ae+K5J9Ep0xk8SJ6NofoWXU9sfjnHB0LYdX2T9LusS50Ro245vsTq0u+ALTx5t2/FFskO709LptG3HRyXnD4N2OkHbdnxXeOVq1s5GaNsORLcCAQAAAAAAAAAAAAAAAAAASo38thNgmSgPAAAAAAAAAOgRHRgAEy15AOcKmrFbo6gfvFxhM/aJZuz4GpqxAwqasQM7aMYO5NCMHcihGTuwz7oVyOETYKI8ABPlAQAAAOAC4QWrE6cZRN/xZuEdDMoD37WbXb+jPLiBiA4UZtdHkV0fgrvmo/bgVh5T/PxL8AzET9C009n1MMQe/tuXSnLqMogHN4QX0aLK7HpyPLblecOBYSX4yJZ6QEX0HY06nV0/VB4+8Ct3FOw90LRz2XX14Cp8cBEpd3kBgHMPdOBcdn3UHvTfPk++BCJ//4ArV+jMfTcr5PkMgP/4bRBAMduXjwEAAAB8S2V2PTSLFBbQt5LsuuyDmHFoYqAVp7PrNeXBHUB04Fx2fYrvmm+1NP0eX34HVGE9yFoiP4KmHc2uJ6kQq4vu+iuVyZ44GQI05Fx2fdgrj/k3/VBWHo7sOpp1NLt+eXmw90DTDmXXk/JIkrxHy4NzD3SgJLu+ScpjEKfmanks8Un8+HtRrlyhJzTaAQAAaMO6ruvKkRmgoTwAE+UBmCgPwER54F3UG3PlD8YoD7yOGusofzBAeeA1/gHpLa1rrPfFTQAAAABJRU5ErkJggg==" alt="" /> print soup.contents[0].name#print soup.contents[0].contents[0].name for i in range(len(soup.contents[0])):    print soup.contents[0].contents[i].name
<img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAJUAAAAlCAIAAADECjxtAAABa0lEQVR4nO2bwY2EMAxFXVcu9EARqSfVpBNOKSZ7iEJsAgvM2qO19J/mgD7JXJ4CGn8Nbdu2ruuyLDRT+0WYrtun9DD3JLGVqYeRhUCX3/yB/w/8+Qb+fPM1f7XWWvEa1Ab+fAN/vnn0++GPBKJKGf4sgD/fwJ9vbvxFNlhp8PlL7GE+C/lK+DPixl8mIjkAq3JBkFuCXBlHCH8mvJt/yiM1/EUZtqSI7fBnwjt/3MphZbheCX92vO4f+BuO323+sjypLSzwZ8bX+iP4MwHzF9/An2/gzzfo/3wDf76BP98oz6/DR7vAx8Cfb+DPNzr90WGuTURJTkoL+wagyKP+aB9m8qkmv5ZV0dgephDo8vT5WYgiUepGG5konlQN4247cwmHzwxDf6HnOHx2PPK39++nz8+pKhrkSTnQ5cbfoWcnWbXv4dVfjeJZXwgUsZ2/XPX1QAtbfzh81lj5y/JXIzAC82vfwJ9v4M81PxgboCx5bfuJAAAAAElFTkSuQmCC" alt="" /> titleTag = soup.html.head.titletitleTag# <title>Page title</title> titleTag.string# u'Page title' len(soup('p'))# 2 soup.findAll('p', align="center")# [<p id="firstpara" align="center">This is paragraph <b>one</b>. </p>] soup.find('p', align="center")# <p id="firstpara" align="center">This is paragraph <b>one</b>. </p> soup('p', align="center")[0]['id']# u'firstpara' soup.find('p', align=re.compile('^b.*'))['id']# u'secondpara' soup.find('p').b.string# u'one' soup('p')[1].b.string# u'two'

0 0