Python爬取表格及图片

来源：互联网发布：soundhound mac 编辑：程序博客网时间：2024/05/17 23:00

import urllib
import re
from bs4 import BeautifulSoup
url='https://baike.baidu.com/item/https/285356?fr=aladdin'
#def getHtml(url):
page = urllib.request.urlopen(url)
html=page.read().decode(encoding='utf-8',errors='strict')
# return html
print(html)
def earse(strline,ch) :
# left = 0
right = strline.find(ch)

while right !=-1 :
strline = strline.replace(ch,'')
right = strline.find(ch)
return strline

#爬取table
#def getTable(html):
# reg=''
#tablere=re.compile(reg)
#tablelist=re.findall(tablere,html)
#爬取table结束

soup = BeautifulSoup(html)
min_salary=[]
min_salary_rows=[]
print (soup('title')[0].string)

tab= soup.findAll('table')

trs = tab[len(tab)-1].findAll('tr')

for trIter in trs :
tds = trIter.findAll('td')
for tdIter in tds :
p = tdIter('p')
for i in range(len(p)) :
if p[i].string :
min_salary_rows.append((earse(p[i].string,' ').strip()))
print (earse(p[i].string,' ').strip()),
else :
pass
min_salary.append(min_salary_rows)
print
print(min_salary)

#html = getHtml("http://www.mohrss.gov.cn/ldgxs/LDGXqiyegongzi/LDGXzuidigongzibiaozhun/201612/t20161213_261789.html")

#爬取图片地址
def getImg(html):
reg = r'src="(.+?\.jpg)" pic_ext'
imgre = re.compile(reg)
imglist = re.findall(imgre,html)
x = 0
for imgurl in imglist:
urllib.urlretrieve(imgurl,'%s.jpg' % x)
x+=1
#爬取图片地址结束

阅读全文

0 0