Python爬取表格及图片
来源:互联网 发布:soundhound mac 编辑:程序博客网 时间:2024/05/17 23:00
import urllib
import re
from bs4 import BeautifulSoup
url='https://baike.baidu.com/item/https/285356?fr=aladdin'
#def getHtml(url):
page = urllib.request.urlopen(url)
html=page.read().decode(encoding='utf-8',errors='strict')
# return html
print(html)
def earse(strline,ch) :
# left = 0
right = strline.find(ch)
while right !=-1 :
strline = strline.replace(ch,'')
right = strline.find(ch)
return strline
#爬取table
#def getTable(html):
# reg=''
#tablere=re.compile(reg)
#tablelist=re.findall(tablere,html)
#爬取table结束
soup = BeautifulSoup(html)
min_salary=[]
min_salary_rows=[]
print (soup('title')[0].string)
tab= soup.findAll('table')
trs = tab[len(tab)-1].findAll('tr')
for trIter in trs :
tds = trIter.findAll('td')
for tdIter in tds :
p = tdIter('p')
for i in range(len(p)) :
if p[i].string :
min_salary_rows.append((earse(p[i].string,' ').strip()))
print (earse(p[i].string,' ').strip()),
else :
pass
min_salary.append(min_salary_rows)
print
print(min_salary)
#html = getHtml("http://www.mohrss.gov.cn/ldgxs/LDGXqiyegongzi/LDGXzuidigongzibiaozhun/201612/t20161213_261789.html")
#爬取图片地址
def getImg(html):
reg = r'src="(.+?\.jpg)" pic_ext'
imgre = re.compile(reg)
imglist = re.findall(imgre,html)
x = 0
for imgurl in imglist:
urllib.urlretrieve(imgurl,'%s.jpg' % x)
x+=1
#爬取图片地址结束
import re
from bs4 import BeautifulSoup
url='https://baike.baidu.com/item/https/285356?fr=aladdin'
#def getHtml(url):
page = urllib.request.urlopen(url)
html=page.read().decode(encoding='utf-8',errors='strict')
# return html
print(html)
def earse(strline,ch) :
# left = 0
right = strline.find(ch)
while right !=-1 :
strline = strline.replace(ch,'')
right = strline.find(ch)
return strline
#爬取table
#def getTable(html):
# reg=''
#tablere=re.compile(reg)
#tablelist=re.findall(tablere,html)
#爬取table结束
soup = BeautifulSoup(html)
min_salary=[]
min_salary_rows=[]
print (soup('title')[0].string)
tab= soup.findAll('table')
trs = tab[len(tab)-1].findAll('tr')
for trIter in trs :
tds = trIter.findAll('td')
for tdIter in tds :
p = tdIter('p')
for i in range(len(p)) :
if p[i].string :
min_salary_rows.append((earse(p[i].string,' ').strip()))
print (earse(p[i].string,' ').strip()),
else :
pass
min_salary.append(min_salary_rows)
print(min_salary)
#html = getHtml("http://www.mohrss.gov.cn/ldgxs/LDGXqiyegongzi/LDGXzuidigongzibiaozhun/201612/t20161213_261789.html")
#爬取图片地址
def getImg(html):
reg = r'src="(.+?\.jpg)" pic_ext'
imgre = re.compile(reg)
imglist = re.findall(imgre,html)
x = 0
for imgurl in imglist:
urllib.urlretrieve(imgurl,'%s.jpg' % x)
x+=1
#爬取图片地址结束
阅读全文
0 0
- Python爬取表格及图片
- Python爬取图片
- Python爬取图片
- python爬取图片
- python爬取图片
- python爬取图片
- python爬虫爬取网页表格数据
- Python爬取网络图片
- python多线程爬取图片
- python爬取网站图片
- python爬取网页图片
- Python爬取网站图片
- Python 爬取百度图片
- python爬虫爬取图片
- python爬取百度图片
- python爬取网上图片
- python爬取网页图片
- python爬取mm131图片
- MFC学习笔记01-(_T()与L的区别)
- 软件测试过程中有哪些风险?
- 单链表的快排实现
- 《并发编程》--5.线程等待结束(join)和线程谦让(yield)
- String-引用计数的写时拷贝
- Python爬取表格及图片
- web.xml 配置-filter
- Highcharts如何设置背景颜色
- 页面表格数据下载到Excel的超实用方法
- android蓝牙
- mysql常见安全加固策略
- hdu2689树状数组
- 【BLE_Mesh】01,Nordic_nRF51-DK之BLE_Mesh编译
- Python3之窗体程序