use_python

来源：互联网发布：幸运抽奖软件免费版编辑：程序博客网时间：2024/06/07 07:41

写csv文件
抓取页面图片①
抓取页面图片②
为爬虫添加代理ip
获取页面内嵌链接
字典的相关用法

August 31, 2017 8:36 AM

写csv文件

import csvfrom urllib.request import urlopenfrom bs4 import BeautifulSouphtml = urlopen("http://en.wikipedia.org/wiki/Comparison_of_text_editors")bsObj = BeautifulSoup(html,"lxml")#主对比表格是当前页面的第一个表格table = bsObj.findAll("table",{"class":"wikitable"})[0]rows = table.findAll("tr")csvFile = open("editors.csv","wt",newline = '',encoding = 'utf-8')writer = csv.writer(csvFile)try:    for row in rows:        csvRow = []        for cell in row.findAll(['td','th']):            csvRow.append(cell.get_text())            writer.writerow(csvRow)finally:    csvFile.close()

抓取页面图片①

import urllib.requestresponse = urllib.request.urlopen('http://imgsrc.baidu.com/forum/w%3D580/sign=fdcdb5b2314e251fe2f7e4f09784c9c2/16391f30e924b89915f86eb06f061d950b7bf677.jpg')cat_img = response.read()with open('picture.jpg','wb')as f:    f.write(cat_img)

抓取页面图片②

import urllib.requestimport redef getHtml(url):    page = urllib.request.urlopen(url)    html = page.read()    return htmldef getImg(html):    reg = r'src="(.+?\.jpg)" pic_ext'    imgre = re.compile(reg)    imglist = re.findall(imgre,html)    x = 0    for imgurl in imglist:        urllib.urlretrieve(imgurl,'%s.jpg' % x)        x+=1html = getHtml("http://tieba.baidu.com/p/2460150866")print(getImg(html))

为爬虫添加代理ip

import urllib.requestimport randomurl = 'http://whatismyip.com.tw'iplist = ['121.201.97.136:80','117.135.164.170:80','58.247.31.230:80']proxy_support = urllib.request.ProxyHandler({'http':random.choice(iplist)})opener = urllib.request.build_opener(proxy_support)urllib.request.install_opener(opener)response = urllib.request.urlopen(url)html = response.read().decode('utf-8')print(html)#获取页面内嵌链接import requestsimport refrom bs4 import BeautifulSoupfrom urllib.request import urlopenrawtext=urlopen("http://bbs.gfan.com/android-8397839-1-1.html").read()soup = BeautifulSoup(rawtext,"html.parser")targetDiv=soup.find('div',{'class':'pg'})catalogLinks=targetDiv.find_all('a')indexlist = []for l in catalogLinks[1:]:    indexlist.append(l.get('href'))for index in indexlist:    print(index)

字典的相关用法

test = {  "post": {    "content": ""  },  "replys": [    {      "content": ""    }  ]}test["post"]["content"] = "xx"test["replys"][0]["content"] = "yy"test["replys"][0]["value"] = "zz"test["replys"].append({"content":"","title":"","publish_date":""})def store(measurements):    import json    with open('measurements.json', 'w') as f:        f.write(json.dumps(test))if __name__ == "__main__":    store(test)

阅读全文

1 0