《鲜活的数据-第2章 处理数据》有关代码

来源:互联网 发布:甘肃干部网络考试答案 编辑:程序博客网 时间:2024/05/18 13:47

2.1.3 自动收集数据

import urllib2
page = urllib2.urlopen("https://www.wunderground.com/history/airport/ZHCC/2017/9/8/DailyHistory.html")
from BeautifulSoup import BeautifulSoup
soup = BeautifulSoup(page)
images = soup.findAll('img')
first_image = images[0]
print first_image
wxvalue = soup.findAll(attrs={"class":"wx-value"})
print wxvalue
print wxvalue[0]
print wxvalue[0].span.string #AttributeError: 'NoneType' object has no attribute 'string'
print wxvalue[0].contents[0].string
for m in range(1, 13):    for d in range(1, 32):      # Check if already gone through month      if (m == 2 and d > 28):        break      elif (m in [4, 6, 9, 11] and d > 30):        break      # Open wunderground.com url      timestamp = '2016' + str(m) + str(d)      print "Getting data for " + timestamp      #url = "http://www.wunderground.com/history/airport/KBUF/2009/" + str(m) + "/" + str(d) + "/DailyHistory.html"      url = "https://www.wunderground.com/history/airport/ZHCC/2016/" + str(m) + "/" + str(d) + "/DailyHistory.html"      page = urllib2.urlopen(url)      # Get temperature from page      soup = BeautifulSoup(page)      # dayTemp = soup.body.nobr.b.string      dayTemp = soup.findAll(attrs={"class":"wx-value"})[0].contents[0].string      # Format month for timestamp      if len(str(m)) < 2:        mStamp = '0' + str(m)      else:        mStamp = str(m)      # Format day for timestamp      if len(str(d)) < 2:        dStamp = '0' + str(d)      else:        dStamp = str(d)      # Build timestamp      timestamp = '2016' + mStamp + dStamp      # Write timestamp and temperature to file      print timestamp + ',' + dayTemp + '\n'

终端输入并运行文件

python get-weather-data.py

2.2.3 用代码来格式化

1. CSV转为XML

import csv
reader = csv.reader(open('wunder-data.txt', 'r'), delimiter=",")
print '<weather_data>'for row in reader:    print '<observation>'    print '<date>' + row[0] + '</date>'    print '<temperature>' + row[1] + '</temperature>'    print '</observation>'print '</weather_data>'

终端输入并运行文件

python csv2xml.py >wunder-data1.xml

f = open('wunder-data.xml', 'w')
f.write('<weather_data>')
for row in reader:    f.write( '<observation>')    f.write( '<date>' + row[0] + '</date>')    f.write( '<temperature>' + row[1] + '</temperature>')    f.write( '</observation>')f.write( '</weather_data>')
f.close()

2. XML转为CSV

from BeautifulSoup import BeautifulStoneSoup
f = open('wunder-data.xml', 'r')xml = f.read()soup = BeautifulStoneSoup(xml)observations = soup.findAll('observation')for o in observations:    print o.date.string + "," + o.temperature.string

终端输入并运行文件

python xml2csv.py >wunder-data1.txt

3. CSV转为JSON

import csv
reader = csv.reader(open('wunder-data.txt', 'r'), delimiter=",")
print '{ "observations": ['rows_so_far = 0for row in reader:    rows_so_far += 1    print '{'     print '"date": ' + '"' + row[0] + '", '    print '"temperature": ' + row[1]     if rows_so_far < 365:        print " },"    else:        print " }"print "] }"

终端输入并运行文件

python csv2json.py >wunder-data1.json

4.在循环中加入新的逻辑

import csv
reader = csv.reader(open('wunder-data.txt', 'r'), delimiter=",")
for row in reader:    if int(row[1]) <= 32:        is_freezing = '1'    else:        is_freezing = '0'    print row[0] + "," + row[1] + "," + is_freezing

终端输入并运行文件

python freezingInfo.py >wunder-data-fz.txt

原创粉丝点击