python3网络爬虫爬取天气网空气质量数据

来源:互联网 发布:淘宝怎么人工投诉电话 编辑:程序博客网 时间:2024/06/05 19:30

刚刚上传了一份北京天气的爬虫文档,现在更新一份北京空气质量,希望大家多多指教,采用方法类似,bs4和urllib.request.

'''
Created on 2017-11-14


@author: chen
'''
'''
Created on 2017-11-13


@author: chen
'''
from bs4 import BeautifulSoup
from urllib.request import urlopen
import csv
#import xlsxwriter


#import re 
def  get_all_weather_url():
    response = urlopen("http://www.tianqihoubao.com/aqi/beijing-201708.html")
    bs_obj = BeautifulSoup(response.read(), "lxml")
    months = bs_obj.find("div", {"class":"box p"})
#    print(months,type(months))
    month_all = months.find_all("li")
    for month in  month_all:
#        print(type(month),len(month_all))
#        print(month.a["href"]+" "+month.a["title"])
#        print(month.attrs)
        yield month.a
#url处理        
def get_page_url_weather():
    for url in get_all_weather_url():
        helf_url = url["href"]
#        title = url["title"]
        weather_url = "http://www.tianqihoubao.com/" + str(helf_url)
        yield weather_url
def get_weather_data():
    url_set = set([])
    for url in get_page_url_weather():
        if url not in url_set:
            url_set.add(url)
            weather_content = urlopen(url).read()
            weather_page_obj = BeautifulSoup(weather_content, "lxml")
            tbody_page = weather_page_obj.table
    #        print(tbody_page.find_all("tr"),title)
            tr_weather_page = tbody_page.find_all("tr")
    #        print(title)
            for tr_each in tr_weather_page:
    #            print(tr_each)
                td_weather = tr_each.find_all("td") 
                print(len(td_weather), url)
                data = td_weather[0].get_text(" ", strip=True).replace("\r\n", "").replace(" ", "")
                Quality = td_weather[1].get_text(" ", strip=True).replace("\r\n", "").replace(" ", "")
                AQI_data = td_weather[2].get_text(" ", strip=True).replace("\r\n", "").replace(" ", "")
                AQI_rank = td_weather[3].get_text(" ", strip=True).replace("\r\n", "").replace(" ", "")
                PM2_5 = td_weather[4].get_text(" ", strip=True).replace("\r\n", "").replace(" ", "")
                PM10 = td_weather[5].get_text(" ", strip=True).replace("\r\n", "").replace(" ", "")
                SO2 = td_weather[6].get_text(" ", strip=True).replace("\r\n", "").replace(" ", "")
                NO2 = td_weather[7].get_text(" ", strip=True).replace("\r\n", "").replace(" ", "")
                CO2 = td_weather[8].get_text(" ", strip=True).replace("\r\n", "").replace(" ", "")
                O3 = td_weather[9].get_text(" ", strip=True).replace("\r\n", "").replace(" ", "")
                yield data, Quality, AQI_data, AQI_rank , PM2_5, PM10 , SO2, NO2 , CO2, O3          
            else:
                continue
def main():
#    book = xlsxwriter.Workbook(r"C:\Users\chen\Desktop\北京天气数据每日更新.xlsx")
    with open(r"C:\Users\chen\Desktop\北京每日空气质量更新.csv", "w+", newline="") as file:
        writer = csv.writer(file)
        tem = 1
        for data, Quality, AQI_data, AQI_rank , PM2_5, PM10 , SO2, NO2 , CO2, O3 in get_weather_data():
#            temp = book.add_worksheet()
#            temp.write_row(data, weather, temperature, wind)
#            print(data , ",", weather, ",", temperature, ",", wind) 
            day_weather = [data, Quality, AQI_data, AQI_rank , PM2_5, PM10 , SO2, NO2 , CO2, O3]
            writer.writerow(day_weather)
#            writer.writerows([str(data) + str(weather) + str(temperature) + str(wind)])
            print("第" + str(tem) + "次写入成功")
            tem += 1
        print("写入完毕")
if __name__ == '__main__':
    main()

阅读全文
0 0