python 爬飞机航班信息

首先感谢一下 飞友网提供的网站数据





解析 中途经停的航班。和班次起飞日期(根据星期1234567计算)。

#coding:utf-8__author__ = 'watsy'from sgmllib import SGMLParserimport urllibimport urllib2import datetimeimport jsonimport osfrom time import sleepimport timeimport sysimport urlparseclass flightCityObject(object):    def __init__(self, name="", url="", leaveurl = ""): = name        self.url = url        self.leaveurl = leaveurlclass flightAirObject(object):    def __init__(self, air_code = "", start_place = "", start_time = "", end_place = "", end_time = "", air_type = "",flightWeekend = "", hasCenterPlace = "" ,hasFood = "", zhundian = ""):        self.air_code = air_code        self.start_place = start_place        self.start_time = start_time        self.end_place = end_place        self.end_time = end_time        self.air_type = air_type        self.flightWeekend = flightWeekend        self.hasFood = hasFood        self.zhundianlv = zhundian        self.needQuery = False        #需要查询具体信息        if hasCenterPlace != '-':            self.needQuery = True        if flightWeekend.find('.') != -1:            self.needQuery = True    @property    def description(self):        return "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s" % (            self.air_code,            self.start_place,            self.start_time,            self.end_place,            self.end_time,            self.air_type,            self.flightWeekend,            self.hasFood,            self.zhundianlv,            self.needQuery        )class flightSearchParser(SGMLParser):    def reset(self):        self.ctx = "" = ""        self.citys = []        self.url = ""        SGMLParser.reset(self)    def start_div(self, attrs):        for k,v in attrs:            if k == 'class' and v == 'cityli':       = 1    def start_a(self, attrs):        if == 1:            for k,v in attrs:                if k == 'href':                    self.url = v    def end_a(self):        if == 1:            for city in self.citys:                if == self.ctx:                    return            # 计算到港html地址            urlsplit =  urlparse.urlsplit(self.url)            urlpath = urlsplit[2]            url_city_htm = urlpath.split('/')[-1]            url_city_htm = ("E_%s") % url_city_htm            url_city_htm = urlparse.urljoin(self.url, url_city_htm)            self.citys.append(flightCityObject(self.ctx, self.url, url_city_htm))    def end_div(self): = ""    def handle_data(self, data):        if == 1:            self.ctx = data# 解析国内离港 国内到港class flightCityArriveAndLeaveParser(SGMLParser):    def reset(self):        self.li_url_flag = False        self.a_url_flag = False        self.a_url = ""        self.url_city_list = []        SGMLParser.reset(self)    def start_li(self, attrs):        self.li_url_flag = True    def end_li(self):        self.li_url_flag = False    def start_a(self, attrs):        if self.li_url_flag:            for k,v in attrs:                if k == 'href':                    self.a_url_flag = True                    self.a_url = v    def end_a(self):        self.a_url_flag = False    def handle_data(self, data):        if self.a_url_flag:            self.url_city_list.append({'city' : data, 'url' : self.a_url})# 解析航班信息class flightTimesParser(SGMLParser):    def reset(self):        self.flight_tr_flag = False        self.flight_td_flag = False        self.flight_td_list = []        self.flight_tr_list = []        SGMLParser.reset(self)    def start_tr(self, attrs):        for k,v in attrs:            if k == 'bgcolor':                if v == '#FFFFCC' or v == '#FFFFFF':                    self.flight_tr_flag = True    def end_tr(self):        if len(self.flight_td_list) > 0:            self.flight_tr_list.append(self.flight_td_list)        self.flight_tr_flag = False        self.flight_td_list = []    def start_td(self, attrs):        if self.flight_tr_flag:            self.flight_td_flag = True    def end_td(self):        self.flight_td_flag = False    def handle_data(self, data):        if self.flight_td_flag:            self.flight_td_list.append(data)# 获取所有航班页面def function_get_flight_html_content_flight(url):    u = urllib.urlopen(url)    html_content =    u.close()    html_content = html_content.decode('gb2312')    html_content = html_content.encode('utf-8')    html_content.replace('gb2312', 'utf-8')    # print html_content    ft = flightTimesParser()    ft.feed(html_content)    return ft.flight_tr_listdef get_city_flight_times(url):    u = urllib.urlopen(url)    html_content =    u.close()    html_content = html_content.decode('gb2312')    html_content = html_content.encode('utf-8')    html_content.replace('gb2312', 'utf-8')    print "time : [%s] - [%d %s]" % (time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), i, air.url)    # print html_content    cityParser = flightCityArriveAndLeaveParser()    cityParser.feed(html_content)    flight_airs = {}    for url_city in cityParser.url_city_list:        # print 'parser :' + url_city['url']        ft = function_get_flight_html_content_flight(url_city['url'])        for flightTime in ft:            if len(flightTime) == 12:                fao = flightAirObject(flightTime[0], flightTime[2],flightTime[1], flightTime[4],flightTime[3],flightTime[5],                                      flightTime[6], flightTime[7], flightTime[8], flightTime[9])            else:                fao = flightAirObject(flightTime[0], flightTime[3],flightTime[2], flightTime[5],flightTime[4],flightTime[6],                                      flightTime[7], flightTime[8], flightTime[9], flightTime[10])            if flight_airs.has_key(flightTime[0]):                continue            flight_airs[flightTime[0]] = fao    return flight_airsdef write_dict_to_file(air_dict):    for air_key in air_dict:        # print air_key        with open(air_key + '.txt', 'w') as wf:            wf.write(air_dict[air_key].description)u = urllib.urlopen('')html_content = = html_content.decode('gb2312')html_content = html_content.encode('utf-8')airParser = flightSearchParser()airParser.feed(html_content)# print air and url# for air in airParser.citys:    # print + ' ' + air.url + '\t' + air.leaveurlstrPath = os.getcwd()for i in range(0, len(airParser.citys)):    air = airParser.citys[i]    os.chdir(strPath + '/air')    write_dict_to_file(get_city_flight_times(air.url))    write_dict_to_file(get_city_flight_times(air.leaveurl))os.chdir(strPath)