获取最新中国行政区划

来源:互联网 发布:程序员一年经验工资 编辑:程序博客网 时间:2024/04/29 06:48

废话少说,上代码:

import urlparsefrom StringIO import StringIOimport datetimeimport requestsimport lxmlfrom lxml import etreedef get_latest_url(index_url):    response=requests.get(index_url)    parser=etree.HTMLParser()    tree   = etree.parse(StringIO(response.content ), parser)    r = tree.xpath('//ul[@class="center_list_contlist"]')    if len(r)==1:        div=r[0]        href = div.xpath('li/a/@href')[0]        return urlparse.urljoin(index_url,href)    else:        return Nonedef get_xingzhengquhua_text(latest_url, referer=None):    response=requests.get(latest_url)    parser= etree.HTMLParser()    tree   = etree.parse(StringIO(response.content ), parser)    r = tree.xpath('//div[@class="xilan_con"]')    print r    if len(r)==1:        div=r[0]        div2 = div.xpath('div/div')[0]        div3 = div2.xpath('.//p')        p=[]        for line in div3:            #line = line.replace(u'\xa0', u' ').strip()            #if not line:            #    continue            try:                code=line.xpath('span[1]/text()')[0]                name=line.xpath('span[2]/text()')[0].strip(u'\u3000')            except:                continue            if code.endswith('0000'):                parent=''            elif code.endswith('00'):                parent=code[:2]+'0000'            else:                parent=code[:4]+'00'            p.append((parent,code,name))        text='\n'.join(map(lambda x:','.join(x),p))        text=text.encode('utf-8')        print text        return text    else:        text=Noneif __name__ == '__main__':    index_url='http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/'    #latest_url='http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/201401/t20140116_501070.html'    latest_url=get_latest_url(index_url)    print latest_url    if latest_url:        text=get_xingzhengquhua_text(latest_url)        filename=latest_url.strip().split('/')[-1]        print filename        try:            filename=filename.split('_')[0][1:]        except:            now=datetime.datetime.now()            filename=now.strftime('%Y-%m-%d')        if text:            ff=open('latest-xingzhengquhua-%s.txt' % filename,'w')            ff.write(text)            ff.close()        else:            print 'Failed get xingzhengquehua data!'    else:        print 'Failed get latest data url'
0 0
原创粉丝点击