python解析csv文件 提取数据

来源:互联网 发布:蚁群算法 python 编辑:程序博客网 时间:2024/05/29 01:55

alexa 网站列表是以电子表格的形式提供的,表格中有两项内容,分别是排名和域名

- A B 1 1 google.com 2 2 facebook.com 3 3 youtube.com … … …

抽取数据包含如下四个步骤:
1.下载zip文件
2.从zip文件阿忠提取csv文件
3.解析csv文件
4.遍历csv文件中的每一行,从中提取数据

下面是实现上面功能的代码:
其中Downloader的内容:

#-*- coding=utf-8 -*-import reimport urlparseimport urllib2import timefrom datetime import datetimeimport robotparserimport Queuefrom bs4 import BeautifulSoupimport sysreload(sys)sys.setdefaultencoding('utf-8')import  csvimport lxml.htmlimport  randomimport  cssselectimport socketDEFAULT_AGENT='wswp'DEFAULT_DELAY=5DEFAULT_RETRIES=1DEFAULT_TIMEOUT=60class Throttle:    """Throttle downloading by sleeping between requests to same domain    """    def __init__(self, delay):        # amount of delay between downloads for each domain        self.delay = delay        # timestamp of when a domain was last accessed        self.domains = {}    def wait(self, url):        domain = urlparse.urlparse(url).netloc        last_accessed = self.domains.get(domain)        if self.delay > 0 and last_accessed is not None:            sleep_secs = self.delay - (datetime.now() - last_accessed).seconds            if sleep_secs > 0:                time.sleep(sleep_secs)        self.domains[domain] = datetime.now()class Downloader:    def __init__(self,delay=5,user_agent='wswp',proxies=None,num_retries=1,timeout=60,opener=None,cache=None):        socket.setdefaulttimeout(timeout)        self.throttle=Throttle(delay)        self.user_agent=user_agent        self.proxies=proxies        self.num_retries=num_retries        self.opener=opener        self.cache = cache    def __call__(self,url):        result=None        if self.cache:            try:                result=self.cache[url]            except KeyError:                #网址在缓存里面不可用                pass        #else:            #if result is not None and self.num_retries >0 and 500<=result['code']<600:                #遇到了服务器的故障 并且重新下载         #       result=None        if result==None:            # 结果并没有在cache中            #所以仍然需要重新下载            self.throttle.wait(url)            proxy=random.choice(self.proxies) if self.proxies else None            headers={'user_agent':self.user_agent}            result=self.download(url,headers,proxy,self.num_retries)            if self.cache:                #保存结果进入cache                self.cache[url]=result        return result['html']    def download(self,url,headers,proxy,num_retries,data=None):        print 'Downlaoding:',url        request=urllib2.Request(url,data,headers or{})        opener=self.opener or urllib2.build_opener()        if proxy:            proxy_params={urlparse.urlparse(url).scheme:proxy}            opener.add_handler(urllib2.ProxyHandler(proxy_params))        try:            response=opener.open(request)            html=response.read()            code=response.code        except Exception as e:            print 'Download error:',str(e)            html=''            if hasattr(e,'code'):                code=e.code                if num_retries>0 and 500<=code<600:                    return self.download(url,headers,proxy,num_retries-1,data)            else:                code=None        return {'html':html,'code':code}
#-*- coding=utf-8 -*-import csvfrom zipfile import ZipFilefrom StringIO import StringIOfrom downloader import DownloaderD=Downloader()zipped_data=D('http://s3.amazonaws.com/alexa-static/top-1m.csv.zip')urls=[]with ZipFile(StringIO(zipped_data))as zf:    csv_filename=zf.namelist()[0]    for _,website in csv.reader(zf.open(csv_filename)):        urls.append('http://'+website)

可能已经注意到,下载的压缩数据狮子啊使用StringIo封装之后才传给ZipFile的。这是因为ZipFile需要一个类似文件的接口,而不是字符串。接下来我们从压缩文件中提取文件的列表。由于这个.zip 文件只包含一个文件,所以我们直接选择第一个文件即可。然后遍历该csv文件将第二列中的域名数据添加到URl列表中。

0 0
原创粉丝点击