爬虫学习5.1.2数据存储（无数据库）存储为CSV

来源：互联网发布：淘宝网上有卖药的吗编辑：程序博客网时间：2024/05/26 17:49

1.写CSV文件

import csvheaders=['ID','Username','Password','Age','Country']rows=[(1001,"qiye","qiye_pass","24","China"),      (1002, "Marry", "Marry_pass", "21", "USA"),      (1003, "Jack", "Jack_pass", "20", "USA"),]with open('qiye.csv','w') as f:    f_csv= csv.writer(f)    f_csv.writerow(headers)    f_csv.writerows(rows)

2.字典数据

#row列表里的数据 可以是字典数据import csvheaders=['ID','Username','Password','Age','Country']rows=[{'ID':1001,'Username':"qiye",'Password':"qiye_pass",'Age':"24",'Country':"China"},    {'ID':1002,'Username':"Marry",'Password':"Marry_pass",'Age':"21",'Country':"USA"},    {'ID':1003,'Username':"Jack",'Password':"Jack_pass",'Age':"20",'Country':"USA"},]with open('qiye2.csv','w') as f:    f_csv= csv.DictWriter(f,headers)    f_csv.writeheader()    f_csv.writerows(rows)

3.读取数据（列表读取）

import csvwith open('qiye.csv') as f:    f_csv = csv.reader(f)    headers = next(f_csv)    print headers    for row in f_csv:        print row

4.读取数据（索引读取）

python的namedtuple详解：http://blog.csdn.net/kongxx/article/details/51553362

命名分组然后填入数据，提取数据

import csvfrom collections import namedtuplewith open('qiye2.csv') as f:    f_csv = csv.reader(f)    headings = next(f_csv)#迭代获取对象元素    Row = namedtuple('Row',headings)#定义namedtuple类型    for r in f_csv:        row = Row(*r)#获取每行数据创建一个对象        print row.Username,row.Password        print row

5.读取到一个字典序列中

import csvwith open('qiye.csv') as f:    f_csv=csv.DictReader(f)    for row in f_csv:        print row.get('Username'),row.get('Password')

6.使用lxml解析 http://seputu.com首页标题章节和链接等数据

#coding:utf-8from lxml import etreeimport requestsimport reimport  csvuser_agent = 'Monzilia/4.0 (compatible; MISE 5.5;Windows NT) 'headers = {'User_Agent': user_agent}r = requests.get('http://seputu.com', headers=headers)#使用lxml解析网页html = etree.HTML(r.text)div_mulus = html.xpath('.//*[@class="mulu"]')#先找到所有的div class=mulu标记pattern = re.compile(r'\s*\[(.*)\]\s+(.*)')rows=[]for div_mulu in div_mulus:    #找到所有div_h2标记    div_h2 = div_mulu.xpath('./div[@class="mulu-title"]/center/h2/text()')    if len(div_h2) > 0:        h2_title = div_h2[0].encode('utf-8')        a_s = div_mulu.xpath('./div[@class="box"]/ul/li/a')        for a in a_s:            #找到href属性            href = a.xpath('./@href')[0].encode('utf-8')            #找到title属性            box_title = a.xpath('./@title')[0]            pattern = re.compile(r'\s*\[(.*)\]\s+(.*)')            match = pattern.search(box_title)            if match!=None:                date = match.group(1).encode('utf-8')                real_title=match.group(2).encode('utf-8')                #print real_title                content = (h2_title,real_title,href,date)                print  content                rows.append(content)headers = ['title','real_title','href','date']with open('qiye.csv','w') as f:    f_csv = csv.writer(f,)    f_csv.writerow(headers)    f_csv.writerows(rows)print 'success!'

阅读全文

0 0