Python中的输入输出(IO)

来源:互联网 发布:网页设计和美工的区别 编辑:程序博客网 时间:2024/05/21 10:29

Python输入输出,包括了Python自带的输入输出和Numpy,Pandas,SQLite3,Pytables的IO操作,并对读写速度进行了比较.

python自带的IO操作

pickle模块可以序列化大部分Python对象
cpickle和pickle模块实现的功能相同

import pickleimport cPickle import numpy as np

 Pickle

data = np.random.randint(100000)# 写入数据pic_file = open('data/data.pkl', 'w')%time pickle.dump(data, pic_file)
CPU times: user 0 ns, sys: 0 ns, total: 0 ns

Wall time: 87µs

# 加载数据pic_file = open('data/data.pkl', 'r')%time  pickle.load(pic_file)
CPU times: user 4 ms, sys: 0 ns, total: 4 ms

Wall time: 86.1 µs

 cPickle

# 下面用cPickle,重复上面操作cpic_file = open('data/data.cpkl','w')% time cPickle.dump(data, cpic_file)
CPU times: user 0 ns, sys: 0 ns, total: 0 ns

Wall time: 34.1 µs

cpic_file = open('data/data.cpkl','r')%time cPickle.load(cpic_file)
CPU times: user 0 ns, sys: 0 ns, total: 0 ns

Wall time: 461 µs

cPickle模块的写入速度比Pickle的写入速度更快一些

# cPickle模块的读写都比pickle模块快# 比较一下两个列表是否相同# np.allclose()np.allclose(np.array(a), np.array(b))
CPU times: user 0 ns, sys: 0 ns, total: 0 nsWall time: 14.8 µsTrue

 Pickle扩展

# pickle按照先进先出的原则,可以连续的写入和读取多个数据对象# 这存在一些问题,没有任何可用的元信息,所以可以以字典的形式存储多个对象c = np.random.rand(10)d = np.random.rand(10)filepath = open('data/dict.pkl','w')pickle.dump({'c':c,'d':d}, filepath)
filepath = open('data/dict.pkl','r')Dict = pickle.load(filepath)Dict[Dict.keys()[0]]
array([ 0.7759383 ,  0.80020534,  0.73876404,  0.19062144,  0.3167207 ,        0.17651691,  0.6717112 ,  0.42323898,  0.1798924 ,  0.30714039])

 SQL数据库

# python 可以使用任何类型的SQL数据库,自带SQLite3数据库import sqlite3 as sq3import numpy as np

 创建表和连接对象

query = 'CREATE TABLE numbs (Date date, No1 real, No2 real)'# 打开一个数据库连接con = sq3.connect(path + 'numbs.db')# execute方法执行查询,创建一个表con.execute(query)
<sqlite3.Cursor at 0x7f89635b9ab0>
# 使查询生效con.commit()

 逐行写入

# 向表中写入数据import datetime as dtdata = np.random.standard_normal((1000,2)).round(5)for row in data:    con.execute('INSERT INTO numbs VALUES(?,?,?)',(dt.datetime.now(),row[0],row[1]))con.commit()  # 使写入操作生效

 批量读取

# fetchmany批量读取数据con.execute('SELECT * FROM numbs').fetchmany(10)
[(u'2017-07-22 18:12:14.619985', -0.44211, -0.60524), (u'2017-07-22 18:12:14.620376', 0.62257, -1.86698), (u'2017-07-22 18:12:14.620456', 0.09199, -0.88587), (u'2017-07-22 18:12:14.620527', 0.21104, -0.63922), (u'2017-07-22 18:12:14.620593', -0.56373, 1.73532), (u'2017-07-22 18:12:14.620656', 1.0311, 0.39597), (u'2017-07-22 18:12:14.620719', -0.02782, -0.49443), (u'2017-07-22 18:12:14.620784', -0.44697, -0.10876), (u'2017-07-22 18:12:14.620845', -0.47572, -0.21467), (u'2017-07-22 18:12:14.620908', 0.47313, 0.62077)]

 逐行读取

# 单行读取数据readline = con.execute('select * from numbs')for i in range(3):    print readline.fetchone()
(u'2017-07-22 18:12:14.619985', -0.44211, -0.60524)(u'2017-07-22 18:12:14.620376', 0.62257, -1.86698)(u'2017-07-22 18:12:14.620456', 0.09199, -0.88587)
con.close() #关闭连接

 Numpy替代SQL

# Numpy,SQL的高效替代品,# datetime64[D],生成的时间序列间隔为1天import numpy as npdates = np.arange('2017-01-01','2020-01-01',dtype='datetime64[D]')len(dates)
1095

 numpy的自定义数组结构

# 使用numpy中的自定义类型数据俩取代SQL中的表dty = np.dtype([('date','datetime64[D]'),('No1','f'),('No2','f')])data = np.zeros(1095,dtype = dty)# 填充数组data['date'] = datesa = np.random.standard_normal((1095,2))data['No1'] = a[:,0]data['No2'] = a[:,1]
# 数据写入磁盘,numpy数组经过了高度优化%time np.save(path+'array',data)
CPU times: user 0 ns, sys: 0 ns, total: 0 ns

Wall time: 814 µs

# 读取数据%time v = np.load(path+'array.npy')
CPU times: user 4 ms, sys: 0 ns, total: 4 ms

Wall time: 973 µs

 Pandas读写vs SQL读写

一个100000行5列的数据
SQL写入耗时:1s,读取耗时:110ms,读取到一个数组对象耗时:167ms,执行一次查询:67.9ms
pandas读取同一个SQL表耗时:600ms,执行一个相同的查询耗时:5.33ms,复杂的查询:7.53ms
HDF5格式写入耗时:9.9ms,读取耗时:17.1ms
CSV格式写入耗时:332ms,读取耗时:86ms
excel格式写入耗时:17.6s,读取耗时:9.82s
Pandas的excel读写是最慢的,HDF5写入最快,DataFrame在内存中执行查询最快
 

# 对比Pandas和SQL的读写操作import pandas as pdimport numpy as npimport sqlite3 as sq3data = np.random.standard_normal((100000,5)).round(5)

 SQL

# 创建一个表,表名,表属性query = 'create table numbers (No1 real,No2 real,No3 real,No4 real,No5 real)'con = sq3.connect('data/numbers.db')con.execute(query)
<sqlite3.Cursor at 0x7f6e57bcdc00>

 SQL写入

# 向表中批量写入一个ndarray对象% time con.executemany('insert into numbers values (?,?,?,?,?)',data)con.commit()
CPU times: user 932 ms, sys: 8 ms, total: 940 ms

Wall time: 1 s
 

 SQL读取为一个列表对象

# 读取数据到一个列表对象%time temp = con.execute('select * from numbers').fetchall()print temp[0]
CPU times: user 112 ms, sys: 0 ns, total: 112 ms

Wall time: 110ms

(-0.01641, -1.84343, 2.18124, 0.36241, -1.36054)

 SQL读取到一个数组对象

# 也可以读入到一个ndarray对象%time array = np.array(con.execute('select * from numbers').fetchall())
CPU times: user 160 ms, sys: 8 ms, total: 168 ms 

Wall time: 167 ms

SQL简单的查询

# SQL查询 ,No1>0 No2 < 0,部分数据%time pdata1 = np.array(con.execute('select * from numbers where No1>0 and No2<0').fetchall())
CPU times: user 68 ms, sys: 0 ns, total: 68 ms

Wall time: 67.9 ms

pdata1[:5]
array([[ 0.35254, -0.81509,  0.58649, -0.73143, -1.39644],       [ 1.86272, -0.22444, -0.0925 ,  0.29582,  1.18061],       [ 0.98774, -0.65247,  0.68095, -0.69551,  1.5109 ],       [ 1.18558, -1.19879,  0.73174,  1.87872,  0.62825],       [ 0.35481, -1.20635,  0.5834 , -0.25098,  0.21274]])

 Pandas读取SQL表

# Pandas读取表,返回一个DataFrame对象import pandas.io.sql as pds
%time data1 = pds.read_sql('select * from numbers',con)
CPU times: user 232 ms, sys: 8 ms, total: 240 ms

Wall time: 600ms

print data1.head()
       No1      No2      No3      No4      No50 -0.33463 -0.32737 -0.58860  0.84526 -0.011281  0.35254 -0.81509  0.58649 -0.73143 -1.396442 -1.82652  1.20503 -0.08089 -1.38706  0.119733 -0.32468  0.83964  1.04378 -1.12341  0.480014 -0.21380 -0.14158 -1.02987  0.25325 -0.75824

 Pandas简单查询

# 查询,No1>0 No2 < 0,部分数据%time data2 = data1[(data1['No1']>0) & (data1['No2']<0)]
CPU times: user 4 ms, sys: 0 ns, total: 4 msWall time: 5.33 ms

Wall time: 5.33ms

print data2.head()
        No1      No2      No3      No4      No51   0.35254 -0.81509  0.58649 -0.73143 -1.396449   1.86272 -0.22444 -0.09250  0.29582  1.1806110  0.98774 -0.65247  0.68095 -0.69551  1.5109011  1.18558 -1.19879  0.73174  1.87872  0.6282515  0.35481 -1.20635  0.58340 -0.25098  0.21274

Pandas对SQL表的读取速度比SQL慢了6倍左右,在内存中的查询速度比SQL跟快了12倍

# pandas可以对DataFrame进行更加复杂的查询(除了结构复杂的关系型数据库)%time data3 = data1[['No1','No2']][((data1['No1']>0.5) | (data1['No1']<-0.5)) & ((data1['No2']<-1) | (data1['No2']>1))]
CPU times: user 8 ms, sys: 0 ns, total: 8 ms

Wall time: 7.53 ms

import matplotlib.pyplot as plt%matplotlib inlineplt.scatter(data3['No1'],data3['No2'])

这里写图片描述

 HDF5文件
 
HDF5文件,可以存储DataFrame对象

import pandas as pdimport numpy as npdata = pd.DataFrame(np.random.standard_normal((100000,5)).round(5))

 写入HDF5文件

# 写入磁盘保存为HDF5格式h5s = pd.HDFStore('data/numbers.h5s','w')   %time h5s['data'] = datah5s.close()
CPU times: user 8 ms, sys: 4 ms, total: 12 ms

Wall time: 9.9ms

 读取HDF5

h5s = pd.HDFStore('data/numbers.h5s','r')%time temp = h5s['data']h5s.close()
CPU times: user 12 ms, sys: 4 ms, total: 16 ms

Wall time: 17.1ms

print temp.head()
         0        1        2        3        40 -0.49150  0.56155 -0.69365  0.94050 -0.129251 -0.09620  0.76851 -0.48442 -0.84382 -0.719262 -1.23829 -0.06890  0.38437 -0.20510  0.992783 -0.12078  1.65159 -0.20019 -0.86031  0.670924  0.14573  1.40608 -0.32161  0.57961  0.56956

 
Pandas,可以将DataFrame对象保存为csv,excel格式

 DataFrame >>> CSV

# 仍以HDF5中的data数据为例% time data.to_csv('data/data.csv')
CPU times: user 312 ms, sys: 4 ms, total: 316 ms

Wall time: 332ms

 读取csv

% time data_csv = pd.read_csv('data/data.csv')
CPU times: user 76 ms, sys: 8 ms, total: 84 ms

Wall time: 86ms

 DataFrame >>> excel

# DataFrame保存为excel%time data.to_excel('data/data.xlsx')
CPU times: user 17.5 s, sys: 160 ms, total: 17.7 s

Wall time: 17.6s

%time a = pd.read_excel('data/data.xlsx')
CPU times: user 9.82 s, sys: 12 ms, total: 9.83 s

Wall time: 9.82s

 PyTables

 
PyTable对IO操作进行了专门的优化,与pandas类似,可以创建许多表,支持压缩和索引,可以更高效的存储Numpy数组,有自己独特的与数组结构类似的结构
 

import numpy as npimport tables as tbimport datetime as dtimport matplotlib.pyplot as plt
# pytable提供基于文件的数据格式h5 = tb.open_file('data/data.h5','w')# 生成一个10万行数据的表rows = 100000# 一个时间列,两个int列,两个float列#两个参数表示字符长的长度和,列位置row_des = {    'Date': tb.StringCol(26, pos=1),    #两个参数表示字符长的长度和,列位置    'No1' : tb.IntCol(pos=2),    'No2' : tb.IntCol(pos=3),    'No3' : tb.Float64Col(pos=4),    'No4' : tb.Float64Col(pos=5)}# 建表选择无压缩filters = tb.Filters(complevel=0)  # 无压缩tab = h5.create_table(where='/',name='ints_floats',description=row_des,                       title='Int and Floats',expectedrows=rows,filters=filters)
tab
/ints_floats (Table(0,)) 'Int and Floats'  description := {  "Date": StringCol(itemsize=26, shape=(), dflt='', pos=0),  "No1": Int32Col(shape=(), dflt=0, pos=1),  "No2": Int32Col(shape=(), dflt=0, pos=2),  "No3": Float64Col(shape=(), dflt=0.0, pos=3),  "No4": Float64Col(shape=(), dflt=0.0, pos=4)}  byteorder := 'little'  chunkshape := (1310,)
# 生成数据int_data = np.random.randint(0,1000, size=(rows,2))float_data = np.random.standard_normal((rows, 2)).round(5)
# 逐行写入数据line = tab.rowfor i in range(rows):    line['Date'] = dt.datetime.now()    line['No1'] = int_data[i,0]    line['No2'] = int_data[i,1]    line['No3'] = float_data[i,0]    line['No4'] = float_data[i,1]    line.append()tab.flush()  # 相当于SQL中的commit(),提交修改
tab
/ints_floats (Table(100000,)) 'Int and Floats'  description := {  "Date": StringCol(itemsize=26, shape=(), dflt='', pos=0),  "No1": Int32Col(shape=(), dflt=0, pos=1),  "No2": Int32Col(shape=(), dflt=0, pos=2),  "No3": Float64Col(shape=(), dflt=0.0, pos=3),  "No4": Float64Col(shape=(), dflt=0.0, pos=4)}  byteorder := 'little'  chunkshape := (1310,)

 数组结构批量写入数据

# 使用Numpy的自定义数组结构,批量写入数据dty = np.dtype([('Date','S26'),('No1','<i4'),('No2','<i4'),('No3','<f8'),('No4','<f8')])sarray = np.zeros(rows, dtype=dty)sarray['Date'] = dt.datetime.now()sarray['No1'] = int_data[:, 0]sarray['No2'] = int_data[:, 1]sarray['No3'] = float_data[:, 0]sarray['No4'] = float_data[:, 1]
# 数组取代行描述,description%time h5.create_table('/','init_floats_from_array',sarray,title='int and floats',expectedrows= rows, filters=filters)
CPU times: user 4 ms, sys: 4 ms, total: 8 ms

Wall time: 70ms

/init_floats_from_array (Table(100000,)) 'int and floats'  description := {  "Date": StringCol(itemsize=26, shape=(), dflt='', pos=0),  "No1": Int32Col(shape=(), dflt=0, pos=1),  "No2": Int32Col(shape=(), dflt=0, pos=2),  "No3": Float64Col(shape=(), dflt=0.0, pos=3),  "No4": Float64Col(shape=(), dflt=0.0, pos=4)}  byteorder := 'little'  chunkshape := (1310,)

 table的切片

# 表对象的切片操作和Numpy数组的操作类似tab[:3]
array([('2017-07-23 11:15:08.680526', 807, 331, -0.07979,  0.61129),       ('2017-07-23 11:15:08.680671', 979, 621,  0.28283, -0.39205),       ('2017-07-23 11:15:08.680694', 291, 433,  0.74783, -0.57023)],      dtype=[('Date', 'S26'), ('No1', '<i4'), ('No2', '<i4'), ('No3', '<f8'), ('No4', '<f8')])
tab[:3]['No1']
array([807, 979, 291], dtype=int32)
# numpy的内建函数,也可以作用在表切片对象上np.sum(tab[:]['No3'])
-413.30854999999997
np.sqrt(np.sum(tab[:]['No1']))
7067.9572720836395
# 绘图%matplotlib inlineplt.hist(tab[:]['No3'], bins=30)
(array([  4.00000000e+00,   1.00000000e+01,   2.10000000e+01,          5.50000000e+01,   1.37000000e+02,   3.19000000e+02,          6.58000000e+02,   1.04800000e+03,   1.87500000e+03,          3.16400000e+03,   4.44700000e+03,   6.41900000e+03,          8.08100000e+03,   9.68300000e+03,   1.07590000e+04,          1.08710000e+04,   1.01330000e+04,   9.21100000e+03,          7.43500000e+03,   5.68500000e+03,   4.01800000e+03,          2.56400000e+03,   1.59400000e+03,   9.56000000e+02,          4.61000000e+02,   2.38000000e+02,   1.01000000e+02,          3.50000000e+01,   1.10000000e+01,   7.00000000e+00]), array([-4.20318 , -3.929048, -3.654916, -3.380784, -3.106652, -2.83252 ,        -2.558388, -2.284256, -2.010124, -1.735992, -1.46186 , -1.187728,        -0.913596, -0.639464, -0.365332, -0.0912  ,  0.182932,  0.457064,         0.731196,  1.005328,  1.27946 ,  1.553592,  1.827724,  2.101856,         2.375988,  2.65012 ,  2.924252,  3.198384,  3.472516,  3.746648,         4.02078 ]), <a list of 30 Patch objects>)

这里写图片描述

 Pytable查询

# 表查询# 查询返回,No3列大于0.5或小于-0.5,和,No4列小于-1或者大于1的值%time array = np.array([(row['No3'],row['No4']) for row in \                  tab.where('((No3<-0.5) | (No3>0.5)) &((No4<-1) | (No4>1))')])[:400]%matplotlib inlineplt.plot(array.T[0], array.T[1],'go')
CPU times: user 24 ms, sys: 0 ns, total: 24 ms

Wall time: 22.5ms

这里写图片描述

table的内建方法

values = tab.cols.No3[:]print 'No3 max',values.max()print 'No3 min',values.min()print 'No3 ave',values.mean()print 'No3 std',values.std()
No3 max 4.02078No3 min -4.20318No3 ave -0.0041330855No3 std 1.00075064734
# 查询,No1列大于500和No2列小于500的数据%time result = [(row['No1'], row['No2']) for row in tab.where('(No1 > 500) & (No2 < 500)')]print result[:3]
CPU times: user 16 ms, sys: 4 ms, total: 20 ms

Wall time: 17.2ms
[(807, 331), (658, 420), (809, 29)]

 压缩表

Pytable的优势就是压缩表,不仅节省空间,而且提升Io性能.当IO成为瓶颈而cpu可以进行快速解压时,压缩表对速度有所提升.

File.create_table(where, name, description=None, title='', filters=None,                   expectedrows=10000, chunkshape=None,                   byteorder=None, createparents=False, obj=None)

 建表

h5c = tb.open_file('data/tab.h5c','w')fileters = tb.Filters(complevel=4,complib='blosc')tabc = h5c.create_table('/','compress',sarray,title='int and floats',expectedrows= rows, filters=filters)
%time result2 = np.array([(row['No3'],row['No4']) for row in \                   tabc.where('((No3 <-0.5) | (No3 > 0.5)) & ((No4 <-1) | (No4 > 1))')])[:400]
CPU times: user 32 ms, sys: 0 ns, total: 32 msWall time: 35.3 ms

压缩表和未压缩表的大小

ll $path*
-rw-rw-r-- 1 jinghui 8002392 7月  23 12:34 data/data.h5-rw-rw-r-- 1 jinghui 5053596 7月  23 13:57 data/tab.h5c

 
HDF5数据存储,是结构化数值和金融数据的强大替代方案,它可以代替关系数据库.
在使用Pytbale时单独访问或者结合Pnadas,都可以得到硬件所支持的最高IO性能.
 

PyTables内存外计算

h5 = tb.open_file('data/array.h5','w')cols = 100   # 列为固定100ear = h5.create_earray(h5.root,'ear',atom=tb.Float64Atom(),shape=(0,cols))# ear,第一维可以扩展,第二维为100# 块填充rand = np.random.standard_normal((100,100))for i in range(1000):    ear.append(rand)ear.flush()
ear
/ear (EArray(100000, 100)) ''  atom := Float64Atom(shape=(), dflt=0.0)  maindim := 0  flavor := 'numpy'  byteorder := 'little'  chunkshape := (81, 100)
ear.size_on_disk
80028000L
ear大约有83M大小,在内存外进行计算,还需要一个目标earray对象pytable有一个模块可以高效的处理,数值表达式,Expr机遇numexpr.

内存外计算

output = h5.create_earray(h5.root,'output',atom=tb.Float64Atom(),shape=(0,cols))expr = tb.Expr('2 * cos(ear) + sqrt(abs(ear))')expr.set_output(output, append_mode=True)%time expr.eval()
CPU times: user 244 ms, sys: 116 ms, total: 360 ms

Wall time: 495ms

/output (EArray(100000, 100)) ''  atom := Float64Atom(shape=(), dflt=0.0)  maindim := 0  flavor := 'numpy'  byteorder := 'little'  chunkshape := (81, 100)
# 第一行,前10个元素output[0,:10]
array([ 1.09110624,  2.16123354,  2.27178116,  1.62730959,  2.27173622,        0.51966769,  2.45945157,  1.39683809,  0.26411995,  0.72637963])

在内存中计算

# numexpr在内存中计算imarray = ear.read()  # 数据全部读到内存当中# 倒入numexpr模块import numexpr as neexpr = '2 * cos(imarray) + sqrt(abs(imarray))'ne.set_num_threads(16)   # 设置进程数%time ne.evaluate(expr)[0,:10]
CPU times: user 164 ms, sys: 40 ms, total: 204 ms

Wall time: 72.1ms

array([ 1.09110624,  2.16123354,  2.27178116,  1.62730959,  2.27173622,        0.51966769,  2.45945157,  1.39683809,  0.26411995,  0.72637963])
在内存中计算比在内存外快了大概423ms,加速了7倍
原创粉丝点击