pandas学习笔记

来源：互联网发布：java枚举单例模式编辑：程序博客网时间：2024/05/01 16:01

# coding: utf-8# 导入模块import pandas as pdframe.columns       # 返回所有的列名  frame.index         # 返回所有的索引frame.T             # frame的转置frame.head(num)     # frame前num行的数据，数据量大时，默认为5frame.tail(num)     # frame后num行的数据frame.info()        # 关于frame的信息，及其重要！！！# 将列名中的xx全部替换为yyframe.rename(columns=lambda x: x.replace('xx', 'yy'), inplace=True)# 检索dataframeframe.loc[i]        # 第i+1行所有值，Seriesframe.loc[i][j]     # 第i+1行第j+1列的值frame['key']        # 名为'key'的那一列的所有值，Seriesframe[i:]           # 第i+1列之后的所有值，dataframe，操作同列表frame['key'][i]     # 名为'key'的那一列的第i+1行的值# 数据转换为转列表frame.loc[i].tolist()   # 将第i+1行转为列表# 删除列frame.pop('key')        # 在原来的frame基础上直接删除名为'key'的那一列

DataFrame的连接方式

frame_x, frame_y分别为需要连接的两个dataframe
how是决定了用何种方法连接，常见的有left（左外连接）、right（右外连接）、inner（交集，默认）、outer（并集）
on是决定连接的key，在这里两个frame依靠’key’这列合并，多列合并需要传入列表，可省略

# merge方法new_frame = pd.merge(frame_x, frame_y, how='left', on='key')# join方法,相比于merge更简便frame_x.join(frame_y, on='key', how='left')# concat方法，全连接，不去重，可以理解为两个frame按轴堆叠new_frame = pd.concat(frame_x, frame_y, join='outer', axai=1)

读取存储DataFrame为csv格式

# 读取csv文件，csv_file为需要读取的csv文件名# sep参数表示读取时的分隔符，默认为逗号，可忽略；encoding可省略，看读出来的数据是否乱码；engine可忽略，若出现报错，可尝试加上这个参数；frame_read = pd.read_csv(csv_file, sep=',' ,encoding=xxx, engine='python')# 保存csv文件，csv_file为保存的csv文件名，基本同读取。frame为dataframeframe_save.to_csv(csv_file)

转为dataframe

# array转dataframe，columns可省略import numpy as nparr = np.arange(5)df = pd.DataFrame(arr, columns=['key'])# 字典转dataframe，columns为字典的的key的集合dict = {    'key1': 'value1',    'key2': 'value2',}df = pd.DataFrame(dict, columns=['key1', 'key2'])

下面部分的参考资料

数据结构

Series

一维数组对象

obj = Series([1, 2, 3, 4])print(obj)# return # 0 1# 1 2 # 2 3# 3 4

表示形式为：索引在左，值在右

print(obj.values)# return array([1, 2, 3, 4])print(obj.index)# return Int64Index([0, 1, 2, 3])

创建带有索引的Series

obj2 = Series([1, 2, 3, 4], index=['a', 'b', 'c','d'])print(obj2)# return # a 1# b 2# c 3# d 4print(obj2.index)# return Index([a, b, c, d], dtype=object)

检索

print(obj2['a'])# return 1print(obj2[['a', 'b', 'c']])# return # a 1# b 2# c 3

过滤数据

# 选出obj2中大于0的数，返回索引和值obj2[obj2 > 0]# 对obj2所有元素*2obj2 * 2# 对obj2所有元素取e的元素次方np.exp(obj2)

判断索引是否存在

print('b' in obj2)      # return Trueprint('e' in obj2)      # return Flase

字典传递

# 将字典完全转换为Seriesdict_ser = {'a': 1, 'b': 2, 'c': 3, 'd': 4}obj3 = Series(dict_ser)# 通过列表创建索引dict_index = ['e', 'b', 'c', 'd']obj4 = Series(dict_ser, index=dict_index)print(obj4)# return# e NaN# b 2# c 3# d 4

数据丢失判断

print(pd.isnull(obj3))# return# a False# b False# c False# d Falseprint(pd.notnull(obj4))# return# e False# b True# c True# d True

Series合并

Series对象和它的索引都有一个name属性

obj5 = obj3 + obj4# obj5# a NaN# b 4# c 6# d 8# e NaNobj4.name = 'value'obj4.index.name = 'word'# 这样在输出obj4的时候有name属性的输出# Series的索引可以通过赋值来更改obj4.index = ['a1', 'b1', 'c1', 'd1']

DataFrame

类似一个电子表格的数据结构，可以看作为Series的字典（每一个Series共享一个索引）

data = {'key1': ['a', 'b', 'c', 'd'],        'key2': [1, 2, 3, 4],        'key3': [1.1, 2.2, 3.3 ,4.4]}df = DataFrame(data)print(df)# return #     key1    key2    key3# 0    a        1      1.1# 1    b        2      2.2# 2    c        3      3.3# 3    d        4      4.4df = DataFrame(data, columns=['value1', 'value2', 'value3'])# 与上面的df区别在于，将对应的key换为value# 同样可以给DataFrame自定义索引df = DataFrame(data, columns=['value1', 'value2', 'value3', 'value4'], index=['one', 'two', 'three', 'four'])# 与上面的df区别在于，索引变为自定义的索引，value4由于缺少值，所以value4那一列值全为NaN

检索具体值

# 方法一print(df['value1'])# return# one   a# two   b# three c# four  d# 方法二print(df.value2)# return# one   1# two   2# three 3# four  4

检索行

print(df.ix['three'])# return# value1    c# value1    3# value1    3.3# Name: three

修改列

df['value4'] = 1# 将df的value4列全部赋值为1df['value4'] = np.arange(5.)# 将df的value4列从0赋值到4val = Series('A', 'B', 'C', index=['two', 'one', 'three'])df['value4'] = val# 将val这个Series通过index精准复制给df的value4列df['value5'] = df.value1 == 'b'# 新增value5这一列，如果df对应的value1这一列为'b'，则值为True，否则为Falseprint(df.columns)# return Index([value1, value2, value3, value4, value5], dtype=object)del df['value5']print(df.columns)# return Index([value1, value2, value3, value4], dtype=object)

传递字典

data_dict = {'key1': {'value1': 1, 'value2': 2},             'key2': {'value1': 3, 'value2': 4, 'value3': 5}}df = DataFrame(data_dict)print(df)# return #           key1      key2# value1      1         3# value2      2         4# value3      NaN       5# 转制DataFrameprint(df.T)df = DataFrame(data_dict, index=['a', 'b', 'c'])print(df)# 将value1,value2,value3替换为a,b,c

DataFrame的部分函数

# name属性df.index.name = 'name1'df.columns.name = 'name2'# 返回DataFrame的数据df.values# return array([...])

索引对象

索引对象是不可变的
重新索引需要使用reindex

obj = Series([1, 2, 3, 4], index=['b', 'c', 'a', 'd'])# 重新索引，对应索引如果不存在则为缺失值NaNobj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])# 也可以将NaN替换为其他数字，此处替换为0obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'], fill_value=0)# 使用ffillobj3 = Series(['a', 'b', 'c'], index=[0, 2, 4])obj3.reindex(range(6), method='ffill')# return# 0 a# 1 a# 2 b# 3 b# 4 c# 5 c

参数描述 ffill/pad 前向（或进位）填充 bfill/backfill 后向（或进位）填充

# 使用columns关键字重新索引columns = ['key1', 'key2', 'key3']df.reindex(columns=columns)# 新增一列key3，填充值为NaN# 也可以同时对columns和index两个重新索引df.reindex(index=[...], columns=[...])# 使用ix可以将重新索引做的更简单df.ix[['a', 'b', 'c', 'd'], name2]

删除条目

# Seriesobj = Series(np.arange(4.), index=['a', 'b', 'c', 'd'])# 删除c所在的索引行new_obj = obj.drop('c')# DataFramedf = DataFrame(np.arange(16).reshape(4, 4),              index=['a', 'b', 'c', 'd'],              columns=['one', 'two', 'three', 'four'])# 删除a、c所在的索引行df.drop(['a', 'c'])# 删除two、four所在列df.drop(['two', 'four'], axis=1)

索引、挑选、过滤

# Seriesobj = Series(np.arange(4.), index=['a', 'b', 'c', 'd'])print(obj['b'])# return 1.0print(obj[1])# return 1.0print(obj[2:4])print(obj[['b', 'a', 'd']])print(obj[[1, 3]])print(obj[obj < 2])# 如果使用标签切片，则会将结速点也包括在内print(obj['b':'c'])# 也可以有如下操作赋值obj['b':'c'] = 5# DataFramedf = DataFrame(np.arange(16).shape(4, 4),              index=['a', 'b', 'c', 'd'],              columns=['one', 'two', 'three', 'four'])print(df['two'])# return 'two'所在列的索引和对应的valueprint(df[['three', 'one']])# return 'three'和'one'所在列的索引和对应的valueprint(df[:2])print(df[df['three'] > 5])print(df < 5)# 返回一个由布尔型构成的DataFramedf[df< 5] = 0# 对df中小于5的元素重新赋值为0print(df.ix['b', ['two', 'three']])# return 'b'所在列，对应'two'、'three'的值及索引print(df.ix[['b', 'c'], [3, 0, 1]])# return ...  columns同样可以使用列数来代替print(df.ix[2])# return 索引3的列名和valueprint(df.ix[:'c', 'three'])# return c索引前的three列print(df.ix[df.three > 5, :3])# return dataframe, 第三列中大于5的行，且在索引3之前

算术方法

df1 + df2对应索引和列中的值进行相加

阅读全文

0 0