python3.5——Pandas模块使用(下)——缺失值处理和层次索引

来源:互联网 发布:域名如何别名解析 编辑:程序博客网 时间:2024/06/05 00:10

1、pandas缺失值处理




import numpy as npimport pandas as pdfrom pandas import Series,DataFramedf3 = DataFrame([    ["Tom",np.nan,456.67,"M"],    ["Merry",34,345.56,np.nan],    [np.nan,np.nan,np.nan,np.nan],    ["John",23,np.nan,"M"],    ["Joe",18,385.12,"F"]],columns = ["name","age","salary","gender"])print(df3)print("=======判断NaN值=======")print(df3.isnull())print("=======判断非NaN值=======")print(df3.notnull())print("=======删除包含NaN值的行=======")print(df3.dropna())print("=======删除全部为NaN值的行=======")print(df3.dropna(how="all"))df3.ix[2,0] = "Gerry"       #修改第2行第0列的值print(df3)print("=======删除包含NaN值的列=======")print(df3.dropna(axis=1))#运行结果:   name   age  salary gender0    Tom   NaN  456.67      M1  Merry  34.0  345.56    NaN2    NaN   NaN     NaN    NaN3   John  23.0     NaN      M4    Joe  18.0  385.12      F=======判断NaN值=======    name    age salary gender0  False   True  False  False1  False  False  False   True2   True   True   True   True3  False  False   True  False4  False  False  False  False=======判断非NaN值=======    name    age salary gender0   True  False   True   True1   True   True   True  False2  False  False  False  False3   True   True  False   True4   True   True   True   True=======删除包含NaN值的行=======  name   age  salary gender4  Joe  18.0  385.12      F=======删除全部为NaN值的行=======    name   age  salary gender0    Tom   NaN  456.67      M1  Merry  34.0  345.56    NaN3   John  23.0     NaN      M4    Joe  18.0  385.12      F    name   age  salary gender0    Tom   NaN  456.67      M1  Merry  34.0  345.56    NaN2  Gerry   NaN     NaN    NaN3   John  23.0     NaN      M4    Joe  18.0  385.12      F=======删除包含NaN值的列=======    name0    Tom1  Merry2  Gerry3   John4    Joe

import numpy as npimport pandas as pdfrom pandas import Series,DataFramedf4 = DataFrame(np.random.randn(7,3))print(df4)df4.ix[:4,1] = np.nan       #第0至3行,第1列的数据df4.ix[:2,2] = np.nanprint(df4)print(df4.fillna(0))        #将缺失值用传入的指定值0替换print(df4.fillna({1:0.5,2:-1}))     #将缺失值按照指定形式填充#运行结果:          0         1         20 -0.737618 -0.530302 -2.7164571  0.810339  0.063028 -0.3413432  0.070564  0.347308 -0.1211373 -0.501875 -1.573071 -0.8160774 -2.159196 -0.659185 -0.8851855  0.175086 -0.954109 -0.7586576  0.395744 -0.875943  0.950323          0         1         20 -0.737618       NaN       NaN1  0.810339       NaN       NaN2  0.070564       NaN       NaN3 -0.501875       NaN -0.8160774 -2.159196       NaN -0.8851855  0.175086 -0.954109 -0.7586576  0.395744 -0.875943  0.950323          0         1         20 -0.737618  0.000000  0.0000001  0.810339  0.000000  0.0000002  0.070564  0.000000  0.0000003 -0.501875  0.000000 -0.8160774 -2.159196  0.000000 -0.8851855  0.175086 -0.954109 -0.7586576  0.395744 -0.875943  0.950323          0         1         20 -0.737618  0.500000 -1.0000001  0.810339  0.500000 -1.0000002  0.070564  0.500000 -1.0000003 -0.501875  0.500000 -0.8160774 -2.159196  0.500000 -0.8851855  0.175086 -0.954109 -0.7586576  0.395744 -0.875943  0.950323

2、pandas常用数学统计方法




import numpy as npimport pandas as pdfrom pandas import Series,DataFrame#pandas常用数学统计方法arr = np.array([    [98.5,89.5,88.5],    [98.5,85.5,88],    [70,85,60],    [80,85,82]])df1 = DataFrame(arr,columns=["语文","数学","英语"])print(df1)print("=======针对列计算总统计值=======")print(df1.describe())print("=======默认计算各列非NaN值个数=======")print(df1.count())print("=======计算各行非NaN值个数=======")print(df1.count(axis=1))#运行结果:     语文    数学    英语0  98.5  89.5  88.51  98.5  85.5  88.02  70.0  85.0  60.03  80.0  85.0  82.0=======针对列计算总统计值=======              语文         数学         英语count   4.000000   4.000000   4.000000mean   86.750000  86.250000  79.625000std    14.168627   2.179449  13.412525min    70.000000  85.000000  60.00000025%    77.500000  85.000000  76.50000050%    89.250000  85.250000  85.00000075%    98.500000  86.500000  88.125000max    98.500000  89.500000  88.500000=======默认计算各列非NaN值个数=======语文    4数学    4英语    4dtype: int64=======计算各行非NaN值个数=======0    31    32    33    3dtype: int64



import numpy as npimport pandas as pdfrom pandas import Series,DataFrame、#2.pandas相关系数与协方差df2 = DataFrame({    "GDP":[12,23,34,45,56],    "air_temperature":[23,25,26,27,30],    "year":["2001","2002","2003","2004","2005"]})print(df2)print("=========相关系数========")print(df2.corr())print("=========协方差========")print(df2.cov())print("=========两个量之间的相关系数========")print(df2["GDP"].corr(df2["air_temperature"]))print("=========两个量之间协方差========")print(df2["GDP"].cov(df2["air_temperature"]))#运行结果: GDP  air_temperature  year0   12               23  20011   23               25  20022   34               26  20033   45               27  20044   56               30  2005=========相关系数========                      GDP  air_temperatureGDP              1.000000         0.977356air_temperature  0.977356         1.000000=========协方差========                   GDP  air_temperatureGDP              302.5             44.0air_temperature   44.0              6.7=========两个量之间的相关系数========0.97735555485=========两个量之间协方差========44.0





import numpy as npimport pandas as pdfrom pandas import Series,DataFrame#3.pandas唯一值、值计数及成员资格df3 = DataFrame({    "order_id":["1001","1002","1003","1004","1005"],    "member_id":["m01","m01","m02","m01","m02",],    "order_amt":[345,312.2,123,250.2,235]})print(df3)print("=========去重后的数组=========")print(df3["member_id"].unique())print("=========值出现的频率=========")print(df3["member_id"].value_counts())print("=========成员资格=========")df3 = df3["member_id"]mask = df3.isin(["m01"])print(mask)print(df3[mask])#运行结果: member_id  order_amt order_id0       m01      345.0     10011       m01      312.2     10022       m02      123.0     10033       m01      250.2     10044       m02      235.0     1005=========去重后的数组=========['m01' 'm02']=========值出现的频率=========m01    3m02    2Name: member_id, dtype: int64=========成员资格=========0     True1     True2    False3     True4    FalseName: member_id, dtype: bool0    m011    m013    m01Name: member_id, dtype: object

3、pandas层次索引





import numpy as npimport pandas as pdfrom pandas import Series,DataFrame#3.pandas层次索引data = Series([998.4,6455,5432,9765,5432],              index=[["2001","2001","2001","2002","2002"],              ["苹果","香蕉","西瓜","苹果","西瓜"]]              )print(data)df4 = DataFrame({    "year":[2001,2001,2002,2002,2003],    "fruit":["apple","banana","apple","banana","apple"],    "production":[2345,5632,3245,6432,4532],    "profits":[245.6,432.7,534.1,354,467.8]})print(df4)print("=======层次化索引=======")df4 = df4.set_index(["year","fruit"])print(df4)print("=======依照索引取值=======")print(df4.ix[2002,"apple"])print("=======依照层次化索引统计数据=======")print(df4.sum(level="year"))print(df4.mean(level="fruit"))print(df4.min(level=["year","fruit"]))#运行结果:2001  苹果     998.4      香蕉    6455.0      西瓜    5432.02002  苹果    9765.0      西瓜    5432.0dtype: float64    fruit  production  profits  year0   apple        2345    245.6  20011  banana        5632    432.7  20012   apple        3245    534.1  20023  banana        6432    354.0  20024   apple        4532    467.8  2003=======层次化索引=======             production  profitsyear fruit                      2001 apple         2345    245.6     banana        5632    432.72002 apple         3245    534.1     banana        6432    354.02003 apple         4532    467.8=======依照索引取值=======production    3245.0profits        534.1Name: (2002, apple), dtype: float64=======依照层次化索引统计数据=======      production  profitsyear                     2001        7977    678.32002        9677    888.12003        4532    467.8        production     profitsfruit                         apple         3374  415.833333banana        6032  393.350000             production  profitsyear fruit                      2001 apple         2345    245.6     banana        5632    432.72002 apple         3245    534.1     banana        6432    354.02003 apple         4532    467.8


阅读全文
0 0