python pandas

来源:互联网 发布:ctf数据库截断 编辑:程序博客网 时间:2024/05/15 09:01
import os,sys,pdb,pickle
import numpy as np
import pandas as pd

df = pd.read_csv('miss.txt', encoding='gb2312')
bs =  df['result'].str.contains(r'null')
print 'record with null\n',df[bs]


bs = df.duplicated()
print 'duplicated \n', df[bs]

#按"source", 'result'两列分组,返回每一组的大小
grp = df.groupby(['source', 'result']).size()
#排序(grp本身只包含group的size, 所以此处是对size排序)
grp.sort()
#保存到csv中
grp.to_csv('duplicated.txt',encoding='gb2312')



===================================================================

import os,sys,pdb,pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('dx.txt', header=None, names=[ 'score', '1', '2', '3', '4', '5','6','nan'])
print df.head()
print df.describe()


plt.figure()
for k in range(1,7,1):
    l = str(k)
    idx = df['score'] > 850
    good = df[idx][l]
    idx = df['score'] < 700
    bad = df[idx][l]

    if 0:
        print 'good ================'
        print good.describe()
        print 'bad ================'
        print bad.describe()

    plt.subplot(6,2,(k-1)*2+1)
    bins = np.arange(0.1,0.2,0.001)
    plt.hist(good.values,bins)
    plt.subplot(6,2,(k-1)*2+2)
    plt.hist(bad.values,bins)
    plt.title(l)
plt.show()


plt.figure()
idx = df['score'] > 850
good = df[idx]
idx = df['score'] < 700
bad = df[idx]
plt.plot(bad['4'].values, bad['6'].values, 'or')
plt.plot(good['4'].values, good['6'].values, 'xb')

plt.show()


if 0:
    idx = df['score'] > 900
    print '>900'
    print [df[idx].mean(), df[idx].std()]


    idx = df['score'] < 700
    print '<700'
    print [df[idx].mean(), df[idx].std()]



0 0