【kaggle】Titanic
来源:互联网 发布:淘宝退货退款原因选项 编辑:程序博客网 时间:2024/04/27 09:51
数据集:train.csv
https://www.kaggle.com/c/titanic
Getting Started With Python:
# -*-coding:utf-8-*-####################### by kevinelstri# 数据集:train.csv# Getting Started With Python######################import csv as csvimport numpy as np####################### Titanic####################### ---------------------# csv 读取csv文件,并转换成矩阵形式# ---------------------csv_file_object = csv.reader(open('../Data/train.csv', 'rb')) # 打开并读取csv文件header = csv_file_object.next() # 获取字段属性值print headerdata = []for row in csv_file_object: # 将csv文件按行读取加入到list元组中 data.append(row)data = np.array(data) # 将list元组构成矩阵形式#----------------------# 操作csv文件,熟悉csv语法#----------------------print data[0]print data[0, 3] # 获取第0行的第三列元素print data[0::, 3] # "0::"表示从开头到结尾所有的行,3表示第三列,默认读取的为string类型print data[0::, 2].astype(np.float) # 将string类型转换为float类型number_passengers = np.size(data[0::, 1].astype(np.float)) # 获取总人数number_survived = np.sum(data[0::, 1].astype(np.float)) # 获取幸存者的人数proportion_survived = number_survived / number_passengers # 幸存者的比例print 'number_passengers=', number_passengersprint 'number_survived=', number_survivedprint 'proportion_survived=', proportion_survived # 幸存比例#----------------------# 运用:男女幸存者比例#----------------------women_only_state = data[0::, 4] == "female"men_only_state = data[0::, 4] != "female"women_onboard = data[women_only_state, 1].astype(np.float) # 船上女性men_onboard = data[men_only_state, 1].astype(np.float) # 船上男性proportion_women_survived = np.sum(women_onboard) / np.size(women_onboard)proportion_men_survived = np.sum(men_onboard) / np.size(men_onboard)print 'proportion_women_survived=', proportion_women_survived # 女性幸存者比例print 'proportion_men_survived=', proportion_men_survived # 男性幸存者比例
Getting Started With Pandas
# -*-coding:utf-8-*-############### by kevinelstri# 数据集为:train.csv# Getting Started With Pandas##############import numpy as np#----------------------# 使用csv进行读取数据#----------------------import csv as csvcsv_file_object = csv.reader(open('../Data/train.csv', 'rb'))header = csv_file_object.next()# print headerdata = []for row in csv_file_object: data.append(row)data = np.array(data)# print data#----------------------# 使用pandas进行读取数据#----------------------import pandas as pddf = pd.read_csv('../Data/train.csv')print df.head(3)print dfprint df.tail(3)print df.dtypesprint df.info()print df.describe()print df['Age'][0:10]print type(df['Age'])print df['Age'].mean()print df[['Age', 'Sex', 'Pclass']][0:10]print df[df['Age'] > 50]print df[df['Age'] > 50][['Name', 'Sex', 'Age']]print df[df['Age'].isnull()][['Name', 'Sex', 'Age']]for i in range(1, 4): print i, len(df[(df['Sex'] == 'male') & (df['Pclass'] == i)]) # 逻辑运算# import pylab as p# df['Age'].hist()# p.show()#----------------------# 清洗数据#----------------------df['Gender'] = 1 # 增加了一列Gender,值全部为1print df.head(3)df['Gender'] = df['Sex'].map(lambda x: x[0].upper()) # 将Sex列的首字母的大写形式复制给Gender列print df.head(3)df['Gender'] = df['Sex'].map({'female': 0, 'male': 1}).astype(int) # 将sex转换为二值,male(男)为1,female(女)为0print df[['Name', 'Age', 'Gender']][0:10]#----------------------# 处理Age的空值问题# the average age of passagers is 29.6991176, so let the value of age is simillar with average.# the median might be better# let's build another reference table to calculate what each of these medians are:###median_ages = np.zeros((2, 3)) # 创建一个2行3列的全为0的矩阵,由于Sex值为2个,Pclass值为3个print median_agesprint '--------------------------------------------------------------------------------'############### 分析:dropna()就是去除Age==NaN的情况,等同于 df['Age'].notnull()# median()指的是一组数据的中位数##############sum = df[(df['Gender'] == 1) & (df['Pclass'] == 1)]['Age'].sum() # 满足条件的Age求和len = len(df[(df['Gender'] == 1) & (df['Pclass'] == 1) & (df['Age'].notnull())]['Age'])print 'sum=', sumprint 'len=', lenprint 'avg=', sum / len # 满足条件的avg平均值print (df[(df['Gender'] == 1) & (df['Pclass'] == 1) & (df['Age'].notnull())]['Age']).median() # 与下面for循环里面的判断是等价的for i in range(0, 2): for j in range(0, 3): # Gender的取值为0,1 Pclass的取值为1,2,3 median_ages[i, j] = df[(df['Gender'] == i) & (df['Pclass'] == j + 1)]['Age'].dropna().median()print median_agesprint '---------------------------------------------------------------------------------'##### 新建一个AgeFill列表,将Age数据复制进去,并将空值补齐####df['AgeFill'] = df['Age']print df[['Name', 'Age', 'AgeFill']][0:20]printprint df[df['Age'].isnull()][['Name', 'Age', 'AgeFill']].head(10) # Age为空的列表printfor i in range(0, 2): for j in range(0, 3): # 空值补齐,将上述矩阵中的数据对应到空值里面 df.loc[(df.Age.isnull()) & (df.Gender == i) & (df.Pclass == j + 1), 'AgeFill'] = median_ages[i, j]print df[df['Age'].isnull()][['Name', 'Age', 'AgeFill']].head(10)print '---------------------------------------------------------------------------------'df['AgeIsNull'] = pd.isnull(df.Age).astype(int)print df[['Name', 'Age', 'AgeFill', 'AgeIsNull']].head(10)#----------------------# 特征工程#----------------------df['FamilySize'] = df['SibSp'] + df['Parch']print df[['SibSp', 'Parch', 'FamilySize']]df['Age*Class'] = df.AgeFill * df.Pclassprint df[['SibSp', 'Parch', 'FamilySize', 'Age*Class', 'Survived']]print df[['SibSp', 'Parch', 'FamilySize', 'Age*Class', 'Survived', 'Sex']]# import pylab as p# df['Age*Class'].hist()# p.show()##### final preparation:# (1)determine what columns we have left which are not numeric# (2)send our pandas.DataFrame back to a numpy.array####print df.dtypesprintprint df.dtypes[df.dtypes.map(lambda x: x == 'object')]print df.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1) # 删除没有用到的列printprint df.drop(['Age'], axis=1) # Age这列已经没用了,也删除printprint df.dropna() # 删除表中仍然有空值的行# convert it into a numpy array# pandas can send back an array using the .values methodtrain_data = df.values # 将数据转换成矩阵形式print train_data
0 0
- 【kaggle】Titanic
- Kaggle: Titanic
- kaggle:titanic
- kaggle-Titanic
- Titanic Kaggle 竞赛系列
- Kaggle之Titanic 沉没
- 分类-kaggle-titanic
- Kaggle入门 (Titanic XGBoost)
- kaggle——Titanic
- kaggle Titanic泰坦尼克
- Titanic Project Kaggle Competition
- Kaggle之Titanic
- Kaggle Titanic 竞赛
- Kaggle-titanic-v0
- 机器学习-Kaggle竞赛-Titanic
- 20151007kaggle Titanic心得.md
- Kaggle Titanic Competition-第一部分
- Kaggle Titanic Competition-第二部分
- 给Java新手的一些建议----Java知识点归纳(J2EE and Web 部分)
- Web安全防护基础篇:HTML Injection - Reflected (GET)
- linux命令
- 利用@media screen实现网页布局的自适应,@media screen and
- KMP模板
- 【kaggle】Titanic
- 直播平台
- intellij idea svn使用一 导入、更新、提交、解决冲突
- 我的PostgreSQL技术笔记
- ubuntu14.04的HADOOP安装,详细
- 问题9:查询优化器工作原理
- ITK&&VTK读取DICOM数据并渲染
- STL容器中list与迭代器iterator的模拟实现
- JRE和JDK相关