titanic+tensorflow
来源:互联网 发布:informix数据库操作 编辑:程序博客网 时间:2024/04/27 15:17
安装pandas sklearn http://blog.csdn.net/Yakumoyukarilan/article/details/51340358
linux delete 文件夹
sudo rm -rf /tmp/pip_build_root/pandas
pandas:Python Data Analysis Library 或 pandas 是基于NumPy的一种工具,该工具是为了解决数据分析任务而创建的。Pandas 纳入了大量库和一些标准的数据模型,提供了高效地操作大型数据集所需的工具。pandas提供了大量能使我们快速便捷地处理数据的函数和方法。你很快就会发现,它是使Python成为强大而高效的数据分析环境的重要因素之一。
titanic实践参考:-----非常不错
http://blog.csdn.net/han_xiaoyang/article/details/49797143
书上的例子:
源码:
# -*- coding:utf-8 -*-
'''
Created on 2017年8月10日
@author:
'''
import pandas as pd #数据分析
data = pd.read_csv('train.csv') #读入的数据为一个DataFrame类型的对象
#DataFrame是一个二维数据类型
data.info() #查看数据概况
#取部分特征字段用于分类,并将所有缺失的字段补充为0
#对Sex字段进行正规化处理
data['Sex'] = data['Sex'].apply(lambda s:1 if s == 'male' else 0)
data = data.fillna(0)
dataset_X = data[['Sex','Age','Pclass','SibSp','Parch','Fare']]
dataset_X = dataset_X.as_matrix() #转换成矩阵
# print dataset_X
#两种分类分别是幸存和死亡,即'Survived'和'Deceased'
data['Deceased'] = data['Survived'].apply( lambda s: int (not s))
dataset_Y = data[['Deceased','Survived']]
dataset_Y = dataset_Y.as_matrix()
# print dataset_Y
#scikit-learn库中提供了用于切分数据集的工具函数train_test_split,随机打乱数据集后按比列拆分数据集
from sklearn.model_selection import train_test_split
#使用函数train_test_split将标记数据切分为训练数据集和验证数据集,其中验证数据集占20%
X_train, X_validation, Y_train, Y_validation = train_test_split(dataset_X, dataset_Y, test_size = 0.2, random_state = 42 )
# print Y_validation
#接下来,使用TensorFlow构建计算图
#使用placeholder声明占位符
#声明输入数据占位符
#shape参数的第一个元素为none,表示可以同时放入任意条记录
import tensorflow as tf
X = tf.placeholder(tf.float32, shape = [None, 6])
Y = tf.placeholder(tf.float32, shape = [None, 2])
#声明参数变量
W = tf.Variable(tf.random_normal([6, 2]), name = 'weights')
b = tf.Variable(tf.zeros([2]), name = 'bias')
#构造前向传播计算图
y_pred = tf.nn.softmax(tf.matmul(X, W) + b)
#声明代价函数
cross_entropy = - tf.reduce_sum(Y*tf.log(y_pred+1e-10), reduction_indices = 1)
cost = tf.reduce_mean(cross_entropy)
#使用梯度下降算法最小化代价,系统自动构建反向传播部分的计算图
train_op = tf.train.GradientDescentOptimizer(0.001).minimize(cost)
#计算图的声明完成
#构建训练迭代过程
with tf.Session() as sess:
#初始化所有变量
tf.initialize_all_variables().run()
#training loop
for epoch in range(10):
total_loss = 0.
for i in range(len(X_train)):
#prepare feed data and run,这里相关于是使用随机梯度下降
feed_dict = {X: [X_train[i]], Y:[Y_train[i]]}
_, loss = sess.run([train_op, cost], feed_dict = feed_dict)
total_loss += loss
# display loss per epoch
print('Epoch: %04d, total loss = %-9f' % (epoch+1, total_loss))
print 'Training complete!'
import numpy as np
#用验证数据集合评估模型的表现
pred = sess.run(y_pred, feed_dict = {X: X_validation})
#argmax是找最大值的位置,后面的1指的是轴
correct = np.equal(np.argmax(pred, 1), np.argmax(Y_validation, 1))
# print correct
# cao = correct.astype(np.float32)
# print cao
#astype指的是类型转换,mean表示加起来,除以个数,这里是1 和 0, 所以可以
accuracy = np.mean(correct.astype(np.float32))
print ('Accuracy on validation set: %.9f' % accuracy)
#predict on test data
testdata = pd.read_csv('test.csv')
testdata = testdata.fillna(0)
testdata['Sex'] = testdata['Sex'].apply(lambda s: 1 if s == 'male' else 0)
X_test = testdata[['Sex', 'Age', 'Pclass', 'SibSp', 'Parch', 'Fare' ]]
predictions = np.argmax(sess.run(y_pred, feed_dict = {X: X_test}),1)
print predictions
submission = pd.DataFrame({
'PassengerId': testdata['PassengerId'],
'Survived': predictions
})
submission.to_csv('Titanic-submission-miao.csv', index = False)
提交kaggle的结果:
加入了:存储和加载模型参数
# -*- coding:utf-8 -*-
'''
Created on 2017年8月10日
@author:
'''
import pandas as pd #数据分析
from setuptools.sandbox import save_path
data = pd.read_csv('train.csv') #读入的数据为一个DataFrame类型的对象
#DataFrame是一个二维数据类型
data.info() #查看数据概况
#取部分特征字段用于分类,并将所有缺失的字段补充为0
#对Sex字段进行正规化处理
data['Sex'] = data['Sex'].apply(lambda s:1 if s == 'male' else 0)
data = data.fillna(0)
dataset_X = data[['Sex','Age','Pclass','SibSp','Parch','Fare']]
dataset_X = dataset_X.as_matrix() #转换成矩阵
# print dataset_X
#两种分类分别是幸存和死亡,即'Survived'和'Deceased'
data['Deceased'] = data['Survived'].apply( lambda s: int (not s))
dataset_Y = data[['Deceased','Survived']]
dataset_Y = dataset_Y.as_matrix()
# print dataset_Y
#scikit-learn库中提供了用于切分数据集的工具函数train_test_split,随机打乱数据集后按比列拆分数据集
from sklearn.model_selection import train_test_split
#使用函数train_test_split将标记数据切分为训练数据集和验证数据集,其中验证数据集占20%
X_train, X_validation, Y_train, Y_validation = train_test_split(dataset_X, dataset_Y, test_size = 0.2, random_state = 42 )
# print Y_validation
#接下来,使用TensorFlow构建计算图
#使用placeholder声明占位符
#声明输入数据占位符
#shape参数的第一个元素为none,表示可以同时放入任意条记录
import tensorflow as tf
X = tf.placeholder(tf.float32, shape = [None, 6])
Y = tf.placeholder(tf.float32, shape = [None, 2])
#声明参数变量
W = tf.Variable(tf.random_normal([6, 2]), name = 'weights')
b = tf.Variable(tf.zeros([2]), name = 'bias')
# use saver to save and restore model
saver = tf.train.Saver()
#构造前向传播计算图
y_pred = tf.nn.softmax(tf.matmul(X, W) + b)
#声明代价函数
cross_entropy = - tf.reduce_sum(Y*tf.log(y_pred+1e-10), reduction_indices = 1)
cost = tf.reduce_mean(cross_entropy)
#使用梯度下降算法最小化代价,系统自动构建反向传播部分的计算图
train_op = tf.train.GradientDescentOptimizer(0.001).minimize(cost)
#计算图的声明完成
#构建训练迭代过程
with tf.Session() as sess1:
#初始化所有变量
tf.initialize_all_variables().run()
#training loop
for epoch in range(10):
total_loss = 0.
for i in range(len(X_train)):
#prepare feed data and run,这里相关于是使用随机梯度下降
feed_dict = {X: [X_train[i]], Y:[Y_train[i]]}
_, loss = sess1.run([train_op, cost], feed_dict = feed_dict)
total_loss += loss
# display loss per epoch
print('Epoch: %04d, total loss = %-9f' % (epoch+1, total_loss))
print 'Training complete!'
import numpy as np
#用验证数据集合评估模型的表现
pred = sess1.run(y_pred, feed_dict = {X: X_validation})
#argmax是找最大值的位置,后面的1指的是轴
correct = np.equal(np.argmax(pred, 1), np.argmax(Y_validation, 1))
# print correct
# cao = correct.astype(np.float32)
# print cao
#astype指的是类型转换,mean表示加起来,除以个数,这里是1 和 0, 所以可以
accuracy = np.mean(correct.astype(np.float32))
print ('Accuracy on validation set: %.9f' % accuracy)
save_path = saver.save(sess1, "model.ckpt" )
with tf.Session() as sess2:
#predict on test data
testdata = pd.read_csv('test.csv')
testdata = testdata.fillna(0)
testdata['Sex'] = testdata['Sex'].apply(lambda s: 1 if s == 'male' else 0)
X_test = testdata[['Sex', 'Age', 'Pclass', 'SibSp', 'Parch', 'Fare' ]]
saver.restore(sess2, "model.ckpt" )
predictions = np.argmax(sess2.run(y_pred, feed_dict = {X: X_test}),1)
print predictions
submission = pd.DataFrame({
'PassengerId': testdata['PassengerId'],
'Survived': predictions
})
submission.to_csv('Titanic-submission-miao.csv', index = False)
数据挖掘的技巧:
1.数据可视化
2. 特征工程
数据清理, 数据预处理, 特征选择, 多种算法模型
3.4 TensorBoard可视化
3.4.1 记录事件数据
------这一部分很用,,对于代码调试来说!!!!
http://blog.csdn.net/chendadayan/article/details/52919267---这个博客也重点参考,可显示tensorboard.
问题:namespace hierachy: finding similar subgraphs...
1.记录事件数据 ---已完成
2.启动tensorBoard服务----没问题---图片也已经看到了
从控制台中,应该能够看到一些日志信息打印出来,然后是消息“Starting Tensor-Board on port 6066”。刚才所做的是启动一个使用来自“my_graph”目录下的数据的TensorBoard服务器。默认情况下,TensorBoard服务器启动后会自动监听端口6006—要访问TensorBoard,可打开浏览器并在地址栏输入http://localhost:6006,然后将看到一个橙白主题的欢迎页面:
请不要为警告消息“No scalar data was found”紧张,这仅仅表示我们尚未为Tensor-Board保存任何概括统计量,从而使其无法正常显示。通常,这个页面会显示利用SummaryWriter对象要求TensorFlow所保存的信息。由于尚未保存任何其他统计量,所以无内容可供显示。尽管如此,这并不妨碍我们欣赏自己定义的美丽的数据流图。单击页面顶部的“Graph”链接,将看到类似下图的页面:
jikexueyuan 教程:http://wiki.jikexueyuan.com/project/tensorflow-zh/how_tos/summaries_and_tensorboard.html#tensorboard--logdir=/Users/jackyue/data/tf/第1课/graphs--port=7701
用这种命令,不要用书上的
3.5 数据读取
3.5.1 数据文件格式
3.6 SkFlow TFlearn TF-Slim
加入Tensorboard部分后的代码:
import pandas as pd #数据分析
import numpy as np
# from setuptools.sandbox import save_path
data = pd.read_csv('train.csv') #读入的数据为一个DataFrame类型的对象
#DataFrame是一个二维数据类型
data.info() #查看数据概况
#取部分特征字段用于分类,并将所有缺失的字段补充为0
#对Sex字段进行正规化处理
data['Sex'] = data['Sex'].apply(lambda s:1 if s == 'male' else 0)
#对年龄字段补全均值
mean_age = data['Age'].mean() #29.69
# print mean_age
data['Age'][data.Age.isnull()] = mean_age
data = data.fillna(0)
dataset_X = data[['Sex','Age','Pclass','SibSp','Parch','Fare']]
dataset_X = dataset_X.as_matrix() #转换成矩阵
# print dataset_X
#两种分类分别是幸存和死亡,即'Survived'和'Deceased'
data['Deceased'] = data['Survived'].apply( lambda s: int (not s))
dataset_Y = data[['Deceased','Survived']]
dataset_Y = dataset_Y.as_matrix()
# print dataset_Y
#scikit-learn库中提供了用于切分数据集的工具函数train_test_split,随机打乱数据集后按比列拆分数据集
from sklearn.model_selection import train_test_split
#使用函数train_test_split将标记数据切分为训练数据集和验证数据集,其中验证数据集占20%
X_train, X_validation, Y_train, Y_validation = train_test_split(dataset_X, dataset_Y, test_size = 0.2, random_state = 42 )
# print Y_validation
#接下来,使用TensorFlow构建计算图
#使用placeholder声明占位符
#声明输入数据占位符
#shape参数的第一个元素为none,表示可以同时放入任意条记录
import tensorflow as tf
with tf.name_scope('input'):
X = tf.placeholder(tf.float32, shape = [None, 6])
Y = tf.placeholder(tf.float32, shape = [None, 2])
with tf.name_scope('classifier'):
#声明参数变量
W = tf.Variable(tf.random_normal([6, 2]), name = 'weights')
b = tf.Variable(tf.zeros([2]), name = 'bias')
# use saver to save and restore model
saver = tf.train.Saver()
#构造前向传播计算图
y_pred = tf.nn.softmax(tf.matmul(X, W) + b)
#add histogram summaries for weights, view on tensorboard
tf.histogram_summary('weights',W)
tf.histogram_summary('bias',b)
with tf.name_scope('cost'):
#声明代价函数
cross_entropy = - tf.reduce_sum(Y*tf.log(y_pred+1e-10), reduction_indices = 1)
cost = tf.reduce_mean(cross_entropy)
tf.scalar_summary('cost',cost)
#验证集合上的正确率
with tf.name_scope('accuracy'):
correct = tf.equal(tf.argmax(Y, 1), tf.argmax(y_pred, 1))
acc_op = tf.reduce_mean(tf.cast(correct, tf.float32))
tf.scalar_summary('accuracy', acc_op)
#使用梯度下降算法最小化代价,系统自动构建反向传播部分的计算图
train_op = tf.train.GradientDescentOptimizer(0.001).minimize(cost)
#计算图的声明完成
#构建训练迭代过程
with tf.Session() as sess1:
#create a log writer. run 'tensorboard--logdir=./logs'
sum_ops = tf.merge_all_summaries()
summary_writer = tf.train.SummaryWriter('/tmp/log', sess1.graph)
#初始化所有变量
tf.initialize_all_variables().run()
#training loop
for epoch in range(10):
total_loss = 0.
for i in range(len(X_train)):
#prepare feed data and run,这里相关于是使用随机梯度下降
feed_dict = {X: [X_train[i]], Y:[Y_train[i]]}
# _, loss = sess1.run([sum_ops, train_op, cost], feed_dict = feed_dict)
_, loss = sess1.run([ train_op, cost], feed_dict = feed_dict)
total_loss += loss
# display loss per epoch
# summary_writer.add_summary(summary, epoch)
print('Epoch: %04d, total loss = %-9f' % (epoch+1, total_loss))
#用验证数据集合评估模型的表现
# pred = sess1.run(y_pred, feed_dict = {X: X_validation})
#argmax是找最大值的位置,后面的1指的是轴
# correct = np.equal(np.argmax(pred, 1), np.argmax(Y_validation, 1))
# print correct
# cao = correct.astype(np.float32)
# print cao
#astype指的是类型转换,mean表示加起来,除以个数,这里是1 和 0, 所以可以
# accuracy = np.mean(correct.astype(np.float32))
summary, accuracy = sess1.run([sum_ops, acc_op], feed_dict = {X: X_validation, Y:Y_validation})
summary_writer.add_summary(summary, epoch)
print ('Accuracy on validation set: %.9f' % accuracy)
save_path = saver.save(sess1, "model.ckpt" )
print 'Training complete!'
with tf.Session() as sess2:
#predict on test data
testdata = pd.read_csv('test.csv')
testdata = testdata.fillna(0)
testdata['Sex'] = testdata['Sex'].apply(lambda s: 1 if s == 'male' else 0)
X_test = testdata[['Sex', 'Age', 'Pclass', 'SibSp', 'Parch', 'Fare' ]]
saver.restore(sess2, "model.ckpt" )
predictions = np.argmax(sess2.run(y_pred, feed_dict = {X: X_test}),1)
print predictions
submission = pd.DataFrame({
'PassengerId': testdata['PassengerId'],
'Survived': predictions
})
submission.to_csv('Titanic-submission-miao.csv', index = False)
- titanic+tensorflow
- TensorFlow实现Titanic比赛
- Kaggle入门 (Titanic TensorFlow Softmax)
- TensorFlow学习笔记二Titanic题目实战
- Titanic
- POJ2354-Titanic
- titanic prediction
- 【kaggle】Titanic
- Kaggle: Titanic
- kaggle:titanic
- kaggle-Titanic
- the lose of Titanic
- 重看《Titanic》
- POJ 2354 Titanic
- poj 2354 Titanic
- Ural 1030. Titanic
- Titanic : ML from Disaster
- POJ 2354 Titanic
- 递归解决八皇后问题-小昝
- eclipse错误: 找不到或无法加载主类 com.lan.generics.GenericTest3
- Hadoop集群搭建
- 跳棋
- 二叉树前、中、后非递归遍历
- titanic+tensorflow
- 007
- 单片机常用名词解释
- Hadoop应用
- hdu 1757 A Simple Math Problem
- java实现DES的加密解密
- 职场昏暗,我心依旧---众智云
- LINUX中SVN冲突解决办法
- 【分布式事务】概述