titanic+tensorflow

来源:互联网 发布:informix数据库操作 编辑:程序博客网 时间:2024/04/27 15:17

安装pandas sklearn     http://blog.csdn.net/Yakumoyukarilan/article/details/51340358

linux delete 文件夹 

sudo rm -rf /tmp/pip_build_root/pandas

pandas:Python Data Analysis Library 或 pandas 是基于NumPy的一种工具,该工具是为了解决数据分析任务而创建的。Pandas 纳入了大量库和一些标准的数据模型,提供了高效地操作大型数据集所需的工具。pandas提供了大量能使我们快速便捷地处理数据的函数和方法。你很快就会发现,它是使Python成为强大而高效的数据分析环境的重要因素之一。


titanic实践参考:-----非常不错

http://blog.csdn.net/han_xiaoyang/article/details/49797143


书上的例子:

源码:


# -*- coding:utf-8 -*-
'''
Created on 2017年8月10日

@author: 
'''
import pandas as pd #数据分析
data = pd.read_csv('train.csv')  #读入的数据为一个DataFrame类型的对象
#DataFrame是一个二维数据类型
data.info()  #查看数据概况
#取部分特征字段用于分类,并将所有缺失的字段补充为0
#对Sex字段进行正规化处理
data['Sex'] = data['Sex'].apply(lambda s:1 if s == 'male' else 0)
data = data.fillna(0)
dataset_X = data[['Sex','Age','Pclass','SibSp','Parch','Fare']]
dataset_X = dataset_X.as_matrix()     #转换成矩阵
# print dataset_X
#两种分类分别是幸存和死亡,即'Survived'和'Deceased'
data['Deceased'] = data['Survived'].apply( lambda s: int (not s))
dataset_Y = data[['Deceased','Survived']]
dataset_Y = dataset_Y.as_matrix()
# print dataset_Y
#scikit-learn库中提供了用于切分数据集的工具函数train_test_split,随机打乱数据集后按比列拆分数据集
from sklearn.model_selection import train_test_split
#使用函数train_test_split将标记数据切分为训练数据集和验证数据集,其中验证数据集占20%
X_train, X_validation, Y_train, Y_validation = train_test_split(dataset_X, dataset_Y, test_size = 0.2, random_state = 42 )
# print Y_validation

#接下来,使用TensorFlow构建计算图
#使用placeholder声明占位符
#声明输入数据占位符
#shape参数的第一个元素为none,表示可以同时放入任意条记录
import tensorflow as tf
X = tf.placeholder(tf.float32, shape = [None, 6])
Y = tf.placeholder(tf.float32, shape = [None, 2])

#声明参数变量
W = tf.Variable(tf.random_normal([6, 2]), name = 'weights')
b = tf.Variable(tf.zeros([2]), name = 'bias')

#构造前向传播计算图
y_pred = tf.nn.softmax(tf.matmul(X, W) + b)

#声明代价函数
cross_entropy = - tf.reduce_sum(Y*tf.log(y_pred+1e-10), reduction_indices = 1)
cost =  tf.reduce_mean(cross_entropy)

#使用梯度下降算法最小化代价,系统自动构建反向传播部分的计算图
train_op = tf.train.GradientDescentOptimizer(0.001).minimize(cost)
#计算图的声明完成

#构建训练迭代过程
with tf.Session() as sess:
    #初始化所有变量
    tf.initialize_all_variables().run()
     
    #training loop
    for epoch in range(10):
        total_loss = 0.
        for i in range(len(X_train)):
            #prepare feed data and run,这里相关于是使用随机梯度下降
            feed_dict = {X: [X_train[i]], Y:[Y_train[i]]}
            _, loss = sess.run([train_op, cost], feed_dict = feed_dict)
            total_loss += loss
        # display loss per epoch
        print('Epoch: %04d, total loss = %-9f' % (epoch+1, total_loss))
    print 'Training complete!'
    
    import numpy as np
    #用验证数据集合评估模型的表现
    pred = sess.run(y_pred, feed_dict = {X: X_validation})
    #argmax是找最大值的位置,后面的1指的是轴
    correct = np.equal(np.argmax(pred, 1), np.argmax(Y_validation, 1))
#     print correct
#     cao = correct.astype(np.float32)
#     print cao
    #astype指的是类型转换,mean表示加起来,除以个数,这里是1 和 0, 所以可以
    accuracy = np.mean(correct.astype(np.float32))
    print ('Accuracy on validation set: %.9f' % accuracy)

    #predict on test data
    testdata = pd.read_csv('test.csv')
    testdata = testdata.fillna(0)
    testdata['Sex'] = testdata['Sex'].apply(lambda s: 1 if s == 'male' else 0)
    X_test = testdata[['Sex', 'Age', 'Pclass', 'SibSp', 'Parch', 'Fare' ]]
    predictions = np.argmax(sess.run(y_pred, feed_dict = {X: X_test}),1)
    print predictions
    submission = pd.DataFrame({
                 'PassengerId': testdata['PassengerId'],
                 'Survived': predictions
                               })
    submission.to_csv('Titanic-submission-miao.csv', index = False)


提交kaggle的结果:


加入了:存储和加载模型参数

# -*- coding:utf-8 -*-
'''
Created on 2017年8月10日

@author: 
'''
import pandas as pd #数据分析
from setuptools.sandbox import save_path
data = pd.read_csv('train.csv')  #读入的数据为一个DataFrame类型的对象
#DataFrame是一个二维数据类型
data.info()  #查看数据概况
#取部分特征字段用于分类,并将所有缺失的字段补充为0
#对Sex字段进行正规化处理
data['Sex'] = data['Sex'].apply(lambda s:1 if s == 'male' else 0)
data = data.fillna(0)
dataset_X = data[['Sex','Age','Pclass','SibSp','Parch','Fare']]
dataset_X = dataset_X.as_matrix()     #转换成矩阵
# print dataset_X
#两种分类分别是幸存和死亡,即'Survived'和'Deceased'
data['Deceased'] = data['Survived'].apply( lambda s: int (not s))
dataset_Y = data[['Deceased','Survived']]
dataset_Y = dataset_Y.as_matrix()
# print dataset_Y
#scikit-learn库中提供了用于切分数据集的工具函数train_test_split,随机打乱数据集后按比列拆分数据集
from sklearn.model_selection import train_test_split
#使用函数train_test_split将标记数据切分为训练数据集和验证数据集,其中验证数据集占20%
X_train, X_validation, Y_train, Y_validation = train_test_split(dataset_X, dataset_Y, test_size = 0.2, random_state = 42 )
# print Y_validation

#接下来,使用TensorFlow构建计算图
#使用placeholder声明占位符
#声明输入数据占位符
#shape参数的第一个元素为none,表示可以同时放入任意条记录
import tensorflow as tf
X = tf.placeholder(tf.float32, shape = [None, 6])
Y = tf.placeholder(tf.float32, shape = [None, 2])

#声明参数变量
W = tf.Variable(tf.random_normal([6, 2]), name = 'weights')
b = tf.Variable(tf.zeros([2]), name = 'bias')

# use saver to save and restore model
saver = tf.train.Saver()

#构造前向传播计算图
y_pred = tf.nn.softmax(tf.matmul(X, W) + b)

#声明代价函数
cross_entropy = - tf.reduce_sum(Y*tf.log(y_pred+1e-10), reduction_indices = 1)
cost =  tf.reduce_mean(cross_entropy)

#使用梯度下降算法最小化代价,系统自动构建反向传播部分的计算图
train_op = tf.train.GradientDescentOptimizer(0.001).minimize(cost)
#计算图的声明完成

#构建训练迭代过程
with tf.Session() as sess1:
    #初始化所有变量
    tf.initialize_all_variables().run()
     
    #training loop
    for epoch in range(10):
        total_loss = 0.
        for i in range(len(X_train)):
            #prepare feed data and run,这里相关于是使用随机梯度下降
            feed_dict = {X: [X_train[i]], Y:[Y_train[i]]}
            _, loss = sess1.run([train_op, cost], feed_dict = feed_dict)
            total_loss += loss
        # display loss per epoch
        print('Epoch: %04d, total loss = %-9f' % (epoch+1, total_loss))
    print 'Training complete!'
     
    import numpy as np
    #用验证数据集合评估模型的表现
    pred = sess1.run(y_pred, feed_dict = {X: X_validation})
    #argmax是找最大值的位置,后面的1指的是轴
    correct = np.equal(np.argmax(pred, 1), np.argmax(Y_validation, 1))
#     print correct
#     cao = correct.astype(np.float32)
#     print cao
    #astype指的是类型转换,mean表示加起来,除以个数,这里是1 和 0, 所以可以
    accuracy = np.mean(correct.astype(np.float32))
    print ('Accuracy on validation set: %.9f' % accuracy)
    save_path = saver.save(sess1, "model.ckpt" )

with tf.Session() as sess2:
    #predict on test data
    testdata = pd.read_csv('test.csv')
    testdata = testdata.fillna(0)
    testdata['Sex'] = testdata['Sex'].apply(lambda s: 1 if s == 'male' else 0)
    X_test = testdata[['Sex', 'Age', 'Pclass', 'SibSp', 'Parch', 'Fare' ]]
    saver.restore(sess2,  "model.ckpt" )
    predictions = np.argmax(sess2.run(y_pred, feed_dict = {X: X_test}),1)
    print predictions
    submission = pd.DataFrame({
                 'PassengerId': testdata['PassengerId'],
                 'Survived': predictions
                               })
    submission.to_csv('Titanic-submission-miao.csv', index = False)


数据挖掘的技巧

1.数据可视化

2. 特征工程

数据清理, 数据预处理, 特征选择, 多种算法模型


3.4 TensorBoard可视化

3.4.1 记录事件数据

------这一部分很用,,对于代码调试来说!!!!

http://blog.csdn.net/chendadayan/article/details/52919267---这个博客也重点参考,可显示tensorboard.

问题:namespace hierachy: finding similar subgraphs...

1.记录事件数据  ---已完成


2.启动tensorBoard服务----没问题---图片也已经看到了


从控制台中,应该能够看到一些日志信息打印出来,然后是消息“Starting Tensor-Board on port 6066”。刚才所做的是启动一个使用来自“my_graph”目录下的数据的TensorBoard服务器。默认情况下,TensorBoard服务器启动后会自动监听端口6006—要访问TensorBoard,可打开浏览器并在地址栏输入http://localhost:6006,然后将看到一个橙白主题的欢迎页面:

请不要为警告消息“No scalar data was found”紧张,这仅仅表示我们尚未为Tensor-Board保存任何概括统计量,从而使其无法正常显示。通常,这个页面会显示利用SummaryWriter对象要求TensorFlow所保存的信息。由于尚未保存任何其他统计量,所以无内容可供显示。尽管如此,这并不妨碍我们欣赏自己定义的美丽的数据流图。单击页面顶部的“Graph”链接,将看到类似下图的页面:

jikexueyuan 教程:http://wiki.jikexueyuan.com/project/tensorflow-zh/how_tos/summaries_and_tensorboard.html


#tensorboard--logdir=/Users/jackyue/data/tf/第1课/graphs--port=7701

用这种命令,不要用书上的



3.5 数据读取

3.5.1 数据文件格式


3.6 SkFlow TFlearn TF-Slim


加入Tensorboard部分后的代码:

import pandas as pd #数据分析
import numpy as np
# from setuptools.sandbox import save_path
data = pd.read_csv('train.csv')  #读入的数据为一个DataFrame类型的对象
#DataFrame是一个二维数据类型
data.info()  #查看数据概况
#取部分特征字段用于分类,并将所有缺失的字段补充为0
#对Sex字段进行正规化处理
data['Sex'] = data['Sex'].apply(lambda s:1 if s == 'male' else 0)
#对年龄字段补全均值
mean_age = data['Age'].mean()  #29.69
# print mean_age
data['Age'][data.Age.isnull()] = mean_age
data = data.fillna(0)
dataset_X = data[['Sex','Age','Pclass','SibSp','Parch','Fare']]
dataset_X = dataset_X.as_matrix()     #转换成矩阵
# print dataset_X
#两种分类分别是幸存和死亡,即'Survived'和'Deceased'
data['Deceased'] = data['Survived'].apply( lambda s: int (not s))
dataset_Y = data[['Deceased','Survived']]
dataset_Y = dataset_Y.as_matrix()
# print dataset_Y
#scikit-learn库中提供了用于切分数据集的工具函数train_test_split,随机打乱数据集后按比列拆分数据集
from sklearn.model_selection import train_test_split
#使用函数train_test_split将标记数据切分为训练数据集和验证数据集,其中验证数据集占20%
X_train, X_validation, Y_train, Y_validation = train_test_split(dataset_X, dataset_Y, test_size = 0.2, random_state = 42 )
# print Y_validation

#接下来,使用TensorFlow构建计算图
#使用placeholder声明占位符
#声明输入数据占位符
#shape参数的第一个元素为none,表示可以同时放入任意条记录
import tensorflow as tf
with tf.name_scope('input'):
    X = tf.placeholder(tf.float32, shape = [None, 6])
    Y = tf.placeholder(tf.float32, shape = [None, 2])

with tf.name_scope('classifier'):
#声明参数变量
    W = tf.Variable(tf.random_normal([6, 2]), name = 'weights')
    b = tf.Variable(tf.zeros([2]), name = 'bias')
# use saver to save and restore model
    saver = tf.train.Saver()

#构造前向传播计算图
    y_pred = tf.nn.softmax(tf.matmul(X, W) + b)
    #add histogram summaries for weights, view on tensorboard
    tf.histogram_summary('weights',W)
    tf.histogram_summary('bias',b)
    
with tf.name_scope('cost'):
#声明代价函数
    cross_entropy = - tf.reduce_sum(Y*tf.log(y_pred+1e-10), reduction_indices = 1)
    cost =  tf.reduce_mean(cross_entropy)
    tf.scalar_summary('cost',cost)

#验证集合上的正确率
with tf.name_scope('accuracy'):
    correct = tf.equal(tf.argmax(Y, 1), tf.argmax(y_pred, 1))
    acc_op =  tf.reduce_mean(tf.cast(correct, tf.float32))
    tf.scalar_summary('accuracy', acc_op)

#使用梯度下降算法最小化代价,系统自动构建反向传播部分的计算图
train_op = tf.train.GradientDescentOptimizer(0.001).minimize(cost)
#计算图的声明完成

#构建训练迭代过程
with tf.Session() as sess1:
    #create a log writer. run 'tensorboard--logdir=./logs'
    sum_ops = tf.merge_all_summaries()
    summary_writer = tf.train.SummaryWriter('/tmp/log', sess1.graph)
    #初始化所有变量
    tf.initialize_all_variables().run()
     
    #training loop
    for epoch in range(10):
        total_loss = 0.
        for i in range(len(X_train)):
            #prepare feed data and run,这里相关于是使用随机梯度下降
            feed_dict = {X: [X_train[i]], Y:[Y_train[i]]}
#             _, loss = sess1.run([sum_ops, train_op, cost], feed_dict = feed_dict)
            _, loss = sess1.run([ train_op, cost], feed_dict = feed_dict)
            total_loss += loss
        # display loss per epoch
#         summary_writer.add_summary(summary, epoch)
        print('Epoch: %04d, total loss = %-9f' % (epoch+1, total_loss))
     
    #用验证数据集合评估模型的表现
#     pred = sess1.run(y_pred, feed_dict = {X: X_validation})
    #argmax是找最大值的位置,后面的1指的是轴
#     correct = np.equal(np.argmax(pred, 1), np.argmax(Y_validation, 1))
#     print correct
#     cao = correct.astype(np.float32)
#     print cao
    #astype指的是类型转换,mean表示加起来,除以个数,这里是1 和 0, 所以可以
#     accuracy = np.mean(correct.astype(np.float32))
        summary, accuracy = sess1.run([sum_ops, acc_op], feed_dict = {X: X_validation, Y:Y_validation})
        summary_writer.add_summary(summary, epoch)
        print ('Accuracy on validation set: %.9f' % accuracy)
        save_path = saver.save(sess1, "model.ckpt" )
    print 'Training complete!'

with tf.Session() as sess2:
    #predict on test data
    testdata = pd.read_csv('test.csv')
    testdata = testdata.fillna(0)
    testdata['Sex'] = testdata['Sex'].apply(lambda s: 1 if s == 'male' else 0)
    X_test = testdata[['Sex', 'Age', 'Pclass', 'SibSp', 'Parch', 'Fare' ]]
    saver.restore(sess2,  "model.ckpt" )
    predictions = np.argmax(sess2.run(y_pred, feed_dict = {X: X_test}),1)
    print predictions
    submission = pd.DataFrame({
                 'PassengerId': testdata['PassengerId'],
                 'Survived': predictions
                               })
    submission.to_csv('Titanic-submission-miao.csv', index = False)