TF Learn入门 —— 稍复杂使用举例

来源:互联网 发布:硕鼠有mac版吗 编辑:程序博客网 时间:2024/06/06 04:41
使用TensorFlow的 TF.Learn API 解决二进制分类问题。 根据普查中的个人信息,包括年龄、性别、教育程度和职业(特征),来预测该人年收入是否超过5万美元(目标标签)。将训练logistic regression模型,输出值在0和1之间,表示该人收入超过5万美元的可能性。

读取普查数据

下载数据

import tempfileimport urllibtrain_file = tempfile.NamedTemporaryFile()test_file = tempfile.NamedTemporaryFile()urllib.urlretrieve('http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.data', train_file.name)urllib.urlretrieve('http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.test', test_file.name)

读入数据

import pandas as pdCOLUMNS = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'gender', 'capital_gain', 'capital_loss', 'hour_per_week', 'native_country', 'income_bracket']df_train = pd.read_csv(train_file, names=COLUMNS, skipinitialspace=True)df_test = pd.read_csv(test_file, names=COLUMNS, skipinitialspace=True, skiprows=1)

构建标签栏

LABEL_COLUMN = 'label'df_train[LABEL_COLUMN] = (df_train['income_bracket'].apply(lambda x: '>50k' in x)).astype(int)df_test[LABEL_COLUMN] = (df_test['income_bracket'].apply(lambda x: '>50k' in x)).astype(int)

检查数据

CATEGORICAL_COLUMNS = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'gender', 'native_country']CONTINUOUS_COLUMNS = ['age', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']

将数据转化为Tensors

import tensorflow as tfdefine input_fn(df):continous_cols = {k: tf.constant(df[k].values) for k in CONTINOUS_COLUMNS}categorical_cols = {k: tf.SparseTensor(indices=[[i, 0] for i in range(df[k].size)],values=df[k].values,shape=[df[k].size, 1])for k in CATEGORICAL_COLUMNS}feature_cols = dict(continous_cols.items() + categorical_cols.items())label = tf.constant(df[LABEL_COLUMN].values)return feature_cols, labeldef train_input_fn():return input_fn(df_train)def eval_input_fn():return input_fn(df_test)

根据模型选择和处理特征

基本类别特征gender = tf.contrib.layer.sparse_column_with_keys(columns_name='gender', keys=['Female', 'Male']) #知道可能的特征值education = tf.contrib.layers_sparse_column_with_hash_bucket('education', hash_bucket_size=1000) #不知道可能的特征值基本连续特征age = tf.contrib.layers.real_valued_column('age')将连续数据归类化age_buckets = tf.contrib.layers.bucketized_column(age, boundaries=[18, 25, 30, 35, 4, 45, 50, 55, 60, 65])构建组合特征education_x_occupation = tf.contrib.layers.crossed_column([education, occupation], hash_bucket_size=int(1e4))age_buckets_x_education_x_occupation = tf.contrib.layers.crossed_column([age_buckets, education, occupation], hash_bucket_size=int(1e6))

定义模型

model_dir = tempfile.mkdtemp()m = tf.contrib.learn.LinearClassifier(feature_columns=[gender, native_country, education, occupation, workclass, marital_status, race, age_buckets, education_x_occupation, age_buckets_x_education_x_occupation], model_dir=model_dir)

训练和评估模型

m.fit(input_fn=train_input_fn, steps=200)results = m.evaluate(input_fn=eval_input_fn, steps=1)for key in sorted(results):print '%s: %s' %(key, results[key])

预防过度拟合

m = tf.contrib.learn.LinearClassifier(feature_columns=[gender, native_country, education, occupation, workclass, marital_status, race, age_buckets, education_x_occupation, age_buckets_x_education_x_occupation],optimizer=tf.train.FtrlOptimizer(learning_rate=0.1,l1_regularization_strength=1.0,l2_regularization_strength=1.0),model_dir=model_dir)






原创粉丝点击