TensorFlow2: 评论分类

来源：互联网发布：ubuntu删除谷歌输入法编辑：程序博客网时间：2024/05/22 12:56

本帖是前一贴的补充：

使用大数据，了解怎么处理数据不能一次全部加载到内存的情况。如果你内存充足，当我没说
训练好的模型的保存和使用
使用的模型没变，还是简单的feedforward神经网络（update：添加CNN模型）
如果你要运行本帖代码，推荐使用GPU版本或强大的VPS，我使用小笔记本差点等吐血
后续有关于中文的练习《TensorFlow练习13: 制作一个简单的聊天机器人》《TensorFlow练习7: 基于RNN生成古诗词》

在正文开始之前，我画了一个机器学习模型的基本开发流程图：

TensorFlow练习2: 对评论进行分类

使用的数据集

使用的数据集：http://help.sentiment140.com/for-students/ (情绪分析)

数据集包含1百60万条推特，包含消极、中性和积极tweet。不知道有没有现成的微博数据集。

数据格式：移除表情符号的CSV文件，字段如下：

0 – the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
1 – the id of the tweet (2087)
2 – the date of the tweet (Sat May 16 23:58:44 UTC 2009)
3 – the query (lyx). If there is no query, then this value is NO_QUERY.
4 – the user that tweeted (robotickilldozr)
5 – the text of the tweet (Lyx is cool)

training.1600000.processed.noemoticon.csv（238M）
testdata.manual.2009.06.14.csv（74K）

数据预处理

import nltkfrom nltk.tokenize import word_tokenizefrom nltk.stem import WordNetLemmatizer import pickleimport numpy as npimport pandas as pdfrom collections import OrderedDict org_train_file = 'training.1600000.processed.noemoticon.csv'org_test_file = 'testdata.manual.2009.06.14.csv' # 提取文件中有用的字段def usefull_filed(org_file, output_file):output = open(output_file, 'w')with open(org_file, buffering=10000, encoding='latin-1') as f:try:for line in f:                # "4","2193601966","Tue Jun 16 08:40:49 PDT 2009","NO_QUERY","AmandaMarie1028","Just woke up. Having no school is the best feeling ever "line = line.replace('"', '')clf = line.split(',')[0]   # 4if clf == '0':clf = [0, 0, 1]  # 消极评论elif clf == '2':clf = [0, 1, 0]  # 中性评论elif clf == '4':clf = [1, 0, 0]  # 积极评论 tweet = line.split(',')[-1]outputline = str(clf) + ':%:%:%:' + tweetoutput.write(outputline)  # [0, 0, 1]:%:%:%: that's a bummer.  You shoulda got David Carr of Third Day to do it. ;Dexcept Exception as e:print(e)output.close()  # 处理完成，处理后文件大小127.5M usefull_filed(org_train_file, 'training.csv')usefull_filed(org_test_file, 'tesing.csv') # 创建词汇表def create_lexicon(train_file):lex = []lemmatizer = WordNetLemmatizer()with open(train_file, buffering=10000, encoding='latin-1') as f:try:count_word = {}  # 统计单词出现次数for line in f:tweet = line.split(':%:%:%:')[1]words = word_tokenize(line.lower())for word in words:word = lemmatizer.lemmatize(word)if word not in count_word:count_word[word] = 1else:count_word[word] += 1 count_word = OrderedDict(sorted(count_word.items(), key=lambda t: t[1]))for word in count_word:if count_word[word] < 100000 and count_word[word] > 100:  # 过滤掉一些词lex.append(word)except Exception as e:print(e)return lex lex = create_lexicon('training.csv') with open('lexcion.pickle', 'wb') as f:pickle.dump(lex, f)  """# 把字符串转为向量def string_to_vector(input_file, output_file, lex):output_f = open(output_file, 'w')lemmatizer = WordNetLemmatizer()with open(input_file, buffering=10000, encoding='latin-1') as f:for line in f:label = line.split(':%:%:%:')[0]tweet = line.split(':%:%:%:')[1]words = word_tokenize(tweet.lower())words = [lemmatizer.lemmatize(word) for word in words] features = np.zeros(len(lex))for word in words:if word in lex:features[lex.index(word)] = 1  # 一个句子中某个词可能出现两次,可以用+=1，其实区别不大features = list(features)output_f.write(str(label) + ":" + str(features) + '\n')output_f.close()  f = open('lexcion.pickle', 'rb')lex = pickle.load(f)f.close() # lexcion词汇表大小112k,training.vec大约112k*1600000  170G  太大，只能边转边训练了# string_to_vector('training.csv', 'training.vec', lex)# string_to_vector('tesing.csv', 'tesing.vec', lex)"""

上面代码把原始数据转为training.csv、和tesing.csv，里面只包含label和tweet。lexcion.pickle文件保存了词汇表。

如果数据文件太大，不能一次加载到内存，可以把数据导入数据库
Dask可处理大csv文件

开始漫长的训练

import osimport random import tensorflow as tfimport pickleimport numpy as npfrom nltk.tokenize import word_tokenizefrom nltk.stem import WordNetLemmatizer f = open('lexcion.pickle', 'rb')lex = pickle.load(f)f.close()  def get_random_line(file, point):file.seek(point)file.readline()return file.readline()# 从文件中随机选择n条记录def get_n_random_line(file_name, n=150):lines = []file = open(file_name, encoding='latin-1')total_bytes = os.stat(file_name).st_size for i in range(n):random_point = random.randint(0, total_bytes)lines.append(get_random_line(file, random_point))file.close()return lines  def get_test_dataset(test_file):with open(test_file, encoding='latin-1') as f:test_x = []test_y = []lemmatizer = WordNetLemmatizer()for line in f:label = line.split(':%:%:%:')[0]tweet = line.split(':%:%:%:')[1]words = word_tokenize(tweet.lower())words = [lemmatizer.lemmatize(word) for word in words]features = np.zeros(len(lex))for word in words:if word in lex:features[lex.index(word)] = 1test_x.append(list(features))test_y.append(eval(label))return test_x, test_y test_x, test_y = get_test_dataset('tesing.csv')  ####################################################################### n_input_layer = len(lex)  # 输入层 n_layer_1 = 2000     # hide layern_layer_2 = 2000    # hide layer(隐藏层)听着很神秘，其实就是除输入输出层外的中间层 n_output_layer = 3       # 输出层  def neural_network(data):# 定义第一层"神经元"的权重和biaseslayer_1_w_b = {'w_':tf.Variable(tf.random_normal([n_input_layer, n_layer_1])), 'b_':tf.Variable(tf.random_normal([n_layer_1]))}# 定义第二层"神经元"的权重和biaseslayer_2_w_b = {'w_':tf.Variable(tf.random_normal([n_layer_1, n_layer_2])), 'b_':tf.Variable(tf.random_normal([n_layer_2]))}# 定义输出层"神经元"的权重和biaseslayer_output_w_b = {'w_':tf.Variable(tf.random_normal([n_layer_2, n_output_layer])), 'b_':tf.Variable(tf.random_normal([n_output_layer]))} # w·x+blayer_1 = tf.add(tf.matmul(data, layer_1_w_b['w_']), layer_1_w_b['b_'])layer_1 = tf.nn.relu(layer_1)  # 激活函数layer_2 = tf.add(tf.matmul(layer_1, layer_2_w_b['w_']), layer_2_w_b['b_'])layer_2 = tf.nn.relu(layer_2 ) # 激活函数layer_output = tf.add(tf.matmul(layer_2, layer_output_w_b['w_']), layer_output_w_b['b_']) return layer_output  X = tf.placeholder('float')Y = tf.placeholder('float')batch_size = 90 def train_neural_network(X, Y):predict = neural_network(X)cost_func = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(predict, Y))optimizer = tf.train.AdamOptimizer().minimize(cost_func) with tf.Session() as session:session.run(tf.initialize_all_variables()) lemmatizer = WordNetLemmatizer()saver = tf.train.Saver()i = 0pre_accuracy = 0while True:   # 一直训练batch_x = []batch_y = [] #if model.ckpt文件已存在:#saver.restore(session, 'model.ckpt')  恢复保存的session try:lines = get_n_random_line('training.csv', batch_size)for line in lines:label = line.split(':%:%:%:')[0]tweet = line.split(':%:%:%:')[1]words = word_tokenize(tweet.lower())words = [lemmatizer.lemmatize(word) for word in words] features = np.zeros(len(lex))for word in words:if word in lex:features[lex.index(word)] = 1  # 一个句子中某个词可能出现两次,可以用+=1，其实区别不大batch_x.append(list(features))batch_y.append(eval(label)) session.run([optimizer, cost_func], feed_dict={X:batch_x,Y:batch_y})except Exception as e:print(e) # 准确率if i > 100:correct = tf.equal(tf.argmax(predict,1), tf.argmax(Y,1))accuracy = tf.reduce_mean(tf.cast(correct,'float'))accuracy = accuracy.eval({X:test_x, Y:test_y})if accuracy > pre_accuracy:  # 保存准确率最高的训练模型print('准确率: ', accuracy)pre_accuracy = accuracysaver.save(session, 'model.ckpt')  # 保存sessioni = 0i += 1  train_neural_network(X,Y)

上面程序占用内存600M，峰值1G。

运行：

TensorFlow练习2: 对评论进行分类

训练模型保存为model.ckpt。

使用训练好的模型

import tensorflow as tfimport picklefrom nltk.tokenize import word_tokenizefrom nltk.stem import WordNetLemmatizerimport numpy as np f = open('lexcion.pickle', 'rb')lex = pickle.load(f)f.close() n_input_layer = len(lex)  # 输入层 n_layer_1 = 2000     # hide layern_layer_2 = 2000    # hide layer(隐藏层)听着很神秘，其实就是除输入输出层外的中间层 n_output_layer = 3       # 输出层def neural_network(data):# 定义第一层"神经元"的权重和biaseslayer_1_w_b = {'w_':tf.Variable(tf.random_normal([n_input_layer, n_layer_1])), 'b_':tf.Variable(tf.random_normal([n_layer_1]))}# 定义第二层"神经元"的权重和biaseslayer_2_w_b = {'w_':tf.Variable(tf.random_normal([n_layer_1, n_layer_2])), 'b_':tf.Variable(tf.random_normal([n_layer_2]))}# 定义输出层"神经元"的权重和biaseslayer_output_w_b = {'w_':tf.Variable(tf.random_normal([n_layer_2, n_output_layer])), 'b_':tf.Variable(tf.random_normal([n_output_layer]))} # w·x+blayer_1 = tf.add(tf.matmul(data, layer_1_w_b['w_']), layer_1_w_b['b_'])layer_1 = tf.nn.relu(layer_1)  # 激活函数layer_2 = tf.add(tf.matmul(layer_1, layer_2_w_b['w_']), layer_2_w_b['b_'])layer_2 = tf.nn.relu(layer_2 ) # 激活函数layer_output = tf.add(tf.matmul(layer_2, layer_output_w_b['w_']), layer_output_w_b['b_']) return layer_output X = tf.placeholder('float')def prediction(tweet_text):predict = neural_network(X) with tf.Session() as session:session.run(tf.initialize_all_variables())saver = tf.train.Saver()saver.restore(session, 'model.ckpt') lemmatizer = WordNetLemmatizer()words = word_tokenize(tweet_text.lower())words = [lemmatizer.lemmatize(word) for word in words] features = np.zeros(len(lex))for word in words:if word in lex:features[lex.index(word)] = 1#print(predict.eval(feed_dict={X:[features]})) [[val1,val2,val3]]res = session.run(tf.argmax(predict.eval(feed_dict={X:[features]}),1 ))return res  prediction("I am very happe")

上面使用简单的feedfroward模型，下面使用CNN模型

# https://github.com/Lab41/sunny-side-upimport osimport randomimport tensorflow as tfimport pickleimport numpy as npfrom nltk.tokenize import word_tokenizefrom nltk.stem import WordNetLemmatizer f = open('lexcion.pickle', 'rb')lex = pickle.load(f)f.close() def get_random_line(file, point):file.seek(point)file.readline()return file.readline()# 从文件中随机选择n条记录def get_n_random_line(file_name, n=150):lines = []file = open(file_name, encoding='latin-1')total_bytes = os.stat(file_name).st_size for i in range(n):random_point = random.randint(0, total_bytes)lines.append(get_random_line(file, random_point))file.close()return lines def get_test_dataset(test_file):with open(test_file, encoding='latin-1') as f:test_x = []test_y = []lemmatizer = WordNetLemmatizer()for line in f:label = line.split(':%:%:%:')[0]tweet = line.split(':%:%:%:')[1]words = word_tokenize(tweet.lower())words = [lemmatizer.lemmatize(word) for word in words]features = np.zeros(len(lex))for word in words:if word in lex:features[lex.index(word)] = 1test_x.append(list(features))test_y.append(eval(label))return test_x, test_y test_x, test_y = get_test_dataset('tesing.csv')##############################################################################input_size = len(lex)num_classes = 3 X = tf.placeholder(tf.int32, [None, input_size])Y = tf.placeholder(tf.float32, [None, num_classes]) dropout_keep_prob = tf.placeholder(tf.float32) batch_size = 90 def neural_network():# embedding layerwith tf.device('/cpu:0'), tf.name_scope("embedding"):embedding_size = 128W = tf.Variable(tf.random_uniform([input_size, embedding_size], -1.0, 1.0))embedded_chars = tf.nn.embedding_lookup(W, X)embedded_chars_expanded = tf.expand_dims(embedded_chars, -1)# convolution + maxpool layernum_filters = 128filter_sizes = [3,4,5]pooled_outputs = []for i, filter_size in enumerate(filter_sizes):with tf.name_scope("conv-maxpool-%s" % filter_size):filter_shape = [filter_size, embedding_size, 1, num_filters]W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1))b = tf.Variable(tf.constant(0.1, shape=[num_filters]))conv = tf.nn.conv2d(embedded_chars_expanded, W, strides=[1, 1, 1, 1], padding="VALID")h = tf.nn.relu(tf.nn.bias_add(conv, b))pooled = tf.nn.max_pool(h, ksize=[1, input_size - filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID')pooled_outputs.append(pooled) num_filters_total = num_filters * len(filter_sizes)h_pool = tf.concat(3, pooled_outputs)h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total])# dropoutwith tf.name_scope("dropout"):h_drop = tf.nn.dropout(h_pool_flat, dropout_keep_prob)# outputwith tf.name_scope("output"):W = tf.get_variable("W", shape=[num_filters_total, num_classes], initializer=tf.contrib.layers.xavier_initializer())b = tf.Variable(tf.constant(0.1, shape=[num_classes]))output = tf.nn.xw_plus_b(h_drop, W, b)return output def train_neural_network():output = neural_network() optimizer = tf.train.AdamOptimizer(1e-3)loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(output, Y))grads_and_vars = optimizer.compute_gradients(loss)train_op = optimizer.apply_gradients(grads_and_vars) saver = tf.train.Saver(tf.global_variables())with tf.Session() as sess:sess.run(tf.global_variables_initializer()) lemmatizer = WordNetLemmatizer()i = 0while True:batch_x = []batch_y = [] #if model.ckpt文件已存在:#saver.restore(session, 'model.ckpt')  恢复保存的sessiontry:lines = get_n_random_line('training.csv', batch_size)for line in lines:label = line.split(':%:%:%:')[0]tweet = line.split(':%:%:%:')[1]words = word_tokenize(tweet.lower())words = [lemmatizer.lemmatize(word) for word in words] features = np.zeros(len(lex))for word in words:if word in lex:features[lex.index(word)] = 1  # 一个句子中某个词可能出现两次,可以用+=1，其实区别不大batch_x.append(list(features))batch_y.append(eval(label)) _, loss_ = sess.run([train_op, loss], feed_dict={X:batch_x, Y:batch_y, dropout_keep_prob:0.5})print(loss_)except Exception as e:print(e) if i % 10 == 0:predictions = tf.argmax(output, 1)correct_predictions = tf.equal(predictions, tf.argmax(Y, 1))accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"))accur = sess.run(accuracy, feed_dict={X:test_x[0:50], Y:test_y[0:50], dropout_keep_prob:1.0})print('准确率:', accur) i += 1 train_neural_network()

使用了CNN模型之后，准确率有了显著提升。

1 0