cnn、rnn相结合进行文本分类

来源：互联网发布：淘宝团队建设编辑：程序博客网时间：2024/05/16 17:48

主要参考代码思路：

https://github.com/jiegzhan/multi-class-text-classification-cnn-rnn

cnn和rnn结合一起进行文本分类主要思路如下:

data--->batch iter-->cnn input-->embedding--->卷积--->池化--->rnn输入--->lstm cell--softmax

在前面的博客已经提到如何把文本数据转化了一个batch iter的形式，下面贴上关于cnn-rnn文本分类的一些代码:

基本配置:

class TCNNRNNConfig(object):
    # 模型参数
    embedding_dim = 64  # 词向量维度
    seq_length = 300  # 序列长度
    num_classes = 2  # 类别数
    num_filters = 256  # 卷积核数目
    kernel_size = 5  # 卷积核尺寸
    vocab_size = 130000  # 词汇表达小
    max_pool_size=4   #最大的pool层
    hidden_dim = 128  # 全连接层神经元
    dropout_keep_prob = 0.8  # dropout保留比例
    learning_rate = 1e-3  # 学习率
    hidden_unit=256  #lstm神经元的个数
    batch_size = 128  # 每批训练大小
    num_epochs = 20  # 总迭代轮次
    print_per_batch = 100  # 每多少轮输出一次结果
    multi_kernel_size = '3,4,5'
    l2_reg_lambda = 0.0

模型代码：

#!/usr/bin/python
# -*- coding: utf-8 -*-
import tensorflow as tf
import numpy  as np
class  TextCnnRnn(object):
    def __init__(self,config):
        self.config=config
        self.input_x=tf.placeholder(tf.int32,[None, self.config.seq_length],name="input_x")
        self.input_y=tf.placeholder(tf.float32,[None, self.config.num_classes],name="inpyt_y")
        self.keep_prob=tf.placeholder(tf.float32,None,name='keep_prob')
        self.pad = tf.placeholder(tf.float32, [None, 1, self.config.embedding_dim, 1], name='pad')
        self.l2_loss = tf.constant(0.0)
        self.real_len = tf.placeholder(tf.int32, [None], name='real_len')
        self.filter_sizes = list(map(int, self.config.multi_kernel_size.split(",")))
        self.cnnrnn()
    def input_embedding(self):
        """词嵌套"""
        with tf.device('/cpu:0'):
            embedding =tf.get_variable("embedding",[self.config.vocab_size,self.config.embedding_dim])
            _input = tf.nn.embedding_lookup(embedding, self.input_x)
            _input_expanded = tf.expand_dims(_input, -1)
        return _input_expanded
    def cnnrnn(self):
        emb=self.input_embedding()
        pooled_concat = []
        reduced = np.int32(np.ceil((self.config.seq_length) * 1.0 / self.config.max_pool_size))
        for i, filter_size in enumerate(self.filter_sizes):
            with tf.name_scope('conv-maxpool-%s' % filter_size):
                # Zero paddings so that the convolution output have dimension batch x sequence_length x emb_size x channel
                num_prio = (filter_size - 1) // 2
                num_post = (filter_size - 1) - num_prio
                pad_prio = tf.concat([self.pad] * num_prio, 1)
                pad_post = tf.concat([self.pad] * num_post, 1)
                emb_pad = tf.concat([pad_prio, emb, pad_post], 1)
                filter_shape = [filter_size, self.config.embedding_dim, 1, self.config.num_filters]
                W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name='W')
                b = tf.Variable(tf.constant(0.1, shape=[self.config.num_filters]), name='b')
                conv = tf.nn.conv2d(emb_pad, W, strides=[1, 1, 1, 1], padding='VALID', name='conv')
                h = tf.nn.relu(tf.nn.bias_add(conv, b), name='relu')
                # Maxpooling over the outputs
                pooled = tf.nn.max_pool(h, ksize=[1, self.config.max_pool_size, 1, 1], strides=[1, self.config.max_pool_size, 1, 1], padding='SAME',
                                        name='pool')
                pooled = tf.reshape(pooled, [-1, reduced, self.config.num_filters])
                pooled_concat.append(pooled)
        pooled_concat = tf.concat(pooled_concat, 2)
        pooled_concat = tf.nn.dropout(pooled_concat, self.keep_prob)
        # lstm_cell = tf.nn.rnn_cell.LSTMCell(num_units=self.config.hidden_unit)
        # lstm_cell = tf.nn.rnn_cell.GRUCell(num_units=self.config.hidden_unit)
        lstm_cell = tf.contrib.rnn.GRUCell(num_units=self.config.hidden_unit)
        # lstm_cell = tf.nn.rnn_cell.DropoutWrapper(lstm_cell, output_keep_prob=self.dropout_keep_prob)
        lstm_cell = tf.contrib.rnn.DropoutWrapper(lstm_cell, output_keep_prob=self.keep_prob)
        self._initial_state = lstm_cell.zero_state(self.config.batch_size, tf.float32)
        # inputs = [tf.squeeze(input_, [1]) for input_ in tf.split(1, reduced, pooled_concat)]
        inputs = [tf.squeeze(input_, [1]) for input_ in tf.split(pooled_concat, num_or_size_splits=int(reduced), axis=1)]
        # outputs, state = tf.nn.rnn(lstm_cell, inputs, initial_state=self._initial_state, sequence_length=self.real_len)
        #outputs, state = tf.contrib.rnn.static_rnn(lstm_cell, inputs, initial_state=self._initial_state,
        #                                           sequence_length=self.real_len)
        outputs, state=tf.nn.static_rnn( lstm_cell, inputs,self._initial_state,sequence_length=self.real_len)
        # Collect the appropriate last words into variable output (dimension = batch x embedding_size)
        output = outputs[0]
        with tf.variable_scope('Output'):
            tf.get_variable_scope().reuse_variables()
            one = tf.ones([1, self.config.hidden_unit], tf.float32)
            for i in range(1, len(outputs)):
                ind = self.real_len < (i + 1)
                ind = tf.to_float(ind)
                ind = tf.expand_dims(ind, -1)
                mat = tf.matmul(ind, one)
                output = tf.add(tf.multiply(output, mat), tf.multiply(outputs[i], 1.0 - mat))
        with tf.name_scope('score'):
            self.W = tf.Variable(tf.truncated_normal([self.config.hidden_unit, self.config.num_classes], stddev=0.1), name='W')
            b = tf.Variable(tf.constant(0.1, shape=[self.config.num_classes]), name='b')
            self.l2_loss += tf.nn.l2_loss(W)
            self.l2_loss += tf.nn.l2_loss(b)
            self.scores = tf.nn.xw_plus_b(output, self.W, b, name='scores')
            self.pred_y = tf.nn.softmax(self.scores, name="pred_y")
            tf.add_to_collection('pred_network', self.pred_y)
            self.predictions = tf.argmax(self.scores, 1, name='predictions')
        with tf.name_scope('loss'):
            losses = tf.nn.softmax_cross_entropy_with_logits(labels=self.input_y,
                                                             logits=self.scores)  # only named arguments accepted
            self.loss = tf.reduce_mean(losses) + self.config.l2_reg_lambda * self.l2_loss
        with tf.name_scope("optimize"):
            # 优化器
            optimizer = tf.train.AdamOptimizer(
                learning_rate=self.config.learning_rate)
            self.optim = optimizer.minimize(self.loss)
        with tf.name_scope('accuracy'):
            correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
            self.acc = tf.reduce_mean(tf.cast(correct_predictions, "float"), name='accuracy')
        with tf.name_scope('num_correct'):
            correct = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
            self.num_correct = tf.reduce_sum(tf.cast(correct, 'float'))

run代码:

#!/usr/bin/python
# -*- coding: utf-8 -*-
from  cnn_rnn_model import  TextCnnRnn
from  configuration import TCNNRNNConfig
from  data_utils_cut  import preocess_file,batch_iter
import time
import tensorflow as tf
import os
import numpy  as np
from  datetime  import timedelta
trainpath="/Users/shuubiasahi/Desktop/tensorflow/adx/"
def run_epoch(cnnrnnmodel=True):
    # 载入数据
    print('Loading data...')
    start_time = time.time()
    x_train, y_train, words = preocess_file(data_path=trainpath+"cnn.txt")
    if cnnrnnmodel:
        print('Using CNNRNN model...')
        config = TCNNRNNConfig()
        config.vocab_size = len(words)
        print("vocab_size is:", config.vocab_size)
        model = TextCnnRnn(config)
        tensorboard_dir = '/Users/shuubiasahi/Desktop/tensorflow/boardlog'
    end_time = time.time()
    time_dif = end_time - start_time
    time_dif = timedelta(seconds=int(round(time_dif)))
    print('Time usage:', time_dif)
    print('Constructing TensorFlow Graph...')
    session = tf.Session()
    session.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    # 配置 tensorboard
    tf.summary.scalar("loss", model.loss)
    tf.summary.scalar("accuracy", model.acc)
    if not os.path.exists(tensorboard_dir):
        os.makedirs(tensorboard_dir)
    merged_summary = tf.summary.merge_all()
    writer = tf.summary.FileWriter(tensorboard_dir)
    writer.add_graph(session.graph)
    # 生成批次数据
    print('Generating batch...')
    batch_train = batch_iter(list(zip(x_train, y_train)),
        config.batch_size, config.num_epochs)
    def feed_data(batch):
        """准备需要喂入模型的数据"""
        x_batch, y_batch = zip(*batch)
        feed_dict = {
            model.input_x: x_batch,
            model.input_y: y_batch,
            model.real_len:real_len(x_batch)
        }
        return feed_dict, len(x_batch)
    def real_len(batches):
        return [np.ceil(np.argmin(batch + [0]) * 1.0 / config.max_pool_size) for batch in batches]
    def evaluate(x_, y_):
        """
        模型评估
        一次运行所有的数据会OOM，所以需要分批和汇总
        """
        batch_eval = batch_iter(list(zip(x_, y_)), 128, 1)
        total_loss = 0.0
        total_acc = 0.0
        cnt = 0
        for batch in batch_eval:
            feed_dict, cur_batch_len = feed_data(batch)
            feed_dict[model.keep_prob] = 1.0
            loss, acc = session.run([model.loss, model.acc],
                feed_dict=feed_dict)
            total_loss += loss * cur_batch_len
            total_acc += acc * cur_batch_len
            cnt += cur_batch_len
        return total_loss / cnt, total_acc / cnt
    # 训练与验证
    print('Training and evaluating...')
    start_time = time.time()
    print_per_batch = config.print_per_batch
    for i, batch in enumerate(batch_train):
        feed_dict, lenbatch = feed_data(batch)
        feed_dict[model.keep_prob] = config.dropout_keep_prob
        feed_dict[model.pad]=np.zeros([lenbatch, 1, config.embedding_dim, 1])
        if i % 5 == 0:  # 每5次将训练结果写入tensorboard scalar
            s = session.run(merged_summary, feed_dict=feed_dict)
            writer.add_summary(s, i)
        if i % print_per_batch == print_per_batch - 1:  # 每200次输出在训练集和验证集上的性能
            loss_train, acc_train = session.run([model.loss, model.acc],
                feed_dict=feed_dict)
            #loss, acc = evaluate(x_val, y_val)   验证机暂时不需要
            # 时间
            end_time = time.time()
            time_dif = end_time - start_time
            time_dif = timedelta(seconds=int(round(time_dif)))
            msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},'\
                + '  Time: {3}'
            print(msg.format(i + 1, loss_train, acc_train, time_dif))
        # if  i%10==0 and i>0:
        #     graph=tf.graph_util.convert_variables_to_constants(session,session.graph_def,["keep_prob","input_x","score/pred_y"])
        #     tf.train.write_graph(graph,".","/Users/shuubiasahi/Desktop/tensorflow/modelsavegraph/graph.db",as_text=False)
        if i%500==0  and i>0:
            graph = tf.graph_util.convert_variables_to_constants(session, session.graph_def,
                                                                 ["keep_prob","real_len","pad", "input_x", "score/pred_y"])
            if cnnrnnmodel:
                tf.train.write_graph(graph, ".", trainpath+"graphcnnrnn.model",
                                     as_text=False)
            print("模型在第{0}步已经保存".format(i))
        session.run(model.optim, feed_dict=feed_dict)  # 运行优化
    # 最后在测试集上进行评估
    session.close()
if __name__ == '__main__':
    run_epoch()

简单的结果分析:

Using CNNRNN model...
vocab_size is: 160238
Time usage: 0:00:35
Constructing TensorFlow Graph...
2017-10-30 23:22:18.426329: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE4.2 instructions, but these are available on your machine and could speed up CPU computations.
2017-10-30 23:22:18.426342: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX instructions, but these are available on your machine and could speed up CPU computations.
2017-10-30 23:22:18.426346: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX2 instructions, but these are available on your machine and could speed up CPU computations.
2017-10-30 23:22:18.426351: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use FMA instructions, but these are available on your machine and could speed up CPU computations.
Generating batch...
Training and evaluating...
Iter:    100, Train Loss:   0.66, Train Acc:  71.09%,  Time: 0:02:47
Iter:    200, Train Loss:   0.65, Train Acc:  61.72%,  Time: 0:05:38

迭代几百步相比单纯的用cnn、bi-lstm实际效果是很差了，可能文本本身的特征已经够明显，再用这种反而效果会变差吧，cnn这种相当于一个超级n-gram，bi-lstm正反两面捕捉文本上下文的信息进行信息输出，之前在GitHub上看到别人做文本分类，cnn、bilstm这种量效果是最佳。。。由于电脑原因并没有迭代很多步，哪天用gpu试试吧

阅读全文

0 0