tensorflow练习1:利用神经网络进行分类

来源:互联网 发布:linux mrtg定制 编辑:程序博客网 时间:2024/06/05 10:47

TensorFlow可被用于语音识别或图像识别等多项机器深度学习领域,它可在小到手机、大到数千台服务器上运行。前段时间在做有关情感分类的实验,利用了神经网络对数据进行分类;效果还不错,达到80+%。
数据集来源:评论数据集,中文的,很不容易,感谢作者!
pos数据
neg数据

数据处理:

import randomdef loadfile():    neg = pd.read_excel('data/neg.xls', header=None, index=None)    pos = pd.read_excel('data/pos.xls', header=None, index=None)  # 读取训练语料完毕    pos['mark'] = 1    neg['mark'] = 0  # 给训练语料贴上标签    pn = pd.concat([pos, neg], ignore_index=True)  # 合并语料    #neglen = len(neg)    #poslen = len(pos)  # 计算语料数目    #print(type(neg['mark'].values[0]))    #print(pn[:10],pn[-10:-1])    print (len(pn[0].values),len(pn['mark'].values))    with open('data/data.txt','w',encoding='utf-8') as f:        for x in pn[0].values:           f.write(x+'\n')    with open('data/label.txt', 'w', encoding='utf-8') as f:        for x in pn['mark'].values:            f.write(str(x)+'\n')loadfile()#加载并合并数据-------------------------------------------------#分词,去停用词import jiebaimport numpy as npwith open('data/stopwords', 'r', encoding='utf-8') as f:    stopwords = []    for line in f.readlines():        stopwords.append(line.strip())def split_word():    with open('data/data.txt', 'r', encoding='utf-8') as f:        lines = f.readlines()        #lines=random.sample(lines, len(lines))#打乱次序        lines_1 = []        #word_list = []        for line in lines:            line = ' '.join(jieba.cut(line.strip()))            #for word in line.split(' '):            #    if word not in stopwords:            #        word_list.append(word)            lines_1.append(line)        with open('data/split_data.txt','w',encoding='utf-8') as f1:            for line in lines_1:                f1.write(line+'\n')    #print(lines_1[0])    #with open('data/clean_data.txt','w',encoding='utf-8') as f:    #    for line in lines_1:    #        f.write((" ".join([word for word in line]) + "\n"))with open('data/split_data.txt','r',encoding='utf-8') as f:    line_list=[]    #len_list=[]    for line in f.readlines():        line =line.strip().split(' ')        line_1=[]        for word in line:            if word not in stopwords:                line_1.append(word)        #len_list.append(len(line_1))        line_list.append(line_1)    with open('data_clean.txt','w',encoding='utf-8') as f1:        for line in line_list:            f1.write((" ".join([num for num in line]) + "\n"))

停用词表(stopwords):

"..>>/...8二<@]、,“”。-&《》…?^_()#啊此这呢哦仅*+=0123456789@$【】[]矣兮~><{}了个呵的」「&#;%..:—TWILIGHT,\;.....

创建词典:

#coding=utf-8import numpy as npimport randomimport osfrom io import openimport datetime"""***yuchuli"""PAD = "__PAD__"GO = "__GO__"EOS = "__EOS__"  # 对话结束UNK = "__UNK__"  # 标记未出现在词汇表中的字符START_VOCABULART = [PAD, GO, EOS, UNK]PAD_ID = 0GO_ID = 1EOS_ID = 2UNK_ID = 3dataset_path_1='data_clean.txt'#dataset_path_2="data/sentiment_XS_test.txt"def set_dataset_path(path):    dataset_path=pathif not os.path.exists(dataset_path_1):    print('training dataset is null')    exit()#gen_vocabulary(生成字典)def gen_vocabulary_file(input_file, output_file,vocab_size,input_file2=None):    f = open(input_file, encoding='utf-8')    train_set_x = []    #train_set_y = []    #test_set_x = []    #test_set_y = []    for line in f.readlines():        x = line.strip()        train_set_x.append(x)        #train_set_y.append(y)    f.close()    #train_set_x = train_set_x[1:]    vocabulary = {}    counter = 0    for line in train_set_x:        counter += 1        # print line        tokens = line.strip().split(' ')  # 这一步有问题,输出的不是汉字        #print(tokens)        for word in tokens:            if word in vocabulary:  # 已在词汇表中,则词频加1                vocabulary[word] += 1            else:  # 不在则为1                vocabulary[word] = 1    vocabulary_list = START_VOCABULART + sorted(vocabulary, key=vocabulary.get, reverse=True)    # print vocabulary    # 取前5000个常用汉字, 应该差不多够用了    if len(vocabulary_list) > vocab_size:        vocabulary_list = vocabulary_list[:vocab_size]  # vocab_size大小的词汇表    print(input_file, " 词汇表大小:", len(vocabulary_list))    with open(output_file, "w",encoding='utf-8') as ff:        for word in vocabulary_list:            ff.write(word + '\n')print ("vocabulary start convert...:")gen_vocabulary_file(dataset_path_1,"train_set_vocabulary",20000)

句子转换id:

#coding=utf-8import numpy as npimport randomimport osfrom io import openimport datetime"""***yuchuli"""PAD = "__PAD__"GO = "__GO__"EOS = "__EOS__"  # 对话结束UNK = "__UNK__"  # 标记未出现在词汇表中的字符START_VOCABULART = [PAD, GO, EOS, UNK]PAD_ID = 0GO_ID = 1EOS_ID = 2UNK_ID = 3dataset_path_1='data_clean.txt'#dataset_path_2="data/sentiment_XS_test.txt"#  把对话字符串转为向量形式def convert_to_vector(input_file, vocabulary_file, output_file):    starttime = datetime.datetime.now()    tmp_vocab = []    with open(vocabulary_file, "r",encoding='utf-8') as f:        tmp_vocab.extend(f.readlines())#将词汇表填入tmp_vocab    tmp_vocab = [line.strip() for line in tmp_vocab]#去除一些无用字符    vocab = dict([(x, y) for (y, x) in enumerate(tmp_vocab)])    # {'硕': 3142, 'v': 577, 'I': 4789, '\ue796': 4515, '拖': 1333, '疤': 2201 ...}    #print vocab以上内容正确组成了字典    output_f = open(output_file, 'w',encoding='utf-8')#写入输出文件    train_set_x=[]    train_set_y=[]    with open(input_file, encoding='utf-8') as f:        for line in f:            x = line.strip().split(' ')            #print (x)            train_set_x.append(x)    #train_set_x = train_set_x[1:]    for line in train_set_x:        line_vec = []        for words in line:            line_vec.append(vocab.get(words, UNK_ID))            #如果words在vocab里,则填入vocab[words],否则3        #print line_vec        output_f.write((" ".join([str(num) for num in line_vec]) + "\n"))        #返回一个字符串的连接,以空格为分隔符,以换行符为结尾    output_f.close()    endtime = datetime.datetime.now()    print("运行时间:%d 秒"%((endtime - starttime).seconds))convert_to_vector(dataset_path_1,vocabulary_file="train_set_vocabulary",output_file="train_set_encode")#convert_to_vector(dataset_path_2,vocabulary_file="train_set_vocabulary",output_file="test_set_encode")

自己手动提取10%的数据作为测试集
接下来,进行分类模型构建:
MLP模型:mlp_model.py

#coding=utf-8import tensorflow as tfimport numpy as np#coding=utf-8import tensorflow as tfimport numpy as npclass MLP_Model(object):    def __init__(self,config,is_training=True):        self.keep_prob=config.keep_prob        self.batch_size=tf.Variable(0,dtype=tf.int32,trainable=False)        self.is_training =is_training        num_step=config.num_step        self.input_data=tf.placeholder(tf.int32,[None,num_step])        self.target = tf.placeholder(tf.int64,[None])        #self.mask_x = tf.placeholder(tf.float32,[num_step,None])        #emotion_embed_dim = config.emotion_embed_dim        class_num=config.class_num        hidden_neural_size=config.hidden_neural_size        vocabulary_size=config.vocabulary_size        max_len = config.max_len        embed_dim=config.embed_dim        hidden_layer_num = config.hidden_layer_num        self.new_batch_size = tf.placeholder(tf.int32,shape=[],name="new_batch_size")        self._batch_size_update = tf.assign(self.batch_size,self.new_batch_size)        # Store layers weight & bias        weights = {            'h1': tf.Variable(tf.random_normal([embed_dim, hidden_neural_size])),            'h2': tf.Variable(tf.random_normal([hidden_neural_size, hidden_neural_size])),            'out': tf.Variable(tf.random_normal([hidden_neural_size, class_num]))        }        biases = {            'b1': tf.Variable(tf.random_normal([hidden_neural_size])),            'b2': tf.Variable(tf.random_normal([hidden_neural_size])),            'out': tf.Variable(tf.random_normal([class_num]))        }        #build mlp network        def multilayer_perceptron(_X, _weights, _biases):            layer_1=[]            layer_2=[]            for i in range(max_len):                if i > 0: tf.get_variable_scope().reuse_variables()                layer_1.append(tf.nn.relu(                    tf.add(tf.matmul(_X[i], _weights['h1']), _biases['b1'])))  # Hidden layer with sigmoid activation                layer_2.append(tf.nn.relu(                    tf.add(tf.matmul(layer_1[i], _weights['h2']), _biases['b2']))) # Hidden layer with RELU activation            with tf.name_scope("mean_pooling_layer"):                out_put = tf.reduce_mean(layer_2, 0)            return tf.matmul(out_put, _weights['out']) + _biases['out']        #lstm_fw_cell = rnn_cell.BasicLSTMCell(hidden_neural_size,forget_bias=0.0,state_is_tuple=True)        #lstm_bw_cell = rnn_cell.BasicLSTMCell(hidden_neural_size, forget_bias=0.0,state_is_tuple=True)        #if self.keep_prob<1:        #    lstm_fw_cell =  rnn_cell.DropoutWrapper(        #        lstm_fw_cell,output_keep_prob=self.keep_prob        #    )        #    lstm_bw_cell = rnn_cell.DropoutWrapper(        #        lstm_bw_cell, output_keep_prob=self.keep_prob        #    )        #lstm_fw_cell = tf.nn.rnn_cell.MultiRNNCell([lstm_fw_cell]*hidden_layer_num)        #lstm_bw_cell = tf.nn.rnn_cell.MultiRNNCell([lstm_bw_cell]*hidden_layer_num)        #self._initial_state = cell.zero_state(self.batch_size,dtype=tf.float32)        #embedding layer        with tf.device("/cpu:0"),tf.name_scope("embedding_layer"):            embedding = tf.get_variable("embedding",[vocabulary_size,embed_dim],dtype=tf.float32)            inputs= tf.nn.embedding_lookup(embedding,self.input_data)            inputs_emb = tf.transpose(inputs, [1, 0, 2])            inputs_emb = tf.reshape(inputs_emb, [-1, embed_dim])            inputs_emb = tf.split(0, num_step, inputs_emb)        #print(inputs)        if self.keep_prob<1:            inputs = tf.nn.dropout(inputs,self.keep_prob)        with tf.variable_scope("mlp_layer"):            self.logits = multilayer_perceptron(inputs_emb,weights,biases)        #out_put=[]        #state=self._initial_state        #with tf.variable_scope("LSTM_layer"):        #    for time_step in range(num_step):        #        if time_step>0: tf.get_variable_scope().reuse_variables()        #        (cell_output,state)=cell(inputs[:,time_step,:],state)        #        out_put.append(cell_output)        #out_put=out_put*self.mask_x[:,:,None]        #with tf.name_scope("mean_pooling_layer"):        #    out_put=tf.reduce_sum(out_put,0)/(tf.reduce_sum(self.mask_x,0)[:,None])        #with tf.name_scope("Softmax_layer_and_output"):        #    softmax_w = tf.get_variable("softmax_w",[2*hidden_neural_size,class_num],dtype=tf.float32)        #    softmax_b = tf.get_variable("softmax_b",[class_num],dtype=tf.float32)        #    self.logits = tf.matmul(outputs[-1],softmax_w)+softmax_b        with tf.name_scope("loss"):            self.loss = tf.nn.sparse_softmax_cross_entropy_with_logits(self.logits+1e-10,self.target)            self.cost = tf.reduce_mean(self.loss)        with tf.name_scope("accuracy"):            self.prediction = tf.argmax(self.logits,1)            correct_prediction = tf.equal(self.prediction,self.target)            self.correct_num=tf.reduce_sum(tf.cast(correct_prediction,tf.float32))            self.accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32),name="accuracy")        #add summary        loss_summary = tf.summary.scalar("loss",self.cost)        #add summary        accuracy_summary=tf.summary.scalar("accuracy_summary",self.accuracy)        if not self.is_training:            self.saver = tf.train.Saver(tf.global_variables())            return        self.globle_step = tf.Variable(0,name="globle_step",trainable=False)        self.lr = tf.Variable(0.0,trainable=False)        tvars = tf.trainable_variables()        grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars),                                      config.max_grad_norm)        # Keep track of gradient values and sparsity (optional)        grad_summaries = []        for g, v in zip(grads, tvars):            if g is not None:                grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g)                sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))                grad_summaries.append(grad_hist_summary)                grad_summaries.append(sparsity_summary)        self.grad_summaries_merged = tf.summary.merge(grad_summaries)        self.summary =tf.summary.merge([loss_summary,accuracy_summary,self.grad_summaries_merged])        optimizer = tf.train.GradientDescentOptimizer(self.lr)        optimizer.apply_gradients(zip(grads, tvars))        self.train_op=optimizer.apply_gradients(zip(grads, tvars))        self.new_lr = tf.placeholder(tf.float32,shape=[],name="new_learning_rate")        self._lr_update = tf.assign(self.lr,self.new_lr)        self.global_step = tf.Variable(0, trainable=False)        self.saver = tf.train.Saver(tf.global_variables())    def assign_new_lr(self,session,lr_value):        session.run(self._lr_update,feed_dict={self.new_lr:lr_value})    def assign_new_batch_size(self,session,batch_size_value):        session.run(self._batch_size_update,feed_dict={self.new_batch_size:batch_size_value})

训练模型:mlp.py

import osimport timeimport tensorflow as tf#import datetime#from rnn_model import RNN_Modelfrom mlp_model import MLP_Modelimport data_processflags =tf.app.flagsFLAGS = flags.FLAGSflags.DEFINE_integer('batch_size',64,'the batch_size of the training procedure')flags.DEFINE_float('lr',0.1,'the learning rate')flags.DEFINE_float('lr_decay',0.6,'the learning rate decay')flags.DEFINE_integer('vocabulary_size',40000,'vocabulary_size')#emotion embeddingflags.DEFINE_integer("emotion_nums",2,'emotion_nums')#posivitive,negative,neuralflags.DEFINE_integer("emotion_embed_dim",128,'emotion embedding_dim')flags.DEFINE_integer('emdedding_dim',128,'embedding dim')flags.DEFINE_integer('hidden_neural_size',128,'LSTM hidden neural size')flags.DEFINE_integer('hidden_layer_num',3,'LSTM hidden layer num')flags.DEFINE_string('dataset_path','data/subj0.pkl','dataset path')flags.DEFINE_integer('max_len',100,'max_len of training sentence')flags.DEFINE_integer('valid_num',100,'epoch num of validation')flags.DEFINE_integer('checkpoint_num',1000,'epoch num of checkpoint')flags.DEFINE_float('init_scale',0.1,'init scale')flags.DEFINE_integer('class_num',2,'class num')flags.DEFINE_float('keep_prob',0.5,'dropout rate')flags.DEFINE_integer('num_epoch',81,'num epoch')flags.DEFINE_integer('max_decay_epoch',30,'num epoch')flags.DEFINE_integer('max_grad_norm',5,'max_grad_norm')flags.DEFINE_string('out_dir',os.path.abspath(os.path.join(os.path.curdir,"review_runs2_81")),'output directory')flags.DEFINE_integer('check_point_every',10,'checkpoint every num epoch ')class Config(object):    hidden_neural_size=FLAGS.hidden_neural_size    vocabulary_size=FLAGS.vocabulary_size    embed_dim=FLAGS.emdedding_dim    #emotion    emotion_nums=FLAGS.emotion_nums    emotion_embed_dim=FLAGS.emotion_embed_dim    #    hidden_layer_num=FLAGS.hidden_layer_num    class_num=FLAGS.class_num    keep_prob=FLAGS.keep_prob    lr = FLAGS.lr    lr_decay = FLAGS.lr_decay    batch_size=FLAGS.batch_size    num_step = FLAGS.max_len    max_grad_norm=FLAGS.max_grad_norm    num_epoch = FLAGS.num_epoch    max_decay_epoch = FLAGS.max_decay_epoch    valid_num=FLAGS.valid_num    out_dir=FLAGS.out_dir    max_len = FLAGS.max_len    checkpoint_every = FLAGS.check_point_everydef evaluate(model,session,data,global_steps=None,summary_writer=None):    correct_num=0    total_num=len(data[0])    for step, (x,y,mask_x) in enumerate(data_process.batch_iter(data, batch_size=FLAGS.batch_size)):         fetches = model.correct_num         feed_dict={}         feed_dict[model.input_data]=x         feed_dict[model.target]=y         #feed_dict[model.mask_x]=mask_x         model.assign_new_batch_size(session,len(x))         #state = session.run(model._initial_state)         #for i , (c,h) in enumerate(model._initial_state):         #   feed_dict[c]=state[i].c         #   feed_dict[h]=state[i].h         count=session.run(fetches,feed_dict)         correct_num+=count    accuracy=float(correct_num)/total_num    dev_summary = tf.summary.scalar('dev_accuracy',accuracy)    dev_summary = session.run(dev_summary)    if summary_writer:        summary_writer.add_summary(dev_summary,global_steps)        summary_writer.flush()    return accuracydef run_epoch(model,session,data,global_steps,valid_model,valid_data,train_summary_writer,valid_summary_writer=None):    for step, (x,y,mask_x) in enumerate(data_process.batch_iter(data, batch_size=FLAGS.batch_size)):        feed_dict={}        feed_dict[model.input_data]=x        #feed_dict[model.emotion_state]=[]        feed_dict[model.target]=y        #feed_dict[model.mask_x]=mask_x        model.assign_new_batch_size(session,len(x))        fetches = [model.cost,model.accuracy,model.train_op,model.summary]        #state = session.run(model._initial_state)        #for i , (c,h) in enumerate(model._initial_state):        #    feed_dict[c]=state[i].c        #    feed_dict[h]=state[i].h        cost,accuracy,_,summary = session.run(fetches,feed_dict)        train_summary_writer.add_summary(summary,global_steps)        train_summary_writer.flush()        model.is_training=False        valid_accuracy=evaluate(valid_model,session,valid_data,global_steps,valid_summary_writer)        if(global_steps%100==0):            print (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))            print("the %i step, train cost is: %f and the train accuracy is %f and the valid accuracy is %f"%(global_steps,cost,accuracy,valid_accuracy))        global_steps+=1    return global_stepsdef train_step():    print("loading the dataset...")    config = Config()    eval_config=Config()    eval_config.keep_prob=1.0    train_data,valid_data,test_data= data_process.load_data(FLAGS.max_len, batch_size=config.batch_size)    print("begin training")    # gpu_config=tf.ConfigProto()    # gpu_config.gpu_options.allow_growth=True    with tf.Graph().as_default(), tf.Session() as session:        initializer = tf.random_uniform_initializer(-1*FLAGS.init_scale,1*FLAGS.init_scale)        with tf.variable_scope("model",reuse=None,initializer=initializer):            model = MLP_Model(config=config,is_training=True)        # train_summary_op = tf.merge_summary([model.loss_summary,model.accuracy])        train_summary_dir = os.path.join(config.out_dir,"summaries","train")        train_summary_writer =  tf.summary.FileWriter(train_summary_dir,session.graph)        # dev_summary_op = tf.merge_summary([valid_model.loss_summary,valid_model.accuracy])        dev_summary_dir = os.path.join(eval_config.out_dir,"summaries","dev")        dev_summary_writer =  tf.summary.FileWriter(dev_summary_dir,session.graph)        #add checkpoint        checkpoint_dir = os.path.abspath(os.path.join(config.out_dir, "checkpoints"))        checkpoint_prefix = os.path.join(checkpoint_dir, "model")        if not os.path.exists(checkpoint_dir):            os.makedirs(checkpoint_dir)        saver = tf.train.Saver(tf.global_variables())        tf.global_variables_initializer().run()        global_steps=1        begin_time=int(time.time())        for i in range(config.num_epoch):            print (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))            print("the %d epoch training..."%(i+1))            lr_decay = config.lr_decay ** max(i-config.max_decay_epoch,0.0)            model.assign_new_lr(session,config.lr*lr_decay)            global_steps=run_epoch(model,session,train_data,global_steps,model,valid_data,train_summary_writer,dev_summary_writer)            if i% config.checkpoint_every==0:                path = saver.save(session,checkpoint_prefix,global_steps)                print("Saved model chechpoint to{}\n".format(path))        print("the train is finished")        end_time=int(time.time())        print("training takes %d seconds already\n"%(end_time-begin_time))        #test_accuracy=evaluate(test_model,session,test_data)        #print("the test data accuracy is %f"%test_accuracy)        print (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))        print("program end!")def main(_):    train_step()if __name__ == "__main__":    tf.app.run()

指定python mlp.py

评估:evalute.py

import osimport timeimport numpy as npimport tensorflow as tf#import datetime#from rnn_model import RNN_Modelfrom mlp_model import MLP_Modelimport data_processflags =tf.app.flagsFLAGS = flags.FLAGSflags.DEFINE_integer('batch_size',64,'the batch_size of the training procedure')flags.DEFINE_float('lr',0.1,'the learning rate')flags.DEFINE_float('lr_decay',0.6,'the learning rate decay')flags.DEFINE_integer('vocabulary_size',40000,'vocabulary_size')#emotion embeddingflags.DEFINE_integer("emotion_nums",2,'emotion_nums')#posivitive,negative,neuralflags.DEFINE_integer("emotion_embed_dim",128,'emotion embedding_dim')flags.DEFINE_integer('emdedding_dim',128,'embedding dim')flags.DEFINE_integer('hidden_neural_size',128,'LSTM hidden neural size')flags.DEFINE_integer('hidden_layer_num',3,'LSTM hidden layer num')flags.DEFINE_string('dataset_path','data/subj0.pkl','dataset path')flags.DEFINE_integer('max_len',100,'max_len of training sentence')flags.DEFINE_integer('valid_num',100,'epoch num of validation')flags.DEFINE_integer('checkpoint_num',1000,'epoch num of checkpoint')flags.DEFINE_float('init_scale',0.1,'init scale')flags.DEFINE_integer('class_num',2,'class num')flags.DEFINE_float('keep_prob',0.5,'dropout rate')flags.DEFINE_integer('num_epoch',81,'num epoch')flags.DEFINE_integer('max_decay_epoch',30,'num epoch')flags.DEFINE_integer('max_grad_norm',5,'max_grad_norm')flags.DEFINE_string('out_dir',os.path.abspath(os.path.join(os.path.curdir,"review_runs2_81")),'output directory')flags.DEFINE_integer('check_point_every',10,'checkpoint every num epoch ')class Config(object):    hidden_neural_size=FLAGS.hidden_neural_size    vocabulary_size=FLAGS.vocabulary_size    embed_dim=FLAGS.emdedding_dim    #emotion    emotion_nums=FLAGS.emotion_nums    emotion_embed_dim=FLAGS.emotion_embed_dim    #    hidden_layer_num=FLAGS.hidden_layer_num    class_num=FLAGS.class_num    keep_prob=FLAGS.keep_prob    lr = FLAGS.lr    lr_decay = FLAGS.lr_decay    batch_size=FLAGS.batch_size    num_step = FLAGS.max_len    max_grad_norm=FLAGS.max_grad_norm    num_epoch = FLAGS.num_epoch    max_decay_epoch = FLAGS.max_decay_epoch    valid_num=FLAGS.valid_num    out_dir=FLAGS.out_dir    max_len = FLAGS.max_len    checkpoint_every = FLAGS.check_point_everydef evaluate(model,session,data,global_steps=None,summary_writer=None):    #pre_label=[]    accuracy=[]    for step, (x,y,mask_x) in enumerate(data_process.batch_iter(data, batch_size=FLAGS.batch_size)):         fetches = model.correct_num         label=model.prediction         feed_dict={}         feed_dict[model.input_data]=x         #feed_dict[model.target]=y         #feed_dict[model.mask_x]=mask_x         model.assign_new_batch_size(session,len(x))         #state = session.run(model._initial_state)         #for i , (c,h) in enumerate(model._initial_state):         #   feed_dict[c]=state[i].c         #   feed_dict[h]=state[i].h         #count=session.run(fetches,feed_dict)         pre=session.run(label,feed_dict)         correct_num=0         #pre_label.append(pre)         for i in range(len(pre)):             if pre[i]== y[i]:                 correct_num +=1         accuracy.append(correct_num/len(pre))    #accuracy=float(correct_num)/total_num    #dev_summary = tf.summary.scalar('dev_accuracy',accuracy)    #dev_summary = session.run(dev_summary)    '''    if summary_writer:        summary_writer.add_summary(dev_summary,global_steps)        summary_writer.flush()    '''    return accuracydef test_step():    print("loading the dataset...")    config = Config()    eval_config = Config()    eval_config.keep_prob = 1.0    train_data, valid_data, test_data = data_process.load_data(FLAGS.max_len, batch_size=config.batch_size)    print("begin testing....")    print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))    with tf.Session() as session:        initializer = tf.random_uniform_initializer(-1 * FLAGS.init_scale, 1 * FLAGS.init_scale)        with tf.variable_scope("model", reuse=None, initializer=initializer):            test_model = MLP_Model(config=eval_config, is_training=False)            curdir = os.path.abspath(os.path.join(config.out_dir, "checkpoints"))            #curdir ="D:\\emotion_classifier\\runs3_60\\checkpoints\\model-18922"            ckpt = tf.train.get_checkpoint_state(curdir)            if ckpt != None:                print(ckpt.model_checkpoint_path)                test_model.saver.restore(session, ckpt.model_checkpoint_path)            else:                print("该路径不存在,结束!")                tf.global_variables_initializer().run()                return        accs = evaluate(test_model, session, test_data)        accuracy = np.mean(accs)        print("精确率为:%f"%(accuracy))test_step()

数据处理data_process:

#coding=utf-8import numpy as npimport randomimport osfrom io import openimport stringimport datetime"""***yuchuli"""PAD = "__PAD__"GO = "__GO__"EOS = "__EOS__"  # 对话结束UNK = "__UNK__"  # 标记未出现在词汇表中的字符START_VOCABULART = [PAD, GO, EOS, UNK]PAD_ID = 0GO_ID = 1EOS_ID = 2UNK_ID = 3dataset_path_1='data/train_label.txt'dataset_path_2="data/test_label.txt"def set_dataset_path(path):    dataset_path=pathif not os.path.exists(dataset_path_1):    print('training dataset is null')    exit()if not os.path.exists(dataset_path_2):    print('test dataset is null')    exit()def load_data(max_len,batch_size,n_words=40000,valid_portion=0.2,sort_by_len=False):    f=open(dataset_path_1,'rb')    f1=open(dataset_path_2,'rb')    f2=open('data/train_set_encode','rb')    f3=open('data/test_set_encode','rb')    print ('load training label from %s\nload test label from %s'%(dataset_path_1,dataset_path_2))    train_set_x=[]    train_set_y=[]    test_set_x=[]    test_set_y=[]    #load label    for line in f.readlines():        y=int(line.strip())        train_set_y.append(y)    for line1 in f1.readlines():        y = int(line1.strip())        test_set_y.append(y)    #get the trainset    for line in f2.readlines():        line=line.decode('utf-8').strip().split(' ')        train_set_x.append(line)    for line in f3.readlines():        line=line.decode('utf-8').strip().split(' ')        test_set_x.append(line)    f.close()    f1.close()    f2.close()    f3.close()    #string matrix-->int matrix    def string_to_int(input):        output=[]        for line in input:            line_vec=[]            for word in line:                num=int(word)                line_vec.append(num)            output.append(line_vec)        return output    train_set_x=string_to_int(train_set_x)    test_set_x=string_to_int(test_set_x)    valid_set_y=[]    valid_set_x=[]    #split train/valid set    n_samples = len(train_set_x)    sidx=np.random.permutation(n_samples)    n_train = int(np.round(n_samples * (1. - valid_portion)))    valid_set_x = [train_set_x[s] for s in sidx[n_train:]]    valid_set_y = [train_set_y[s] for s in sidx[n_train:]]    train_set_x = [train_set_x[s] for s in sidx[:n_train]]    train_set_y = [train_set_y[s] for s in sidx[:n_train]]    train_set=(train_set_x,train_set_y)    valid_set=(valid_set_x,valid_set_y)    test_set=(test_set_x,test_set_y)    # remove unknow words    def remove_unk(x):        return [[UNK_ID if w >= n_words else w for w in sen] for sen in x]    test_set_x, test_set_y = test_set    valid_set_x, valid_set_y = valid_set    train_set_x, train_set_y = train_set    train_set_x = remove_unk(train_set_x)    valid_set_x = remove_unk(valid_set_x)    test_set_x = remove_unk(test_set_x)    def len_argsort(seq):        return sorted(range(len(seq)), key=lambda x: len(seq[x]))    if sort_by_len:        sorted_index = len_argsort(test_set_x)        test_set_x = [test_set_x[i] for i in sorted_index]        test_set_y = [test_set_y[i] for i in sorted_index]        sorted_index = len_argsort(valid_set_x)        valid_set_x = [valid_set_x[i] for i in sorted_index]        valid_set_y = [valid_set_y[i] for i in sorted_index]        sorted_index = len_argsort(train_set_x)        train_set_x = [train_set_x[i] for i in sorted_index]        train_set_y = [train_set_y[i] for i in sorted_index]    train_set = (train_set_x, train_set_y)    valid_set = (valid_set_x, valid_set_y)    test_set = (test_set_x, test_set_y)    new_train_set_x = np.zeros([len(train_set[0]), max_len])    new_train_set_y = np.zeros(len(train_set[0]))    new_valid_set_x = np.zeros([len(valid_set[0]), max_len])    new_valid_set_y = np.zeros(len(valid_set[0]))    new_test_set_x = np.zeros([len(test_set[0]), max_len])    new_test_set_y = np.zeros(len(test_set[0]))    mask_train_x = np.zeros([max_len, len(train_set[0])])    mask_valid_x = np.zeros([max_len, len(valid_set[0])])    mask_test_x = np.zeros([max_len, len(test_set[0])])    #padding    def padding_and_generate_mask(x, y, new_x, new_y, new_mask_x):        for i, (x, y) in enumerate(zip(x, y)):            if len(x) <= max_len:                new_x[i, 0:len(x)] = x                new_mask_x[0:len(x), i] = 1                new_y[i] = y            else:                new_x[i] = (x[0:max_len])                new_mask_x[:, i] = 1                new_y[i] = y        new_set = (new_x, new_y, new_mask_x)        del new_x, new_y        return new_set    train_set = padding_and_generate_mask(train_set[0], train_set[1], new_train_set_x, new_train_set_y, mask_train_x)    valid_set = padding_and_generate_mask(valid_set[0], valid_set[1], new_valid_set_x, new_valid_set_y, mask_valid_x)    test_set = padding_and_generate_mask(test_set[0], test_set[1], new_test_set_x, new_test_set_y, mask_test_x)    return train_set,valid_set,test_set#return batch datasetdef batch_iter(data,batch_size):    #get dataset and label    x,y,mask_x=data#wentiguanjian    x=np.array(x)    y=np.array(y)    data_size=len(x)    num_batches_per_epoch=int((data_size-1)/batch_size)+1    for batch_index in range(num_batches_per_epoch):        start_index=batch_index*batch_size        end_index=min((batch_index+1)*batch_size,data_size)        return_x = x[start_index:end_index]        return_y = y[start_index:end_index]        return_mask_x = mask_x[:,start_index:end_index]        # if(len(return_x)<batch_size):        #     print(len(return_x))        #     print return_x        #     print return_y        #     print return_mask_x        #     import sys        #     sys.exit(0)        yield (return_x,return_y,return_mask_x)

最终结果86%左右,还不错。

阅读全文
0 0
原创粉丝点击