首先是要对数据进行预处理,这里使用了nltk这个工具包,首先把每一句话转化为list的形式,对list中的每一个词进行规范化处理,之后获取整个数据集中的所有单词列表,并把每一句话用和单词列表相同长度 的一维数组来表示,简单来说就是如果第i个位置的单词在句子中出现,则特征数组的第i个值为1,否则为0。代码如下:

def process_file(f):    with open(f,'r') as data:        ltex = []        lines = data.readlines()        for line in lines:            item = word_tokenize(line.lower())            ltex += item    return ltexdef process_line(ltx,f,clf):    data_set = []    with open(f,'r') as data:        lines = data.readlines()        for line in lines:            features = np.zeros(len(ltx))            #首先把一句话转换为单独的词            words = word_tokenize(line)            #接着进行词形还原            lemmatizer = WordNetLemmatizer()            words = [lemmatizer.lemmatize(word) for word in words]            for word in ltx:                if word in words:                    features[ltx.index(word)] += 1            data_set.append([features,clf])    return data_set def process_data():    #所有词汇的汇总表    ltx = []    pos = 'pos.txt'    neg = 'neg.txt'    ltx += process_file(pos)    ltx += process_file(neg)    lemmatizer = WordNetLemmatizer()    ltx = [lemmatizer.lemmatize(word) for word in ltx]    word_count = Counter(ltx)    #处理过后的词    ltx = []    #把常用词和低频词过滤掉    for word in word_count:        if word_count[word] > 10 and word_count[word] < 20000:            ltx.append(word)    #把每一句话用词向量的形式表现出来    data_set = []    data_set += process_line(ltx,neg,[0,1])    data_set += process_line(ltx,pos,[1,0])    #把data_set的顺序打乱    random.shuffle(data_set)    #把整理好的data_set保存在文件中    with open('data.pickle','w+') as f:        pickle.dump(data_set,f)


def neural_network(data,n_input_layer):    #第一层神经元    layer_1_w_b = {'w_':tf.Variable(tf.random_normal([n_input_layer,n_layer_1])),'b_':tf.Variable(tf.random_normal([n_layer_1]))}    #第二层神经元    layer_2_w_b = {'w_':tf.Variable(tf.random_normal([n_layer_1,n_layer_2])),'b_':tf.Variable(tf.random_normal([n_layer_2]))}    #输出层神经元    layer_output_w_b = {'w_':tf.Variable(tf.random_normal([n_layer_2,n_output_layer])),'b_':tf.Variable(tf.random_normal([n_output_layer]))}    #w*x+b    layer_1 = tf.nn.relu(tf.add(tf.matmul(data,layer_1_w_b['w_']),layer_1_w_b['b_']))    layer_2 = tf.nn.relu(tf.add(tf.matmul(layer_1,layer_2_w_b['w_']),layer_2_w_b['b_']))    layer_output = tf.add(tf.matmul(layer_2,layer_output_w_b['w_']),layer_output_w_b['b_'])    return layer_output,layer_1_w_b


X = tf.placeholder('float',[None,len(data_set[0][0])])    Y = tf.placeholder('float')    predict,_ = neural_network(X, n_input_layer)    #下面就是搭建训练模型    #cost_function    cost_function = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=predict,labels=Y))    #下面是训练过程    train_step = tf.train.AdamOptimizer().minimize(cost_function)    #ok,开始组织数据    epochs = 2    with tf.Session() as sess:        sess.run(tf.global_variables_initializer())        x_data = train_data[:,0]        y_data = train_data[:,1]        for epoch in range(epochs):            epoch_loss = 0            i = 0            while i < len(x_data):                start = i                 end = start+batch_size                x_input = x_data[start:end]                y_input = y_data[start:end]                cost = sess.run(cost_function,feed_dict = {X:list(x_input),Y:list(y_input)})                i += batch_size                epoch_loss += cost            print 'epoch_loss_%d :%f'%(epoch,epoch_loss)                    x_test = test_data[:,0]        y_test = test_data[:,1]        correct = tf.equal(tf.arg_max(predict,1),tf.arg_max(Y,1))        accuracy_op = tf.reduce_mean(tf.cast(correct,'float'))        accuracy_output = sess.run(accuracy_op,feed_dict={X:list(x_test),Y:list(y_test)})        print '准确率 :%f'%accuracy_output


def useful_field(file_name,save_name):    output = open(save_name,'w+')    with open(file_name,'r') as f:        lines = f.readlines()        for line in lines:            line = line.replace('"','')            clf = line.split(',')[0]            if clf == '0':                clf = [0,0,1]            elif clf == '2':                clf = [0,1,0]            elif clf == '4':                clf = [1,0,0]            tweet = line.split(',')[-1]            output_line = str(clf)+'----'+tweet            output.write(output_line)    output.close()def create_lexicon(train_file,output_name):    lex = []    with open(train_file,'r') as f:        count_word = {}        lines = f.readlines()        lemmatizer = WordNetLemmatizer()        for line in lines:            tweet = line.split('----')[1]            words = word_tokenize(tweet.decode('utf-8','ignore'))            for word in words:                word = lemmatizer.lemmatize(word.decode('utf-8','ignore'))                if word in count_word:                    count_word[word] += 1                else:                    count_word[word] = 1        #就是把字典排下序,也不知道要干啥        count_word = OrderedDict(sorted(count_word.items(),key = lambda t:t[1]))        for word in count_word:            if count_word[word] < 10000 and count_word[word] > 10:                lex.append(word)    with open(output_name,'w+') as f:        pickle.dump(lex,f)def load_data(train_name,lex):    with open(train_name,'r') as train_file:        x_data = []        y_data = []        lines = train_file.readlines()        lemmatizer = WordNetLemmatizer()        for line in lines:            label = eval(line.split('----')[0])            tweet = line.split('----')[1]            words = word_tokenize(tweet)            words = [lemmatizer.lemmatize(word) for word in words]            feature = np.zeros(len(lex))            for word in lex:                if word in words:                    feature[lex.index(word)] = 1            x_data.append(label)            y_data.append(feature)    return x_data,y_data


def cnn_network(X,input_size,dropout_keep_prob):        # embedding layer    with tf.device('/cpu:0'), tf.name_scope("embedding"):        embedding_size = 128        #Variable        W = tf.Variable(tf.random_uniform([input_size, embedding_size], -1.0, 1.0))        embedded_chars = tf.nn.embedding_lookup(W, X)        embedded_chars_expanded = tf.expand_dims(embedded_chars, -1)    # convolution + maxpool layer    num_filters = 128    filter_sizes = [3,4,5]    pooled_outputs = []    for i, filter_size in enumerate(filter_sizes):        with tf.name_scope("conv-maxpool-%s" % filter_size):            filter_shape = [filter_size, embedding_size, 1, num_filters]            W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1))            b = tf.Variable(tf.constant(0.1, shape=[num_filters]))            conv = tf.nn.conv2d(embedded_chars_expanded, W, strides=[1, 1, 1, 1], padding="VALID")            h = tf.nn.relu(tf.nn.bias_add(conv, b))            pooled = tf.nn.max_pool(h, ksize=[1, input_size - filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID')            pooled_outputs.append(pooled)    num_filters_total = num_filters * len(filter_sizes)    h_pool = tf.concat(pooled_outputs,3)    h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total])    # dropout    with tf.name_scope("dropout"):        h_drop = tf.nn.dropout(h_pool_flat, dropout_keep_prob)    # output    with tf.name_scope("output"):        W = tf.get_variable("W", shape=[num_filters_total, n_output_layer], initializer=tf.contrib.layers.xavier_initializer())        b = tf.Variable(tf.constant(0.1, shape=[n_output_layer]))        output = tf.nn.xw_plus_b(h_drop, W, b)            return output,embedded_chars,embedded_chars_expanded
