IMDB情感分析例子(keras)

来源:互联网 发布:免费充值软件 编辑:程序博客网 时间:2024/05/05 14:45

加载IMDB数据集

X_train[0]=[1,14,22,.....32]    长度为228

X_train=sequence.pad_sequences(X_train,maxlen=500) 

x_train[0]变为[0,0,0.......1,14,22,....32]   长度为500

import numpyfrom keras.datasets import imdbfrom matplotlib import pyplotfrom keras.preprocessing import sequence(X_train,y_train),(X_test,y_test)=imdb.load_data()print("Train data:")print(X_train.shape)print(y_train.shape)print(X_test.shape)print(y_test.shape)print(X_train[0])print("first length:")print(len(X_train[0]))print("classes:")print(numpy.unique(y_train))print("number of words:")print(len(numpy.unique(numpy.hstack(X_train))))print("review length:")result=map(len, X_train)#sequence.pad_sequences  #将228长度的句子,填充到500,在前面前冲0X_train=sequence.pad_sequences(X_train,maxlen=500)print(X_train[0])print(len(X_train[0]))print("mean %.2f words(%f)"% (numpy.mean(result),numpy.std(result)))pyplot.subplot(121)pyplot.boxplot(result)pyplot.subplot(122)pyplot.hist(result)pyplot.show()

Word Embeddings

imdb.load_data(nb_words=5000,test__split=0.33)

X_train=sequence.pad_sequences(X_train,maxlen=500)

X_test=sequence.pad_sequences(X_test,maxlen=500)

model.add(Embedding(5000,32,input_length=500))

5000词汇量,每个句子500长度,每个词用32位向量表示


普通神经网络

import numpyfrom keras.datasets import imdbfrom keras.models import Sequentialfrom keras.layers import Densefrom keras.layers import Flattenfrom keras.layers.embeddings import Embeddingfrom keras.preprocessing import sequenceseed=7numpy.random.seed(seed)top_words=5000test_split=0.33(X_train,y_train),(X_test,y_test)=imdb.load_data(num_words=top_words)num_lizi=X_train.shape[0]/10num_lizi2=X_test.shape[0]/10X_train=X_train[0:num_lizi]y_train=y_train[0:num_lizi]X_test=X_test[0:num_lizi2]y_test=y_test[0:num_lizi2]max_words=500X_train=sequence.pad_sequences(X_train,maxlen=max_words)X_test=sequence.pad_sequences(X_test,maxlen=max_words)model=Sequential()model.add(Embedding(top_words,32,input_length=max_words))model.add(Flatten())model.add(Dense(250,activation='relu'))model.add(Dense(1))model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])print(model.summary())model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=2,batch_size=128,verbose=1)scores=model.evaluate(X_test, y_test,verbose=0)print("Accuracy: %.2f%%"%(scores[1]*100))

一维CNN处理IMDB问题

# CNN for the IMDB problemimport numpyfrom keras.datasets import imdbfrom keras.models import Sequentialfrom keras.layers import Densefrom keras.layers import Flattenfrom keras.layers.convolutional import Convolution1Dfrom keras.layers.convolutional import MaxPooling1Dfrom keras.layers.embeddings import Embeddingfrom keras.preprocessing import sequenceseed = 7numpy.random.seed(seed)# load the dataset but only keep the top n words, zero the resttop_words = 5000test_split = 0.33(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)# pad dataset to a maximum review length in wordsmax_words = 500X_train = sequence.pad_sequences(X_train, maxlen=max_words)X_test = sequence.pad_sequences(X_test, maxlen=max_words)model = Sequential()model.add(Embedding(top_words, 32, input_length=max_words))model.add(Convolution1D(nb_filter=32, filter_length=3, border_mode='same', activation='relu'))model.add(MaxPooling1D(pool_length=2))model.add(Flatten())model.add(Dense(250, activation='relu'))model.add(Dense(1, activation='sigmoid'))model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])print(model.summary())model.fit(X_train, y_train, validation_data=(X_test, y_test), nb_epoch=2, batch_size=128, verbose=1)scores = model.evaluate(X_test, y_test, verbose=0)print("Accuracy: %.2f%%" % (scores[1]*100))


原创粉丝点击