推荐算法学习2-MXNET 实现movielen 融合个性化推荐-续-加入CNN文本处理
来源:互联网 发布:迪联软件 编辑:程序博客网 时间:2024/05/19 02:39
上一篇文章的网络定义严格来说并不是严格按照paddle2给的那个例子来的。本篇文章在上一篇的基础上加入了CNN对电影title的特征提取。CNN 对文本的处理参考的是MXNET的教程:Text Classification Using a Convolutional Neural Network on MXNet 和论文:Convolutional Neural Networks for Sentence Classification
增加的部分如下:
1. title的预处理,主要是把title中的单词建立成字典,并且把所有title padded to max length,变成一个定宽的数据,类似于一张张相同大小的图片,这样应用不同size的核进行卷积。
def pad_sentences(sentences, padding_word="</s>"): """ Pads all sentences to the same length. The length is defined by the longest sentence. Returns padded sentences. """ sequence_length = max(len(x) for x in sentences) padded_sentences = [] for i in range(len(sentences)): sentence = sentences[i] num_padding = sequence_length - len(sentence) new_sentence = sentence + [padding_word] * num_padding padded_sentences.append(new_sentence) return padded_sentencesdef build_vocab(sentences): """ Builds a vocabulary mapping from word to index based on the sentences. Returns vocabulary mapping and inverse vocabulary mapping. """ # Build vocabulary word_counts = Counter(itertools.chain(*sentences)) # Mapping from index to word vocabulary_inv = [x[0] for x in word_counts.most_common()] # Mapping from word to index vocabulary = {x: i for i, x in enumerate(vocabulary_inv)} return [vocabulary, vocabulary_inv]def buildTitles(sentences): sentences_padded = pad_sentences(sentences) vocabulary, vocabulary_inv = build_vocab(sentences_padded) vocab_size = len(vocabulary) ''' Maps sentences and labels to vectors based on a vocabulary. ''' x = pd.Series([[vocabulary[word] for word in sentence] for sentence in sentences_padded]) return (x,vocab_size)2. 网络定义
def get_one_layer_mlp( max_userid, max_itemid,max_gender,max_age, max_job,max_title,sentence_size, k,batch_size): # user profile userid = mx.symbol.Variable('userid') gender = mx.symbol.Variable('gender') age = mx.symbol.Variable('age') job = mx.symbol.Variable('job') #times profile itemid = mx.symbol.Variable('itemid') title = mx.symbol.Variable('title') cat = mx.symbol.Variable('cat') score = mx.symbol.Variable('score') # user latent features userid = mx.symbol.Embedding(data = userid, input_dim = max_userid, output_dim = k/2,name='userid_Embedding') userid = mx.symbol.FullyConnected(data = userid, num_hidden = k/2) gender = mx.symbol.Embedding(data = gender, input_dim = max_gender, output_dim = k/4,name='gender_Embedding') gender = mx.symbol.FullyConnected(data = gender, num_hidden = k/4) age = mx.symbol.Embedding(data = age, input_dim = max_age, output_dim = k/2,name='age_Embedding') age = mx.symbol.FullyConnected(data =age, num_hidden = k/2) job = mx.symbol.Embedding(data = job, input_dim = max_job, output_dim = k/2,name='job_Embedding') job = mx.symbol.FullyConnected(data =job, num_hidden = k/2) user = mx.symbol.concat(userid,gender,age,job,dim=1) user = mx.symbol.FullyConnected(data =user, num_hidden = k) user = mx.symbol.Activation(data = user, act_type="relu") # item latent features itemid = mx.symbol.Embedding(data = itemid, input_dim = max_itemid, output_dim = k/2,name='itemid_Embedding') itemid = mx.symbol.FullyConnected(data = itemid, num_hidden = k/2) # item title features num_embed = k/2 embed_layer = mx.symbol.Embedding(data = title, input_dim = max_title, output_dim = num_embed,name='title_Embedding') conv_input = mx.symbol.Reshape(data= embed_layer,target_shape = (batch_size,1,sentence_size,num_embed)) # create convolution + (max) pooling layer for each filter operation filter_list=[3,4,5] # the size of filters to use [3, 4, 5] num_filter=50 pooled_outputs = [] for i, filter_size in enumerate(filter_list): convi = mx.symbol.Convolution(data=conv_input, kernel=(filter_size, num_embed), num_filter=num_filter) relui = mx.symbol.Activation(data=convi, act_type='relu') pooli = mx.symbol.Pooling(data=relui, pool_type='max', kernel=(sentence_size - filter_size + 1, 1), stride=(1,1)) pooled_outputs.append(pooli) # combine all pooled outputs total_filters = num_filter * len(filter_list) concat = mx.symbol.Concat(*pooled_outputs, dim=1) # reshape for next layer h_pool = mx.symbol.Reshape(data=concat, target_shape=(batch_size, total_filters)) h_drop = mx.sym.Dropout(data=h_pool, p=0.5) #fix length of output title = mx.sym.FullyConnected(data=h_drop, num_hidden = k/2,name='title_Fc') #categories latent features cat = mx.symbol.FullyConnected(data = cat, num_hidden = k/2,name='cat_Fc') #concate all item infos item = mx.symbol.concat(itemid,title,cat,dim=1) item = mx.symbol.FullyConnected(data =item, num_hidden = k) item = mx.symbol.Activation(data = item, act_type="relu") pred = calc_cos_sim(user,item,1,5) pred = mx.symbol.Flatten(data = pred) # loss layer pred = mx.symbol.LinearRegressionOutput(data = pred, label = score) return pred
3. 训练
def trainingModel(): TRAIN_DIR = 'C:/Users/chuanxie/PycharmProjects/mxnetlearn/data/movie/' ratingdf = LoadRatingData(TRAIN_DIR+'ml-1m/ratings.dat',delimiter='\t') userdf = LoadUserData(TRAIN_DIR+'ml-1m/users.dat',delimiter='\t') itemdf = LoadItemData(TRAIN_DIR+'ml-1m/movies.dat',delimiter='\t') np_encodedcat = encodeTag(itemdf) print 'ratingdf.shape:',ratingdf.shape print 'np_encodedcat.shape:' ,np_encodedcat.shape fulldf = ratingdf.join(userdf,on='userid').join(itemdf,on='itemid').join(np_encodedcat,on='itemid') '''titles''' titlematrix = fulldf['title'].as_matrix() titles = [s.split(" ") for s in titlematrix] title_arr,vocab_size = buildTitles(titles) sentence_size = len(title_arr[0]) print 'title_arr.shape:',title_arr.shape '''reconstruct series to dataframe''' matrix_encoded_cat = fulldf['encoded_cat'].as_matrix() df_encoded_cat = np.array(matrix_encoded_cat.tolist()) print 'df_encoded_cat.shape:',df_encoded_cat.shape data = np.array([fulldf['userid'],fulldf['gender'],fulldf['age'],fulldf['job'],fulldf['itemid'],title_arr,fulldf['encoded_cat']]) print 'train data shape:',data.shape label = np.array([fulldf['score']]) context = mx.gpu() BATCH_SIZE = 400 num_epoch = 200 trainIter = CustDataIter2(['userid', 'gender','age','job','itemid','title','cat'],data, [(BATCH_SIZE,),(BATCH_SIZE,),(BATCH_SIZE,),(BATCH_SIZE,),(BATCH_SIZE,),(BATCH_SIZE,sentence_size),(BATCH_SIZE,df_encoded_cat.shape[1])], ['score'],label,[(BATCH_SIZE,)],context,BATCH_SIZE,data.shape[1]/BATCH_SIZE) max_userid = pd.Series(fulldf['userid']).max() max_itemid = pd.Series(fulldf['itemid']).max() max_gender = pd.Series(fulldf['gender']).max() max_age = pd.Series(fulldf['age']).max() max_job = pd.Series(fulldf['job']).max() max_title = vocab_size net =get_one_layer_mlp( max_userid=max_userid, max_itemid=max_itemid,max_gender=max_gender, max_age = max_age , max_job = max_job,max_title=max_title,sentence_size=sentence_size, k=96,batch_size=BATCH_SIZE) mx.viz.plot_network(net,shape={'userid':(BATCH_SIZE,),'gender':(BATCH_SIZE,),'age':(BATCH_SIZE,),'job':(BATCH_SIZE,),'itemid':(BATCH_SIZE,),'title':(BATCH_SIZE,sentence_size),'cat':(BATCH_SIZE,73)}).view() ##Train module train(net,trainIter,None,context,num_epoch=num_epoch,learning = 'rmsprop',learning_rate=0.001)
加入CNN之后,显存消耗和计算量都有了巨大增加。没有加CNN之前用的batch size是10K, 加入CNN之后batch size只能调成500了,我的显卡只有1G,batch size大了之后就黑屏了。。。。, 训练一次的时间也有10s以内上上涨到2分多钟
INFO:root:Epoch[0] Train-rmse=0.975520
INFO:root:Epoch[0] Time cost=136.348
INFO:root:Epoch[1] Train-rmse=0.933322
INFO:root:Epoch[1] Time cost=134.050
INFO:root:Epoch[2] Train-rmse=0.912823
INFO:root:Epoch[2] Time cost=133.927
INFO:root:Epoch[3] Train-rmse=0.899103
INFO:root:Epoch[3] Time cost=133.942
INFO:root:Epoch[4] Train-rmse=0.893966
INFO:root:Epoch[4] Time cost=134.107
- 推荐算法学习2-MXNET 实现movielen 融合个性化推荐-续-加入CNN文本处理
- 推荐算法学习2-MXNET 实现movielen 融合个性化推荐
- 个性化推荐算法python实现
- 使用文本挖掘实现站点个性化推荐
- 使用文本挖掘实现站点个性化推荐
- 使用文本挖掘实现站点个性化推荐
- 使用文本挖掘实现站点个性化推荐
- 使用文本挖掘实现站点个性化推荐
- 个性化推荐算法 综述
- 个性化推荐算法比较
- 推荐算法学习-MXNET 矩阵分解应用实例
- 个性化推荐语自然语言处理
- 个性化推荐和遗传算法
- 新闻个性化推荐基本算法
- 购物网站的推荐算法-个性化推荐算法中如何处理买了还推
- 个性化推荐算法:GRM,CF,NBI的实现
- “个性化视频推荐”算法的Storm实现方案
- “个性化视频推荐”算法的Storm实现方案
- 滑块外遮罩
- HDU 1983:Kaitou Kid
- 微信企业号会议助手---一个简单的SSM实例
- 快速选择排序(quickselect)--基于quicksort
- MySQL数据库出现The server quit without updating PID file.
- 推荐算法学习2-MXNET 实现movielen 融合个性化推荐-续-加入CNN文本处理
- Cg Programming/Unity
- CVE-2012-0158浅析-word栈溢出漏洞
- 解决MDK5在调试中崩溃,提示“IDE已停止工作”的一种方法
- Java双缓冲队列实现
- Android异步下载图片并且缓存图片到本地
- Handler
- ICE的使用
- UITableView拖动时计算页码 & 往上拖拉时自动加载