推荐算法学习2-MXNET 实现movielen 融合个性化推荐-续-加入CNN文本处理

来源：互联网发布：迪联软件编辑：程序博客网时间：2024/05/19 02:39

上一篇文章的网络定义严格来说并不是严格按照paddle2给的那个例子来的。本篇文章在上一篇的基础上加入了CNN对电影title的特征提取。CNN 对文本的处理参考的是MXNET的教程：Text Classification Using a Convolutional Neural Network on MXNet 和论文：Convolutional Neural Networks for Sentence Classification

增加的部分如下：

1. title的预处理，主要是把title中的单词建立成字典，并且把所有title padded to max length，变成一个定宽的数据，类似于一张张相同大小的图片，这样应用不同size的核进行卷积。

def pad_sentences(sentences, padding_word="</s>"):    """    Pads all sentences to the same length. The length is defined by the longest sentence.    Returns padded sentences.    """    sequence_length = max(len(x) for x in sentences)    padded_sentences = []    for i in range(len(sentences)):        sentence = sentences[i]        num_padding = sequence_length - len(sentence)        new_sentence = sentence + [padding_word] * num_padding        padded_sentences.append(new_sentence)    return padded_sentencesdef build_vocab(sentences):    """    Builds a vocabulary mapping from word to index based on the sentences.    Returns vocabulary mapping and inverse vocabulary mapping.    """    # Build vocabulary    word_counts = Counter(itertools.chain(*sentences))    # Mapping from index to word    vocabulary_inv = [x[0] for x in word_counts.most_common()]    # Mapping from word to index    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}    return [vocabulary, vocabulary_inv]def buildTitles(sentences):    sentences_padded = pad_sentences(sentences)    vocabulary, vocabulary_inv = build_vocab(sentences_padded)    vocab_size = len(vocabulary)    '''    Maps sentences and labels to vectors based on a vocabulary.    '''    x = pd.Series([[vocabulary[word] for word in sentence] for sentence in sentences_padded])    return (x,vocab_size)

2. 网络定义

def get_one_layer_mlp( max_userid,  max_itemid,max_gender,max_age, max_job,max_title,sentence_size, k,batch_size):    # user profile    userid = mx.symbol.Variable('userid')    gender = mx.symbol.Variable('gender')    age = mx.symbol.Variable('age')    job = mx.symbol.Variable('job')    #times profile    itemid = mx.symbol.Variable('itemid')    title = mx.symbol.Variable('title')    cat = mx.symbol.Variable('cat')    score = mx.symbol.Variable('score')    # user latent features    userid = mx.symbol.Embedding(data = userid, input_dim = max_userid, output_dim = k/2,name='userid_Embedding')    userid = mx.symbol.FullyConnected(data = userid, num_hidden = k/2)    gender = mx.symbol.Embedding(data = gender, input_dim = max_gender, output_dim = k/4,name='gender_Embedding')    gender = mx.symbol.FullyConnected(data = gender, num_hidden = k/4)    age = mx.symbol.Embedding(data = age, input_dim = max_age, output_dim = k/2,name='age_Embedding')    age = mx.symbol.FullyConnected(data =age, num_hidden =  k/2)    job = mx.symbol.Embedding(data = job, input_dim = max_job, output_dim = k/2,name='job_Embedding')    job = mx.symbol.FullyConnected(data =job, num_hidden = k/2)    user =  mx.symbol.concat(userid,gender,age,job,dim=1)    user = mx.symbol.FullyConnected(data =user, num_hidden = k)    user = mx.symbol.Activation(data = user, act_type="relu")    # item latent features    itemid = mx.symbol.Embedding(data = itemid, input_dim = max_itemid, output_dim = k/2,name='itemid_Embedding')    itemid = mx.symbol.FullyConnected(data = itemid, num_hidden = k/2)    # item title features    num_embed = k/2    embed_layer = mx.symbol.Embedding(data = title, input_dim = max_title, output_dim = num_embed,name='title_Embedding')    conv_input = mx.symbol.Reshape(data= embed_layer,target_shape = (batch_size,1,sentence_size,num_embed))    # create convolution + (max) pooling layer for each filter operation    filter_list=[3,4,5] # the size of filters to use [3, 4, 5]    num_filter=50    pooled_outputs = []    for i, filter_size in enumerate(filter_list):        convi = mx.symbol.Convolution(data=conv_input, kernel=(filter_size, num_embed), num_filter=num_filter)        relui = mx.symbol.Activation(data=convi, act_type='relu')        pooli = mx.symbol.Pooling(data=relui, pool_type='max', kernel=(sentence_size - filter_size + 1, 1), stride=(1,1))        pooled_outputs.append(pooli)    # combine all pooled outputs    total_filters = num_filter * len(filter_list)    concat = mx.symbol.Concat(*pooled_outputs, dim=1)    # reshape for next layer    h_pool = mx.symbol.Reshape(data=concat, target_shape=(batch_size, total_filters))    h_drop = mx.sym.Dropout(data=h_pool, p=0.5)    #fix length of output    title = mx.sym.FullyConnected(data=h_drop, num_hidden = k/2,name='title_Fc')    #categories latent features    cat = mx.symbol.FullyConnected(data = cat, num_hidden = k/2,name='cat_Fc')    #concate all item infos    item = mx.symbol.concat(itemid,title,cat,dim=1)    item = mx.symbol.FullyConnected(data =item, num_hidden = k)    item = mx.symbol.Activation(data = item, act_type="relu")    pred = calc_cos_sim(user,item,1,5)    pred = mx.symbol.Flatten(data = pred)    # loss layer    pred = mx.symbol.LinearRegressionOutput(data = pred, label = score)    return pred

3. 训练

def trainingModel():    TRAIN_DIR = 'C:/Users/chuanxie/PycharmProjects/mxnetlearn/data/movie/'    ratingdf = LoadRatingData(TRAIN_DIR+'ml-1m/ratings.dat',delimiter='\t')    userdf = LoadUserData(TRAIN_DIR+'ml-1m/users.dat',delimiter='\t')    itemdf = LoadItemData(TRAIN_DIR+'ml-1m/movies.dat',delimiter='\t')    np_encodedcat = encodeTag(itemdf)    print 'ratingdf.shape:',ratingdf.shape    print 'np_encodedcat.shape:'  ,np_encodedcat.shape    fulldf = ratingdf.join(userdf,on='userid').join(itemdf,on='itemid').join(np_encodedcat,on='itemid')    '''titles'''    titlematrix = fulldf['title'].as_matrix()    titles = [s.split(" ") for s in titlematrix]    title_arr,vocab_size = buildTitles(titles)    sentence_size = len(title_arr[0])    print 'title_arr.shape:',title_arr.shape    '''reconstruct series to dataframe'''    matrix_encoded_cat = fulldf['encoded_cat'].as_matrix()    df_encoded_cat = np.array(matrix_encoded_cat.tolist())    print 'df_encoded_cat.shape:',df_encoded_cat.shape    data = np.array([fulldf['userid'],fulldf['gender'],fulldf['age'],fulldf['job'],fulldf['itemid'],title_arr,fulldf['encoded_cat']])    print 'train data shape:',data.shape    label = np.array([fulldf['score']])    context = mx.gpu()    BATCH_SIZE = 400    num_epoch = 200    trainIter = CustDataIter2(['userid', 'gender','age','job','itemid','title','cat'],data,                             [(BATCH_SIZE,),(BATCH_SIZE,),(BATCH_SIZE,),(BATCH_SIZE,),(BATCH_SIZE,),(BATCH_SIZE,sentence_size),(BATCH_SIZE,df_encoded_cat.shape[1])],                             ['score'],label,[(BATCH_SIZE,)],context,BATCH_SIZE,data.shape[1]/BATCH_SIZE)    max_userid = pd.Series(fulldf['userid']).max()    max_itemid = pd.Series(fulldf['itemid']).max()    max_gender = pd.Series(fulldf['gender']).max()    max_age = pd.Series(fulldf['age']).max()    max_job = pd.Series(fulldf['job']).max()    max_title = vocab_size    net =get_one_layer_mlp( max_userid=max_userid,  max_itemid=max_itemid,max_gender=max_gender,                            max_age = max_age , max_job = max_job,max_title=max_title,sentence_size=sentence_size, k=96,batch_size=BATCH_SIZE)    mx.viz.plot_network(net,shape={'userid':(BATCH_SIZE,),'gender':(BATCH_SIZE,),'age':(BATCH_SIZE,),'job':(BATCH_SIZE,),'itemid':(BATCH_SIZE,),'title':(BATCH_SIZE,sentence_size),'cat':(BATCH_SIZE,73)}).view()    ##Train module    train(net,trainIter,None,context,num_epoch=num_epoch,learning = 'rmsprop',learning_rate=0.001)

加入CNN之后，显存消耗和计算量都有了巨大增加。没有加CNN之前用的batch size是10K，加入CNN之后batch size只能调成500了，我的显卡只有1G，batch size大了之后就黑屏了。。。。, 训练一次的时间也有10s以内上上涨到2分多钟

INFO:root:Epoch[0] Train-rmse=0.975520
INFO:root:Epoch[0] Time cost=136.348
INFO:root:Epoch[1] Train-rmse=0.933322
INFO:root:Epoch[1] Time cost=134.050
INFO:root:Epoch[2] Train-rmse=0.912823
INFO:root:Epoch[2] Time cost=133.927
INFO:root:Epoch[3] Train-rmse=0.899103
INFO:root:Epoch[3] Time cost=133.942
INFO:root:Epoch[4] Train-rmse=0.893966
INFO:root:Epoch[4] Time cost=134.107

0 0