Document classification by inversion of distributed language representations
来源:互联网 发布:工业三维建模软件 编辑:程序博客网 时间:2024/05/21 09:22
In [1]:
# An example binary huffman tree.!dot -Tpng paper/graphs/bht.dot -o paper/graphs/bht.pngfrom IPython.display import ImageImage(filename='paper/graphs/bht.png') # Note that it is a prefix tree: you know the total length after each point given bits to that point.
Out[1]:
In [2]:
import sysimport numpy as npimport pandas as pdfrom copy import deepcopyfrom gensim.models import Word2Vecfrom gensim.models import Phrasesfin = open("data/yelptrain1star.txt")firstbadreview = fin.readline()print(firstbadreview)
In [3]:
## define a review generatorimport realteos = re.compile(r'( [!\?] )')def revsplit(l): l = alteos.sub(r' \1 . ', l).rstrip("( \. )*\n") return [s.split() for s in l.split(" . ")]def YelpReviews( stars = [1,2,3,4,5], prefix="train" ): for nstar in stars: for line in open("data/yelp%s%dstar.txt"%(prefix,nstar)): yield revsplit(line)
In [4]:
## grab all sentences; good bad and uglyallsentences = [ s for r in YelpReviews() for s in r ]len(allsentences)
Out[4]:
In [5]:
docgrp = {'neg': [1,2], 'pos': [5]} [g for g in docgrp]
Out[5]:
In [6]:
reviews = { g: list(YelpReviews(docgrp[g])) for g in docgrp }ndoc = pd.Series( {g: len(reviews[g]) for g in docgrp} , dtype="float64" )
In [7]:
reviews['neg'][0][6:10]
Out[7]:
In [8]:
jointmodel = Word2Vec(workers=4)np.random.shuffle(allsentences)jointmodel.build_vocab(allsentences)
In [9]:
model = { g: deepcopy(jointmodel) for g in docgrp }
In [10]:
def trainW2V(g, T=20): sent = [l for r in reviews[g] for l in r] model[g].min_alpha = model[g].alpha for epoch in range(T): print(epoch, end=" ") np.random.shuffle(sent) model[g].train(sent) model[g].alpha *= 0.9 model[g].min_alpha = model[g].alpha print(".")
In [11]:
for g in docgrp: print(g, end=": ") trainW2V( g )
In [12]:
def nearby(word, g): print(word) print( "%s:"%str(g), end=" ") for (w,v) in model[g].most_similar([word]): print(w, end=" ") print("\n")
In [13]:
for g in docgrp: nearby("food", g)for g in docgrp: nearby("service", g)for g in docgrp: nearby("value", g)
In [14]:
testrev = { g: list(YelpReviews(docgrp[g], "test")) for g in docgrp }
In [15]:
def getprobs(rev, grp): sentences = [(i,s) for i,r in enumerate(rev) for s in r] eta = pd.DataFrame( { g: model[g].score([s for i,s in sentences]) for g in grp } ) probs = eta.subtract( eta.max('columns'), 'rows') probs = np.exp( probs ) probs = probs.divide(probs.sum('columns'), "rows") probs['cnt'] = 1 probs = probs.groupby([i for i,s in sentences]).sum() probs = probs.divide(probs["cnt"], 'rows').drop("cnt", 1) return(probs)
In [16]:
probs = {g: getprobs(testrev[g], docgrp) for g in docgrp }
In [17]:
import matplotlib.pyplot as plt%matplotlib inlinefig = plt.figure(figsize=(7,4))plt.hist(probs['neg']['pos'],normed=1, color="red", alpha=.6, label="true negative", linewidth=1)plt.hist(probs['pos']['pos'],normed=1, color="yellow", alpha=.6, label="true positive", linewidth=1)plt.xlim([0,1])plt.ylim([0,5])plt.legend(frameon=False, loc='upper center')plt.xlabel("prob positive")plt.ylabel("density")#fig.savefig("graphs/coarseprobs.pdf", format="pdf", bbox_inches="tight")
Out[17]:
In [18]:
yhat = {g: probs[g].idxmax('columns') for g in docgrp}mc = pd.DataFrame({ 'mcr': {g: (yhat[g] != g).mean() for g in docgrp}, 'n': {g: len(testrev[g]) for g in docgrp} })print(mc)overall = mc.product("columns").sum()/mc['n'].sum()print("\nOverall MCR: %.3f" %overall)
In [19]:
svec = np.concatenate((probs['neg']['pos'],probs['pos']['pos']), axis=0)allrev = [[w for s in r for w in s] for r in testrev['neg']+testrev['pos']]
In [20]:
import pandas as pddiff = pd.Series( svec )tops = diff.order(ascending=False)[:5]print("TOPS\n")for i in tops.index: print( " ".join(allrev[i]), end="\n\n")bots = diff.order()[:5]print("BOTTOMS\n")for i in bots.index: print( " ".join(allrev[i]), end="\n\n")
In [21]:
docgrp_fine = {str(i) : [i] for i in range(1,6)} docgrp_fine
Out[21]:
In [22]:
for g in docgrp_fine: print(g, end=": ") reviews[g] = list(YelpReviews(docgrp_fine[g])) model[g] = deepcopy(jointmodel) trainW2V( g )
In [23]:
for g in docgrp_fine: testrev[g] = list( YelpReviews(docgrp_fine[g], "test") ) probs[g] = getprobs(testrev[g], docgrp_fine) yhat[g] = probs[g].idxmax("columns")
In [24]:
mc_fine = pd.DataFrame({ 'mcr': {g: (yhat[g] != g).mean() for g in docgrp_fine}, 'n': {g: len(testrev[g]) for g in docgrp_fine} })print(mc_fine)ntest = mc_fine['n'].sum()overall_fine = mc_fine.product("columns").sum()/ntestprint("\nOverall Fine-Scale MCR: %.3f" %overall_fine)
In [25]:
phraser = Phrases(allsentences,threshold=5.0)
In [26]:
for w in phraser[allsentences[0]]: print(w, end=" ")
In [27]:
i = 0fout = open("data/yelp_phrases.txt", "w")for samp in ["train","test"]: for stars in range(1,6): if samp == "train": rev = reviews[str(stars)] else: rev = testrev[str(stars)] for r in rev: for s in r: for w in phraser[s]: if "|" not in w: fout.write("%d|%s|%d|%s\n" % (i,w,stars,samp)) i += 1
In [28]:
Pfine = {}for stars in range(1,6): print(stars) Pfine['train%d'%stars] = getprobs(reviews[str(stars)], docgrp_fine) Pfine['test%d'%stars] = getprobs(testrev[str(stars)], docgrp_fine) pmatfine = pd.concat( [Pfine['train%d'%s] for s in range(1,6)] + [Pfine['test%d'%s] for s in range(1,6)] )pmatfine.to_csv("data/yelpw2vprobs.csv",index=False)
In [29]:
ntrain = [len(reviews[g]) for g in docgrp_fine]ntest = [len(testrev[g]) for g in docgrp_fine]
In [30]:
sum(ntrain)+sum(ntest)
Out[30]:
In [31]:
i
Out[31]:
In [32]:
fullmodel = deepcopy(jointmodel)
In [33]:
fullmodel.min_alpha = fullmodel.alphafor epoch in range(10): print(epoch, end=" ") np.random.shuffle(allsentences) fullmodel.train(allsentences) fullmodel.alpha *= 0.9 fullmodel.min_alpha = fullmodel.alpha print(".")
In [34]:
def aggvec(rev): av = np.zeros(fullmodel.layer1_size) ns = 0.0 for s in rev: sv = np.zeros(fullmodel.layer1_size) nw = 0.0 for w in s: nw += 1.0 try: sv += fullmodel[w] except KeyError: # print("%s is not in vocab"%w) pass if nw > 0.0: av += sv/nw if ns > 0: av = av/ns return av
In [35]:
i = 0AV = np.zeros((sum(ntrain)+sum(ntest),fullmodel.layer1_size))for samp in ["train","test"]: for stars in range(1,6): if samp == "train": rev = reviews[str(stars)] else: rev = testrev[str(stars)] for r in rev: AV[i,:] = aggvec(r) i += 1 if np.remainder(i,10000) == 0: print(i)
In [36]:
np.savetxt("data/yelp_vectors.txt", AV, delimiter="|", fmt='%.6f')
In [37]:
i
Out[37]:
In [38]:
from gensim.models.doc2vec import *def YelpLabeledSentence( stars = [1,2,3,4,5], prefix="train" ): for nstar in stars: i = 0 for line in open("data/yelp%s%dstar.txt"%(prefix,nstar)): line = alteos.sub(r' \1 . ', line).rstrip("( \. )*\n") lab = "%s-%d-%d" % (prefix, nstar, i) rev = [s.split() for s in line.split(" . ")] i += 1 for s in rev: yield LabeledSentence(s, [lab])
In [39]:
trainsent = list(YelpLabeledSentence()) testsent = list(YelpLabeledSentence(prefix="test"))
In [40]:
mdm0 = Doc2Vec(workers=4, size=100, window=5, dm=0)mdm1 = Doc2Vec(workers=4, size=100, window=5, dm=1)%time mdm0.build_vocab(trainsent+testsent)%time mdm1.build_vocab(trainsent+testsent)
In [41]:
def trainD2V(mod, sent, T=20): mod.min_alpha = mod.alpha for epoch in range(T): print(epoch, end=" ") np.random.shuffle(sent) mod.train(sent) mod.alpha *= 0.9 mod.min_alpha = mod.alpha print(".")
In [42]:
%time trainD2V(mdm0, trainsent)%time trainD2V(mdm1, trainsent)
In [43]:
## turn of training of word vecs, just score label vecsmdm0.train_words=Falsemdm1.train_words=False%time trainD2V(mdm0, testsent)%time trainD2V(mdm1, testsent)
In [44]:
def writeD2V(mod, fname, prefix): v = [] y = np.empty(0) x = np.empty([0,mod.syn0.shape[1]]) for stars in range(1,6): labs = [ w for w in mod.vocab if re.match("%s-%d-\d+"%(prefix,stars), w) ] v += labs i = [mod.vocab[w].index for w in labs] y = np.append(y, np.repeat(stars,len(i))) x = np.vstack( (x, mod.syn0[i,:]) ) veclab = ["x%d"%d for d in range(1,x.shape[1]+1)] df = pd.DataFrame( x, index=v, columns=veclab ) df["stars"] = y df.to_csv("data/%s.csv"%fname, index_label="id")
In [45]:
for prefix in ["train","test"]: writeD2V(mdm0, "yelpD2V%s0"%prefix, prefix) writeD2V(mdm1, "yelpD2V%s1"%prefix, prefix)
In [50]:
!Rscript code/linmod.R
In [55]:
Image(filename='paper/graphs/yelp_logistic.png', width=600,height=300)
Out[55]:
In [ ]:
This website does not host notebooks, it only renders notebooks available on other websites.
Delivered by Fastly, Rendered by Rackspace
nbviewer GitHub repository.
nbviewer version: 08dff96
notebook version:
nbconvert version: 4.1.0
Rendered 12 days ago
0 0
- Document classification by inversion of distributed language representations
- 《Document Classification by Inversion of Distributed Language Representations》分享
- 【deep learning学习笔记】Distributed Representations of Sentences and Documents
- Distributed Representations of Words and Phrasesand their Compositionality
- Distributed Representations of Words and Phrasesand their Compositionality
- [NLP论文阅读]Distributed Representations of Sentences and Documents
- 《Distributed Representations of Words and Phrases and their Compositionality》笔记
- 翻译:Distributed Representations of Words and Phrases and their Compositionality
- NLP-文献-Distributed Representations of Sentences and Documents
- The Survey of Programming Language Classification
- 读论文《Distributed Representations of Words and Phrases and their Compositionality》
- Get the distributed keys of Greenplum by SQL query
- Ending Spam: Bayesian Content Filtering and the Art of Statistical Language Classification
- Multi-Language Programming : Distributed Object
- Implementation of Hierarchical Attention Networks for Document Classification的讲解与Tensorflow实现
- Are sparse representations really relevant for image classification?
- How to enable the use of 'Ad Hoc Distributed Queries' by using sp_configure
- hive order by,sort by,distributed by
- warning: ISO C90 forbids mixed declarations and code
- 如何防止运营商网络劫持,避免被他人强行插入广告?
- TCP实现多个客户端与服务端 数据 传输
- 开局首季问大势(2016.5.9)
- 1.1 背景相关与系统架构
- Document classification by inversion of distributed language representations
- hdu 1598(最小生成树变形)
- ListView之ContextMenu无法弹出
- Asianux 3 linux系统下GitLab客户端安装
- unity 基于socket的多人群聊实现
- 数据库的百花争艳
- javascript库之Mustache库使用说明
- C#点滴
- python读取大文件并逐行写入另外一个文件