机器学习系统设计 学习笔记
来源:互联网 发布:yii框架 商城源码 编辑:程序博客网 时间:2024/05/29 17:54
第一章import timeitnormal_py_sec = timeit.timeit('sum(x*x for x in xrange(1000))', number=10000)naive_np_sec = timeit.timeit('sum(na*na)', setup="import numpy as np; na=np.arange(1000)", number=10000)good_np_sec = timeit.timeit('na.dot(na)', setup="import numpy as np; na=np.arange(1000)", number=10000)print("Normal Python: %f sec" % normal_py_sec)print("Naive NumPy: %f sec" % naive_np_sec)print("Good NumPy: %f sec" % good_np_sec)# This script generates web traffic data for our hypothetical# web startup "MLASS" in chapter 01import osimport scipy as spfrom scipy.stats import gammaimport matplotlib.pyplot as pltsp.random.seed(3) # to reproduce the data later onx = sp.arange(1, 31 * 24)y = sp.array(200 * (sp.sin(2 * sp.pi * x / (7 * 24))), dtype=int)y += gamma.rvs(15, loc=0, scale=100, size=len(x)) #产生gamma分布y += 2 * sp.exp(x / 100.0)y = sp.ma.array(y, mask=[y < 0]) ###此处使用了掩码数组 mask,print(sum(y), sum(y < 0))plt.scatter(x, y)plt.title("Web traffic over the last month")plt.xlabel("Time")plt.ylabel("Hits/hour")plt.xticks([w * 7 * 24 for w in [0, 1, 2, 3, 4]], ['week %i' % (w + 1) for w in [ 0, 1, 2, 3, 4]])plt.autoscale(tight=True)plt.grid()plt.savefig(os.path.join("..", "1400_01_01.png"))data_dir = os.path.join( os.path.dirname(os.path.realpath(__file__)), "..", "data") ##注意路径的用法,os.path.join, os.path.dirname, os.path.realpath(__file__)# sp.savetxt(os.path.join("..", "web_traffic.tsv"),# zip(x[~y.mask],y[~y.mask]), delimiter="\t", fmt="%i")sp.savetxt(os.path.join( data_dir, "web_traffic.tsv"), list(zip(x, y)), delimiter="\t", fmt="%s")import osimport scipy as spimport matplotlib.pyplot as pltdata_dir = os.path.join( os.path.dirname(os.path.realpath(__file__)), "..", "data")data = sp.genfromtxt(os.path.join(data_dir, "web_traffic.tsv"), delimiter="\t") ##在scipy中打开文件print(data[:10])# all examples will have three classes in this filecolors = ['g', 'k', 'b', 'm', 'r']linestyles = ['-', '-.', '--', ':', '-']x = data[:, 0]y = data[:, 1]print("Number of invalid entries:", sp.sum(sp.isnan(y)))x = x[~sp.isnan(y)] ##取反的方式y = y[~sp.isnan(y)]# plot input datadef plot_models(x, y, models, fname, mx=None, ymax=None, xmin=None): plt.clf() plt.scatter(x, y, s=10) plt.title("Web traffic over the last month") plt.xlabel("Time") plt.ylabel("Hits/hour") plt.xticks( [w * 7 * 24 for w in range(10)], ['week %i' % w for w in range(10)]) if models: if mx is None: mx = sp.linspace(0, x[-1], 1000) for model, style, color in zip(models, linestyles, colors): ##循环打印图标 # print "Model:",model # print "Coeffs:",model.coeffs plt.plot(mx, model(mx), linestyle=style, linewidth=2, c=color) plt.legend(["d=%i" % m.order for m in models], loc="upper left") plt.autoscale(tight=True) plt.ylim(ymin=0) if ymax: plt.ylim(ymax=ymax) if xmin: plt.xlim(xmin=xmin) plt.grid(True, linestyle='-', color='0.75') plt.savefig(fname) ##保存图表# first look at the dataplot_models(x, y, None, os.path.join("..", "1400_01_01.png"))# create and plot modelsfp1, res, rank, sv, rcond = sp.polyfit(x, y, 1, full=True)print("Model parameters: %s" % fp1)print("Error of the model:", res)f1 = sp.poly1d(fp1)f2 = sp.poly1d(sp.polyfit(x, y, 2)) ##ploy1d 多项式拟合函数f3 = sp.poly1d(sp.polyfit(x, y, 3))f10 = sp.poly1d(sp.polyfit(x, y, 10))f100 = sp.poly1d(sp.polyfit(x, y, 100))plot_models(x, y, [f1], os.path.join("..", "1400_01_02.png"))plot_models(x, y, [f1, f2], os.path.join("..", "1400_01_03.png"))plot_models( x, y, [f1, f2, f3, f10, f100], os.path.join("..", "1400_01_04.png"))# fit and plot a model using the knowledge about inflection pointinflection = 3.5 * 7 * 24xa = x[:inflection]ya = y[:inflection]xb = x[inflection:]yb = y[inflection:]fa = sp.poly1d(sp.polyfit(xa, ya, 1))fb = sp.poly1d(sp.polyfit(xb, yb, 1))plot_models(x, y, [fa, fb], os.path.join("..", "1400_01_05.png"))def error(f, x, y): return sp.sum((f(x) - y) ** 2)print("Errors for the complete data set:")for f in [f1, f2, f3, f10, f100]: print("Error d=%i: %f" % (f.order, error(f, x, y)))print("Errors for only the time after inflection point")for f in [f1, f2, f3, f10, f100]: print("Error d=%i: %f" % (f.order, error(f, xb, yb)))print("Error inflection=%f" % (error(fa, xa, ya) + error(fb, xb, yb)))# extrapolating into the futureplot_models( x, y, [f1, f2, f3, f10, f100], os.path.join("..", "1400_01_06.png"), mx=sp.linspace(0 * 7 * 24, 6 * 7 * 24, 100), ymax=10000, xmin=0 * 7 * 24)print("Trained only on data after inflection point")fb1 = fbfb2 = sp.poly1d(sp.polyfit(xb, yb, 2))fb3 = sp.poly1d(sp.polyfit(xb, yb, 3))fb10 = sp.poly1d(sp.polyfit(xb, yb, 10))fb100 = sp.poly1d(sp.polyfit(xb, yb, 100))print("Errors for only the time after inflection point")for f in [fb1, fb2, fb3, fb10, fb100]: print("Error d=%i: %f" % (f.order, error(f, xb, yb)))plot_models( x, y, [fb1, fb2, fb3, fb10, fb100], os.path.join("..", "1400_01_07.png"), mx=sp.linspace(0 * 7 * 24, 6 * 7 * 24, 100), ymax=10000, xmin=0 * 7 * 24)# separating training from testing datafrac = 0.3split_idx = int(frac * len(xb))shuffled = sp.random.permutation(list(range(len(xb))))test = sorted(shuffled[:split_idx])train = sorted(shuffled[split_idx:])fbt1 = sp.poly1d(sp.polyfit(xb[train], yb[train], 1))fbt2 = sp.poly1d(sp.polyfit(xb[train], yb[train], 2))fbt3 = sp.poly1d(sp.polyfit(xb[train], yb[train], 3))fbt10 = sp.poly1d(sp.polyfit(xb[train], yb[train], 10))fbt100 = sp.poly1d(sp.polyfit(xb[train], yb[train], 100))print("Test errors for only the time after inflection point")for f in [fbt1, fbt2, fbt3, fbt10, fbt100]: print("Error d=%i: %f" % (f.order, error(f, xb[test], yb[test])))plot_models( x, y, [fbt1, fbt2, fbt3, fbt10, fbt100], os.path.join("..", "1400_01_08.png"), mx=sp.linspace(0 * 7 * 24, 6 * 7 * 24, 100), ymax=10000, xmin=0 * 7 * 24)from scipy.optimize import fsolveprint(fbt2)print(fbt2 - 100000)reached_max = fsolve(fbt2 - 100000, 800) / (7 * 24)print("100,000 hits/hour expected at week %f" % reached_max[0])第二章import numpy as npfrom sklearn.datasets import load_irisfrom matplotlib import pyplot as pltdata = load_iris()features = data['data']feature_names = data['feature_names']target = data['target']pairs = [(0,1),(0,2),(0,3),(1,2),(1,3),(2,3)]for i,(p0,p1) in enumerate(pairs): # 注意次数enumerate的用法,i,(p0,p1) plt.subplot(2,3,i+1) for t,marker,c in zip(range(3),">ox","rgb"): plt.scatter(features[target == t,p0], features[target == t,p1], marker=marker, c=c) plt.xlabel(feature_names[p0]) plt.ylabel(feature_names[p1]) plt.xticks([]) plt.yticks([])plt.savefig('../1400_02_01.png')from load import load_datasetdef test_iris(): features, labels = load_dataset('iris') assert len(features[0]) == 4 #断言 assert len(features) assert len(features) == len(labels)def test_seeds(): features, labels = load_dataset('seeds') assert len(features[0]) == 7 assert len(features) assert len(features) == len(labels)import milksets.irisimport milksets.seedsdef save_as_tsv(fname, module): features, labels = module.load() nlabels = [module.label_names[ell] for ell in labels] with open(fname, 'w') as ofile: for f,n in zip(features, nlabels): print >>ofile, "\t".join(map(str,f)+[n]) save_as_tsv('iris.tsv', milksets.iris)save_as_tsv('seeds.tsv', milksets.seeds)COLOUR_FIGURE = Falsefrom matplotlib import pyplot as pltfrom sklearn.datasets import load_irisdata = load_iris()features = data['data']feature_names = data['feature_names']species = data['target_names'][data['target']]setosa = (species == 'setosa')features = features[~setosa]species = species[~setosa]virginica = species == 'virginica't = 1.75p0,p1 = 3,2if COLOUR_FIGURE: area1c = (1.,.8,.8) area2c = (.8,.8,1.)else: area1c = (1.,1,1) area2c = (.7,.7,.7)x0,x1 =[features[:,p0].min()*.9,features[:,p0].max()*1.1] #注意用列表对两个变量赋值, 列表内为两个变量y0,y1 =[features[:,p1].min()*.9,features[:,p1].max()*1.1]plt.fill_between([t,x1],[y0,y0],[y1,y1],color=area2c) #填充颜色plt.fill_between([x0,t],[y0,y0],[y1,y1],color=area1c)plt.plot([t,t],[y0,y1],'k--',lw=2)plt.plot([t-.1,t-.1],[y0,y1],'k:',lw=2)plt.scatter(features[virginica,p0], features[virginica,p1], c='b', marker='o')plt.scatter(features[~virginica,p0], features[~virginica,p1], c='r', marker='x')plt.ylim(y0,y1)plt.xlim(x0,x1)plt.xlabel(feature_names[p0])plt.ylabel(feature_names[p1])plt.savefig('../1400_02_02.png')COLOUR_FIGURE = False%matplotlib inlinefrom matplotlib import pyplot as pltfrom matplotlib.colors import ListedColormap #颜色listfrom load import load_datasetimport numpy as npfrom knn import learn_model, apply_model, accuracyfeature_names = [ 'area', 'perimeter', 'compactness', 'length of kernel', 'width of kernel', 'asymmetry coefficien', 'length of kernel groove',]def train_plot(features, labels): y0,y1 = features[:,2].min()*.9, features[:,2].max()*1.1 x0,x1 = features[:,0].min()*.9, features[:,0].max()*1.1 X = np.linspace(x0,x1,100) Y = np.linspace(y0,y1,100) X,Y = np.meshgrid(X,Y) ##注意meshgrid的用法,格子 model = learn_model(1, features[:,(0,2)], np.array(labels)) C = apply_model(np.vstack([X.ravel(),Y.ravel()]).T, model).reshape(X.shape) #ravel是视图, 相比flatten if COLOUR_FIGURE: cmap = ListedColormap([(1.,.6,.6),(.6,1.,.6),(.6,.6,1.)]) ##listedColormap的用法 else: cmap = ListedColormap([(1.,1.,1.),(.2,.2,.2),(.6,.6,.6)]) plt.xlim(x0,x1) plt.ylim(y0,y1) plt.xlabel(feature_names[0]) plt.ylabel(feature_names[2]) plt.pcolormesh(X,Y,C, cmap=cmap) if COLOUR_FIGURE: cmap = ListedColormap([(1.,.0,.0),(.0,1.,.0),(.0,.0,1.)]) plt.scatter(features[:,0], features[:,2], c=labels, cmap=cmap) else: for lab,ma in zip(range(3), "Do^"): plt.plot(features[labels == lab,0], features[labels == lab,2], ma, c=(1.,1.,1.))features,labels = load_dataset('seeds')names = sorted(set(labels))labels = np.array([names.index(ell) for ell in labels])train_plot(features, labels)plt.savefig('../1400_02_04.png')features -= features.mean(0) #简洁的归一化方式features /= features.std(0)train_plot(features, labels)plt.savefig('../1400_02_05.png')from matplotlib import pyplot as pltimport numpy as npfrom sklearn.datasets import load_irisfrom threshold import learn_model, apply_model, accuracydata = load_iris()features = data['data']labels = data['target_names'][data['target']]setosa = (labels == 'setosa')features = features[~setosa]labels = labels[~setosa]virginica = (labels == 'virginica')testing = np.tile([True, False], 50) ##tile(A,(2,2,3))表示A的第一个维度重复3遍,第二个维度重复2遍,第三个维度重复2遍training = ~testing #取反的简洁方式model = learn_model(features[training], virginica[training])train_error = accuracy(features[training], virginica[training], model)test_error = accuracy(features[testing], virginica[testing], model)print('''\Training error was {0:.1%}.Testing error was {1:.1%} (N = {2}).'''.format(train_error, test_error, testing.sum())) # 注意此种字符串打印方式, 采用{0} 然后用.format()import numpy as npdef learn_model(k, features, labels): return k, features.copy(),labels.copy()def plurality(xs): from collections import defaultdict #注意此种import方式 counts = defaultdict(int) #dict for x in xs: counts[x] += 1 maxv = max(counts.values()) for k,v in counts.items(): if v == maxv: return kdef apply_model(features, model): k, train_feats, labels = model results = [] for f in features: label_dist = [] for t,ell in zip(train_feats, labels): label_dist.append( (np.linalg.norm(f-t), ell) ) label_dist.sort(key=lambda d_ell: d_ell[0]) label_dist = label_dist[:k] results.append(plurality([ell for _,ell in label_dist])) return np.array(results)def accuracy(features, labels, model): preds = apply_model(features, model) return np.mean(preds == labels) #简洁的方式计算氙灯还是不想等,然后进行处理import numpy as npdef load_dataset(dataset_name): ''' data,labels = load_dataset(dataset_name) Load a given dataset Returns ------- data : numpy ndarray labels : list of str ''' data = [] labels = [] with open('../data/{0}.tsv'.format(dataset_name)) as ifile: # 注意with open as 此种格式写法 for line in ifile: tokens = line.strip().split('\t') data.append([float(tk) for tk in tokens[:-1]]) labels.append(tokens[-1]) data = np.array(data) labels = np.array(labels) return data, labelsfrom load import load_datasetimport numpy as npfrom knn import learn_model, apply_model, accuracyfeatures,labels = load_dataset('seeds')def cross_validate(features, labels): error = 0.0 for fold in range(10): training = np.ones(len(features), bool) #注意用ones,还有组合[fold::10] 进行交叉验证的取样操作 training[fold::10] = 0 testing = ~training #简洁的方式取测试集合 model = learn_model(1, features[training], labels[training]) test_error = accuracy(features[testing], labels[testing], model) error += test_error return error/ 10.0error = cross_validate(features, labels)print('Ten fold cross-validated error was {0:.1%}.'.format(error))features -= features.mean(0)features /= features.std(0)error = cross_validate(features, labels)print('Ten fold cross-validated error after z-scoring was {0:.1%}.'.format(error))from load import load_datasetimport numpy as npfrom threshold import learn_model, apply_model, accuracyfeatures,labels = load_dataset('seeds')labels = labels == 'Canadian'error = 0.0for fold in range(10): training = np.ones(len(features), bool) training[fold::10] = 0 testing = ~training model = learn_model(features[training], labels[training]) test_error = accuracy(features[testing], labels[testing], model) error += test_errorerror /= 10.0print('Ten fold cross-validated error was {0:.1%}.'.format(error))import numpy as npfrom sklearn.datasets import load_irisdata = load_iris()features = data['data']labels = data['target_names'][data['target']]plength = features[:,2]is_setosa = (labels == 'setosa')print('Maximum of setosa: {0}.'.format(plength[is_setosa].max()))print('Minimum of others: {0}.'.format(plength[~is_setosa].min()))from matplotlib import pyplot as pltfrom sklearn.datasets import load_irisdata = load_iris()features = data['data']labels = data['target_names'][data['target']]setosa = (labels == 'setosa')features = features[~setosa]labels = labels[~setosa]virginica = (labels == 'virginica') #注意此种写法,相等后赋值, 最后的结果是一个布尔数组best_acc = -1.0for fi in range(features.shape[1]): thresh = features[:,fi].copy() thresh.sort() for t in thresh: pred = (features[:,fi] > t) acc = (pred == virginica).mean() if acc > best_acc: best_acc = acc best_fi = fi best_t = tprint('Best cut is {0} on feature {1}, which achieves accuracy of {2:.1%}.'.format(best_t,best_fi,best_acc))import numpy as npdef learn_model(features, labels): best_acc = -1.0 for fi in range(features.shape[1]): thresh = features[:,fi].copy() thresh.sort() for t in thresh: pred = (features[:,fi] > t) acc = (pred == labels).mean() if acc > best_acc: best_acc = acc best_fi = fi best_t = t return best_t, best_fidef apply_model(features, model): t, fi = model return features[:,fi] > tdef accuracy(features, labels, model): preds = apply_model(features, model) return np.mean(preds == labels)第三章import osimport sysimport scipy as spfrom sklearn.feature_extraction.text import CountVectorizerDIR = r"../data/toy"posts = [open(os.path.join(DIR, f)).read() for f in os.listdir(DIR)] #注意,os.listdir, 读取文档new_post = "imaging databases"import nltk.stem #词干处理库english_stemmer = nltk.stem.SnowballStemmer('english')class StemmedCountVectorizer(CountVectorizer): # 继承类 def build_analyzer(self): analyzer = super(StemmedCountVectorizer, self).build_analyzer() return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))# vectorizer = CountVectorizer(min_df=1, stop_words='english',# preprocessor=stemmer)vectorizer = StemmedCountVectorizer(min_df=1, stop_words='english')from sklearn.feature_extraction.text import TfidfVectorizerclass StemmedTfidfVectorizer(TfidfVectorizer): def build_analyzer(self): analyzer = super(StemmedTfidfVectorizer, self).build_analyzer() return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))vectorizer = StemmedTfidfVectorizer( min_df=1, stop_words='english', charset_error='ignore')print(vectorizer)X_train = vectorizer.fit_transform(posts)num_samples, num_features = X_train.shapeprint("#samples: %d, #features: %d" % (num_samples, num_features))new_post_vec = vectorizer.transform([new_post])print(new_post_vec, type(new_post_vec))print(new_post_vec.toarray())print(vectorizer.get_feature_names())def dist_raw(v1, v2): delta = v1 - v2 return sp.linalg.norm(delta.toarray()) #linalg.norm 计算欧式距离def dist_norm(v1, v2): v1_normalized = v1 / sp.linalg.norm(v1.toarray()) v2_normalized = v2 / sp.linalg.norm(v2.toarray()) delta = v1_normalized - v2_normalized return sp.linalg.norm(delta.toarray())dist = dist_normbest_dist = sys.maxsize #sys.maxsize python 3 中最大值best_i = Nonefor i in range(0, num_samples): post = posts[i] if post == new_post: continue post_vec = X_train.getrow(i) d = dist(post_vec, new_post_vec) print("=== Post %i with dist=%.2f: %s" % (i, d, post)) if d < best_dist: best_dist = d best_i = iprint("Best post is %i with dist=%.2f" % (best_i, best_dist))import sklearn.datasetsimport scipy as spnew_post = \ """Disk drive problems. Hi, I have a problem with my hard disk.After 1 year it is working only sporadically now.I tried to format it, but now it doesn't boot any more.Any ideas? Thanks."""MLCOMP_DIR = r"P:\Dropbox\pymlbook\data"groups = [ 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.ma c.hardware', 'comp.windows.x', 'sci.space']dataset = sklearn.datasets.load_mlcomp("20news-18828", "train", mlcomp_root=MLCOMP_DIR, categories=groups)print("Number of posts:", len(dataset.filenames))labels = dataset.targetnum_clusters = 50 # sp.unique(labels).shape[0]import nltk.stemenglish_stemmer = nltk.stem.SnowballStemmer('english')from sklearn.feature_extraction.text import TfidfVectorizerclass StemmedTfidfVectorizer(TfidfVectorizer): def build_analyzer(self): analyzer = super(TfidfVectorizer, self).build_analyzer() return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))vectorizer = StemmedTfidfVectorizer(min_df=10, max_df=0.5, # max_features=1000, stop_words='english', charset_error='ignore' )vectorized = vectorizer.fit_transform(dataset.data)num_samples, num_features = vectorized.shapeprint("#samples: %d, #features: %d" % (num_samples, num_features))from sklearn.cluster import KMeanskm = KMeans(n_clusters=num_clusters, init='k-means++', n_init=1, verbose=1)clustered = km.fit(vectorized)from sklearn import metrics #许多评估函数在此包中print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels, km.labels_))print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels, km.labels_))print(("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(vectorized, labels, sample_size=1000)))new_post_vec = vectorizer.transform([new_post])new_post_label = km.predict(new_post_vec)[0]similar_indices = (km.labels_ == new_post_label).nonzero()[0]similar = []for i in similar_indices: dist = sp.linalg.norm((new_post_vec - vectorized[i]).toarray()) similar.append((dist, dataset.data[i]))similar = sorted(similar)import pdbpdb.set_trace()show_at_1 = similar[0]show_at_2 = similar[len(similar) / 2]show_at_3 = similar[-1]print(show_at_1)print(show_at_2)print(show_at_3)import scipy as spdef tfidf(t, d, D): tf = float(d.count(t)) / sum(d.count(w) for w in set(d)) idf = sp.log(float(len(D)) / (len([doc for doc in D if t in doc]))) return tf * idfa, abb, abc = ["a"], ["a", "b", "b"], ["a", "b", "c"]D = [a, abb, abc]print(tfidf("a", a, D))print(tfidf("b", abb, D))print(tfidf("a", abc, D))print(tfidf("b", abc, D))print(tfidf("c", abc, D))import osimport scipy as spfrom scipy.stats import normfrom matplotlib import pylabfrom sklearn.cluster import KMeansseed = 2sp.random.seed(seed) # to reproduce the data later onnum_clusters = 3def plot_clustering(x, y, title, mx=None, ymax=None, xmin=None, km=None): pylab.figure(num=None, figsize=(8, 6)) if km: pylab.scatter(x, y, s=50, c=km.predict(list(zip(x, y)))) # 此处使用zip else: pylab.scatter(x, y, s=50) pylab.title(title) pylab.xlabel("Occurrence word 1") pylab.ylabel("Occurrence word 2") # pylab.xticks([w*7*24 for w in range(10)], ['week %i'%w for w in range(10)]) pylab.autoscale(tight=True) pylab.ylim(ymin=0, ymax=1) pylab.xlim(xmin=0, xmax=1) pylab.grid(True, linestyle='-', color='0.75') return pylabxw1 = norm(loc=0.3, scale=.15).rvs(20) #rvs随机数的个数yw1 = norm(loc=0.3, scale=.15).rvs(20)xw2 = norm(loc=0.7, scale=.15).rvs(20)yw2 = norm(loc=0.7, scale=.15).rvs(20)xw3 = norm(loc=0.2, scale=.15).rvs(20)yw3 = norm(loc=0.8, scale=.15).rvs(20)x = sp.append(sp.append(xw1, xw2), xw3)y = sp.append(sp.append(yw1, yw2), yw3)i = 1plot_clustering(x, y, "Vectors")pylab.savefig(os.path.join("..", "1400_03_0%i.png" % i))pylab.clf()i += 1mx, my = sp.meshgrid(sp.arange(0, 1, 0.001), sp.arange(0, 1, 0.001))km = KMeans(init='random', n_clusters=num_clusters, verbose=1, n_init=1, max_iter=1, random_state=seed)km.fit(sp.array(list(zip(x, y))))Z = km.predict(sp.c_[mx.ravel(), my.ravel()]).reshape(mx.shape)plot_clustering(x, y, "Clustering iteration 1", km=km)pylab.imshow(Z, interpolation='nearest', ##imshow图片显示? extent=(mx.min(), mx.max(), my.min(), my.max()), cmap=pylab.cm.Blues, aspect='auto', origin='lower')c1a, c1b, c1c = km.cluster_centers_pylab.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1], marker='x', linewidth=2, s=100, color='black')pylab.savefig(os.path.join("..", "1400_03_0%i.png" % i))pylab.clf()i += 1#################### 2 iterations ####################km = KMeans(init='random', n_clusters=num_clusters, verbose=1, n_init=1, max_iter=2, random_state=seed)km.fit(sp.array(list(zip(x, y))))Z = km.predict(sp.c_[mx.ravel(), my.ravel()]).reshape(mx.shape)plot_clustering(x, y, "Clustering iteration 2", km=km)pylab.imshow(Z, interpolation='nearest', extent=(mx.min(), mx.max(), my.min(), my.max()), cmap=pylab.cm.Blues, aspect='auto', origin='lower')c2a, c2b, c2c = km.cluster_centers_pylab.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1], marker='x', linewidth=2, s=100, color='black')# import pdb;pdb.set_trace()pylab.gca().add_patch( pylab.Arrow(c1a[0], c1a[1], c2a[0] - c1a[0], c2a[1] - c1a[1], width=0.1))pylab.gca().add_patch( pylab.Arrow(c1b[0], c1b[1], c2b[0] - c1b[0], c2b[1] - c1b[1], width=0.1))pylab.gca().add_patch( pylab.Arrow(c1c[0], c1c[1], c2c[0] - c1c[0], c2c[1] - c1c[1], width=0.1))pylab.savefig(os.path.join("..", "1400_03_0%i.png" % i))pylab.clf()i += 1#################### 3 iterations ####################km = KMeans(init='random', n_clusters=num_clusters, verbose=1, n_init=1, max_iter=10, random_state=seed)km.fit(sp.array(list(zip(x, y))))Z = km.predict(sp.c_[mx.ravel(), my.ravel()]).reshape(mx.shape)plot_clustering(x, y, "Clustering iteration 10", km=km)pylab.imshow(Z, interpolation='nearest', extent=(mx.min(), mx.max(), my.min(), my.max()), cmap=pylab.cm.Blues, aspect='auto', origin='lower')pylab.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1], marker='x', linewidth=2, s=100, color='black')pylab.savefig(os.path.join("..", "1400_03_0%i.png" % i))pylab.clf()i += 1第四章from __future__ import print_functionfrom gensim import corpora, models, similarities #LDA主题分析包from mpltools import styleimport matplotlib.pyplot as pltimport numpy as npfrom os import pathstyle.use('ggplot') #画图风格if not path.exists('./data/ap/ap.dat'): #检测路径存在方式 os.exists print('Error: Expected data to be present at data/ap/')corpus = corpora.BleiCorpus('./data/ap/ap.dat', './data/ap/vocab.txt')model = models.ldamodel.LdaModel(corpus, num_topics=100, id2word=corpus.id2word, alpha=None)for ti in xrange(84): words = model.show_topic(ti, 64) tf = sum(f for f,w in words) print('\n'.join('{}:{}'.format(w, int(1000.*f/tf)) for f,w in words)) print() print() print()thetas = [model[c] for c in corpus]plt.hist([len(t) for t in thetas], np.arange(42))plt.ylabel('Nr of documents')plt.xlabel('Nr of topics')plt.savefig('../1400OS_04_01+.png')model1 = models.ldamodel.LdaModel(corpus, num_topics=100, id2word=corpus.id2word, alpha=1.)thetas1 = [model1[c] for c in corpus]#model8 = models.ldamodel.LdaModel(corpus, num_topics=100, id2word=corpus.id2word, alpha=1.e-8)#thetas8 = [model8[c] for c in corpus]plt.clf()plt.hist([[len(t) for t in thetas], [len(t) for t in thetas1]], np.arange(42))plt.ylabel('Nr of documents')plt.xlabel('Nr of topics')plt.text(9,223, r'default alpha')plt.text(26,156, 'alpha=1.0')plt.savefig('../1400OS_04_02+.png')import nltk.corpusimport milkimport numpy as npimport stringfrom gensim import corpora, models, similaritiesimport sklearn.datasetsimport nltk.stemfrom collections import defaultdictenglish_stemmer = nltk.stem.SnowballStemmer('english')stopwords = set(nltk.corpus.stopwords.words('english'))stopwords.update(['from:', 'subject:', 'writes:', 'writes'])class DirectText(corpora.textcorpus.TextCorpus): def get_texts(self): return self.input def __len__(self): return len(self.input)dataset = sklearn.datasets.load_mlcomp("20news-18828", "train", mlcomp_root='../data')otexts = dataset.datatexts = dataset.datatexts = [t.decode('utf-8', 'ignore') for t in texts]texts = [t.split() for t in texts]texts = [map(lambda w: w.lower(), t) for t in texts] #注意使用map lambda 在列表推导式中 , 结果处理 texts = [filter(lambda s : not len(set("+-.?!()>@012345689") & set(s)), t) for t in texts]texts = [filter(lambda s : (len(s) > 3) and (s not in stopwords), t) for t in texts]texts = [map(english_stemmer.stem,t) for t in texts]usage = defaultdict(int)for t in texts: for w in set(t): usage[w] += 1limit = len(texts)/10too_common = [w for w in usage if usage[w] > limit]too_common = set(too_common)texts = [filter(lambda s : s not in too_common, t) for t in texts]corpus = DirectText(texts)dictionary = corpus.dictionarytry: dictionary['computer']except: passmodel = models.ldamodel.LdaModel(corpus, num_topics=100, id2word=dictionary.id2token)thetas = np.zeros((len(texts),100))for i,c in enumerate(corpus): for ti,v in model[c]: thetas[i,ti] += vdistances = milk.unsupervised.pdist(thetas) #milk库large = distances.max() + 1for i in xrange(len(distances)): distances[i,i] = largeprint otexts[1]printprintprintprint otexts[distances[1].argmin()]from __future__ import print_functionimport numpy as npimport logging, gensimlogging.basicConfig( format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)id2word = gensim.corpora.Dictionary.load_from_text('data/wiki_en_output_wordids.txt')mm = gensim.corpora.MmCorpus('data/wiki_en_output_tfidf.mm')model = gensim.models.ldamodel.LdaModel( corpus=mm, id2word=id2word, num_topics=100, update_every=1, chunksize=10000, passes=1)model.save('wiki_lda.pkl')topics = [model[doc] for doc in mm]lens = np.array([len(t) for t in topics])print(np.mean(lens <= 10))print(np.mean(lens))counts = np.zeros(100)for doc_top in topics: for ti,_ in doc_toc: counts[ti] += 1 for doc_top in topics: for ti,_ in doc_top: counts[ti] += 1 words = model.show_topic(counts.argmax(), 64)print(words)print()print()print()words = model.show_topic(counts.argmin(), 64)print(words)print()print()print() 第五章import ostry: import ujson as json # UltraJSON if available 注意try, except 用在importexcept: import jsonimport sysfrom collections import defaultdicttry: import enchantexcept: print( "Enchant is not installed. You can get it from http://packages.python.org/pyenchant/. Exitting...") sys.exit(1)from data import chosen, chosen_meta, filtered, filtered_metafiltered_meta = json.load(open(filtered_meta, "r"))speller = enchant.Dict("en_US")def misspelled_fraction(p): tokens = p.split() if not tokens: return 0.0 return 1 - float(sum(speller.check(t) for t in tokens)) / len(tokens) ## 在此处使用for in,在函数sum内部,类似列表表达式,def data(filename, col=None): for line in open(filename, "r"): data = line.strip().split("\t") # check format Id, ParentId, IsAccepted, TimeToAnswer, Score, Text, NumTextTokens, NumCodeLines, LinkCount, NumImages = data if col: yield data[col] #生成器, 注意生成器一定和for in 等循环之类的在一起使用? else: yield dataposts_to_keep = set()found_questions = 0num_qestion_sample = 1000# keep the best and worst, but only if we have one with positive and one with negative score# filter_method = "negative_positive"# if true, only keep the lowest scoring answer per class in addition to the accepted one# filter_method = "only_one_per_class "# if not None, specifies the number of unaccepted per question# filter_method = "sample_per_question"filter_method = "negative_positive" # warning: this does not retrieve many!# filter_method = "only_one_per_class"MaxAnswersPerQuestions = 10 # filter_method == "sample_per_question"# filter_method = "all"# equal share of questions that are unanswered and those that are answered# filter_method = "half-half"unaccepted_scores = {}has_q_accepted_a = {}num_q_with_accepted_a = 0num_q_without_accepted_a = 0for ParentId, posts in filtered_meta.items(): assert ParentId != -1 if len(posts) < 2: continue ParentId = int(ParentId) AllIds = set([ParentId]) AcceptedId = None UnacceptedId = None UnacceptedIds = [] UnacceptedScore = sys.maxsize NegativeScoreIds = [] PositiveScoreIds = [] if filter_method == "half-half": has_accepted_a = False for post in posts: Id, IsAccepted, TimeToAnswer, Score = post if IsAccepted: has_accepted_a = True break has_q_accepted_a[ParentId] = has_accepted_a if has_accepted_a: if num_q_with_accepted_a < num_qestion_sample / 2: num_q_with_accepted_a += 1 posts_to_keep.add(ParentId) else: if num_q_without_accepted_a < num_qestion_sample / 2: num_q_without_accepted_a += 1 posts_to_keep.add(ParentId) if num_q_without_accepted_a + num_q_with_accepted_a > num_qestion_sample: assert -1 not in posts_to_keep break else: for post in posts: Id, IsAccepted, TimeToAnswer, Score = post if filter_method == "all": AllIds.add(int(Id)) elif filter_method == "only_one_per_class": if IsAccepted: AcceptedId = Id elif Score < UnacceptedScore: UnacceptedScore = Score UnacceptedId = Id elif filter_method == "sample_per_question": if IsAccepted: AcceptedId = Id else: UnacceptedIds.append(Id) elif filter_method == "negative_positive": if Score < 0: NegativeScoreIds.append((Score, Id)) elif Score > 0: PositiveScoreIds.append((Score, Id)) else: raise ValueError(filter_method) added = False if filter_method == "all": posts_to_keep.update(AllIds) added = True elif filter_method == "only_one_per_class": if AcceptedId is not None and UnacceptedId is not None: posts_to_keep.add(ParentId) posts_to_keep.add(AcceptedId) posts_to_keep.add(UnacceptedId) added = True elif filter_method == "sample_per_question": if AcceptedId is not None and UnacceptedIds is not None: ##注意python中None 不是零,不是NULL,不是空字符串 posts_to_keep.add(ParentId) posts_to_keep.add(AcceptedId) posts_to_keep.update(UnacceptedIds[:MaxAnswersPerQuestions]) added = True elif filter_method == "negative_positive": if PositiveScoreIds and NegativeScoreIds: posts_to_keep.add(ParentId) posScore, posId = sorted(PositiveScoreIds)[-1] posts_to_keep.add(posId) negScore, negId = sorted(NegativeScoreIds)[0] posts_to_keep.add(negId) print("%i: %i/%i %i/%i" % (ParentId, posId, posScore, negId, negScore)) added = True if added: found_questions += 1 if num_qestion_sample and found_questions >= num_qestion_sample: breaktotal = 0kept = 0already_written = set()chosen_meta_dict = defaultdict(dict)with open(chosen, "w") as f: for line in data(filtered): strId, ParentId, IsAccepted, TimeToAnswer, Score, Text, NumTextTokens, NumCodeLines, LinkCount, NumImages = line Text = Text.strip() total += 1 Id = int(strId) if Id in posts_to_keep: if Id in already_written: print(Id, "is already written") continue if kept % 100 == 0: print(kept) # setting meta info post = chosen_meta_dict[Id] post['ParentId'] = int(ParentId) post['IsAccepted'] = int(IsAccepted) post['TimeToAnswer'] = int(TimeToAnswer) post['Score'] = int(Score) post['NumTextTokens'] = int(NumTextTokens) post['NumCodeLines'] = int(NumCodeLines) post['LinkCount'] = int(LinkCount) post['MisSpelledFraction'] = misspelled_fraction(Text) post['NumImages'] = int(NumImages) post['idx'] = kept # index into the file if int(ParentId) == -1: q = chosen_meta_dict[Id] if not 'Answers' in q: q['Answers'] = [] if filter_method == "half-half": q['HasAcceptedAnswer'] = has_q_accepted_a[Id] else: q = chosen_meta_dict[int(ParentId)] if int(IsAccepted) == 1: assert 'HasAcceptedAnswer' not in q q['HasAcceptedAnswer'] = True if 'Answers' not in q: q['Answers'] = [Id] else: q['Answers'].append(Id) f.writelines("%s\t%s\n" % (Id, Text)) kept += 1with open(chosen_meta, "w") as fm: json.dump(chosen_meta_dict, fm)print("total=", total)print("kept=", kept)import timestart_time = time.time()import numpy as npfrom sklearn.metrics import classification_reportfrom sklearn.metrics import precision_recall_curve, roc_curve, aucfrom sklearn.cross_validation import KFoldfrom sklearn import neighborsfrom data import chosen, chosen_metafrom utils import plot_roc, plot_prfrom utils import plot_feat_importancefrom utils import load_metafrom utils import fetch_postsfrom utils import plot_feat_histfrom utils import plot_bias_variancefrom utils import plot_k_complexity# question Id -> {'features'->feature vector, 'answers'->[answer Ids]}, 'scores'->[scores]}# scores will be added on-the-fly as the are not in metameta, id_to_idx, idx_to_id = load_meta(chosen_meta)import nltk# splitting questions into train (70%) and test(30%) and then take their# answersall_posts = list(meta.keys())all_questions = [q for q, v in meta.items() if v['ParentId'] == -1]all_answers = [q for q, v in meta.items() if v['ParentId'] != -1] # [:500]feature_names = np.array(( 'NumTextTokens', 'NumCodeLines', 'LinkCount', 'AvgSentLen', 'AvgWordLen', 'NumAllCaps', 'NumExclams', 'NumImages'))# activate the following for reduced feature space"""feature_names = np.array(( 'NumTextTokens', 'LinkCount',))"""def prepare_sent_features(): for pid, text in fetch_posts(chosen, with_index=True): if not text: meta[pid]['AvgSentLen'] = meta[pid]['AvgWordLen'] = 0 else: sent_lens = [len(nltk.word_tokenize( sent)) for sent in nltk.sent_tokenize(text)] meta[pid]['AvgSentLen'] = np.mean(sent_lens) meta[pid]['AvgWordLen'] = np.mean( [len(w) for w in nltk.word_tokenize(text)]) meta[pid]['NumAllCaps'] = np.sum( [word.isupper() for word in nltk.word_tokenize(text)]) meta[pid]['NumExclams'] = text.count('!')prepare_sent_features()def get_features(aid): return tuple(meta[aid][fn] for fn in feature_names)qa_X = np.asarray([get_features(aid) for aid in all_answers])# Score > 0 tests => positive class is good answer# Score <= 0 tests => positive class is poor answerqa_Y = np.asarray([meta[aid]['Score'] > 0 for aid in all_answers])classifying_answer = "good"for idx, feat in enumerate(feature_names): plot_feat_hist([(qa_X[:, idx], feat)])"""plot_feat_hist([(qa_X[:, idx], feature_names[idx]) for idx in [1,0]], 'feat_hist_two.png')plot_feat_hist([(qa_X[:, idx], feature_names[idx]) for idx in [3,4,5,6]], 'feat_hist_four.png')"""avg_scores_summary = []def measure(clf_class, parameters, name, data_size=None, plot=False): start_time_clf = time.time() if data_size is None: X = qa_X Y = qa_Y else: X = qa_X[:data_size] Y = qa_Y[:data_size] cv = KFold(n=len(X), n_folds=10, indices=True) train_errors = [] test_errors = [] scores = [] roc_scores = [] fprs, tprs = [], [] pr_scores = [] precisions, recalls, thresholds = [], [], [] for train, test in cv: X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] clf = clf_class(**parameters) ##可变参数 clf.fit(X_train, y_train) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) train_errors.append(1 - train_score) test_errors.append(1 - test_score) scores.append(test_score) proba = clf.predict_proba(X_test) label_idx = 1 fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, label_idx]) precision, recall, pr_thresholds = precision_recall_curve( y_test, proba[:, label_idx]) roc_scores.append(auc(fpr, tpr)) fprs.append(fpr) tprs.append(tpr) pr_scores.append(auc(recall, precision)) precisions.append(precision) recalls.append(recall) thresholds.append(pr_thresholds) print(classification_report(y_test, proba[:, label_idx] > 0.63, target_names=['not accepted', 'accepted'])) # get medium clone scores_to_sort = pr_scores # roc_scores medium = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] if plot: #plot_roc(roc_scores[medium], name, fprs[medium], tprs[medium]) plot_pr(pr_scores[medium], name, precisions[medium], recalls[medium], classifying_answer + " answers") if hasattr(clf, 'coef_'): plot_feat_importance(feature_names, clf, name) summary = (name, np.mean(scores), np.std(scores), np.mean(roc_scores), np.std(roc_scores), np.mean(pr_scores), np.std(pr_scores), time.time() - start_time_clf) print(summary) avg_scores_summary.append(summary) precisions = precisions[medium] recalls = recalls[medium] thresholds = np.hstack(([0], thresholds[medium])) idx80 = precisions >= 0.8 print("P=%.2f R=%.2f thresh=%.2f" % (precisions[idx80][0], recalls[ idx80][0], thresholds[idx80][0])) return np.mean(train_errors), np.mean(test_errors)def bias_variance_analysis(clf_class, parameters, name): data_sizes = np.arange(60, 2000, 4) train_errors = [] test_errors = [] for data_size in data_sizes: train_error, test_error = measure( clf_class, parameters, name, data_size=data_size) train_errors.append(train_error) test_errors.append(test_error) plot_bias_variance(data_sizes, train_errors, test_errors, name, "Bias-Variance for '%s'" % name)def k_complexity_analysis(clf_class, parameters): ks = np.hstack((np.arange(1, 20), np.arange(21, 100, 5))) train_errors = [] test_errors = [] for k in ks: parameters['n_neighbors'] = k train_error, test_error = measure( clf_class, parameters, "%dNN" % k, data_size=2000) train_errors.append(train_error) test_errors.append(test_error) plot_k_complexity(ks, train_errors, test_errors)for k in [5]: #[5, 10, 40, 90]: bias_variance_analysis(neighbors.KNeighborsClassifier, {'n_neighbors':k, 'warn_on_equidistant':False}, "%iNN"%k) k_complexity_analysis(neighbors.KNeighborsClassifier, {'n_neighbors':k, 'warn_on_equidistant':False}) #measure(neighbors.KNeighborsClassifier, {'n_neighbors': k, 'p': 2, #'warn_on_equidistant': False}, "%iNN" % k)from sklearn.linear_model import LogisticRegressionfor C in [0.1]: #[0.01, 0.1, 1.0, 10.0]: name = "LogReg C=%.2f" % C bias_variance_analysis(LogisticRegression, {'penalty':'l2', 'C':C}, name) measure(LogisticRegression, {'penalty': 'l2', 'C': C}, name, plot=True)print("=" * 50)from operator import itemgetterfor s in reversed(sorted(avg_scores_summary, key=itemgetter(1))): print("%-20s\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f" % s)print("time spent:", time.time() - start_time)import os# DATA_DIR = r"C:\pymlbook-data\ch05"DATA_DIR = r"/media/sf_C/pymlbook-data/ch05"CHART_DIR = os.path.join("..", "charts")filtered = os.path.join(DATA_DIR, "filtered.tsv")filtered_meta = os.path.join(DATA_DIR, "filtered-meta.json")chosen = os.path.join(DATA_DIR, "chosen.tsv")chosen_meta = os.path.join(DATA_DIR, "chosen-meta.json")import numpy as npfrom scipy.stats import normfrom matplotlib import pyplotnp.random.seed(3)num_per_class = 40X = np.hstack((norm.rvs(2, size=num_per_class, scale=2), norm.rvs(8, size=num_per_class, scale=3)))y = np.hstack((np.zeros(num_per_class), np.ones(num_per_class)))def lr_model(clf, X): return 1.0 / (1.0 + np.exp(-(clf.intercept_ + clf.coef_ * X)))from sklearn.linear_model import LogisticRegressionlogclf = LogisticRegression()print(logclf)logclf.fit(X.reshape(num_per_class * 2, 1), y) ## reshapeprint(np.exp(logclf.intercept_), np.exp(logclf.coef_.ravel())) ##注意线性回归的截距和参数print("P(x=-1)=%.2f\tP(x=7)=%.2f" % (lr_model(logclf, -1), lr_model(logclf, 7)))X_test = np.arange(-5, 20, 0.1)pyplot.figure(figsize=(10, 4))pyplot.xlim((-5, 20))pyplot.scatter(X, y, c=y)pyplot.xlabel("feature value")pyplot.ylabel("class")pyplot.grid(True, linestyle='-', color='0.75')pyplot.savefig("log_reg_example_data.png", bbox_inches="tight")def lin_model(clf, X): return clf.intercept_ + clf.coef_ * Xfrom sklearn.linear_model import LinearRegressionclf = LinearRegression()print(clf)clf.fit(X.reshape(num_per_class * 2, 1), y)X_odds = np.arange(0, 1, 0.001)pyplot.figure(figsize=(10, 4))pyplot.subplot(1, 2, 1)pyplot.scatter(X, y, c=y)pyplot.plot(X_test, lin_model(clf, X_test))pyplot.xlabel("feature value")pyplot.ylabel("class")pyplot.title("linear fit on original data")pyplot.grid(True, linestyle='-', color='0.75')X_ext = np.hstack((X, norm.rvs(20, size=100, scale=5)))y_ext = np.hstack((y, np.ones(100)))clf = LinearRegression()clf.fit(X_ext.reshape(num_per_class * 2 + 100, 1), y_ext)pyplot.subplot(1, 2, 2)pyplot.scatter(X_ext, y_ext, c=y_ext)pyplot.plot(X_ext, lin_model(clf, X_ext))pyplot.xlabel("feature value")pyplot.ylabel("class")pyplot.title("linear fit on additional data")pyplot.grid(True, linestyle='-', color='0.75')pyplot.savefig("log_reg_log_linear_fit.png", bbox_inches="tight")pyplot.figure(figsize=(10, 4))pyplot.xlim((-5, 20))pyplot.scatter(X, y, c=y)pyplot.plot(X_test, lr_model(logclf, X_test).ravel())pyplot.plot(X_test, np.ones(X_test.shape[0]) * 0.5, "--")pyplot.xlabel("feature value")pyplot.ylabel("class")pyplot.grid(True, linestyle='-', color='0.75')pyplot.savefig("log_reg_example_fitted.png", bbox_inches="tight")X = np.arange(0, 1, 0.001)pyplot.figure(figsize=(10, 4))pyplot.subplot(1, 2, 1)pyplot.xlim((0, 1))pyplot.ylim((0, 10))pyplot.plot(X, X / (1 - X))pyplot.xlabel("P")pyplot.ylabel("odds = P / (1-P)")pyplot.grid(True, linestyle='-', color='0.75')pyplot.subplot(1, 2, 2)pyplot.xlim((0, 1))pyplot.plot(X, np.log(X / (1 - X)))pyplot.xlabel("P")pyplot.ylabel("log(odds) = log(P / (1-P))")pyplot.grid(True, linestyle='-', color='0.75')pyplot.savefig("log_reg_log_odds.png", bbox_inches="tight")import refrom operator import itemgetter #多条件排序from collections import Mappingimport scipy.sparse as spfrom sklearn.base import BaseEstimator ##注意评估器选择函数from sklearn.feature_extraction.text import strip_accents_ascii, strip_accents_unicodeimport nltkfrom collections import Countertry: import ujson as json # UltraJSON if availableexcept: import jsonposcache_filename = "poscache.json"class PosCounter(Counter): def __init__(self, iterable=(), normalize=True, poscache=None, **kwargs): self.n_sents = 0 self.normalize = normalize self.poscache = poscache super(PosCounter, self).__init__(iterable, **kwargs) def update(self, other): """Adds counts for elements in other""" if isinstance(other, self.__class__): self.n_sents += other.n_sents for x, n in other.items(): self[x] += n else: for sent in other: self.n_sents += 1 # import pdb;pdb.set_trace() if self.poscache is not None: if sent in self.poscache: tags = self.poscache[sent] else: self.poscache[sent] = tags = nltk.pos_tag( nltk.word_tokenize(sent)) else: tags = nltk.pos_tag(nltk.word_tokenize(sent)) for x in tags: tok, tag = x self[tag] += 1 if self.normalize: for x, n in self.items(): self[x] /= float(self.n_sents)class PosTagFreqVectorizer(BaseEstimator): """ Convert a collection of raw documents to a matrix Pos tag frequencies """ def __init__(self, input='content', charset='utf-8', charset_error='strict', strip_accents=None, vocabulary=None, normalize=True, dtype=float): self.input = input self.charset = charset self.charset_error = charset_error self.strip_accents = strip_accents if vocabulary is not None: self.fixed_vocabulary = True if not isinstance(vocabulary, Mapping): vocabulary = dict((t, i) for i, t in enumerate(vocabulary)) self.vocabulary_ = vocabulary else: self.fixed_vocabulary = False try: self.poscache = json.load(open(poscache_filename, "r")) except IOError: self.poscache = {} self.normalize = normalize self.dtype = dtype def write_poscache(self): json.dump(self.poscache, open(poscache_filename, "w")) def decode(self, doc): """Decode the input into a string of unicode symbols The decoding strategy depends on the vectorizer parameters. """ if self.input == 'filename': doc = open(doc, 'rb').read() elif self.input == 'file': doc = doc.read() if isinstance(doc, bytes): doc = doc.decode(self.charset, self.charset_error) return doc def build_preprocessor(self): """Return a function to preprocess the text before tokenization""" # unfortunately python functools package does not have an efficient # `compose` function that would have allowed us to chain a dynamic # number of functions. However the however of a lambda call is a few # hundreds of nanoseconds which is negligible when compared to the # cost of tokenizing a string of 1000 chars for instance. noop = lambda x: x # accent stripping if not self.strip_accents: strip_accents = noop elif hasattr(self.strip_accents, '__call__'): strip_accents = self.strip_accents elif self.strip_accents == 'ascii': strip_accents = strip_accents_ascii elif self.strip_accents == 'unicode': strip_accents = strip_accents_unicode else: raise ValueError('Invalid value for "strip_accents": %s' % self.strip_accents) only_prose = lambda s: re.sub('<[^>]*>', '', s).replace("\n", " ") return lambda x: strip_accents(only_prose(x)) def build_tokenizer(self): """Return a function that split a string in sequence of tokens""" return nltk.sent_tokenize def build_analyzer(self): """Return a callable that handles preprocessing and tokenization""" preprocess = self.build_preprocessor() tokenize = self.build_tokenizer() return lambda doc: tokenize(preprocess(self.decode(doc))) def _term_count_dicts_to_matrix(self, term_count_dicts): i_indices = [] j_indices = [] values = [] vocabulary = self.vocabulary_ for i, term_count_dict in enumerate(term_count_dicts): for term, count in term_count_dict.items(): j = vocabulary.get(term) if j is not None: i_indices.append(i) j_indices.append(j) values.append(count) # free memory as we go term_count_dict.clear() shape = (len(term_count_dicts), max(vocabulary.values()) + 1) spmatrix = sp.csr_matrix((values, (i_indices, j_indices)), shape=shape, dtype=self.dtype) return spmatrix def fit(self, raw_documents, y=None): """Learn a vocabulary dictionary of all tokens in the raw documents Parameters ---------- raw_documents: iterable an iterable which yields either str, unicode or file objects Returns ------- self """ self.fit_transform(raw_documents) return self def fit_transform(self, raw_documents, y=None): """Learn the vocabulary dictionary and return the count vectors This is more efficient than calling fit followed by transform. Parameters ---------- raw_documents: iterable an iterable which yields either str, unicode or file objects Returns ------- vectors: array, [n_samples, n_features] """ if self.fixed_vocabulary: # No need to fit anything, directly perform the transformation. # We intentionally don't call the transform method to make it # fit_transform overridable without unwanted side effects in # TfidfVectorizer analyze = self.build_analyzer() term_counts_per_doc = [PosCounter(analyze(doc), normalize=self.normalize, poscache=self.poscache) for doc in raw_documents] return self._term_count_dicts_to_matrix(term_counts_per_doc) self.vocabulary_ = {} # result of document conversion to term count dicts term_counts_per_doc = [] term_counts = Counter() analyze = self.build_analyzer() for doc in raw_documents: term_count_current = PosCounter( analyze(doc), normalize=self.normalize, poscache=self.poscache) term_counts.update(term_count_current) term_counts_per_doc.append(term_count_current) self.write_poscache() terms = set(term_counts) # store map from term name to feature integer index: we sort the term # to have reproducible outcome for the vocabulary structure: otherwise # the mapping from feature name to indices might depend on the memory # layout of the machine. Furthermore sorted terms might make it # possible to perform binary search in the feature names array. self.vocabulary_ = dict(((t, i) for i, t in enumerate(sorted(terms)))) return self._term_count_dicts_to_matrix(term_counts_per_doc) def transform(self, raw_documents): """Extract token counts out of raw text documents using the vocabulary fitted with fit or the one provided in the constructor. Parameters ---------- raw_documents: iterable an iterable which yields either str, unicode or file objects Returns ------- vectors: sparse matrix, [n_samples, n_features] """ if not hasattr(self, 'vocabulary_') or len(self.vocabulary_) == 0: raise ValueError("Vocabulary wasn't fitted or is empty!") # raw_documents can be an iterable so we don't know its size in # advance # XXX @larsmans tried to parallelize the following loop with joblib. # The result was some 20% slower than the serial version. analyze = self.build_analyzer() term_counts_per_doc = [Counter(analyze(doc)) for doc in raw_documents] return self._term_count_dicts_to_matrix(term_counts_per_doc) def get_feature_names(self): """Array mapping from feature integer indices to feature name""" if not hasattr(self, 'vocabulary_') or len(self.vocabulary_) == 0: raise ValueError("Vocabulary wasn't fitted or is empty!") return [t for t, i in sorted(iter(self.vocabulary_.items()), key=itemgetter(1))]## This script filters the posts and keeps those posts that are or belong# to a question that has been asked in 2011 or 2012.#import osimport retry: import ujson as json # UltraJSON if availableexcept: import jsonfrom dateutil import parser as dateparserfrom operator import itemgetterfrom xml.etree import cElementTree as etreefrom collections import defaultdictfrom data import DATA_DIRfilename = os.path.join(DATA_DIR, "posts-2011-12.xml")filename_filtered = os.path.join(DATA_DIR, "filtered.tsv")q_creation = {} # creation datetimes of questionsq_accepted = {} # id of accepted answermeta = defaultdict( list) # question -> [(answer Id, IsAccepted, TimeToAnswer, Score), ...]# regegx to find code snippetscode_match = re.compile('<pre>(.*?)</pre>', re.MULTILINE | re.DOTALL)link_match = re.compile( '<a href="http://.*?".*?>(.*?)</a>', re.MULTILINE | re.DOTALL)img_match = re.compile('<img(.*?)/>', re.MULTILINE | re.DOTALL)tag_match = re.compile('<[^>]*>', re.MULTILINE | re.DOTALL)def filter_html(s): num_code_lines = 0 link_count_in_code = 0 code_free_s = s num_images = len(img_match.findall(s)) # remove source code and count how many lines 正则表达式的用法 for match_str in code_match.findall(s): num_code_lines += match_str.count('\n') code_free_s = code_match.sub("", code_free_s) # sometimes source code contain links, which we don't want to count link_count_in_code += len(link_match.findall(match_str)) anchors = link_match.findall(s) link_count = len(anchors) link_count -= link_count_in_code html_free_s = re.sub( " +", " ", tag_match.sub('', code_free_s)).replace("\n", "") link_free_s = html_free_s for anchor in anchors: if anchor.lower().startswith("http://"): link_free_s = link_free_s.replace(anchor, '') num_text_tokens = html_free_s.count(" ") return link_free_s, num_text_tokens, num_code_lines, link_count, num_imagesyears = defaultdict(int)num_questions = 0num_answers = 0def parsexml(filename): global num_questions, num_answers counter = 0 it = map(itemgetter(1), iter(etree.iterparse(filename, events=('start',)))) root = next(it) # get posts element for elem in it: if counter % 100000 == 0: print(counter) counter += 1 if elem.tag == 'row': creation_date = dateparser.parse(elem.get('CreationDate')) # import pdb;pdb.set_trace() # if creation_date.year < 2011: # continue Id = int(elem.get('Id')) PostTypeId = int(elem.get('PostTypeId')) Score = int(elem.get('Score')) if PostTypeId == 1: num_questions += 1 years[creation_date.year] += 1 ParentId = -1 TimeToAnswer = 0 q_creation[Id] = creation_date accepted = elem.get('AcceptedAnswerId') if accepted: q_accepted[Id] = int(accepted) IsAccepted = 0 elif PostTypeId == 2: num_answers += 1 ParentId = int(elem.get('ParentId')) if not ParentId in q_creation: # question was too far in the past continue TimeToAnswer = (creation_date - q_creation[ParentId]).seconds if ParentId in q_accepted: IsAccepted = int(q_accepted[ParentId] == Id) else: IsAccepted = 0 meta[ParentId].append((Id, IsAccepted, TimeToAnswer, Score)) else: continue Text, NumTextTokens, NumCodeLines, LinkCount, NumImages = filter_html( elem.get('Body')) values = (Id, ParentId, IsAccepted, TimeToAnswer, Score, Text, NumTextTokens, NumCodeLines, LinkCount, NumImages) yield values root.clear() # preserve memorywith open(os.path.join(DATA_DIR, filename_filtered), "w") as f: for item in parsexml(filename): line = "\t".join(map(str, item)) f.write(line.encode("utf-8") + "\n")with open(os.path.join(DATA_DIR, "filtered-meta.json"), "w") as f: json.dump(meta, f)print("years:", years)print("#qestions: %i" % num_questions)print("#answers: %i" % num_answers)import ostry: import ujson as json # UltraJSON if availableexcept: import jsonfrom matplotlib import pylabimport numpy as npfrom data import CHART_DIRdef fetch_data(filename, col=None, line_count=-1, only_questions=False): count = 0 for line in open(filename, "r"): count += 1 if line_count > 0 and count > line_count: break data = Id, ParentId, IsQuestion, IsAccepted, TimeToAnswer, Score, Text, NumTextTokens, NumCodeLines, LinkCount, MisSpelledFraction = line.split( "\t") IsQuestion = int(IsQuestion) if only_questions and not IsQuestion: continue if col: if col < 6: val = int(data[col]) else: val = data[col] yield val else: Id = int(Id) assert Id >= 0, line ParentId = int(ParentId) IsAccepted = int(IsAccepted) assert not IsQuestion == IsAccepted == 1, "%i %i --- %s" % ( IsQuestion, IsAccepted, line) assert (ParentId == -1 and IsQuestion) or ( ParentId >= 0 and not IsQuestion), "%i %i --- %s" % (ParentId, IsQuestion, line) TimeToAnswer = int(TimeToAnswer) Score = int(Score) NumTextTokens = int(NumTextTokens) NumCodeLines = int(NumCodeLines) LinkCount = int(LinkCount) MisSpelledFraction = float(MisSpelledFraction) yield Id, ParentId, IsQuestion, IsAccepted, TimeToAnswer, Score, Text, NumTextTokens, NumCodeLines, LinkCount, MisSpelledFractiondef fetch_posts(filename, with_index=True, line_count=-1): count = 0 for line in open(filename, "r"): count += 1 if line_count > 0 and count > line_count: break Id, Text = line.split("\t") Text = Text.strip() if with_index: yield int(Id), Text else: yield Textdef load_meta(filename): meta = json.load(open(filename, "r")) keys = list(meta.keys()) # JSON only allows string keys, changing that to int for key in keys: meta[int(key)] = meta[key] del meta[key] # post Id to index in vectorized id_to_idx = {} # and back idx_to_id = {} for PostId, Info in meta.items(): id_to_idx[PostId] = idx = Info['idx'] idx_to_id[idx] = PostId return meta, id_to_idx, idx_to_iddef plot_roc(auc_score, name, fpr, tpr): pylab.figure(num=None, figsize=(6, 5)) pylab.plot([0, 1], [0, 1], 'k--') pylab.xlim([0.0, 1.0]) pylab.ylim([0.0, 1.0]) pylab.xlabel('False Positive Rate') pylab.ylabel('True Positive Rate') pylab.title('Receiver operating characteristic (AUC=%0.2f)\n%s' % ( auc_score, name)) pylab.legend(loc="lower right") pylab.grid(True, linestyle='-', color='0.75') pylab.fill_between(tpr, fpr, alpha=0.5) pylab.plot(fpr, tpr, lw=1) pylab.savefig(os.path.join(CHART_DIR, "roc_" + name.replace(" ", "_")+ ".png"))def plot_pr(auc_score, name, precision, recall, label=None): pylab.figure(num=None, figsize=(6, 5)) pylab.xlim([0.0, 1.0]) pylab.ylim([0.0, 1.0]) pylab.xlabel('Recall') pylab.ylabel('Precision') pylab.title('P/R (AUC=%0.2f) / %s' % (auc_score, label)) pylab.fill_between(recall, precision, alpha=0.5) pylab.grid(True, linestyle='-', color='0.75') pylab.plot(recall, precision, lw=1) filename = name.replace(" ", "_") pylab.savefig(os.path.join(CHART_DIR, "pr_" + filename + ".png"))def show_most_informative_features(vectorizer, clf, n=20): c_f = sorted(zip(clf.coef_[0], vectorizer.get_feature_names())) top = list(zip(c_f[:n], c_f[:-(n + 1):-1])) for (c1, f1), (c2, f2) in top: print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (c1, f1, c2, f2))def plot_feat_importance(feature_names, clf, name): pylab.figure(num=None, figsize=(6, 5)) coef_ = clf.coef_ important = np.argsort(np.absolute(coef_.ravel())) f_imp = feature_names[important] coef = coef_.ravel()[important] inds = np.argsort(coef) f_imp = f_imp[inds] coef = coef[inds] xpos = np.array(list(range(len(coef)))) pylab.bar(xpos, coef, width=1) pylab.title('Feature importance for %s' % (name)) ax = pylab.gca() ax.set_xticks(np.arange(len(coef))) labels = ax.set_xticklabels(f_imp) for label in labels: label.set_rotation(90) filename = name.replace(" ", "_") pylab.savefig(os.path.join( CHART_DIR, "feat_imp_%s.png" % filename), bbox_inches="tight")def plot_feat_hist(data_name_list, filename=None): if len(data_name_list)>1: assert filename is not None pylab.figure(num=None, figsize=(8, 6)) num_rows = 1 + (len(data_name_list) - 1) / 2 num_cols = 1 if len(data_name_list) == 1 else 2 pylab.figure(figsize=(5 * num_cols, 4 * num_rows)) for i in range(num_rows): for j in range(num_cols): pylab.subplot(num_rows, num_cols, 1 + i * num_cols + j) x, name = data_name_list[i * num_cols + j] pylab.title(name) pylab.xlabel('Value') pylab.ylabel('Fraction') # the histogram of the data max_val = np.max(x) if max_val <= 1.0: bins = 50 elif max_val > 50: bins = 50 else: bins = max_val n, bins, patches = pylab.hist( x, bins=bins, normed=1, facecolor='blue', alpha=0.75) pylab.grid(True) if not filename: filename = "feat_hist_%s.png" % name.replace(" ", "_") pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")def plot_bias_variance(data_sizes, train_errors, test_errors, name, title): pylab.figure(num=None, figsize=(6, 5)) pylab.ylim([0.0, 1.0]) pylab.xlabel('Data set size') pylab.ylabel('Error') pylab.title("Bias-Variance for '%s'" % name) pylab.plot( data_sizes, test_errors, "--", data_sizes, train_errors, "b-", lw=1) pylab.legend(["train error", "test error"], loc="upper right") pylab.grid(True, linestyle='-', color='0.75') pylab.savefig(os.path.join(CHART_DIR, "bv_" + name.replace(" ", "_") + ".png"), bbox_inches="tight")def plot_k_complexity(ks, train_errors, test_errors): pylab.figure(num=None, figsize=(6, 5)) pylab.ylim([0.0, 1.0]) pylab.xlabel('k') pylab.ylabel('Error') pylab.title('Errors for for different values of k') pylab.plot( ks, test_errors, "--", ks, train_errors, "-", lw=1) pylab.legend(["train error", "test error"], loc="upper right") pylab.grid(True, linestyle='-', color='0.75') pylab.savefig(os.path.join(CHART_DIR, "kcomplexity.png"), bbox_inches="tight")第六章import osimport collectionsfrom matplotlib import pylabimport numpy as npDATA_DIR = os.path.join("..", "data")CHART_DIR = os.path.join("..", "charts")import csvimport jsondef tweak_labels(Y, pos_sent_list): pos = Y == pos_sent_list[0] #注意此种写法,最后赋值的是逻辑值 for sent_label in pos_sent_list[1:]: pos |= Y == sent_label #|= 注意竖杠和等于号在一起, a|=b 等价于a=a|b 按位或 Y = np.zeros(Y.shape[0]) Y[pos] = 1 Y = Y.astype(int) #类型转化 return Ydef load_sanders_data(dirname=".", line_count=-1): count = 0 topics = [] labels = [] tweets = [] with open(os.path.join(DATA_DIR, dirname, "corpus.csv"), "r") as csvfile: metareader = csv.reader(csvfile, delimiter=',', quotechar='"') #注意格式 for line in metareader: count += 1 if line_count > 0 and count > line_count: break topic, label, tweet_id = line # import vimpdb;vimpdb.set_trace() tweet_fn = os.path.join( DATA_DIR, dirname, 'rawdata', '%s.json' % tweet_id) tweet = json.load(open(tweet_fn, "r")) if 'text' in tweet and tweet['user']['lang']=="en": topics.append(topic) labels.append(label) tweets.append(tweet['text']) tweets = np.asarray(tweets) labels = np.asarray(labels) # return topics, tweets, labels return tweets, labelsdef load_kaggle_data(filename="kaggle/training.txt", line_count=-1): count = 0 labels = [] texts = [] read_texts = set([]) for line in open(os.path.join(DATA_DIR, filename), "r"): count += 1 if line_count > 0 and count > line_count: break label, text = line.split("\t") # Some tweets occur multiple times, so we have to # remove them to not bias the training set. if text in read_texts: continue read_texts.add(text) labels.append(label) texts.append(text) texts = np.asarray(texts) labels = np.asarray(labels, dtype=np.int) return texts, labelsdef plot_pr(auc_score, name, phase, precision, recall, label=None): pylab.clf() #清除图像 pylab.figure(num=None, figsize=(5, 4)) pylab.grid(True) pylab.fill_between(recall, precision, alpha=0.5) pylab.plot(recall, precision, lw=1) pylab.xlim([0.0, 1.0]) pylab.ylim([0.0, 1.0]) pylab.xlabel('Recall') pylab.ylabel('Precision') pylab.title('P/R curve (AUC=%0.2f) / %s' % (auc_score, label)) filename = name.replace(" ", "_") pylab.savefig(os.path.join(CHART_DIR, "pr_%s_%s.png"%(filename, phase)), bbox_inches="tight")def show_most_informative_features(vectorizer, clf, n=20): c_f = sorted(zip(clf.coef_[0], vectorizer.get_feature_names())) top = zip(c_f[:n], c_f[:-(n + 1):-1]) for (c1, f1), (c2, f2) in top: print "\t%.4f\t%-15s\t\t%.4f\t%-15s" % (c1, f1, c2, f2)def plot_log(): pylab.clf() pylab.figure(num=None, figsize=(6, 5)) x = np.arange(0.001, 1, 0.001) y = np.log(x) pylab.title('Relationship between probabilities and their logarithm') pylab.plot(x, y) pylab.grid(True) pylab.xlabel('P') pylab.ylabel('log(P)') filename = 'log_probs.png' pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")def plot_feat_importance(feature_names, clf, name): pylab.clf() coef_ = clf.coef_ important = np.argsort(np.absolute(coef_.ravel())) f_imp = feature_names[important] coef = coef_.ravel()[important] inds = np.argsort(coef) f_imp = f_imp[inds] coef = coef[inds] xpos = np.array(range(len(coef))) pylab.bar(xpos, coef, width=1) pylab.title('Feature importance for %s' % (name)) ax = pylab.gca() ax.set_xticks(np.arange(len(coef))) labels = ax.set_xticklabels(f_imp) for label in labels: label.set_rotation(90) filename = name.replace(" ", "_") pylab.savefig(os.path.join( CHART_DIR, "feat_imp_%s.png" % filename), bbox_inches="tight")def plot_feat_hist(data_name_list, filename=None): pylab.clf() # import pdb;pdb.set_trace() num_rows = 1 + (len(data_name_list) - 1) / 2 num_cols = 1 if len(data_name_list) == 1 else 2 pylab.figure(figsize=(5 * num_cols, 4 * num_rows)) for i in range(num_rows): for j in range(num_cols): pylab.subplot(num_rows, num_cols, 1 + i * num_cols + j) x, name = data_name_list[i * num_cols + j] pylab.title(name) pylab.xlabel('Value') pylab.ylabel('Density') # the histogram of the data max_val = np.max(x) if max_val <= 1.0: bins = 50 elif max_val > 50: bins = 50 else: bins = max_val n, bins, patches = pylab.hist( x, bins=bins, normed=1, facecolor='green', alpha=0.75) pylab.grid(True) if not filename: filename = "feat_hist_%s.png" % name pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")def plot_bias_variance(data_sizes, train_errors, test_errors, name): pylab.clf() pylab.ylim([0.0, 1.0]) pylab.xlabel('Data set size') pylab.ylabel('Error') pylab.title("Bias-Variance for '%s'" % name) pylab.plot( data_sizes, train_errors, "-", data_sizes, test_errors, "--", lw=1) pylab.legend(["train error", "test error"], loc="upper right") pylab.grid() pylab.savefig(os.path.join(CHART_DIR, "bv_" + name + ".png"))def load_sent_word_net(): sent_scores = collections.defaultdict(list) with open(os.path.join(DATA_DIR, "SentiWordNet_3.0.0_20130122.txt"), "r") as csvfile: reader = csv.reader(csvfile, delimiter='\t', quotechar='"') for line in reader: if line[0].startswith("#"): #注意用法 continue if len(line)==1: continue POS,ID,PosScore,NegScore,SynsetTerms,Gloss = line if len(POS)==0 or len(ID)==0: continue #print POS,PosScore,NegScore,SynsetTerms for term in SynsetTerms.split(" "): term = term.split("#")[0] # drop #number at the end of every term term = term.replace("-", " ").replace("_", " ") #两个replace 连用 key = "%s/%s"%(POS,term.split("#")[0]) sent_scores[key].append((float(PosScore), float(NegScore))) for key, value in sent_scores.iteritems(): sent_scores[key] = np.mean(value, axis=0) return sent_scoresdef log_false_positives(clf, X, y, name): with open("FP_"+name.replace(" ", "_")+".tsv", "w") as f: false_positive = clf.predict(X)!=y for tweet, false_class in zip(X[false_positive], y[false_positive]): f.write("%s\t%s\n"%(false_class, tweet.encode("ascii", "ignore")))if __name__ == '__main__': plot_log()import timestart_time = time.time()import numpy as npfrom sklearn.metrics import precision_recall_curve, roc_curve, aucfrom sklearn.cross_validation import ShuffleSplitfrom utils import plot_prfrom utils import load_sanders_datafrom utils import tweak_labelsfrom sklearn.feature_extraction.text import TfidfVectorizerfrom sklearn.pipeline import Pipeline #管道机制from sklearn.naive_bayes import MultinomialNBdef create_ngram_model(): tfidf_ngrams = TfidfVectorizer(ngram_range=(1, 3), analyzer="word", binary=False) clf = MultinomialNB() pipeline = Pipeline([('vect', tfidf_ngrams), ('clf', clf)]) return pipelinedef train_model(clf_factory, X, Y, name="NB ngram", plot=False): cv = ShuffleSplit( n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0) train_errors = [] test_errors = [] scores = [] pr_scores = [] precisions, recalls, thresholds = [], [], [] for train, test in cv: X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] clf = clf_factory() clf.fit(X_train, y_train) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) train_errors.append(1 - train_score) test_errors.append(1 - test_score) scores.append(test_score) proba = clf.predict_proba(X_test) fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1]) precision, recall, pr_thresholds = precision_recall_curve( y_test, proba[:, 1]) pr_scores.append(auc(recall, precision)) precisions.append(precision) recalls.append(recall) thresholds.append(pr_thresholds) scores_to_sort = pr_scores median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] if plot: plot_pr(pr_scores[median], name, "01", precisions[median], recalls[median], label=name) summary = (np.mean(scores), np.std(scores), np.mean(pr_scores), np.std(pr_scores)) print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary return np.mean(train_errors), np.mean(test_errors)def print_incorrect(clf, X, Y): Y_hat = clf.predict(X) wrong_idx = Y_hat != Y X_wrong = X[wrong_idx] Y_wrong = Y[wrong_idx] Y_hat_wrong = Y_hat[wrong_idx] for idx in xrange(len(X_wrong)): print "clf.predict('%s')=%i instead of %i" %\ (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx])if __name__ == "__main__": X_orig, Y_orig = load_sanders_data() classes = np.unique(Y_orig) for c in classes: print "#%s: %i" % (c, sum(Y_orig == c)) print "== Pos vs. neg ==" pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative") X = X_orig[pos_neg] Y = Y_orig[pos_neg] Y = tweak_labels(Y, ["positive"]) train_model(create_ngram_model, X, Y, name="pos vs neg", plot=True) print "== Pos/neg vs. irrelevant/neutral ==" X = X_orig Y = tweak_labels(Y_orig, ["positive", "negative"]) train_model(create_ngram_model, X, Y, name="sent vs rest", plot=True) print "== Pos vs. rest ==" X = X_orig Y = tweak_labels(Y_orig, ["positive"]) train_model(create_ngram_model, X, Y, name="pos vs rest", plot=True) print "== Neg vs. rest ==" X = X_orig Y = tweak_labels(Y_orig, ["negative"]) train_model(create_ngram_model, X, Y, name="neg vs rest", plot=True) print "time spent:", time.time() - start_timeimport timestart_time = time.time()import numpy as npfrom sklearn.metrics import precision_recall_curve, roc_curve, aucfrom sklearn.cross_validation import ShuffleSplitfrom utils import plot_prfrom utils import load_sanders_datafrom utils import tweak_labelsfrom sklearn.feature_extraction.text import TfidfVectorizerfrom sklearn.pipeline import Pipelinefrom sklearn.grid_search import GridSearchCVfrom sklearn.metrics import f1_scorefrom sklearn.naive_bayes import MultinomialNBphase = "02"def create_ngram_model(params=None): tfidf_ngrams = TfidfVectorizer(ngram_range=(1, 3), analyzer="word", binary=False) clf = MultinomialNB() pipeline = Pipeline([('vect', tfidf_ngrams), ('clf', clf)]) if params: pipeline.set_params(**params) return pipelinedef grid_search_model(clf_factory, X, Y): cv = ShuffleSplit( n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0) param_grid = dict(vect__ngram_range=[(1, 1), (1, 2), (1, 3)], vect__min_df=[1, 2], vect__stop_words=[None, "english"], vect__smooth_idf=[False, True], vect__use_idf=[False, True], vect__sublinear_tf=[False, True], vect__binary=[False, True], clf__alpha=[0, 0.01, 0.05, 0.1, 0.5, 1], ) grid_search = GridSearchCV(clf_factory(), param_grid=param_grid, cv=cv, score_func=f1_score, verbose=10) grid_search.fit(X, Y) clf = grid_search.best_estimator_ print clf return clfdef train_model(clf, X, Y, name="NB ngram", plot=False): # create it again for plotting cv = ShuffleSplit( n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0) train_errors = [] test_errors = [] scores = [] pr_scores = [] precisions, recalls, thresholds = [], [], [] for train, test in cv: X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] clf.fit(X_train, y_train) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) train_errors.append(1 - train_score) test_errors.append(1 - test_score) scores.append(test_score) proba = clf.predict_proba(X_test) fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1]) precision, recall, pr_thresholds = precision_recall_curve( y_test, proba[:, 1]) pr_scores.append(auc(recall, precision)) precisions.append(precision) recalls.append(recall) thresholds.append(pr_thresholds) if plot: scores_to_sort = pr_scores median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] plot_pr(pr_scores[median], name, phase, precisions[median], recalls[median], label=name) summary = (np.mean(scores), np.std(scores), np.mean(pr_scores), np.std(pr_scores)) print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary return np.mean(train_errors), np.mean(test_errors)def print_incorrect(clf, X, Y): Y_hat = clf.predict(X) wrong_idx = Y_hat != Y X_wrong = X[wrong_idx] Y_wrong = Y[wrong_idx] Y_hat_wrong = Y_hat[wrong_idx] for idx in xrange(len(X_wrong)): print "clf.predict('%s')=%i instead of %i" %\ (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx])def get_best_model(): best_params = dict(vect__ngram_range=(1, 2), vect__min_df=1, vect__stop_words=None, vect__smooth_idf=False, vect__use_idf=False, vect__sublinear_tf=True, vect__binary=False, clf__alpha=0.01, ) best_clf = create_ngram_model(best_params) return best_clfif __name__ == "__main__": X_orig, Y_orig = load_sanders_data() classes = np.unique(Y_orig) for c in classes: print "#%s: %i" % (c, sum(Y_orig == c)) print "== Pos vs. neg ==" pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative") X = X_orig[pos_neg] Y = Y_orig[pos_neg] Y = tweak_labels(Y, ["positive"]) train_model(get_best_model(), X, Y, name="pos vs neg", plot=True) print "== Pos/neg vs. irrelevant/neutral ==" X = X_orig Y = tweak_labels(Y_orig, ["positive", "negative"]) # best_clf = grid_search_model(create_ngram_model, X, Y, name="sent vs # rest", plot=True) train_model(get_best_model(), X, Y, name="pos vs neg", plot=True) print "== Pos vs. rest ==" X = X_orig Y = tweak_labels(Y_orig, ["positive"]) train_model(get_best_model(), X, Y, name="pos vs rest", plot=True) print "== Neg vs. rest ==" X = X_orig Y = tweak_labels(Y_orig, ["negative"]) train_model(get_best_model(), X, Y, name="neg vs rest", plot=True) print "time spent:", time.time() - start_timeimport timestart_time = time.time()import reimport numpy as npfrom sklearn.metrics import precision_recall_curve, roc_curve, aucfrom sklearn.cross_validation import ShuffleSplitfrom sklearn.pipeline import Pipelinefrom utils import plot_prfrom utils import load_sanders_datafrom utils import tweak_labelsfrom utils import log_false_positivesfrom sklearn.feature_extraction.text import TfidfVectorizerfrom sklearn.naive_bayes import MultinomialNBfrom utils import load_sent_word_netsent_word_net = load_sent_word_net()phase = "03"emo_repl = { # positive emoticons "<3": " good ", ":d": " good ", # :D in lower case ":dd": " good ", # :DD in lower case "8)": " good ", ":-)": " good ", ":)": " good ", ";)": " good ", "(-:": " good ", "(:": " good ", # negative emoticons: ":/": " bad ", ":>": " sad ", ":')": " sad ", ":-(": " bad ", ":(": " bad ", ":S": " bad ", ":-S": " bad ", }emo_repl_order = [k for (k_len,k) in reversed(sorted([(len(k),k) for k in emo_repl.keys()]))]re_repl = { r"\br\b": "are", r"\bu\b": "you", r"\bhaha\b": "ha", r"\bhahaha\b": "ha", r"\bdon't\b": "do not", r"\bdoesn't\b": "does not", r"\bdidn't\b": "did not", r"\bhasn't\b": "has not", r"\bhaven't\b": "have not", r"\bhadn't\b": "had not", r"\bwon't\b": "will not", r"\bwouldn't\b": "would not", r"\bcan't\b": "can not", r"\bcannot\b": "can not", }def create_ngram_model(params=None): def preprocessor(tweet): global emoticons_replaced tweet = tweet.lower() for k in emo_repl_order: tweet = tweet.replace(k, emo_repl[k]) for r, repl in re_repl.iteritems(): tweet = re.sub(r, repl, tweet) return tweet tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor, analyzer="word") clf = MultinomialNB() pipeline = Pipeline([('tfidf', tfidf_ngrams), ('clf', clf)]) if params: pipeline.set_params(**params) return pipelinedef train_model(clf, X, Y, name="NB ngram", plot=False): # create it again for plotting cv = ShuffleSplit( n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0) train_errors = [] test_errors = [] scores = [] pr_scores = [] precisions, recalls, thresholds = [], [], [] clfs = [] # just to later get the median for train, test in cv: X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] clf.fit(X_train, y_train) clfs.append(clf) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) train_errors.append(1 - train_score) test_errors.append(1 - test_score) scores.append(test_score) proba = clf.predict_proba(X_test) #使用predict_proba进行预测,结果是可能性 fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1]) precision, recall, pr_thresholds = precision_recall_curve( y_test, proba[:, 1]) pr_scores.append(auc(recall, precision)) precisions.append(precision) recalls.append(recall) thresholds.append(pr_thresholds) if plot: scores_to_sort = pr_scores median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] plot_pr(pr_scores[median], name, phase, precisions[median], recalls[median], label=name) log_false_positives(clfs[median], X_test, y_test, name) summary = (np.mean(scores), np.std(scores), np.mean(pr_scores), np.std(pr_scores)) print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary return np.mean(train_errors), np.mean(test_errors)def print_incorrect(clf, X, Y): Y_hat = clf.predict(X) wrong_idx = Y_hat != Y X_wrong = X[wrong_idx] Y_wrong = Y[wrong_idx] Y_hat_wrong = Y_hat[wrong_idx] for idx in xrange(len(X_wrong)): print "clf.predict('%s')=%i instead of %i" %\ (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx])def get_best_model(): best_params = dict(tfidf__ngram_range=(1, 2), tfidf__min_df=1, tfidf__stop_words=None, tfidf__smooth_idf=False, tfidf__use_idf=False, tfidf__sublinear_tf=True, tfidf__binary=False, clf__alpha=0.01, ) best_clf = create_ngram_model(best_params) return best_clfif __name__ == "__main__": X_orig, Y_orig = load_sanders_data() classes = np.unique(Y_orig) for c in classes: print "#%s: %i" % (c, sum(Y_orig == c)) print "== Pos vs. neg ==" pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative") X = X_orig[pos_neg] Y = Y_orig[pos_neg] Y = tweak_labels(Y, ["positive"]) train_model(get_best_model(), X, Y, name="pos vs neg", plot=True) print "== Pos/neg vs. irrelevant/neutral ==" X = X_orig Y = tweak_labels(Y_orig, ["positive", "negative"]) # best_clf = grid_search_model(create_union_model, X, Y, name="sent vs # rest", plot=True) train_model(get_best_model(), X, Y, name="pos+neg vs rest", plot=True) print "== Pos vs. rest ==" X = X_orig Y = tweak_labels(Y_orig, ["positive"]) train_model(get_best_model(), X, Y, name="pos vs rest", plot=True) print "== Neg vs. rest ==" X = X_orig Y = tweak_labels(Y_orig, ["negative"]) train_model(get_best_model(), X, Y, name="neg vs rest", plot=True) print "time spent:", time.time() - start_time# This script trains tries to tweak hyperparameters to improve P/R AUC#import timestart_time = time.time()import nltkimport numpy as npfrom sklearn.metrics import precision_recall_curve, roc_curve, aucfrom sklearn.cross_validation import ShuffleSplitfrom utils import plot_prfrom utils import load_sanders_datafrom utils import tweak_labelsfrom utils import log_false_positivesfrom sklearn.feature_extraction.text import TfidfVectorizerfrom sklearn.pipeline import Pipeline, FeatureUnion #特征联合,并行from sklearn.grid_search import GridSearchCVfrom sklearn.metrics import f1_scorefrom sklearn.base import BaseEstimatorfrom sklearn.naive_bayes import MultinomialNBfrom utils import load_sent_word_netsent_word_net = load_sent_word_net()import jsonposcache_filename = "poscache.json"try: poscache = json.load(open(poscache_filename, "r"))except IOError: poscache = {}class StructCounter(BaseEstimator): def get_feature_names(self): return np.array(['sent_neut', 'sent_pos', 'sent_neg', 'nouns', 'adjectives', 'verbs', 'adverbs', 'allcaps', 'exclamation', 'question', 'hashtag', 'mentioning']) def fit(self, documents, y=None): return self def _get_sentiments(self, d): # http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html #import pdb;pdb.set_trace() sent = tuple(d.split()) if poscache is not None: if d in poscache: tagged = poscache[d] else: poscache[d] = tagged = nltk.pos_tag(sent) else: tagged = nltk.pos_tag(sent) pos_vals = [] neg_vals = [] nouns = 0. adjectives = 0. verbs = 0. adverbs = 0. for w,t in tagged: p, n = 0,0 sent_pos_type = None if t.startswith("NN"): sent_pos_type = "n" nouns += 1 elif t.startswith("JJ"): sent_pos_type = "a" adjectives += 1 elif t.startswith("VB"): sent_pos_type = "v" verbs += 1 elif t.startswith("RB"): sent_pos_type = "r" adverbs += 1 if sent_pos_type is not None: sent_word = "%s/%s"%(sent_pos_type, w) if sent_word in sent_word_net: p,n = sent_word_net[sent_word] pos_vals.append(p) neg_vals.append(n) l = len(sent) avg_pos_val = np.mean(pos_vals) avg_neg_val = np.mean(neg_vals) return [1-avg_pos_val-avg_neg_val, avg_pos_val, avg_neg_val, nouns/l, adjectives/l, verbs/l, adverbs/l] def transform(self, documents): obj_val, pos_val, neg_val, nouns, adjectives, verbs, adverbs = np.array([self._get_sentiments(d) for d in documents]).T allcaps = [] exclamation = [] question = [] hashtag = [] mentioning = [] for d in documents: #import pdb;pdb.set_trace() allcaps.append(np.sum([t.isupper() for t in d.split() if len(t)>2])) exclamation.append(d.count("!")) question.append(d.count("?")) hashtag.append(d.count("#")) mentioning.append(d.count("@")) result = np.array([obj_val, pos_val, neg_val, nouns, adjectives, verbs, adverbs, allcaps, exclamation, question, hashtag, mentioning]).T return resultdef create_union_model(params=None): def preprocessor(tweet): global emoticons_replaced #return tweet.lower() repl = { # positive emoticons "<3": " good ", ":D": " good ", "8)": " good ", ":-)": " good ", ":)": " good ", ";)": " good ", ";-)": " good ", # negative emoticons: ":/": " bad ", ":>": " sad ", ":-(": " bad ", ":(": " bad ", ":S": " bad ", ":-S": " bad ", } for a,b in repl.iteritems(): tweet = tweet.replace(a,b) return tweet.lower() tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor, analyzer="word") struct_stats = StructCounter() all_features = FeatureUnion([('struct', struct_stats), ('tfidf', tfidf_ngrams)]) #注意FeatureUNion, 是特征并列处理,然后组合在一起,简单的lbind #all_features = FeatureUnion([('tfidf', tfidf_ngrams)]) #all_features = FeatureUnion([('struct', struct_stats)]) clf = MultinomialNB() pipeline = Pipeline([('all', all_features), ('clf', clf)]) if params: pipeline.set_params(**params) return pipelinedef grid_search_model(clf_factory, X, Y): cv = ShuffleSplit( n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0) param_grid = dict(all__tfidf__ngram_range=[(1, 1), (1, 2), (1, 3)], all__tfidf__min_df=[1, 2], all__tfidf__stop_words=[None, "english"], all__tfidf__smooth_idf=[False, True], all__tfidf__use_idf=[False, True], all__tfidf__sublinear_tf=[False, True], all__tfidf__binary=[False, True], clf__alpha=[0, 0.01, 0.05, 0.1, 0.5, 1], ) grid_search = GridSearchCV(clf_factory(), param_grid=param_grid, cv=cv, score_func=f1_score, verbose=10) grid_search.fit(X, Y) clf = grid_search.best_estimator_ # grid_search找出最好的模型 print clf return clfdef train_model(clf, X, Y, name="NB ngram", plot=False): # create it again for plotting cv = ShuffleSplit( n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0) train_errors = [] test_errors = [] scores = [] pr_scores = [] precisions, recalls, thresholds = [], [], [] clfs = [] # just to later get the median for train, test in cv: X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] clf.fit(X_train, y_train) clfs.append(clf) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) train_errors.append(1 - train_score) test_errors.append(1 - test_score) scores.append(test_score) proba = clf.predict_proba(X_test) fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1]) precision, recall, pr_thresholds = precision_recall_curve( y_test, proba[:, 1]) pr_scores.append(auc(recall, precision)) precisions.append(precision) recalls.append(recall) thresholds.append(pr_thresholds) if plot: scores_to_sort = pr_scores median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] plot_pr(pr_scores[median], name, precisions[median], recalls[median], label=name) log_false_positives(clfs[median], X_test, y_test, name) summary = (np.mean(scores), np.std(scores), np.mean(pr_scores), np.std(pr_scores)) print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary return np.mean(train_errors), np.mean(test_errors)def print_incorrect(clf, X, Y): Y_hat = clf.predict(X) wrong_idx = Y_hat != Y X_wrong = X[wrong_idx] Y_wrong = Y[wrong_idx] Y_hat_wrong = Y_hat[wrong_idx] for idx in xrange(len(X_wrong)): print "clf.predict('%s')=%i instead of %i" %\ (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx])def get_best_model(): best_params = dict(all__tfidf__ngram_range=(1, 2), all__tfidf__min_df=1, all__tfidf__stop_words=None, all__tfidf__smooth_idf=False, all__tfidf__use_idf=False, all__tfidf__sublinear_tf=True, all__tfidf__binary=False, clf__alpha=0.01, ) best_clf = create_union_model(best_params) return best_clfif __name__ == "__main__": X_orig, Y_orig = load_sanders_data() #from sklearn.utils import shuffle #print "shuffle, sample" #X_orig, Y_orig = shuffle(X_orig, Y_orig) #X_orig = X_orig[:100,] #Y_orig = Y_orig[:100,] classes = np.unique(Y_orig) for c in classes: print "#%s: %i" % (c, sum(Y_orig == c)) print "== Pos vs. neg ==" pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative") X = X_orig[pos_neg] Y = Y_orig[pos_neg] Y = tweak_labels(Y, ["positive"]) grid_search_model(create_union_model, X, Y) print "== Pos/neg vs. irrelevant/neutral ==" X = X_orig Y = tweak_labels(Y_orig, ["positive", "negative"]) # best_clf = grid_search_model(create_union_model, X, Y, name="sent vs # rest", plot=True) grid_search_model(create_union_model, X, Y) print "== Pos vs. rest ==" X = X_orig Y = tweak_labels(Y_orig, ["positive"]) grid_search_model(create_union_model, X, Y) print "== Neg vs. rest ==" X = X_orig Y = tweak_labels(Y_orig, ["negative"]) grid_search_model(create_union_model, X, Y) print "time spent:", time.time() - start_time json.dump(poscache, open(poscache_filename, "w"))## This script trains tries to tweak hyperparameters to improve P/R AUC#import timestart_time = time.time()import reimport nltkimport numpy as npfrom sklearn.metrics import precision_recall_curve, roc_curve, aucfrom sklearn.cross_validation import ShuffleSplitfrom utils import plot_prfrom utils import load_sanders_datafrom utils import tweak_labelsfrom utils import log_false_positivesfrom sklearn.feature_extraction.text import TfidfVectorizerfrom sklearn.pipeline import Pipeline, FeatureUnionfrom sklearn.grid_search import GridSearchCVfrom sklearn.metrics import f1_scorefrom sklearn.base import BaseEstimatorfrom sklearn.naive_bayes import MultinomialNBfrom utils import load_sent_word_netsent_word_net = load_sent_word_net()phase = "04"import jsonposcache_filename = "poscache.json"try: # try, except 模式 poscache = json.load(open(poscache_filename, "r"))except IOError: poscache = {}class LinguisticVectorizer(BaseEstimator): def get_feature_names(self): return np.array(['sent_neut', 'sent_pos', 'sent_neg', 'nouns', 'adjectives', 'verbs', 'adverbs', 'allcaps', 'exclamation', 'question']) def fit(self, documents, y=None): return self def _get_sentiments(self, d): # http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html #import pdb;pdb.set_trace() sent = tuple(nltk.word_tokenize(d)) #tuple类型 if poscache is not None: if d in poscache: tagged = poscache[d] else: poscache[d] = tagged = nltk.pos_tag(sent) else: tagged = nltk.pos_tag(sent) pos_vals = [] neg_vals = [] nouns = 0. adjectives = 0. verbs = 0. adverbs = 0. for w,t in tagged: p, n = 0,0 sent_pos_type = None if t.startswith("NN"): sent_pos_type = "n" nouns += 1 elif t.startswith("JJ"): sent_pos_type = "a" adjectives += 1 elif t.startswith("VB"): sent_pos_type = "v" verbs += 1 elif t.startswith("RB"): sent_pos_type = "r" adverbs += 1 if sent_pos_type is not None: sent_word = "%s/%s"%(sent_pos_type, w) if sent_word in sent_word_net: p,n = sent_word_net[sent_word] pos_vals.append(p) neg_vals.append(n) l = len(sent) avg_pos_val = np.mean(pos_vals) #注意 avg_neg_val = np.mean(neg_vals) #import pdb;pdb.set_trace() return [1-avg_pos_val-avg_neg_val, avg_pos_val, avg_neg_val, nouns/l, adjectives/l, verbs/l, adverbs/l] def transform(self, documents): obj_val, pos_val, neg_val, nouns, adjectives, verbs, adverbs = np.array([self._get_sentiments(d) for d in documents]).T allcaps = [] exclamation = [] question = [] for d in documents: allcaps.append(np.sum([t.isupper() for t in d.split() if len(t)>2])) exclamation.append(d.count("!")) question.append(d.count("?")) result = np.array([obj_val, pos_val, neg_val, nouns, adjectives, verbs, adverbs, allcaps, exclamation, question]).T return resultemo_repl = { # positive emoticons "<3": " good ", ":d": " good ", # :D in lower case ":dd": " good ", # :DD in lower case "8)": " good ", ":-)": " good ", ":)": " good ", ";)": " good ", "(-:": " good ", "(:": " good ", # negative emoticons: ":/": " bad ", ":>": " sad ", ":')": " sad ", ":-(": " bad ", ":(": " bad ", ":S": " bad ", ":-S": " bad ", }emo_repl_order = [k for (k_len,k) in reversed(sorted([(len(k),k) for k in emo_repl.keys()]))]re_repl = { r"\br\b": "are", r"\bu\b": "you", r"\bhaha\b": "ha", r"\bhahaha\b": "ha", r"\bdon't\b": "do not", r"\bdoesn't\b": "does not", r"\bdidn't\b": "did not", r"\bhasn't\b": "has not", r"\bhaven't\b": "have not", r"\bhadn't\b": "had not", r"\bwon't\b": "will not", r"\bwouldn't\b": "would not", r"\bcan't\b": "can not", r"\bcannot\b": "can not", }def create_union_model(params=None): def preprocessor(tweet): tweet = tweet.lower() for k in emo_repl_order: tweet = tweet.replace(k, emo_repl[k]) for r, repl in re_repl.iteritems(): tweet = re.sub(r, repl, tweet) return tweet.replace("-", " ").replace("_", " ") tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor, analyzer="word") ling_stats = LinguisticVectorizer() all_features = FeatureUnion([('ling', ling_stats), ('tfidf', tfidf_ngrams)]) #all_features = FeatureUnion([('tfidf', tfidf_ngrams)]) #all_features = FeatureUnion([('ling', ling_stats)]) clf = MultinomialNB() pipeline = Pipeline([('all', all_features), ('clf', clf)]) if params: pipeline.set_params(**params) return pipelinedef __grid_search_model(clf_factory, X, Y): cv = ShuffleSplit( n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0) param_grid = dict(vect__ngram_range=[(1, 1), (1, 2), (1, 3)], vect__min_df=[1, 2], vect__smooth_idf=[False, True], vect__use_idf=[False, True], vect__sublinear_tf=[False, True], vect__binary=[False, True], clf__alpha=[0, 0.01, 0.05, 0.1, 0.5, 1], ) grid_search = GridSearchCV(clf_factory(), param_grid=param_grid, cv=cv, score_func=f1_score, verbose=10) grid_search.fit(X, Y) clf = grid_search.best_estimator_ print clf return clfdef train_model(clf, X, Y, name="NB ngram", plot=False): # create it again for plotting cv = ShuffleSplit( n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0) train_errors = [] test_errors = [] scores = [] pr_scores = [] precisions, recalls, thresholds = [], [], [] clfs = [] # just to later get the median for train, test in cv: X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] clf.fit(X_train, y_train) clfs.append(clf) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) train_errors.append(1 - train_score) test_errors.append(1 - test_score) scores.append(test_score) proba = clf.predict_proba(X_test) fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1]) precision, recall, pr_thresholds = precision_recall_curve( y_test, proba[:, 1]) pr_scores.append(auc(recall, precision)) precisions.append(precision) recalls.append(recall) thresholds.append(pr_thresholds) if plot: scores_to_sort = pr_scores median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] plot_pr(pr_scores[median], name, phase, precisions[median], recalls[median], label=name) log_false_positives(clfs[median], X_test, y_test, name) summary = (np.mean(scores), np.std(scores), np.mean(pr_scores), np.std(pr_scores)) print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary return np.mean(train_errors), np.mean(test_errors)def print_incorrect(clf, X, Y): Y_hat = clf.predict(X) wrong_idx = Y_hat != Y X_wrong = X[wrong_idx] Y_wrong = Y[wrong_idx] Y_hat_wrong = Y_hat[wrong_idx] for idx in xrange(len(X_wrong)): print "clf.predict('%s')=%i instead of %i" %\ (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx])def get_best_model(): best_params = dict(all__tfidf__ngram_range=(1, 2), all__tfidf__min_df=1, all__tfidf__stop_words=None, all__tfidf__smooth_idf=False, all__tfidf__use_idf=False, all__tfidf__sublinear_tf=True, all__tfidf__binary=False, clf__alpha=0.01, ) best_clf = create_union_model(best_params) return best_clfif __name__ == "__main__": X_orig, Y_orig = load_sanders_data() #from sklearn.utils import shuffle #print "shuffle, sample" #X_orig, Y_orig = shuffle(X_orig, Y_orig) #X_orig = X_orig[:100,] #Y_orig = Y_orig[:100,] classes = np.unique(Y_orig) for c in classes: print "#%s: %i" % (c, sum(Y_orig == c)) print "== Pos vs. neg ==" pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative") X = X_orig[pos_neg] Y = Y_orig[pos_neg] Y = tweak_labels(Y, ["positive"]) train_model(get_best_model(), X, Y, name="pos vs neg", plot=True) print "== Pos/neg vs. irrelevant/neutral ==" X = X_orig Y = tweak_labels(Y_orig, ["positive", "negative"]) # best_clf = grid_search_model(create_union_model, X, Y, name="sent vs # rest", plot=True) train_model(get_best_model(), X, Y, name="pos+neg vs rest", plot=True) print "== Pos vs. rest ==" X = X_orig Y = tweak_labels(Y_orig, ["positive"]) train_model(get_best_model(), X, Y, name="pos vs rest", plot=True) print "== Neg vs. rest ==" X = X_orig Y = tweak_labels(Y_orig, ["negative"]) train_model(get_best_model(), X, Y, name="neg vs rest", plot=True) print "time spent:", time.time() - start_time json.dump(poscache, open(poscache_filename, "w")) #dump 序列化## This script trains tries to tweak hyperparameters to improve P/R AUC#import timestart_time = time.time()import reimport nltkimport numpy as npfrom sklearn.metrics import precision_recall_curve, roc_curve, aucfrom sklearn.cross_validation import ShuffleSplitfrom utils import plot_prfrom utils import load_sanders_datafrom utils import tweak_labelsfrom utils import log_false_positivesfrom sklearn.feature_extraction.text import TfidfVectorizerfrom sklearn.pipeline import Pipeline, FeatureUnionfrom sklearn.grid_search import GridSearchCVfrom sklearn.metrics import f1_scorefrom sklearn.base import BaseEstimatorfrom sklearn.naive_bayes import MultinomialNBfrom utils import load_sent_word_netsent_word_net = load_sent_word_net()import jsonposcache_filename = "poscache.json"try: poscache = json.load(open(poscache_filename, "r"))except IOError: poscache = {}class LinguisticVectorizer(BaseEstimator): def get_feature_names(self): return np.array(['sent_neut', 'sent_pos', 'sent_neg', 'nouns', 'adjectives', 'verbs', 'adverbs', 'allcaps', 'exclamation', 'question']) def fit(self, documents, y=None): return self def _get_sentiments(self, d): # http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html #import pdb;pdb.set_trace() sent = tuple(nltk.word_tokenize(d)) if poscache is not None: if d in poscache: tagged = poscache[d] else: poscache[d] = tagged = nltk.pos_tag(sent) else: tagged = nltk.pos_tag(sent) pos_vals = [] neg_vals = [] nouns = 0. adjectives = 0. verbs = 0. adverbs = 0. for w,t in tagged: p, n = 0,0 sent_pos_type = None if t.startswith("NN"): sent_pos_type = "n" nouns += 1 elif t.startswith("JJ"): sent_pos_type = "a" adjectives += 1 elif t.startswith("VB"): sent_pos_type = "v" verbs += 1 elif t.startswith("RB"): sent_pos_type = "r" adverbs += 1 if sent_pos_type is not None: sent_word = "%s/%s"%(sent_pos_type, w) if sent_word in sent_word_net: p,n = sent_word_net[sent_word] pos_vals.append(p) neg_vals.append(n) l = len(sent) avg_pos_val = np.max(pos_vals) avg_neg_val = np.max(neg_vals) return [max(0,1-avg_pos_val-avg_neg_val), avg_pos_val, avg_neg_val, nouns/l, adjectives/l, verbs/l, adverbs/l] def transform(self, documents): obj_val, pos_val, neg_val, nouns, adjectives, verbs, adverbs = np.array([self._get_sentiments(d) for d in documents]).T allcaps = [] exclamation = [] question = [] for d in documents: allcaps.append(np.sum([t.isupper() for t in d.split() if len(t)>2])) exclamation.append(d.count("!")) question.append(d.count("?")) result = np.array([obj_val, pos_val, neg_val, nouns, adjectives, verbs, adverbs, allcaps, exclamation, question]).T return resultemo_repl = { # positive emoticons "<3": " good ", ":d": " good ", # :D in lower case ":dd": " good ", # :DD in lower case "8)": " good ", ":-)": " good ", ":)": " good ", ";)": " good ", "(-:": " good ", "(:": " good ", # negative emoticons: ":/": " bad ", ":>": " sad ", ":')": " sad ", ":-(": " bad ", ":(": " bad ", ":S": " bad ", ":-S": " bad ", }emo_repl_order = [k for (k_len,k) in reversed(sorted([(len(k),k) for k in emo_repl.keys()]))]re_repl = { r"\br\b": "are", r"\bu\b": "you", r"\bhaha\b": "ha", r"\bhahaha\b": "ha", r"\bdon't\b": "do not", r"\bdoesn't\b": "does not", r"\bdidn't\b": "did not", r"\bhasn't\b": "has not", r"\bhaven't\b": "have not", r"\bhadn't\b": "had not", r"\bwon't\b": "will not", r"\bwouldn't\b": "would not", r"\bcan't\b": "can not", r"\bcannot\b": "can not", }def create_union_model(params=None): def preprocessor(tweet): tweet = tweet.lower() for k in emo_repl_order: tweet = tweet.replace(k, emo_repl[k]) for r, repl in re_repl.iteritems(): tweet = re.sub(r, repl, tweet) return tweet.replace("-", " ").replace("_", " ") tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor, analyzer="word") ling_stats = LinguisticVectorizer() all_features = FeatureUnion([('ling', ling_stats), ('tfidf', tfidf_ngrams)]) #all_features = FeatureUnion([('tfidf', tfidf_ngrams)]) #all_features = FeatureUnion([('ling', ling_stats)]) clf = MultinomialNB() pipeline = Pipeline([('all', all_features), ('clf', clf)]) if params: pipeline.set_params(**params) return pipelinedef __grid_search_model(clf_factory, X, Y): cv = ShuffleSplit( n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0) param_grid = dict(vect__ngram_range=[(1, 1), (1, 2), (1, 3)], vect__min_df=[1, 2], vect__smooth_idf=[False, True], vect__use_idf=[False, True], vect__sublinear_tf=[False, True], vect__binary=[False, True], clf__alpha=[0, 0.01, 0.05, 0.1, 0.5, 1], ) grid_search = GridSearchCV(clf_factory(), param_grid=param_grid, cv=cv, score_func=f1_score, verbose=10) grid_search.fit(X, Y) clf = grid_search.best_estimator_ print clf return clfdef train_model(clf, X, Y, name="NB ngram", plot=False): # create it again for plotting cv = ShuffleSplit( n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0) train_errors = [] test_errors = [] scores = [] pr_scores = [] precisions, recalls, thresholds = [], [], [] clfs = [] # just to later get the median for train, test in cv: X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] clf.fit(X_train, y_train) clfs.append(clf) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) train_errors.append(1 - train_score) test_errors.append(1 - test_score) scores.append(test_score) proba = clf.predict_proba(X_test) fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1]) precision, recall, pr_thresholds = precision_recall_curve( y_test, proba[:, 1]) pr_scores.append(auc(recall, precision)) precisions.append(precision) recalls.append(recall) thresholds.append(pr_thresholds) if plot: scores_to_sort = pr_scores median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] plot_pr(pr_scores[median], name, precisions[median], recalls[median], label=name) log_false_positives(clfs[median], X_test, y_test, name) summary = (np.mean(scores), np.std(scores), np.mean(pr_scores), np.std(pr_scores)) print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary return np.mean(train_errors), np.mean(test_errors)def print_incorrect(clf, X, Y): Y_hat = clf.predict(X) wrong_idx = Y_hat != Y X_wrong = X[wrong_idx] Y_wrong = Y[wrong_idx] Y_hat_wrong = Y_hat[wrong_idx] for idx in xrange(len(X_wrong)): print "clf.predict('%s')=%i instead of %i" %\ (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx])def get_best_model(): best_params = dict(all__tfidf__ngram_range=(1, 2), all__tfidf__min_df=1, all__tfidf__stop_words=None, all__tfidf__smooth_idf=False, all__tfidf__use_idf=False, all__tfidf__sublinear_tf=True, all__tfidf__binary=False, clf__alpha=0.01, ) best_clf = create_union_model(best_params) return best_clfif __name__ == "__main__": X_orig, Y_orig = load_sanders_data() #from sklearn.utils import shuffle #print "shuffle, sample" #X_orig, Y_orig = shuffle(X_orig, Y_orig) #X_orig = X_orig[:100,] #Y_orig = Y_orig[:100,] classes = np.unique(Y_orig) for c in classes: print "#%s: %i" % (c, sum(Y_orig == c)) print "== Pos vs. neg ==" pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative") X = X_orig[pos_neg] Y = Y_orig[pos_neg] Y = tweak_labels(Y, ["positive"]) train_model(get_best_model(), X, Y, name="pos vs neg", plot=True) print "== Pos/neg vs. irrelevant/neutral ==" X = X_orig Y = tweak_labels(Y_orig, ["positive", "negative"]) # best_clf = grid_search_model(create_union_model, X, Y, name="sent vs # rest", plot=True) train_model(get_best_model(), X, Y, name="pos+neg vs rest", plot=True) print "== Pos vs. rest ==" X = X_orig Y = tweak_labels(Y_orig, ["positive"]) train_model(get_best_model(), X, Y, name="pos vs rest", plot=True) print "== Neg vs. rest ==" X = X_orig Y = tweak_labels(Y_orig, ["negative"]) train_model(get_best_model(), X, Y, name="neg vs rest", plot=True) print "time spent:", time.time() - start_time json.dump(poscache, open(poscache_filename, "w")) This script trains tries to tweak hyperparameters to improve P/R AUC#import timestart_time = time.time()import reimport nltkimport numpy as npfrom sklearn.metrics import precision_recall_curve, roc_curve, aucfrom sklearn.cross_validation import ShuffleSplitfrom utils import plot_prfrom utils import load_sanders_datafrom utils import tweak_labelsfrom utils import log_false_positivesfrom sklearn.feature_extraction.text import CountVectorizerfrom sklearn.pipeline import Pipeline, FeatureUnionfrom sklearn.grid_search import GridSearchCVfrom sklearn.metrics import f1_scorefrom sklearn.base import BaseEstimatorfrom sklearn.naive_bayes import BernoulliNBfrom utils import load_sent_word_netsent_word_net = load_sent_word_net()import jsonposcache_filename = "poscache.json"try: poscache = json.load(open(poscache_filename, "r"))except IOError: poscache = {}class LinguisticVectorizer(BaseEstimator): def get_feature_names(self): return np.array(['sent_neut', 'sent_pos', 'sent_neg', 'nouns', 'adjectives', 'verbs', 'adverbs', 'allcaps', 'exclamation', 'question']) def fit(self, documents, y=None): return self def _get_sentiments(self, d): # http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html #import pdb;pdb.set_trace() sent = tuple(nltk.word_tokenize(d)) if poscache is not None: if d in poscache: tagged = poscache[d] else: poscache[d] = tagged = nltk.pos_tag(sent) else: tagged = nltk.pos_tag(sent) pos_vals = [] neg_vals = [] nouns = 0. adjectives = 0. verbs = 0. adverbs = 0. for w,t in tagged: p, n = 0,0 sent_pos_type = None if t.startswith("NN"): sent_pos_type = "n" nouns += 1 elif t.startswith("JJ"): sent_pos_type = "a" adjectives += 1 elif t.startswith("VB"): sent_pos_type = "v" verbs += 1 elif t.startswith("RB"): sent_pos_type = "r" adverbs += 1 if sent_pos_type is not None: sent_word = "%s/%s"%(sent_pos_type, w) if sent_word in sent_word_net: p,n = sent_word_net[sent_word] pos_vals.append(p) neg_vals.append(n) l = len(sent) avg_pos_val = np.mean(pos_vals) avg_neg_val = np.mean(neg_vals) #import pdb;pdb.set_trace() return [1-avg_pos_val-avg_neg_val, avg_pos_val, avg_neg_val, nouns/l, adjectives/l, verbs/l, adverbs/l] def transform(self, documents): obj_val, pos_val, neg_val, nouns, adjectives, verbs, adverbs = np.array([self._get_sentiments(d) for d in documents]).T allcaps = [] exclamation = [] question = [] for d in documents: allcaps.append(np.sum([t.isupper() for t in d.split() if len(t)>2])) exclamation.append(d.count("!")) question.append(d.count("?")) result = np.array([obj_val, pos_val, neg_val, nouns, adjectives, verbs, adverbs, allcaps, exclamation, question]).T return resultemo_repl = { # positive emoticons "<3": " good ", ":d": " good ", # :D in lower case ":dd": " good ", # :DD in lower case "8)": " good ", ":-)": " good ", ":)": " good ", ";)": " good ", "(-:": " good ", "(:": " good ", # negative emoticons: ":/": " bad ", ":>": " sad ", ":')": " sad ", ":-(": " bad ", ":(": " bad ", ":S": " bad ", ":-S": " bad ", }emo_repl_order = [k for (k_len,k) in reversed(sorted([(len(k),k) for k in emo_repl.keys()]))]re_repl = { r"\br\b": "are", r"\bu\b": "you", r"\bhaha\b": "ha", r"\bhahaha\b": "ha", r"\bdon't\b": "do not", r"\bdoesn't\b": "does not", r"\bdidn't\b": "did not", r"\bhasn't\b": "has not", r"\bhaven't\b": "have not", r"\bhadn't\b": "had not", r"\bwon't\b": "will not", r"\bwouldn't\b": "would not", r"\bcan't\b": "can not", r"\bcannot\b": "can not", }def create_union_model(params=None): def preprocessor(tweet): tweet = tweet.lower() for k in emo_repl_order: tweet = tweet.replace(k, emo_repl[k]) for r, repl in re_repl.iteritems(): tweet = re.sub(r, repl, tweet) return tweet.replace("-", " ").replace("_", " ") count_ngrams = CountVectorizer(preprocessor=preprocessor, analyzer="word") ling_stats = LinguisticVectorizer() all_features = FeatureUnion([('ling', ling_stats), ('count', count_ngrams)]) #all_features = FeatureUnion([('count', count_ngrams)]) #all_features = FeatureUnion([('ling', ling_stats)]) clf = BernoulliNB() pipeline = Pipeline([('all', all_features), ('clf', clf)]) if params: pipeline.set_params(**params) return pipelinedef __grid_search_model(clf_factory, X, Y): cv = ShuffleSplit( n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0) param_grid = dict(all__count__ngram_range=[(1, 1), (1, 2), (1, 3)], all__count__min_df=[1, 2], ) grid_search = GridSearchCV(clf_factory(), param_grid=param_grid, cv=cv, score_func=f1_score, verbose=10) grid_search.fit(X, Y) clf = grid_search.best_estimator_ print clf return clfdef train_model(clf, X, Y, name="NB ngram", plot=False): # create it again for plotting cv = ShuffleSplit( n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0) train_errors = [] test_errors = [] scores = [] pr_scores = [] precisions, recalls, thresholds = [], [], [] clfs = [] # just to later get the median for train, test in cv: X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] clf.fit(X_train, y_train) clfs.append(clf) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) train_errors.append(1 - train_score) test_errors.append(1 - test_score) scores.append(test_score) proba = clf.predict_proba(X_test) fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1]) precision, recall, pr_thresholds = precision_recall_curve( y_test, proba[:, 1]) pr_scores.append(auc(recall, precision)) precisions.append(precision) recalls.append(recall) thresholds.append(pr_thresholds) if plot: scores_to_sort = pr_scores median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] plot_pr(pr_scores[median], name, precisions[median], recalls[median], label=name) log_false_positives(clfs[median], X_test, y_test, name) summary = (np.mean(scores), np.std(scores), np.mean(pr_scores), np.std(pr_scores)) print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary return np.mean(train_errors), np.mean(test_errors)def print_incorrect(clf, X, Y): Y_hat = clf.predict(X) wrong_idx = Y_hat != Y X_wrong = X[wrong_idx] Y_wrong = Y[wrong_idx] Y_hat_wrong = Y_hat[wrong_idx] for idx in xrange(len(X_wrong)): print "clf.predict('%s')=%i instead of %i" %\ (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx])def get_best_model(): best_params = dict(all__count__ngram_range=(1, 2), all__count__min_df=1, all__count__stop_words=None, all__count__binary=True, ) best_clf = create_union_model(best_params) return best_clfif __name__ == "__main__": X_orig, Y_orig = load_sanders_data() #from sklearn.utils import shuffle #print "shuffle, sample" #X_orig, Y_orig = shuffle(X_orig, Y_orig) #X_orig = X_orig[:100,] #Y_orig = Y_orig[:100,] classes = np.unique(Y_orig) for c in classes: print "#%s: %i" % (c, sum(Y_orig == c)) print "== Pos vs. neg ==" pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative") X = X_orig[pos_neg] Y = Y_orig[pos_neg] Y = tweak_labels(Y, ["positive"]) train_model(get_best_model(), X, Y, name="pos vs neg", plot=True) print "== Pos/neg vs. irrelevant/neutral ==" X = X_orig Y = tweak_labels(Y_orig, ["positive", "negative"]) #best_clf = __grid_search_model(create_union_model, X, Y) #print best_clf train_model(get_best_model(), X, Y, name="pos+neg vs rest", plot=True) print "== Pos vs. rest ==" X = X_orig Y = tweak_labels(Y_orig, ["positive"]) train_model(get_best_model(), X, Y, name="pos vs rest", plot=True) print "== Neg vs. rest ==" X = X_orig Y = tweak_labels(Y_orig, ["negative"]) train_model(get_best_model(), X, Y, name="neg vs rest", plot=True) print "time spent:", time.time() - start_time json.dump(poscache, open(poscache_filename, "w"))第七章import numpy as npfrom sklearn.datasets import load_bostonimport pylab as pltboston = load_boston()x = np.array([np.concatenate((v,[1])) for v in boston.data]) ##concatenate 矩阵连接,vstack,hstack,等一起y = boston.targets,total_error,_,_ = np.linalg.lstsq(x,y)rmse = np.sqrt(total_error[0]/len(x))print('Residual: {}'.format(rmse)) # 注意是空的 {}, 可以两个空的连着plt.plot(np.dot(x,s), boston.target,'ro') #矩阵乘法 dotplt.plot([0,50],[0,50], 'g-')plt.xlabel('predicted')plt.ylabel('real')plt.show()from __future__ import print_functionfrom sklearn.cross_validation import KFoldfrom sklearn.linear_model import ElasticNet, Lasso, Ridgefrom sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCVimport numpy as npx = np.array([np.concatenate((v,[1])) for v in boston.data])y = boston.targetfor name,met in [ ('elastic-net(.5)', ElasticNet(fit_intercept=True, alpha=0.5)), # 注意模型也在内部,此种写法 ('lasso(.5)', Lasso(fit_intercept=True, alpha=0.5)), ('ridge(.5)', Ridge(fit_intercept=True, alpha=0.5)), ]: met.fit(x,y) p = np.array([met.predict(xi) for xi in x]) e = p-y total_error = np.dot(e,e) # 注意此种写法 平方 rmse_train = np.sqrt(total_error/len(p)) kf = KFold(len(x), n_folds=10) err = 0 for train,test in kf: met.fit(x[train],y[train]) # 注意此种交叉验证方式设计,采用的是索引 p = np.array([met.predict(xi) for xi in x[test]]) e = p-y[test] err += np.dot(e,e) rmse_10cv = np.sqrt(err/len(x)) print('Method: {}'.format(name)) print('RMSE on training: {}'.format(rmse_train)) print('RMSE on 10-fold CV: {}'.format(rmse_10cv)) print() print()from sklearn.cross_validation import KFoldfrom sklearn.linear_model import LinearRegression, ElasticNetimport numpy as npfrom sklearn.datasets import load_bostonboston = load_boston()x = np.array([np.concatenate((v,[1])) for v in boston.data])y = boston.targetFIT_EN = Falseif FIT_EN: model = ElasticNet(fit_intercept=True, alpha=0.5) # 巧妙的使用判断,和fit结合else: model = LinearRegression(fit_intercept=True)model.fit(x,y)p = np.array([model.predict(xi) for xi in x])e = p-ytotal_error = np.dot(e,e)rmse_train = np.sqrt(total_error/len(p))kf = KFold(len(x), n_folds=10) #此处是索引err = 0for train,test in kf: model.fit(x[train],y[train]) p = np.array([model.predict(xi) for xi in x[test]]) e = p-y[test] err += np.dot(e,e)rmse_10cv = np.sqrt(err/len(x))print('RMSE on training: {}'.format(rmse_train))print('RMSE on 10-fold CV: {}'.format(rmse_10cv))import numpy as npfrom sklearn.datasets import load_bostonimport pylab as pltfrom mpltools import stylestyle.use('ggplot')boston = load_boston()plt.scatter(boston.data[:,5], boston.target)plt.xlabel("RM")plt.ylabel("House Price")x = boston.data[:,5]x = np.array([[v] for v in x])y = boston.targetslope,res,_,_ = np.linalg.lstsq(x,y) #_, 使用在不想起名字不使用的情况下?plt.plot([0,boston.data[:,5].max()+1],[0,slope*(boston.data[:,5].max()+1)], '-', lw=4)plt.savefig('Figure1.png',dpi=150)rmse = np.sqrt(res[0]/len(x))print('Residual: {}'.format(rmse))import numpy as npfrom sklearn.datasets import load_bostonimport pylab as pltfrom mpltools import style # 使用方法,sytle.usestyle.use('ggplot')boston = load_boston()plt.scatter(boston.data[:,5], boston.target)plt.xlabel("RM")plt.ylabel("House Price")x = boston.data[:,5]xmin = x.min()xmax = x.max()x = np.array([[v,1] for v in x])y = boston.target(slope,bias),res,_,_ = np.linalg.lstsq(x,y)plt.plot([xmin,xmax],[slope*xmin + bias, slope*xmax + bias], '-', lw=4)plt.savefig('Figure2.png',dpi=150)rmse = np.sqrt(res[0]/len(x))print('Residual: {}'.format(rmse))import numpy as npfrom sklearn.datasets import load_svmlight_filefrom sklearn.linear_model import ElasticNet, LinearRegressiondata,target = load_svmlight_file('E2006.train')lr = LinearRegression(fit_intercept=True)from sklearn.cross_validation import KFoldkf = KFold(len(target), n_folds=10)err = 0for train,test in kf: lr.fit(data[train],target[train]) p = map(lr.predict, data[test]) p = np.array(p).ravel() ##注意ravel的使用, 和flatten的区别 e = p-target[test] err += np.dot(e,e)rmse_10cv = np.sqrt(err/len(target))lr.fit(data,target)p = np.array(map(lr.predict, data)) # 推到表达式也可实现p = p.ravel()e = p-targettotal_error = np.dot(e,e)rmse_train = np.sqrt(total_error/len(p))print('RMSE on training: {}'.format(rmse_train))print('RMSE on 10-fold CV: {}'.format(rmse_10cv))import numpy as npfrom scipy import sparsefrom sklearn.linear_model import LassoCV, RidgeCV, ElasticNetCVfrom sklearn.cross_validation import KFolddata = np.array([[int(tok) for tok in line.split('\t')[:3]] for line in open('data/ml-100k/u.data')]) #多层列表推导式,取出需要的值ij = data[:,:2]ij -= 1 # original data is in 1-based systemvalues = data[:,2]reviews = sparse.csc_matrix((values,ij.T)).astype(float) #两种稀疏矩阵压缩,astypereg = ElasticNetCV(fit_intercept=True, alphas=[0.0125, 0.025,0.05,.125,.25,.5,1.,2.,4.])def movie_norm(xc): xc = xc.copy().toarray() x1 = np.array([xi[xi > 0].mean() for xi in xc]) x1 = np.nan_to_num(x1) #注意 np.nan_to_num的使用 for i in range(xc.shape[0]): xc[i] -= (xc[i] > 0) * x1[i] return xc, x1def learn_for(i): u = reviews[i] us = np.delete(np.arange(reviews.shape[0]), i) ps, = np.where(u.toarray().ravel() > 0) x = reviews[us][:,ps].T y = u.data err = 0 eb = 0 kf = KFold(len(y), n_folds=4) for train,test in kf: xc,x1 = movie_norm(x[train]) reg.fit(xc, y[train]-x1) xc,x1 = movie_norm(x[test]) p = np.array([reg.predict(xi) for xi in xc]).ravel() e = (p+x1)-y[test] err += np.sum(e*e) eb += np.sum( (y[train].mean() - y[test])**2 ) return np.sqrt(err/float(len(y))), np.sqrt(eb/float(len(y)))whole_data = []for i in range(reviews.shape[0]): s = learn_for(i) print(s[0] < s[1]) print(s) whole_data.append(s)第八章def apriori(dataset, minsupport, maxsize): ''' freqsets, baskets = apriori(dataset, minsupport, maxsize) Parameters ---------- dataset : sequence of sequences input dataset minsupport : int Minimal support for frequent items maxsize : int Maximal size of frequent items to return Returns ------- freqsets : sequence of sequences baskets : dictionary ''' from collections import defaultdict baskets = defaultdict(list) pointers = defaultdict(list) for i,ds in enumerate(dataset): for ell in ds: pointers[ell].append(i) baskets[frozenset([ell])].append(i) pointers = dict([(k,frozenset(v)) for k,v in pointers.items()]) baskets = dict([(k,frozenset(v)) for k,v in baskets.items()]) valid = set(list(el)[0] for el,c in baskets.items() if (len(c) >= minsupport)) dataset = [[el for el in ds if (el in valid)] for ds in dataset] dataset = [ds for ds in dataset if len(ds) > 1] dataset = map(frozenset,dataset) itemsets = [frozenset([v]) for v in valid] freqsets = [] for i in range(maxsize-1): print(len(itemsets)) newsets = [] for i,ell in enumerate(itemsets): ccounts = baskets[ell] for v_,pv in pointers.items(): if v_ not in ell: csup = (ccounts & pv) if len(csup) >= minsupport: new = frozenset(ell|set([v_])) if new not in baskets: newsets.append(new) baskets[new] = csup freqsets.extend(itemsets) itemsets = newsets return freqsets,basketsdef association_rules(dataset, freqsets, baskets, minlift): ''' for (antecendent, consequent, base, py_x, lift) in association_rules(dataset, freqsets, baskets, minlift): ... This function takes the returns from ``apriori``. Parameters ---------- dataset : sequence of sequences input dataset freqsets : sequence of sequences baskets : dictionary minlift : int minimal lift of yielded rules ''' nr_transactions = float(len(dataset)) freqsets = [f for f in freqsets if len(f) > 1] for fset in freqsets: for f in fset: consequent = frozenset([f]) antecendent = fset-consequent base = len(baskets[consequent])/nr_transactions py_x = len(baskets[fset])/float(len(baskets[antecendent])) lift = py_x / base if lift > minlift: yield (antecendent, consequent, base, py_x, lift) # 注意生成器的使用from apriori import apriori, association_rulesfrom gzip import GzipFiledataset = [ [int(tok) for tok in line.strip().split()] for line in GzipFile('retail.dat.gz')]freqsets,baskets = apriori(dataset, 80, maxsize=5)nr_transactions = float(len(dataset))for ant,con,base,pyx,lift in association_rules(dataset, freqsets,baskets, 30): print('{} | {} | {} ({:%}) | {} | {} | {}' .format(ant, con, len(baskets[con]), len(baskets[con])/ nr_transactions, len(baskets[ant]), len(baskets[con|ant]), int(lift)))import numpy as npfrom collections import defaultdictfrom itertools import chain # 注意itertools 模块提供迭代器from gzip import GzipFileminsupport = 44 dataset = [ [int(tok) for tok in line.strip().split()] for line in GzipFile('retail.dat.gz')]dataset = dataset[::20]counts = defaultdict(int)for elem in chain(*dataset): #注意chain内部是迭代器,注意此处使用的函数参数为*不固定的 counts[elem] += 1valid = set(el for el,c in counts.items() if (c >= minsupport))dataset = [[el for el in ds if (el in valid)] for ds in dataset]dataset = [frozenset(ds) for ds in dataset if len(ds) > 1]itemsets = [frozenset([v]) for v in valid]allsets = [itemsets]for i in range(16): print(len(itemsets)) nextsets = [] for i,ell in enumerate(itemsets): for v_ in valid: if v_ not in ell: c = (ell|set([v_])) if sum(1 for d in dataset if d.issuperset(c)) > minsupport: nextsets.append(c) allsets.append(nextsets) itemsets = nextsetsimport numpy as npfrom collections import defaultdictfrom itertools import chainfrom gzip import GzipFiledataset = [ [int(tok) for tok in line.strip().split()] for line in GzipFile('retail.dat.gz')]counts = defaultdict(int)for elem in chain(*dataset): counts[elem] += 1counts = np.array(list(counts.values()))bins = [1,2,4,8,16,32,64,128,512]print(' {0:11} | {1:12}'.format('Nr of baskets', 'Nr of products'))print('--------------------------------')for i in range(len(bins)): bot = bins[i] top = (bins[i+1] if (i+1) < len(bins) else 100000000000) print(' {0:4} - {1:3} | {2:12}'.format(bot, (top if top < 1000 else ''), np.sum( (counts >= bot) & (counts < top) )))# 注意在.format中使用了if else等语法import numpy as np# This is the version in the book:def all_correlations(bait, target): ''' corrs = all_correlations(bait, target) corrs[i] is the correlation between bait and target[i] ''' return np.array( [np.corrcoef(bait, c)[0,1] for c in target])# This is a faster, but harder to read, implementation:def all_correlations(y, X): ''' Cs = all_correlations(y, X) Cs[i] = np.corrcoef(y, X[i])[0,1] ''' X = np.asanyarray(X, float) y = np.asanyarray(y, float) xy = np.dot(X, y) y_ = y.mean() ys_ = y.std() x_ = X.mean(1) xs_ = X.std(1) n = float(len(y)) ys_ += 1e-5 # Handle zeros in ys xs_ += 1e-5 # Handle zeros in x return (xy - x_*y_*n)/n/xs_/ys_import numpy as np# This is the version in the book:def all_correlations(bait, target): ''' corrs = all_correlations(bait, target) corrs[i] is the correlation between bait and target[i] ''' return np.array( [np.corrcoef(bait, c)[0,1] for c in target])# This is a faster, but harder to read, implementation:def all_correlations(y, X): ''' Cs = all_correlations(y, X) Cs[i] = np.corrcoef(y, X[i])[0,1] ''' X = np.asanyarray(X, float) #把matrix转换为array用asarray() asanyarray()根据和你的输入的类型保持一致 y = np.asanyarray(y, float) xy = np.dot(X, y) y_ = y.mean() ys_ = y.std() x_ = X.mean(1) xs_ = X.std(1) n = float(len(y)) ys_ += 1e-5 # Handle zeros in ys xs_ += 1e-5 # Handle zeros in x return (xy - x_*y_*n)/n/xs_/ys_from __future__ import print_functionfrom all_correlations import all_correlationsimport numpy as npfrom scipy import sparsefrom load_ml100k import loadreviews = load()def estimate_user(user, rest): bu = user > 0 br = rest > 0 ws = all_correlations(bu,br) selected = ws.argsort()[-100:] estimates = rest[selected].mean(0) estimates /= (.1+br[selected].mean(0)) return estimatesdef train_test(user, rest): estimates = estimate_user(user, rest) bu = user > 0 br = rest > 0 err = estimates[bu]-user[bu] null = rest.mean(0) null /= (.1+br.mean(0)) nerr = null[bu]-user[bu] return np.dot(err,err), np.dot(nerr, nerr)def cross_validate_all(): err = [] for i in xrange(reviews.shape[0]): err.append( train_test(reviews[i], np.delete(reviews, i, 0)) ) revs = (reviews > 0).sum(1) err = np.array(err) rmse = np.sqrt(err / revs[:,None]) print(np.mean(rmse, 0)) print(np.mean(rmse[revs > 60], 0))def all_estimates(reviews): reviews = reviews.toarray() estimates = np.zeros_like(reviews) ##注意zeros_like和ones_like 和原矩阵的shape一样, 但数据都是0, 或1 for i in xrange(reviews.shape[0]): estimates[i] = estimate_user(reviews[i], np.delete(reviews, i, 0)) return estimatesfrom load_ml100k import loadfrom matplotlib import pyplot as pltdata = load()data = data.toarray() # 和todense一样,都是把稀疏举证转为数组plt.gray()plt.imshow(data[:200,:200], interpolation='nearest') ##imshow显示二值型数据,颜色深浅plt.xlabel('User ID')plt.ylabel('Film ID')plt.savefig('../1400_08_03+.png')import numpy as npfrom scipy import sparsedef load(): data = np.array([[int(t) for t in line.split('\t')[:3]] for line in open('data/ml-100k/u.data')]) ij = data[:,:2] ij -= 1 # original data is in 1-based system values = data[:,2] reviews = sparse.csc_matrix((values,ij.T)).astype(float) ##csc 按列压缩矩阵, csr按行压缩矩阵 return reviewsfrom __future__ import print_functionimport numpy as npfrom load_ml100k import loadfrom all_correlations import all_correlationsdef nn_movie(ureviews, reviews, uid, mid, k=1): X = ureviews y = ureviews[mid].copy() y -= y.mean() y /= (y.std()+1e-5) corrs = np.dot(X, y) likes = corrs.argsort() likes = likes[::-1] c = 0 pred = 3. for ell in likes: if ell == mid: continue if reviews[uid,ell] > 0: pred = reviews[uid,ell] if c == k: return pred c += 1 return preddef all_estimates(reviews, k=1): reviews = reviews.astype(float) k -= 1 nusers, nmovies = reviews.shape estimates = np.zeros_like(reviews) for u in range(nusers): ureviews = np.delete(reviews, u, 0) ureviews -= ureviews.mean(0) ureviews /= (ureviews.std(0)+1e-4) ureviews = ureviews.T.copy() for m in np.where(reviews[u] > 0)[0]: estimates[u,m] = nn_movie(ureviews, reviews, u, m, k) return estimatesif __name__ == '__main__': reviews = load().toarray() estimates = all_estimates(reviews) error = (estimates-reviews) error **= 2 #注意**用法 error = error[reviews > 0] print(np.sqrt(error).mean())from __future__ import print_functionfrom sklearn.linear_model import LinearRegressionfrom load_ml100k import loadimport numpy as npimport similar_movieimport usermodelimport corrneighboursreviews = load()reg = LinearRegression()es = np.array([ usermodel.all_estimates(reviews), corrneighbours.all_estimates(reviews), similar_movies.all_estimates(reviews), ])reviews = reviews.toarray() total_error = 0.0coefficients = []for u in xrange(reviews.shape[0]): es0 = np.delete(es,u,1) r0 = np.delete(reviews, u, 0) X,Y = np.where(r0 > 0) X = es[:,X,Y] y = r0[r0 > 0] reg.fit(X.T,y) coefficients.append(reg.coef_) r0 = reviews[u] X = np.where(r0 > 0) p0 = reg.predict(es[:,u,X].squeeze().T) err0 = r0[r0 > 0]-p0 total_error += np.dot(err0,err0) print(u)from sklearn.linear_model import LinearRegressionfrom load_ml100k import loadimport numpy as npimport similar_movieimport usermodelimport corrneighbourssreviews = load()reviews = sreviews.toarray()reg = LinearRegression()es = np.array([ usermodel.all_estimates(sreviews), similar_movie.all_estimates(reviews,k=1), similar_movie.all_estimates(reviews,k=2), similar_movie.all_estimates(reviews,k=3), similar_movie.all_estimates(reviews,k=4), similar_movie.all_estimates(reviews,k=5), ])total_error = 0.0coefficients = []for u in xrange(reviews.shape[0]): es0 = np.delete(es,u,1) #delete的用法 留一 r0 = np.delete(reviews, u, 0) X,Y = np.where(r0 > 0) X = es[:,X,Y] y = r0[r0 > 0] reg.fit(X.T,y) coefficients.append(reg.coef_) r0 = reviews[u] X = np.where(r0 > 0) p0 = reg.predict(es[:,u,X].squeeze().T) err0 = r0[r0 > 0]-p0 total_error += np.dot(err0,err0)coefficients = np.array(coefficients)import numpy as npfrom sklearn.linear_model import LassoCV, RidgeCV, ElasticNetCVfrom sklearn.cross_validation import KFoldfrom load_ml100k import loaddef learn_for(reviews, i): reg = ElasticNetCV(fit_intercept=True, alphas=[0.0125, 0.025,0.05,.125,.25,.5,1.,2.,4.]) u = reviews[i] us = range(reviews.shape[0]) del us[i] ps, = np.where(u.toarray().ravel() > 0) x = reviews[us][:,ps].T y = u.data kf = KFold(len(y), n_folds=4) predictions = np.zeros(len(ps)) for train,test in kf: xc = x[train].copy().toarray() x1 = np.array([xi[xi > 0].mean() for xi in xc]) x1 = np.nan_to_num(x1) for i in xrange(xc.shape[0]): xc[i] -= (xc[i] > 0) * x1[i] reg.fit(xc, y[train]-x1) xc = x[test].copy().toarray() x1 = np.array([xi[xi > 0].mean() for xi in xc])第九章import osfrom matplotlib import pylabimport numpy as npDATA_DIR = os.path.join("..", "data")CHART_DIR = os.path.join("..", "charts")GENRE_DIR = "/media/sf_P/pymlbook-data/09-genre-class/genres"GENRE_LIST = ["classical", "jazz", "country", "pop", "rock", "metal"]def plot_confusion_matrix(cm, genre_list, name, title): pylab.clf() pylab.matshow(cm, fignum=False, cmap='Blues', vmin=0, vmax=1.0) # 注意用matshow画混淆举证,并设置cmap颜色等 ax = pylab.axes() ax.set_xticks(range(len(genre_list))) # 注意图形的建立方式 ax.set_xticklabels(genre_list) ax.xaxis.set_ticks_position("bottom") ax.set_yticks(range(len(genre_list))) ax.set_yticklabels(genre_list) pylab.title(title) pylab.colorbar() pylab.grid(False) pylab.show() pylab.xlabel('Predicted class') pylab.ylabel('True class') pylab.grid(False) pylab.savefig(os.path.join(CHART_DIR, "confusion_matrix_%s.png"%name), bbox_inches="tight")def plot_pr(auc_score, name, precision, recall, label=None): pylab.clf() pylab.figure(num=None, figsize=(5, 4)) pylab.grid(True) pylab.fill_between(recall, precision, alpha=0.5) # 使用fill_between pylab.plot(recall, precision, lw=1) pylab.xlim([0.0, 1.0]) pylab.ylim([0.0, 1.0]) pylab.xlabel('Recall') pylab.ylabel('Precision') pylab.title('P/R curve (AUC = %0.2f) / %s' % (auc_score, label)) filename = name.replace(" ", "_") pylab.savefig(os.path.join(CHART_DIR, "pr_" + filename + ".png"), bbox_inches="tight")def plot_roc(auc_score, name, tpr, fpr, label=None): pylab.clf() pylab.figure(num=None, figsize=(5, 4)) pylab.grid(True) pylab.plot([0, 1], [0, 1], 'k--') pylab.plot(fpr, tpr) pylab.fill_between(fpr, tpr, alpha=0.5) pylab.xlim([0.0, 1.0]) pylab.ylim([0.0, 1.0]) pylab.xlabel('False Positive Rate') pylab.ylabel('True Positive Rate') pylab.title('ROC curve (AUC = %0.2f) / %s' % (auc_score, label), verticalalignment="bottom") pylab.legend(loc="lower right") filename = name.replace(" ", "_") pylab.savefig(os.path.join(CHART_DIR, "roc_" + filename + ".png"), bbox_inches="tight")def show_most_informative_features(vectorizer, clf, n=20): c_f = sorted(zip(clf.coef_[0], vectorizer.get_feature_names())) top = zip(c_f[:n], c_f[:-(n + 1):-1]) for (c1, f1), (c2, f2) in top: print "\t%.4f\t%-15s\t\t%.4f\t%-15s" % (c1, f1, c2, f2)def plot_log(): pylab.clf() x = np.arange(0.001, 1, 0.001) y = np.log(x) pylab.title('Relationship between probabilities and their logarithm') pylab.plot(x, y) pylab.grid(True) pylab.xlabel('P') pylab.ylabel('log(P)') filename = 'log_probs.png' pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")def plot_feat_importance(feature_names, clf, name): pylab.clf() coef_ = clf.coef_ important = np.argsort(np.absolute(coef_.ravel())) f_imp = feature_names[important] coef = coef_.ravel()[important] inds = np.argsort(coef) f_imp = f_imp[inds] coef = coef[inds] xpos = np.array(range(len(coef))) pylab.bar(xpos, coef, width=1) pylab.title('Feature importance for %s' % (name)) ax = pylab.gca() ax.set_xticks(np.arange(len(coef))) labels = ax.set_xticklabels(f_imp) for label in labels: label.set_rotation(90) filename = name.replace(" ", "_") pylab.savefig(os.path.join( CHART_DIR, "feat_imp_%s.png" % filename), bbox_inches="tight")def plot_feat_hist(data_name_list, filename=None): pylab.clf() # import pdb;pdb.set_trace() num_rows = 1 + (len(data_name_list) - 1) / 2 num_cols = 1 if len(data_name_list) == 1 else 2 pylab.figure(figsize=(5 * num_cols, 4 * num_rows)) for i in range(num_rows): for j in range(num_cols): pylab.subplot(num_rows, num_cols, 1 + i * num_cols + j) # 注意subplot 动态设置行列。 x, name = data_name_list[i * num_cols + j] pylab.title(name) pylab.xlabel('Value') pylab.ylabel('Density') # the histogram of the data max_val = np.max(x) if max_val <= 1.0: bins = 50 elif max_val > 50: bins = 50 else: bins = max_val n, bins, patches = pylab.hist( x, bins=bins, normed=1, facecolor='green', alpha=0.75) pylab.grid(True) if not filename: filename = "feat_hist_%s.png" % name pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")def plot_bias_variance(data_sizes, train_errors, test_errors, name): pylab.clf() pylab.ylim([0.0, 1.0]) pylab.xlabel('Data set size') pylab.ylabel('Error') pylab.title("Bias-Variance for '%s'" % name) pylab.plot( data_sizes, train_errors, "-", data_sizes, test_errors, "--", lw=1) pylab.legend(["train error", "test error"], loc="upper right") pylab.grid(True) pylab.savefig(os.path.join(CHART_DIR, "bv_" + name + ".png"))import numpy as npfrom collections import defaultdictfrom sklearn.metrics import precision_recall_curve, roc_curvefrom sklearn.metrics import aucfrom sklearn.cross_validation import ShuffleSplitfrom sklearn.metrics import confusion_matrixfrom utils import plot_pr, plot_roc, plot_confusion_matrix, GENRE_LISTfrom fft import read_fftTEST_DIR = "/media/sf_P/pymlbook-data/09-genre-class/private"genre_list = GENRE_LISTdef train_model(clf_factory, X, Y, name, plot=False): labels = np.unique(Y) #np.unique函数使用 cv = ShuffleSplit( n=len(X), n_iter=1, test_size=0.3, indices=True, random_state=0) train_errors = [] test_errors = [] scores = [] pr_scores = defaultdict(list) precisions, recalls, thresholds = defaultdict( list), defaultdict(list), defaultdict(list) roc_scores = defaultdict(list) tprs = defaultdict(list) fprs = defaultdict(list) clfs = [] # just to later get the median cms = [] for train, test in cv: X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] clf = clf_factory() clf.fit(X_train, y_train) clfs.append(clf) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) scores.append(test_score) train_errors.append(1 - train_score) test_errors.append(1 - test_score) y_pred = clf.predict(X_test) cm = confusion_matrix(y_test, y_pred) cms.append(cm) for label in labels: y_label_test = np.asarray(y_test == label, dtype=int) # 注意在此处嵌入dtype语句 proba = clf.predict_proba(X_test) proba_label = proba[:, label] precision, recall, pr_thresholds = precision_recall_curve( y_label_test, proba_label) pr_scores[label].append(auc(recall, precision)) precisions[label].append(precision) recalls[label].append(recall) thresholds[label].append(pr_thresholds) fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label) roc_scores[label].append(auc(fpr, tpr)) tprs[label].append(tpr) fprs[label].append(fpr) if plot: for label in labels: print "Plotting", genre_list[label] scores_to_sort = roc_scores[label] median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] desc = "%s %s" % (name, genre_list[label]) plot_pr(pr_scores[label][median], desc, precisions[label][median], recalls[label][median], label='%s vs rest' % genre_list[label]) plot_roc(roc_scores[label][median], desc, tprs[label][median], fprs[label][median], label='%s vs rest' % genre_list[label]) all_pr_scores = np.asarray(pr_scores.values()).flatten() summary = (np.mean(scores), np.std(scores), np.mean(all_pr_scores), np.std(all_pr_scores)) print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)def create_model(): from sklearn.linear_model.logistic import LogisticRegression clf = LogisticRegression() return clfif __name__ == "__main__": X, y = read_fft(genre_list) train_avg, test_avg, cms = train_model( create_model, X, y, "Log Reg FFT", plot=True) cm_avg = np.mean(cms, axis=0) cm_norm = cm_avg / np.sum(cm_avg, axis=0) print cm_norm plot_confusion_matrix(cm_norm, genre_list, "fft", "Confusion matrix of an FFT based classifier") import numpy as npfrom collections import defaultdictfrom sklearn.metrics import precision_recall_curve, roc_curvefrom sklearn.metrics import aucfrom sklearn.cross_validation import ShuffleSplitfrom sklearn.metrics import confusion_matrixfrom utils import plot_roc, plot_confusion_matrix, GENRE_LISTfrom ceps import read_cepsTEST_DIR = "/media/sf_P/pymlbook-data/09-genre-class/private"genre_list = GENRE_LISTdef train_model(clf_factory, X, Y, name, plot=False): labels = np.unique(Y) cv = ShuffleSplit( n=len(X), n_iter=1, test_size=0.3, indices=True, random_state=0) train_errors = [] test_errors = [] scores = [] pr_scores = defaultdict(list) precisions, recalls, thresholds = defaultdict( list), defaultdict(list), defaultdict(list) roc_scores = defaultdict(list) tprs = defaultdict(list) fprs = defaultdict(list) clfs = [] # just to later get the median cms = [] for train, test in cv: X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] clf = clf_factory() clf.fit(X_train, y_train) clfs.append(clf) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) scores.append(test_score) train_errors.append(1 - train_score) test_errors.append(1 - test_score) y_pred = clf.predict(X_test) cm = confusion_matrix(y_test, y_pred) cms.append(cm) for label in labels: y_label_test = np.asarray(y_test == label, dtype=int) proba = clf.predict_proba(X_test) proba_label = proba[:, label] precision, recall, pr_thresholds = precision_recall_curve( y_label_test, proba_label) pr_scores[label].append(auc(recall, precision)) precisions[label].append(precision) recalls[label].append(recall) thresholds[label].append(pr_thresholds) fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label) roc_scores[label].append(auc(fpr, tpr)) tprs[label].append(tpr) fprs[label].append(fpr) if plot: for label in labels: print "Plotting", genre_list[label] scores_to_sort = roc_scores[label] median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] desc = "%s %s" % (name, genre_list[label]) plot_roc(roc_scores[label][median], desc, tprs[label][median], fprs[label][median], label='%s vs rest' % genre_list[label]) all_pr_scores = np.asarray(pr_scores.values()).flatten() summary = (np.mean(scores), np.std(scores), np.mean(all_pr_scores), np.std(all_pr_scores)) print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)def create_model(): from sklearn.linear_model.logistic import LogisticRegression clf = LogisticRegression() return clfif __name__ == "__main__": X, y = read_ceps(genre_list) train_avg, test_avg, cms = train_model( create_model, X, y, "Log Reg CEPS", plot=True) cm_avg = np.mean(cms, axis=0) cm_norm = cm_avg / np.sum(cm_avg, axis=0) print cm_norm plot_confusion_matrix(cm_norm, genre_list, "ceps", "Confusion matrix of a CEPS based classifier")import osimport globimport sysimport numpy as npimport scipyimport scipy.io.wavfile #利用scipy对wav文件进行处理from scikits.talkbox.features import mfcc #talkbox对音频进行XXX变换的包from utils import GENRE_DIRdef write_ceps(ceps, fn): """ Write the MFCC to separate files to speed up processing. """ base_fn, ext = os.path.splitext(fn) data_fn = base_fn + ".ceps" np.save(data_fn, ceps) print "Written", data_fndef create_ceps(fn): sample_rate, X = scipy.io.wavfile.read(fn) ceps, mspec, spec = mfcc(X) write_ceps(ceps, fn)def read_ceps(genre_list, base_dir=GENRE_DIR): X = [] y = [] for label, genre in enumerate(genre_list): for fn in glob.glob(os.path.join(base_dir, genre, "*.ceps.npy")): ceps = np.load(fn) num_ceps = len(ceps) X.append( np.mean(ceps[int(num_ceps / 10):int(num_ceps * 9 / 10)], axis=0)) y.append(label) return np.array(X), np.array(y)if __name__ == "__main__": os.chdir(GENRE_DIR) glob_wav = os.path.join(sys.argv[1], "*.wav") print glob_wav for fn in glob.glob(glob_wav): create_ceps(fn)import sysimport osimport globimport numpy as npimport scipyimport scipy.io.wavfilefrom utils import GENRE_DIR, CHART_DIRimport matplotlib.pyplot as pltfrom matplotlib.ticker import EngFormatterdef write_fft(fft_features, fn): """ Write the FFT features to separate files to speed up processing. """ base_fn, ext = os.path.splitext(fn) data_fn = base_fn + ".fft" np.save(data_fn, fft_features) print "Written", data_fndef create_fft(fn): sample_rate, X = scipy.io.wavfile.read(fn) fft_features = abs(scipy.fft(X)[:1000]) write_fft(fft_features, fn)def read_fft(genre_list, base_dir=GENRE_DIR): X = [] y = [] for label, genre in enumerate(genre_list): genre_dir = os.path.join(base_dir, genre, "*.fft.npy") file_list = glob.glob(genre_dir) assert(file_list), genre_dir for fn in file_list: fft_features = np.load(fn) X.append(fft_features[:2000]) y.append(label) return np.array(X), np.array(y)def plot_wav_fft(wav_filename, desc=None): plt.clf() plt.figure(num=None, figsize=(6, 4)) sample_rate, X = scipy.io.wavfile.read(wav_filename) spectrum = np.fft.fft(X) freq = np.fft.fftfreq(len(X), 1.0 / sample_rate) plt.subplot(211) num_samples = 200.0 plt.xlim(0, num_samples / sample_rate) plt.xlabel("time [s]") plt.title(desc or wav_filename) plt.plot(np.arange(num_samples) / sample_rate, X[:num_samples]) plt.grid(True) plt.subplot(212) plt.xlim(0, 5000) plt.xlabel("frequency [Hz]") plt.xticks(np.arange(5) * 1000) if desc: desc = desc.strip() fft_desc = desc[0].lower() + desc[1:] else: fft_desc = wav_filename plt.title("FFT of %s" % fft_desc) plt.plot(freq, abs(spectrum), linewidth=5) plt.grid(True) plt.tight_layout() rel_filename = os.path.split(wav_filename)[1] plt.savefig("%s_wav_fft.png" % os.path.splitext(rel_filename)[0], bbox_inches='tight') plt.show()def plot_wav_fft_demo(): plot_wav_fft("sine_a.wav", "400Hz sine wave") plot_wav_fft("sine_b.wav", "3,000Hz sine wave") plot_wav_fft("sine_mix.wav", "Mixed sine wave")def plot_specgram(ax, fn): sample_rate, X = scipy.io.wavfile.read(fn) ax.specgram(X, Fs=sample_rate, xextent=(0, 30))def plot_specgrams(base_dir=CHART_DIR): """ Plot a bunch of spectrograms of wav files in different genres """ plt.clf() genres = ["classical", "jazz", "country", "pop", "rock", "metal"] num_files = 3 f, axes = plt.subplots(len(genres), num_files) for genre_idx, genre in enumerate(genres): for idx, fn in enumerate(glob.glob(os.path.join(GENRE_DIR, genre, "*.wav"))): if idx == num_files: break axis = axes[genre_idx, idx] axis.yaxis.set_major_formatter(EngFormatter()) axis.set_title("%s song %i" % (genre, idx + 1)) plot_specgram(axis, fn) #经过后台处理,直接使用specgram画图 specgram_file = os.path.join(base_dir, "Spectrogram_Genres.png") plt.savefig(specgram_file, bbox_inches="tight") plt.show()if __name__ == "__main__": # for fn in glob.glob(os.path.join(sys.argv[1], "*.wav")): # create_fft(fn) # plot_decomp() if len(sys.argv) > 1: plot_wav_fft(sys.argv[1], desc="some sample song") else: plot_wav_fft_demo() plot_specgrams()第十章 图像处理import numpy as npimport mahotas as mh #读取图像文件def edginess_sobel(image): ''' edgi = edginess_sobel(image) Measure the "edginess" of an image ''' edges = mh.sobel(image, just_filter=True) edges = edges.ravel() return np.sqrt(np.dot(edges, edges))import numpy as npimport mahotas as mhtext = mh.imread("simple-dataset/text21.jpg")scene = mh.imread("simple-dataset/scene00.jpg")h,w,_ = text.shapecanvas = np.zeros((h,2*w+128,3), np.uint8)canvas[:,-w:] = scenecanvas[:,:w] = textcanvas = canvas[::4,::4]mh.imsave('../1400OS_10_10+.jpg', canvas)import mahotas as mhfrom mahotas.colors import rgb2greyimport numpy as npim = mh.imread('lenna.jpg')im = rgb2grey(im) #灰度处理salt = np.random.random(im.shape) > .975pepper = np.random.random(im.shape) > .975im = np.maximum(salt*170, mh.stretch(im))im = np.minimum(pepper*30 + im*(~pepper), im)mh.imsave('../1400OS_10_13+.jpg', im.astype(np.uint8))import mahotas as mhfrom sklearn import cross_validationfrom sklearn.linear_model.logistic import LogisticRegressionfrom mpltools import stylefrom matplotlib import pyplot as pltimport numpy as npfrom glob import globbasedir = 'AnimTransDistr'def features_for(images): fs = [] for im in images: im = mh.imread(im,as_grey=True).astype(np.uint8) fs.append(mh.features.haralick(im).mean(0)) return np.array(fs)def features_labels(groups): labels = np.zeros(sum(map(len,groups))) st = 0 for i,g in enumerate(groups): labels[st:st+len(g)] = i st += len(g) return np.vstack(groups), labels classes = [ 'Anims', 'Cars', 'Distras', 'Trans',]features = []labels = []for ci,cl in enumerate(classes): images = glob('{}/{}/*.jpg'.format(basedir,cl)) features.extend(features_for(images)) labels.extend([ci for _ in images])features = np.array(features)labels = np.array(labels)scores0 = cross_validation.cross_val_score(LogisticRegression(), features, labels, cv=10)print('Accuracy (5 fold x-val) with Logistic Regrssion [std features]: %s%%' % (0.1* round(1000*scores0.mean())))tfeatures = featuresfrom sklearn.cluster import KMeansfrom mahotas.features import surfimages = []labels = []for ci,cl in enumerate(classes): curimages = glob('{}/{}/*.jpg'.format(basedir,cl)) images.extend(curimages) labels.extend([ci for _ in curimages])labels = np.array(labels)alldescriptors = []for im in images: im = mh.imread(im, as_grey=1) im = im.astype(np.uint8) #alldescriptors.append(surf.dense(im, spacing=max(im.shape)//32)) alldescriptors.append(surf.surf(im, descriptor_only=True))print('Descriptors done')k = 256km = KMeans(k)concatenated = np.concatenate(alldescriptors)concatenated = concatenated[::64]print('k-meaning...')km.fit(concatenated)features = []for d in alldescriptors: c = km.predict(d) features.append( np.array([np.sum(c == i) for i in xrange(k)]) )features = np.array(features)print('predicting...')scoreSURFlr = cross_validation.cross_val_score(LogisticRegression(), features, labels, cv=5).mean()print('Accuracy (5 fold x-val) with Log. Reg [SURF features]: %s%%' % (0.1* round(1000*scoreSURFlr.mean())))print('combined...')allfeatures = np.hstack([features, tfeatures])scoreSURFplr = cross_validation.cross_val_score(LogisticRegression(), allfeatures, labels, cv=5).mean()print('Accuracy (5 fold x-val) with Log. Reg [All features]: %s%%' % (0.1* round(1000*scoreSURFplr.mean())))style.use('ggplot')plt.plot([0,1,2],100*np.array([scores0.mean(), scoreSURFlr, scoreSURFplr]), 'k-', lw=8)plt.plot([0,1,2],100*np.array([scores0.mean(), scoreSURFlr, scoreSURFplr]), 'o', mec='#cccccc', mew=12, mfc='white')plt.xlim(-.5,2.5)plt.ylim(scores0.mean()*90., scoreSURFplr*110)plt.xticks([0,1,2], ["baseline", "SURF", "combined"])plt.ylabel('Accuracy (%)')plt.savefig('../1400OS_10_18+.png')from matplotlib import pyplot as pltimport numpy as npimport mahotas as mhimage = mh.imread('../1400OS_10_01.jpeg')image = mh.colors.rgb2gray(image)im8 = mh.gaussian_filter(image,8)im16 = mh.gaussian_filter(image,16)im32 = mh.gaussian_filter(image,32)h,w = im8.shapecanvas = np.ones((h,3*w+256), np.uint8)canvas *= 255canvas[:,:w] = im8canvas[:,w+128:2*w+128] = im16canvas[:,-w:] = im32mh.imsave('../1400OS_10_05+.jpg', canvas[:,::2])im32 = mh.stretch(im32)ot32 = mh.otsu(im32)mh.imsave('../1400OS_10_06+.jpg', (im32 > ot32).astype(np.uint8)*255)from matplotlib import pyplot as pltimport numpy as npimport mahotas as mhimage = mh.imread('../1400OS_10_01.jpeg')image = mh.colors.rgb2gray(image, dtype=np.uint8)image = image[::4,::4]thresh = mh.sobel(image)filtered = mh.sobel(image, just_filter=True)thresh = mh.dilate(thresh,np.ones((7,7)))filtered = mh.dilate(mh.stretch(filtered),np.ones((7,7)))h,w = thresh.shapecanvas = 255*np.ones((h,w*2+64), np.uint8)canvas[:,:w] = thresh*255canvas[:,-w:] = filteredmh.imsave('../1400OS_10_09+.jpg', canvas)import mahotas as mhimport numpy as npim = mh.imread('lenna.jpg')r,g,b = im.transpose(2,0,1)h,w = r.shaper12 = mh.gaussian_filter(r, 12.)g12 = mh.gaussian_filter(g, 12.)b12 = mh.gaussian_filter(b, 12.)im12 = mh.as_rgb(r12,g12,b12)X,Y = np.mgrid[:h,:w]X = X-h/2.Y = Y-w/2.X /= X.max()Y /= Y.max()C = np.exp(-2.*(X**2+ Y**2))C -= C.min()C /= C.ptp()C = C[:,:,None]ring = mh.stretch(im*C + (1-C)*im12)mh.imsave('lenna-ring.jpg', ring)import mahotas as mhfrom sklearn import cross_validationfrom sklearn.linear_model.logistic import LogisticRegressionimport numpy as npfrom glob import globfrom edginess import edginess_sobelbasedir = 'simple-dataset'def features_for(im): im = mh.imread(im,as_grey=True).astype(np.uint8) return mh.features.haralick(im).mean(0)features = []sobels = []labels = []images = glob('{}/*.jpg'.format(basedir))for im in images: features.append(features_for(im)) sobels.append(edginess_sobel(mh.imread(im, as_grey=True))) labels.append(im[:-len('00.jpg')])features = np.array(features)labels = np.array(labels)scores = cross_validation.cross_val_score(LogisticRegression(), features, labels, cv=5)print('Accuracy (5 fold x-val) with Logistic Regrssion [std features]: {}%'.format(0.1* round(1000*scores.mean())))scores = cross_validation.cross_val_score(LogisticRegression(), np.hstack([np.atleast_2d(sobels).T,features]), labels, cv=5).mean()print('Accuracy (5 fold x-val) with Logistic Regrssion [std features + sobel]: {}%'.format(0.1* round(1000*scores.mean())))import numpy as npimport mahotas as mhimage = mh.imread('../1400OS_10_01.jpeg')image = mh.colors.rgb2gray(image, dtype=np.uint8)thresh = mh.thresholding.otsu(image)print(thresh)otsubin = (image > thresh)mh.imsave('otsu-threshold.jpeg', otsubin.astype(np.uint8)*255)otsubin = ~ mh.close(~otsubin, np.ones((15,15)))mh.imsave('otsu-closed.jpeg', otsubin.astype(np.uint8)*255)thresh = mh.thresholding.rc(image)print(thresh)mh.imsave('rc-threshold.jpeg', (image > thresh).astype(np.uint8)*255)第十一章 相关性import osfrom matplotlib import pylabimport numpy as npimport scipyfrom scipy.stats import norm, pearsonr ##相关性DATA_DIR = os.path.join("..", "data")CHART_DIR = os.path.join("..", "charts")def _plot_correlation_func(x, y): r, p = pearsonr(x, y) title = "Cor($X_1$, $X_2$) = %.3f" % r pylab.scatter(x, y) pylab.title(title) pylab.xlabel("$X_1$") pylab.ylabel("$X_2$") f1 = scipy.poly1d(scipy.polyfit(x, y, 1)) pylab.plot(x, f1(x), "r--", linewidth=2) # pylab.xticks([w*7*24 for w in [0,1,2,3,4]], ['week %i'%(w+1) for w in # [0,1,2,3,4]])def plot_correlation_demo(): np.random.seed(0) # to reproduce the data later on pylab.clf() pylab.figure(num=None, figsize=(8, 8)) x = np.arange(0, 10, 0.2) pylab.subplot(221) y = 0.5 * x + norm.rvs(1, loc=0, scale=.01, size=len(x)) _plot_correlation_func(x, y) pylab.subplot(222) y = 0.5 * x + norm.rvs(1, loc=0, scale=.1, size=len(x)) _plot_correlation_func(x, y) pylab.subplot(223) y = 0.5 * x + norm.rvs(1, loc=0, scale=1, size=len(x)) _plot_correlation_func(x, y) pylab.subplot(224) y = norm.rvs(1, loc=0, scale=10, size=len(x)) _plot_correlation_func(x, y) pylab.autoscale(tight=True) pylab.grid(True) filename = "corr_demo_1.png" pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight") pylab.clf() pylab.figure(num=None, figsize=(8, 8)) x = np.arange(-5, 5, 0.2) pylab.subplot(221) y = 0.5 * x ** 2 + norm.rvs(1, loc=0, scale=.01, size=len(x)) _plot_correlation_func(x, y) pylab.subplot(222) y = 0.5 * x ** 2 + norm.rvs(1, loc=0, scale=.1, size=len(x)) _plot_correlation_func(x, y) pylab.subplot(223) y = 0.5 * x ** 2 + norm.rvs(1, loc=0, scale=1, size=len(x)) _plot_correlation_func(x, y) pylab.subplot(224) y = 0.5 * x ** 2 + norm.rvs(1, loc=0, scale=10, size=len(x)) _plot_correlation_func(x, y) pylab.autoscale(tight=True) pylab.grid(True) filename = "corr_demo_2.png" pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")if __name__ == '__main__': plot_correlation_demo()import osfrom matplotlib import pylabimport numpy as npfrom scipy.stats import norm, entropy #互信息的计算DATA_DIR = os.path.join("..", "data")CHART_DIR = os.path.join("..", "charts")def mutual_info(x, y, bins=10): counts_xy, bins_x, bins_y = np.histogram2d(x, y, bins=(bins, bins)) counts_x, bins = np.histogram(x, bins=bins) counts_y, bins = np.histogram(y, bins=bins) counts_xy += 1 counts_x += 1 counts_y += 1 P_xy = counts_xy / np.sum(counts_xy, dtype=float) P_x = counts_x / np.sum(counts_x, dtype=float) P_y = counts_y / np.sum(counts_y, dtype=float) I_xy = np.sum(P_xy * np.log2(P_xy / (P_x.reshape(-1, 1) * P_y))) return I_xy / (entropy(counts_x) + entropy(counts_y))def plot_entropy(): pylab.clf() pylab.figure(num=None, figsize=(5, 4)) title = "Entropy $H(X)$" pylab.title(title) pylab.xlabel("$P(X=$coin will show heads up$)$") pylab.ylabel("$H(X)$") pylab.xlim(xmin=0, xmax=1.1) x = np.arange(0.001, 1, 0.001) y = -x * np.log2(x) - (1 - x) * np.log2(1 - x) pylab.plot(x, y) # pylab.xticks([w*7*24 for w in [0,1,2,3,4]], ['week %i'%(w+1) for w in # [0,1,2,3,4]]) pylab.autoscale(tight=True) pylab.grid(True) filename = "entropy_demo.png" pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")def _plot_mi_func(x, y): mi = mutual_info(x, y) title = "NI($X_1$, $X_2$) = %.3f" % mi pylab.scatter(x, y) pylab.title(title) pylab.xlabel("$X_1$") pylab.ylabel("$X_2$")def plot_mi_demo(): np.random.seed(0) # to reproduce the data later on pylab.clf() pylab.figure(num=None, figsize=(8, 8)) x = np.arange(0, 10, 0.2) pylab.subplot(221) y = 0.5 * x + norm.rvs(1, loc=0, scale=.01, size=len(x)) _plot_mi_func(x, y) pylab.subplot(222) y = 0.5 * x + norm.rvs(1, loc=0, scale=.1, size=len(x)) _plot_mi_func(x, y) pylab.subplot(223) y = 0.5 * x + norm.rvs(1, loc=0, scale=1, size=len(x)) _plot_mi_func(x, y) pylab.subplot(224) y = norm.rvs(1, loc=0, scale=10, size=len(x)) _plot_mi_func(x, y) pylab.autoscale(tight=True) pylab.grid(True) filename = "mi_demo_1.png" pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight") pylab.clf() pylab.figure(num=None, figsize=(8, 8)) x = np.arange(-5, 5, 0.2) pylab.subplot(221) y = 0.5 * x ** 2 + norm.rvs(1, loc=0, scale=.01, size=len(x)) _plot_mi_func(x, y) pylab.subplot(222) y = 0.5 * x ** 2 + norm.rvs(1, loc=0, scale=.1, size=len(x)) _plot_mi_func(x, y) pylab.subplot(223) y = 0.5 * x ** 2 + norm.rvs(1, loc=0, scale=1, size=len(x)) _plot_mi_func(x, y) pylab.subplot(224) y = 0.5 * x ** 2 + norm.rvs(1, loc=0, scale=10, size=len(x)) _plot_mi_func(x, y) pylab.autoscale(tight=True) pylab.grid(True) filename = "mi_demo_2.png" pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")if __name__ == '__main__': plot_entropy() plot_mi_demo()from sklearn.feature_selection import RFE #特征选择方法from sklearn.linear_model import LogisticRegressionfrom sklearn.datasets import make_classificationX, y = make_classification( n_samples=100, n_features=10, n_informative=3, random_state=0)clf = LogisticRegression()clf.fit(X, y)for i in range(1, 11): selector = RFE(clf, i) selector = selector.fit(X, y) print("%i\t%s\t%s" % (i, selector.support_, selector.ranking_)) import osfrom matplotlib import pylabimport numpy as npfrom sklearn import linear_model, decomposition #PCA方法from sklearn import lda #LDA方法logistic = linear_model.LogisticRegression()CHART_DIR = os.path.join("..", "charts")np.random.seed(3)x1 = np.arange(0, 10, .2)x2 = x1 + np.random.normal(loc=0, scale=1, size=len(x1))def plot_simple_demo_1(): pylab.clf() fig = pylab.figure(num=None, figsize=(10, 4)) pylab.subplot(121) title = "Original feature space" pylab.title(title) pylab.xlabel("$X_1$") pylab.ylabel("$X_2$") x1 = np.arange(0, 10, .2) x2 = x1 + np.random.normal(loc=0, scale=1, size=len(x1)) good = (x1 > 5) | (x2 > 5) bad = ~good x1g = x1[good] x2g = x2[good] pylab.scatter(x1g, x2g, edgecolor="blue", facecolor="blue") x1b = x1[bad] x2b = x2[bad] pylab.scatter(x1b, x2b, edgecolor="red", facecolor="white") pylab.grid(True) pylab.subplot(122) X = np.c_[(x1, x2)] pca = decomposition.PCA(n_components=1) #PCA Xtrans = pca.fit_transform(X) Xg = Xtrans[good] Xb = Xtrans[bad] pylab.scatter( Xg[:, 0], np.zeros(len(Xg)), edgecolor="blue", facecolor="blue") pylab.scatter( Xb[:, 0], np.zeros(len(Xb)), edgecolor="red", facecolor="white") title = "Transformed feature space" pylab.title(title) pylab.xlabel("$X'$") fig.axes[1].get_yaxis().set_visible(False) print(pca.explained_variance_ratio_) pylab.grid(True) pylab.autoscale(tight=True) filename = "pca_demo_1.png" pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")def plot_simple_demo_2(): pylab.clf() fig = pylab.figure(num=None, figsize=(10, 4)) pylab.subplot(121) title = "Original feature space" pylab.title(title) pylab.xlabel("$X_1$") pylab.ylabel("$X_2$") x1 = np.arange(0, 10, .2) x2 = x1 + np.random.normal(loc=0, scale=1, size=len(x1)) good = x1 > x2 bad = ~good x1g = x1[good] x2g = x2[good] pylab.scatter(x1g, x2g, edgecolor="blue", facecolor="blue") x1b = x1[bad] x2b = x2[bad] pylab.scatter(x1b, x2b, edgecolor="red", facecolor="white") pylab.grid(True) pylab.subplot(122) X = np.c_[(x1, x2)] pca = decomposition.PCA(n_components=1) Xtrans = pca.fit_transform(X) Xg = Xtrans[good] Xb = Xtrans[bad] pylab.scatter( Xg[:, 0], np.zeros(len(Xg)), edgecolor="blue", facecolor="blue") pylab.scatter( Xb[:, 0], np.zeros(len(Xb)), edgecolor="red", facecolor="white") title = "Transformed feature space" pylab.title(title) pylab.xlabel("$X'$") fig.axes[1].get_yaxis().set_visible(False) print(pca.explained_variance_ratio_) pylab.grid(True) pylab.autoscale(tight=True) filename = "pca_demo_2.png" pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")def plot_simple_demo_lda(): pylab.clf() fig = pylab.figure(num=None, figsize=(10, 4)) pylab.subplot(121) title = "Original feature space" pylab.title(title) pylab.xlabel("$X_1$") pylab.ylabel("$X_2$") good = x1 > x2 bad = ~good x1g = x1[good] x2g = x2[good] pylab.scatter(x1g, x2g, edgecolor="blue", facecolor="blue") x1b = x1[bad] x2b = x2[bad] pylab.scatter(x1b, x2b, edgecolor="red", facecolor="white") pylab.grid(True) pylab.subplot(122) X = np.c_[(x1, x2)] lda_inst = lda.LDA(n_components=1) #LDA 方法 Xtrans = lda_inst.fit_transform(X, good) Xg = Xtrans[good] Xb = Xtrans[bad] pylab.scatter( Xg[:, 0], np.zeros(len(Xg)), edgecolor="blue", facecolor="blue") pylab.scatter( Xb[:, 0], np.zeros(len(Xb)), edgecolor="red", facecolor="white") title = "Transformed feature space" pylab.title(title) pylab.xlabel("$X'$") fig.axes[1].get_yaxis().set_visible(False) pylab.grid(True) pylab.autoscale(tight=True) filename = "lda_demo.png" pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")if __name__ == '__main__': plot_simple_demo_1() plot_simple_demo_2() plot_simple_demo_lda()import osimport numpy as npfrom matplotlib import pylabfrom mpl_toolkits.mplot3d import Axes3Dfrom sklearn import linear_model, manifold, decomposition, datasets #MDA在manifold包中logistic = linear_model.LogisticRegression()CHART_DIR = os.path.join("..", "charts")np.random.seed(3)# all examples will have three classes in this filecolors = ['r', 'g', 'b']markers = ['o', 6, '*']def plot_demo_1(): X = np.c_[np.ones(5), 2 * np.ones(5), 10 * np.ones(5)].T ##np.c_ Translates slice objects to concatenation along the second axis, 和vstock等有类似又有区分 y = np.array([0, 1, 2]) fig = pylab.figure(figsize=(10, 4)) ax = fig.add_subplot(121, projection='3d') ax.set_axis_bgcolor('white') #set_axis_bgcolor mds = manifold.MDS(n_components=3) #MDS 分析方法 Xtrans = mds.fit_transform(X) for cl, color, marker in zip(np.unique(y), colors, markers): ax.scatter( Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], Xtrans[y == cl][:, 2], c=color, marker=marker, edgecolor='black') pylab.title("MDS on example data set in 3 dimensions") ax.view_init(10, -15) mds = manifold.MDS(n_components=2) Xtrans = mds.fit_transform(X) ax = fig.add_subplot(122) for cl, color, marker in zip(np.unique(y), colors, markers): ax.scatter( Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], c=color, marker=marker, edgecolor='black') pylab.title("MDS on example data set in 2 dimensions") filename = "mds_demo_1.png" pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")def plot_iris_mds(): iris = datasets.load_iris() X = iris.data y = iris.target # MDS fig = pylab.figure(figsize=(10, 4)) ax = fig.add_subplot(121, projection='3d') ax.set_axis_bgcolor('white') mds = manifold.MDS(n_components=3) Xtrans = mds.fit_transform(X) for cl, color, marker in zip(np.unique(y), colors, markers): ax.scatter( Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], Xtrans[y == cl][:, 2], c=color, marker=marker, edgecolor='black') pylab.title("MDS on Iris data set in 3 dimensions") ax.view_init(10, -15) mds = manifold.MDS(n_components=2) Xtrans = mds.fit_transform(X) ax = fig.add_subplot(122) for cl, color, marker in zip(np.unique(y), colors, markers): ax.scatter( Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], c=color, marker=marker, edgecolor='black') pylab.title("MDS on Iris data set in 2 dimensions") filename = "mds_demo_iris.png" pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight") # PCA fig = pylab.figure(figsize=(10, 4)) ax = fig.add_subplot(121, projection='3d') ax.set_axis_bgcolor('white') pca = decomposition.PCA(n_components=3) Xtrans = pca.fit(X).transform(X) for cl, color, marker in zip(np.unique(y), colors, markers): ax.scatter( Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], Xtrans[y == cl][:, 2], c=color, marker=marker, edgecolor='black') pylab.title("PCA on Iris data set in 3 dimensions") ax.view_init(50, -35) pca = decomposition.PCA(n_components=2) Xtrans = pca.fit_transform(X) ax = fig.add_subplot(122) for cl, color, marker in zip(np.unique(y), colors, markers): ax.scatter( Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], c=color, marker=marker, edgecolor='black') pylab.title("PCA on Iris data set in 2 dimensions") filename = "pca_demo_iris.png" pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")if __name__ == '__main__': plot_demo_1() plot_iris_mds()第十二章 from jug import TaskGenerator #大数据分布式平台运算from time import sleep@TaskGeneratordef double(x): sleep(4) return 2*x@TaskGeneratordef add(a, b): return a + b@TaskGeneratordef print_final_result(oname, value): with open(oname, 'w') as output: print >>output, "Final result:", valueinput = 2y = double(input)z = double(y)y2 = double(7)z2 = double(y2)print_final_result('output.txt', add(z,z2))
阅读全文
0 0
- 机器学习系统设计 学习笔记
- 学习笔记-机器学习系统设计
- Spark机器学习笔记2--设计机器学习系统
- [机器学习笔记]Note9--机器学习系统设计
- spark机器学习笔记:设计机器学习系统
- 机器学习笔记(十一) 机器学习系统的设计
- 机器学习系统设计
- 机器学习08机器学习系统设计
- 机器学习的系统设计
- 三、机器学习系统设计笔记之聚类
- 四、机器学习系统设计笔记之主体模型
- 七、机器学习系统设计笔记之对回归推荐
- 十、机器学习系统设计笔记之计算机视觉
- 十一、机器学习系统设计笔记之对分类
- 十二、机器学习系统设计笔记之大数据
- Machine Learning第六周笔记二:机器学习系统设计
- 斯坦福机器学习公开课笔记(八)--机器学习系统设计
- 《python机器学习系统设计》笔记一:python机器学习入门
- Kafka学习笔记 --- 概念梳理
- 笨办法13参数、解包、变量_草稿
- 使用 jquery 获取一组或者单个 checkbox 的选中状态的id
- B2Ctt商城06 cms系统
- 重撸Android事件分发
- 机器学习系统设计 学习笔记
- document 对象 、 自定义对象 、 事件
- linux环境下生成文件,文件名称中文乱码
- 一个像素的Activity
- Linux工具--sed
- 常用Git命令大全思维导图
- PHP读取XML
- ubuntu上mysql数据库的启动/关闭/重启
- redis-migrate-tool迁移工具