机器学习系统设计学习笔记

来源：互联网发布：yii框架商城源码编辑：程序博客网时间：2024/05/29 17:54
第一章import timeitnormal_py_sec = timeit.timeit('sum(x*x for x in xrange(1000))',                              number=10000)naive_np_sec = timeit.timeit('sum(na*na)',                             setup="import numpy as np; na=np.arange(1000)",                             number=10000)good_np_sec = timeit.timeit('na.dot(na)',                            setup="import numpy as np; na=np.arange(1000)",                            number=10000)print("Normal Python: %f sec" % normal_py_sec)print("Naive NumPy: %f sec" % naive_np_sec)print("Good NumPy: %f sec" % good_np_sec)# This script generates web traffic data for our hypothetical# web startup "MLASS" in chapter 01import osimport scipy as spfrom scipy.stats import gammaimport matplotlib.pyplot as pltsp.random.seed(3)  # to reproduce the data later onx = sp.arange(1, 31 * 24)y = sp.array(200 * (sp.sin(2 * sp.pi * x / (7 * 24))), dtype=int)y += gamma.rvs(15, loc=0, scale=100, size=len(x))                #产生gamma分布y += 2 * sp.exp(x / 100.0)y = sp.ma.array(y, mask=[y < 0])            ###此处使用了掩码数组 mask，print(sum(y), sum(y < 0))plt.scatter(x, y)plt.title("Web traffic over the last month")plt.xlabel("Time")plt.ylabel("Hits/hour")plt.xticks([w * 7 * 24 for w in [0, 1, 2, 3, 4]], ['week %i' % (w + 1) for w in [           0, 1, 2, 3, 4]])plt.autoscale(tight=True)plt.grid()plt.savefig(os.path.join("..", "1400_01_01.png"))data_dir = os.path.join(    os.path.dirname(os.path.realpath(__file__)), "..", "data")             ##注意路径的用法，os.path.join, os.path.dirname, os.path.realpath(__file__)# sp.savetxt(os.path.join("..", "web_traffic.tsv"),# zip(x[~y.mask],y[~y.mask]), delimiter="\t", fmt="%i")sp.savetxt(os.path.join(    data_dir, "web_traffic.tsv"), list(zip(x, y)), delimiter="\t", fmt="%s")import osimport scipy as spimport matplotlib.pyplot as pltdata_dir = os.path.join(    os.path.dirname(os.path.realpath(__file__)), "..", "data")data = sp.genfromtxt(os.path.join(data_dir, "web_traffic.tsv"), delimiter="\t")   ##在scipy中打开文件print(data[:10])# all examples will have three classes in this filecolors = ['g', 'k', 'b', 'm', 'r']linestyles = ['-', '-.', '--', ':', '-']x = data[:, 0]y = data[:, 1]print("Number of invalid entries:", sp.sum(sp.isnan(y)))x = x[~sp.isnan(y)]        ##取反的方式y = y[~sp.isnan(y)]# plot input datadef plot_models(x, y, models, fname, mx=None, ymax=None, xmin=None):    plt.clf()    plt.scatter(x, y, s=10)    plt.title("Web traffic over the last month")    plt.xlabel("Time")    plt.ylabel("Hits/hour")    plt.xticks(        [w * 7 * 24 for w in range(10)], ['week %i' % w for w in range(10)])    if models:        if mx is None:            mx = sp.linspace(0, x[-1], 1000)        for model, style, color in zip(models, linestyles, colors):        ##循环打印图标            # print "Model:",model            # print "Coeffs:",model.coeffs            plt.plot(mx, model(mx), linestyle=style, linewidth=2, c=color)        plt.legend(["d=%i" % m.order for m in models], loc="upper left")    plt.autoscale(tight=True)    plt.ylim(ymin=0)    if ymax:        plt.ylim(ymax=ymax)    if xmin:        plt.xlim(xmin=xmin)    plt.grid(True, linestyle='-', color='0.75')    plt.savefig(fname)            ##保存图表# first look at the dataplot_models(x, y, None, os.path.join("..", "1400_01_01.png"))# create and plot modelsfp1, res, rank, sv, rcond = sp.polyfit(x, y, 1, full=True)print("Model parameters: %s" % fp1)print("Error of the model:", res)f1 = sp.poly1d(fp1)f2 = sp.poly1d(sp.polyfit(x, y, 2))               ##ploy1d 多项式拟合函数f3 = sp.poly1d(sp.polyfit(x, y, 3))f10 = sp.poly1d(sp.polyfit(x, y, 10))f100 = sp.poly1d(sp.polyfit(x, y, 100))plot_models(x, y, [f1], os.path.join("..", "1400_01_02.png"))plot_models(x, y, [f1, f2], os.path.join("..", "1400_01_03.png"))plot_models(    x, y, [f1, f2, f3, f10, f100], os.path.join("..", "1400_01_04.png"))# fit and plot a model using the knowledge about inflection pointinflection = 3.5 * 7 * 24xa = x[:inflection]ya = y[:inflection]xb = x[inflection:]yb = y[inflection:]fa = sp.poly1d(sp.polyfit(xa, ya, 1))fb = sp.poly1d(sp.polyfit(xb, yb, 1))plot_models(x, y, [fa, fb], os.path.join("..", "1400_01_05.png"))def error(f, x, y):    return sp.sum((f(x) - y) ** 2)print("Errors for the complete data set:")for f in [f1, f2, f3, f10, f100]:    print("Error d=%i: %f" % (f.order, error(f, x, y)))print("Errors for only the time after inflection point")for f in [f1, f2, f3, f10, f100]:    print("Error d=%i: %f" % (f.order, error(f, xb, yb)))print("Error inflection=%f" % (error(fa, xa, ya) + error(fb, xb, yb)))# extrapolating into the futureplot_models(    x, y, [f1, f2, f3, f10, f100], os.path.join("..", "1400_01_06.png"),    mx=sp.linspace(0 * 7 * 24, 6 * 7 * 24, 100),    ymax=10000, xmin=0 * 7 * 24)print("Trained only on data after inflection point")fb1 = fbfb2 = sp.poly1d(sp.polyfit(xb, yb, 2))fb3 = sp.poly1d(sp.polyfit(xb, yb, 3))fb10 = sp.poly1d(sp.polyfit(xb, yb, 10))fb100 = sp.poly1d(sp.polyfit(xb, yb, 100))print("Errors for only the time after inflection point")for f in [fb1, fb2, fb3, fb10, fb100]:    print("Error d=%i: %f" % (f.order, error(f, xb, yb)))plot_models(    x, y, [fb1, fb2, fb3, fb10, fb100], os.path.join("..", "1400_01_07.png"),    mx=sp.linspace(0 * 7 * 24, 6 * 7 * 24, 100),    ymax=10000, xmin=0 * 7 * 24)# separating training from testing datafrac = 0.3split_idx = int(frac * len(xb))shuffled = sp.random.permutation(list(range(len(xb))))test = sorted(shuffled[:split_idx])train = sorted(shuffled[split_idx:])fbt1 = sp.poly1d(sp.polyfit(xb[train], yb[train], 1))fbt2 = sp.poly1d(sp.polyfit(xb[train], yb[train], 2))fbt3 = sp.poly1d(sp.polyfit(xb[train], yb[train], 3))fbt10 = sp.poly1d(sp.polyfit(xb[train], yb[train], 10))fbt100 = sp.poly1d(sp.polyfit(xb[train], yb[train], 100))print("Test errors for only the time after inflection point")for f in [fbt1, fbt2, fbt3, fbt10, fbt100]:    print("Error d=%i: %f" % (f.order, error(f, xb[test], yb[test])))plot_models(    x, y, [fbt1, fbt2, fbt3, fbt10, fbt100], os.path.join("..",                                                          "1400_01_08.png"),    mx=sp.linspace(0 * 7 * 24, 6 * 7 * 24, 100),    ymax=10000, xmin=0 * 7 * 24)from scipy.optimize import fsolveprint(fbt2)print(fbt2 - 100000)reached_max = fsolve(fbt2 - 100000, 800) / (7 * 24)print("100,000 hits/hour expected at week %f" % reached_max[0])第二章import numpy as npfrom sklearn.datasets import load_irisfrom matplotlib import pyplot as pltdata = load_iris()features = data['data']feature_names = data['feature_names']target = data['target']pairs = [(0,1),(0,2),(0,3),(1,2),(1,3),(2,3)]for i,(p0,p1) in enumerate(pairs):   # 注意次数enumerate的用法，i，(p0,p1)    plt.subplot(2,3,i+1)    for t,marker,c in zip(range(3),">ox","rgb"):        plt.scatter(features[target == t,p0], features[target == t,p1], marker=marker, c=c)    plt.xlabel(feature_names[p0])    plt.ylabel(feature_names[p1])    plt.xticks([])    plt.yticks([])plt.savefig('../1400_02_01.png')from load import load_datasetdef test_iris():    features, labels = load_dataset('iris')     assert len(features[0]) == 4            #断言    assert len(features)    assert len(features) == len(labels)def test_seeds():    features, labels = load_dataset('seeds')    assert len(features[0]) == 7    assert len(features)    assert len(features) == len(labels)import milksets.irisimport milksets.seedsdef save_as_tsv(fname, module):    features, labels = module.load()    nlabels = [module.label_names[ell] for ell in labels]    with open(fname, 'w') as ofile:        for f,n in zip(features, nlabels):            print >>ofile, "\t".join(map(str,f)+[n])  save_as_tsv('iris.tsv', milksets.iris)save_as_tsv('seeds.tsv', milksets.seeds)COLOUR_FIGURE = Falsefrom matplotlib import pyplot as pltfrom sklearn.datasets import load_irisdata = load_iris()features = data['data']feature_names = data['feature_names']species = data['target_names'][data['target']]setosa = (species == 'setosa')features = features[~setosa]species = species[~setosa]virginica = species == 'virginica't = 1.75p0,p1 = 3,2if COLOUR_FIGURE:    area1c = (1.,.8,.8)    area2c = (.8,.8,1.)else:    area1c = (1.,1,1)    area2c = (.7,.7,.7)x0,x1 =[features[:,p0].min()*.9,features[:,p0].max()*1.1]       #注意用列表对两个变量赋值， 列表内为两个变量y0,y1 =[features[:,p1].min()*.9,features[:,p1].max()*1.1]plt.fill_between([t,x1],[y0,y0],[y1,y1],color=area2c)   #填充颜色plt.fill_between([x0,t],[y0,y0],[y1,y1],color=area1c)plt.plot([t,t],[y0,y1],'k--',lw=2)plt.plot([t-.1,t-.1],[y0,y1],'k:',lw=2)plt.scatter(features[virginica,p0], features[virginica,p1], c='b', marker='o')plt.scatter(features[~virginica,p0], features[~virginica,p1], c='r', marker='x')plt.ylim(y0,y1)plt.xlim(x0,x1)plt.xlabel(feature_names[p0])plt.ylabel(feature_names[p1])plt.savefig('../1400_02_02.png')COLOUR_FIGURE = False%matplotlib inlinefrom matplotlib import pyplot as pltfrom matplotlib.colors import ListedColormap  #颜色listfrom load import load_datasetimport numpy as npfrom knn import learn_model, apply_model, accuracyfeature_names = [    'area',    'perimeter',    'compactness',    'length of kernel',    'width of kernel',    'asymmetry coefficien',    'length of kernel groove',]def train_plot(features, labels):    y0,y1 = features[:,2].min()*.9, features[:,2].max()*1.1    x0,x1 = features[:,0].min()*.9, features[:,0].max()*1.1    X = np.linspace(x0,x1,100)    Y = np.linspace(y0,y1,100)    X,Y = np.meshgrid(X,Y)       ##注意meshgrid的用法，格子    model = learn_model(1, features[:,(0,2)], np.array(labels))    C = apply_model(np.vstack([X.ravel(),Y.ravel()]).T, model).reshape(X.shape)   #ravel是视图， 相比flatten    if COLOUR_FIGURE:        cmap = ListedColormap([(1.,.6,.6),(.6,1.,.6),(.6,.6,1.)])     ##listedColormap的用法    else:        cmap = ListedColormap([(1.,1.,1.),(.2,.2,.2),(.6,.6,.6)])    plt.xlim(x0,x1)    plt.ylim(y0,y1)    plt.xlabel(feature_names[0])    plt.ylabel(feature_names[2])    plt.pcolormesh(X,Y,C, cmap=cmap)    if COLOUR_FIGURE:        cmap = ListedColormap([(1.,.0,.0),(.0,1.,.0),(.0,.0,1.)])        plt.scatter(features[:,0], features[:,2], c=labels, cmap=cmap)    else:        for lab,ma in zip(range(3), "Do^"):            plt.plot(features[labels == lab,0], features[labels == lab,2], ma, c=(1.,1.,1.))features,labels = load_dataset('seeds')names = sorted(set(labels))labels = np.array([names.index(ell) for ell in labels])train_plot(features, labels)plt.savefig('../1400_02_04.png')features -= features.mean(0)  #简洁的归一化方式features /= features.std(0)train_plot(features, labels)plt.savefig('../1400_02_05.png')from matplotlib import pyplot as pltimport numpy as npfrom sklearn.datasets import load_irisfrom threshold import learn_model, apply_model, accuracydata = load_iris()features = data['data']labels = data['target_names'][data['target']]setosa = (labels == 'setosa')features = features[~setosa]labels = labels[~setosa]virginica = (labels == 'virginica')testing = np.tile([True, False], 50)        ##tile（A,(2,2,3)）表示A的第一个维度重复3遍，第二个维度重复2遍，第三个维度重复2遍training = ~testing  #取反的简洁方式model = learn_model(features[training], virginica[training])train_error = accuracy(features[training], virginica[training], model)test_error = accuracy(features[testing], virginica[testing], model)print('''\Training error was {0:.1%}.Testing error was {1:.1%} (N = {2}).'''.format(train_error, test_error, testing.sum()))  # 注意此种字符串打印方式， 采用{0} 然后用.format()import numpy as npdef learn_model(k, features, labels):    return k, features.copy(),labels.copy()def plurality(xs):    from collections import defaultdict   #注意此种import方式    counts = defaultdict(int)       #dict    for x in xs:        counts[x] += 1    maxv = max(counts.values())    for k,v in counts.items():        if v == maxv:            return kdef apply_model(features, model):    k, train_feats, labels = model    results = []    for f in features:        label_dist = []        for t,ell in zip(train_feats, labels):            label_dist.append( (np.linalg.norm(f-t), ell) )        label_dist.sort(key=lambda d_ell: d_ell[0])        label_dist = label_dist[:k]        results.append(plurality([ell for _,ell in label_dist]))    return np.array(results)def accuracy(features, labels, model):    preds = apply_model(features, model)    return np.mean(preds == labels)  #简洁的方式计算氙灯还是不想等，然后进行处理import numpy as npdef load_dataset(dataset_name):    '''    data,labels = load_dataset(dataset_name)    Load a given dataset    Returns    -------    data : numpy ndarray    labels : list of str    '''    data = []    labels = []    with open('../data/{0}.tsv'.format(dataset_name)) as ifile:  # 注意with open  as 此种格式写法        for line in ifile:            tokens = line.strip().split('\t')            data.append([float(tk) for tk in tokens[:-1]])            labels.append(tokens[-1])    data = np.array(data)    labels = np.array(labels)    return data, labelsfrom load import load_datasetimport numpy as npfrom knn import learn_model, apply_model, accuracyfeatures,labels = load_dataset('seeds')def cross_validate(features, labels):    error = 0.0    for fold in range(10):        training = np.ones(len(features), bool)       #注意用ones，还有组合[fold::10] 进行交叉验证的取样操作        training[fold::10] = 0        testing = ~training         #简洁的方式取测试集合        model = learn_model(1, features[training], labels[training])        test_error = accuracy(features[testing], labels[testing], model)        error += test_error    return error/ 10.0error = cross_validate(features, labels)print('Ten fold cross-validated error was {0:.1%}.'.format(error))features -= features.mean(0)features /= features.std(0)error = cross_validate(features, labels)print('Ten fold cross-validated error after z-scoring was {0:.1%}.'.format(error))from load import load_datasetimport numpy as npfrom threshold import learn_model, apply_model, accuracyfeatures,labels = load_dataset('seeds')labels = labels == 'Canadian'error = 0.0for fold in range(10):    training = np.ones(len(features), bool)    training[fold::10] = 0    testing = ~training    model = learn_model(features[training], labels[training])    test_error = accuracy(features[testing], labels[testing], model)    error += test_errorerror /= 10.0print('Ten fold cross-validated error was {0:.1%}.'.format(error))import numpy as npfrom sklearn.datasets import load_irisdata = load_iris()features = data['data']labels = data['target_names'][data['target']]plength = features[:,2]is_setosa = (labels == 'setosa')print('Maximum of setosa: {0}.'.format(plength[is_setosa].max()))print('Minimum of others: {0}.'.format(plength[~is_setosa].min()))from matplotlib import pyplot as pltfrom sklearn.datasets import load_irisdata = load_iris()features = data['data']labels = data['target_names'][data['target']]setosa = (labels == 'setosa')features = features[~setosa]labels = labels[~setosa]virginica = (labels == 'virginica')  #注意此种写法，相等后赋值， 最后的结果是一个布尔数组best_acc = -1.0for fi in range(features.shape[1]):    thresh = features[:,fi].copy()    thresh.sort()    for t in thresh:        pred = (features[:,fi] > t)        acc = (pred == virginica).mean()        if acc > best_acc:            best_acc = acc            best_fi = fi            best_t = tprint('Best cut is {0} on feature {1}, which achieves accuracy of {2:.1%}.'.format(best_t,best_fi,best_acc))import numpy as npdef learn_model(features, labels):    best_acc = -1.0    for fi in range(features.shape[1]):        thresh = features[:,fi].copy()        thresh.sort()        for t in thresh:            pred = (features[:,fi] > t)            acc = (pred == labels).mean()            if acc > best_acc:                best_acc = acc                best_fi = fi                best_t = t    return best_t, best_fidef apply_model(features, model):    t, fi = model    return features[:,fi] > tdef accuracy(features, labels, model):    preds = apply_model(features, model)    return np.mean(preds == labels)第三章import osimport sysimport scipy as spfrom sklearn.feature_extraction.text import CountVectorizerDIR = r"../data/toy"posts = [open(os.path.join(DIR, f)).read() for f in os.listdir(DIR)]   #注意，os.listdir, 读取文档new_post = "imaging databases"import nltk.stem                  #词干处理库english_stemmer = nltk.stem.SnowballStemmer('english')class StemmedCountVectorizer(CountVectorizer):                 # 继承类    def build_analyzer(self):        analyzer = super(StemmedCountVectorizer, self).build_analyzer()        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))# vectorizer = CountVectorizer(min_df=1, stop_words='english',# preprocessor=stemmer)vectorizer = StemmedCountVectorizer(min_df=1, stop_words='english')from sklearn.feature_extraction.text import TfidfVectorizerclass StemmedTfidfVectorizer(TfidfVectorizer):    def build_analyzer(self):        analyzer = super(StemmedTfidfVectorizer, self).build_analyzer()        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))vectorizer = StemmedTfidfVectorizer(    min_df=1, stop_words='english', charset_error='ignore')print(vectorizer)X_train = vectorizer.fit_transform(posts)num_samples, num_features = X_train.shapeprint("#samples: %d, #features: %d" % (num_samples, num_features))new_post_vec = vectorizer.transform([new_post])print(new_post_vec, type(new_post_vec))print(new_post_vec.toarray())print(vectorizer.get_feature_names())def dist_raw(v1, v2):    delta = v1 - v2    return sp.linalg.norm(delta.toarray())               #linalg.norm 计算欧式距离def dist_norm(v1, v2):    v1_normalized = v1 / sp.linalg.norm(v1.toarray())    v2_normalized = v2 / sp.linalg.norm(v2.toarray())    delta = v1_normalized - v2_normalized    return sp.linalg.norm(delta.toarray())dist = dist_normbest_dist = sys.maxsize            #sys.maxsize python 3 中最大值best_i = Nonefor i in range(0, num_samples):    post = posts[i]    if post == new_post:        continue    post_vec = X_train.getrow(i)    d = dist(post_vec, new_post_vec)    print("=== Post %i with dist=%.2f: %s" % (i, d, post))    if d < best_dist:        best_dist = d        best_i = iprint("Best post is %i with dist=%.2f" % (best_i, best_dist))import sklearn.datasetsimport scipy as spnew_post = \    """Disk drive problems. Hi, I have a problem with my hard disk.After 1 year it is working only sporadically now.I tried to format it, but now it doesn't boot any more.Any ideas? Thanks."""MLCOMP_DIR = r"P:\Dropbox\pymlbook\data"groups = [    'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware',    'comp.sys.ma c.hardware', 'comp.windows.x', 'sci.space']dataset = sklearn.datasets.load_mlcomp("20news-18828", "train",                                       mlcomp_root=MLCOMP_DIR,                                       categories=groups)print("Number of posts:", len(dataset.filenames))labels = dataset.targetnum_clusters = 50  # sp.unique(labels).shape[0]import nltk.stemenglish_stemmer = nltk.stem.SnowballStemmer('english')from sklearn.feature_extraction.text import TfidfVectorizerclass StemmedTfidfVectorizer(TfidfVectorizer):    def build_analyzer(self):        analyzer = super(TfidfVectorizer, self).build_analyzer()        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))vectorizer = StemmedTfidfVectorizer(min_df=10, max_df=0.5,                                    # max_features=1000,                                    stop_words='english', charset_error='ignore'                                    )vectorized = vectorizer.fit_transform(dataset.data)num_samples, num_features = vectorized.shapeprint("#samples: %d, #features: %d" % (num_samples, num_features))from sklearn.cluster import KMeanskm = KMeans(n_clusters=num_clusters, init='k-means++', n_init=1,            verbose=1)clustered = km.fit(vectorized)from sklearn import metrics              #许多评估函数在此包中print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))print("Adjusted Rand Index: %0.3f" %      metrics.adjusted_rand_score(labels, km.labels_))print("Adjusted Mutual Information: %0.3f" %      metrics.adjusted_mutual_info_score(labels, km.labels_))print(("Silhouette Coefficient: %0.3f" %       metrics.silhouette_score(vectorized, labels, sample_size=1000)))new_post_vec = vectorizer.transform([new_post])new_post_label = km.predict(new_post_vec)[0]similar_indices = (km.labels_ == new_post_label).nonzero()[0]similar = []for i in similar_indices:    dist = sp.linalg.norm((new_post_vec - vectorized[i]).toarray())    similar.append((dist, dataset.data[i]))similar = sorted(similar)import pdbpdb.set_trace()show_at_1 = similar[0]show_at_2 = similar[len(similar) / 2]show_at_3 = similar[-1]print(show_at_1)print(show_at_2)print(show_at_3)import scipy as spdef tfidf(t, d, D):    tf = float(d.count(t)) / sum(d.count(w) for w in set(d))    idf = sp.log(float(len(D)) / (len([doc for doc in D if t in doc])))    return tf * idfa, abb, abc = ["a"], ["a", "b", "b"], ["a", "b", "c"]D = [a, abb, abc]print(tfidf("a", a, D))print(tfidf("b", abb, D))print(tfidf("a", abc, D))print(tfidf("b", abc, D))print(tfidf("c", abc, D))import osimport scipy as spfrom scipy.stats import normfrom matplotlib import pylabfrom sklearn.cluster import KMeansseed = 2sp.random.seed(seed)  # to reproduce the data later onnum_clusters = 3def plot_clustering(x, y, title, mx=None, ymax=None, xmin=None, km=None):    pylab.figure(num=None, figsize=(8, 6))    if km:        pylab.scatter(x, y, s=50, c=km.predict(list(zip(x, y))))  # 此处使用zip    else:        pylab.scatter(x, y, s=50)    pylab.title(title)    pylab.xlabel("Occurrence word 1")    pylab.ylabel("Occurrence word 2")    # pylab.xticks([w*7*24 for w in range(10)], ['week %i'%w for w in range(10)])    pylab.autoscale(tight=True)    pylab.ylim(ymin=0, ymax=1)    pylab.xlim(xmin=0, xmax=1)    pylab.grid(True, linestyle='-', color='0.75')    return pylabxw1 = norm(loc=0.3, scale=.15).rvs(20)   #rvs随机数的个数yw1 = norm(loc=0.3, scale=.15).rvs(20)xw2 = norm(loc=0.7, scale=.15).rvs(20)yw2 = norm(loc=0.7, scale=.15).rvs(20)xw3 = norm(loc=0.2, scale=.15).rvs(20)yw3 = norm(loc=0.8, scale=.15).rvs(20)x = sp.append(sp.append(xw1, xw2), xw3)y = sp.append(sp.append(yw1, yw2), yw3)i = 1plot_clustering(x, y, "Vectors")pylab.savefig(os.path.join("..", "1400_03_0%i.png" % i))pylab.clf()i += 1mx, my = sp.meshgrid(sp.arange(0, 1, 0.001), sp.arange(0, 1, 0.001))km = KMeans(init='random', n_clusters=num_clusters, verbose=1,            n_init=1, max_iter=1,            random_state=seed)km.fit(sp.array(list(zip(x, y))))Z = km.predict(sp.c_[mx.ravel(), my.ravel()]).reshape(mx.shape)plot_clustering(x, y, "Clustering iteration 1", km=km)pylab.imshow(Z, interpolation='nearest',               ##imshow图片显示？           extent=(mx.min(), mx.max(), my.min(), my.max()),           cmap=pylab.cm.Blues,           aspect='auto', origin='lower')c1a, c1b, c1c = km.cluster_centers_pylab.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1],            marker='x', linewidth=2, s=100, color='black')pylab.savefig(os.path.join("..", "1400_03_0%i.png" % i))pylab.clf()i += 1#################### 2 iterations ####################km = KMeans(init='random', n_clusters=num_clusters, verbose=1,            n_init=1, max_iter=2,            random_state=seed)km.fit(sp.array(list(zip(x, y))))Z = km.predict(sp.c_[mx.ravel(), my.ravel()]).reshape(mx.shape)plot_clustering(x, y, "Clustering iteration 2", km=km)pylab.imshow(Z, interpolation='nearest',           extent=(mx.min(), mx.max(), my.min(), my.max()),           cmap=pylab.cm.Blues,           aspect='auto', origin='lower')c2a, c2b, c2c = km.cluster_centers_pylab.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1],            marker='x', linewidth=2, s=100, color='black')# import pdb;pdb.set_trace()pylab.gca().add_patch(    pylab.Arrow(c1a[0], c1a[1], c2a[0] - c1a[0], c2a[1] - c1a[1], width=0.1))pylab.gca().add_patch(    pylab.Arrow(c1b[0], c1b[1], c2b[0] - c1b[0], c2b[1] - c1b[1], width=0.1))pylab.gca().add_patch(    pylab.Arrow(c1c[0], c1c[1], c2c[0] - c1c[0], c2c[1] - c1c[1], width=0.1))pylab.savefig(os.path.join("..", "1400_03_0%i.png" % i))pylab.clf()i += 1#################### 3 iterations ####################km = KMeans(init='random', n_clusters=num_clusters, verbose=1,            n_init=1, max_iter=10,            random_state=seed)km.fit(sp.array(list(zip(x, y))))Z = km.predict(sp.c_[mx.ravel(), my.ravel()]).reshape(mx.shape)plot_clustering(x, y, "Clustering iteration 10", km=km)pylab.imshow(Z, interpolation='nearest',           extent=(mx.min(), mx.max(), my.min(), my.max()),           cmap=pylab.cm.Blues,           aspect='auto', origin='lower')pylab.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1],            marker='x', linewidth=2, s=100, color='black')pylab.savefig(os.path.join("..", "1400_03_0%i.png" % i))pylab.clf()i += 1第四章from __future__ import print_functionfrom gensim import corpora, models, similarities          #LDA主题分析包from mpltools import styleimport matplotlib.pyplot as pltimport numpy as npfrom os import pathstyle.use('ggplot')                #画图风格if not path.exists('./data/ap/ap.dat'):                   #检测路径存在方式 os.exists    print('Error: Expected data to be present at data/ap/')corpus = corpora.BleiCorpus('./data/ap/ap.dat', './data/ap/vocab.txt')model = models.ldamodel.LdaModel(corpus, num_topics=100, id2word=corpus.id2word, alpha=None)for ti in xrange(84):    words = model.show_topic(ti, 64)    tf = sum(f for f,w in words)    print('\n'.join('{}:{}'.format(w, int(1000.*f/tf)) for f,w in words))    print()    print()    print()thetas = [model[c] for c in corpus]plt.hist([len(t) for t in thetas], np.arange(42))plt.ylabel('Nr of documents')plt.xlabel('Nr of topics')plt.savefig('../1400OS_04_01+.png')model1 = models.ldamodel.LdaModel(corpus, num_topics=100, id2word=corpus.id2word, alpha=1.)thetas1 = [model1[c] for c in corpus]#model8 = models.ldamodel.LdaModel(corpus, num_topics=100, id2word=corpus.id2word, alpha=1.e-8)#thetas8 = [model8[c] for c in corpus]plt.clf()plt.hist([[len(t) for t in thetas], [len(t) for t in thetas1]], np.arange(42))plt.ylabel('Nr of documents')plt.xlabel('Nr of topics')plt.text(9,223, r'default alpha')plt.text(26,156, 'alpha=1.0')plt.savefig('../1400OS_04_02+.png')import nltk.corpusimport milkimport numpy as npimport stringfrom gensim import corpora, models, similaritiesimport sklearn.datasetsimport nltk.stemfrom collections import defaultdictenglish_stemmer = nltk.stem.SnowballStemmer('english')stopwords = set(nltk.corpus.stopwords.words('english'))stopwords.update(['from:', 'subject:', 'writes:', 'writes'])class DirectText(corpora.textcorpus.TextCorpus):    def get_texts(self):        return self.input    def __len__(self):        return len(self.input)dataset = sklearn.datasets.load_mlcomp("20news-18828", "train",                                        mlcomp_root='../data')otexts = dataset.datatexts = dataset.datatexts = [t.decode('utf-8', 'ignore') for t in texts]texts = [t.split() for t in texts]texts = [map(lambda w: w.lower(), t) for t in texts]               #注意使用map lambda 在列表推导式中 ， 结果处理 texts = [filter(lambda s : not len(set("+-.?!()>@012345689") & set(s)), t) for t in texts]texts = [filter(lambda s : (len(s) > 3) and (s not in stopwords), t) for t in texts]texts = [map(english_stemmer.stem,t) for t in texts]usage = defaultdict(int)for t in texts:    for w in set(t):        usage[w] += 1limit = len(texts)/10too_common = [w for w in usage if usage[w] > limit]too_common = set(too_common)texts = [filter(lambda s : s not in too_common, t) for t in texts]corpus = DirectText(texts)dictionary = corpus.dictionarytry:    dictionary['computer']except:    passmodel = models.ldamodel.LdaModel(corpus, num_topics=100, id2word=dictionary.id2token)thetas = np.zeros((len(texts),100))for i,c in enumerate(corpus):    for ti,v in model[c]:        thetas[i,ti] += vdistances = milk.unsupervised.pdist(thetas)          #milk库large = distances.max() + 1for i in xrange(len(distances)): distances[i,i] = largeprint otexts[1]printprintprintprint otexts[distances[1].argmin()]from __future__ import print_functionimport numpy as npimport logging, gensimlogging.basicConfig(    format='%(asctime)s : %(levelname)s : %(message)s',    level=logging.INFO)id2word = gensim.corpora.Dictionary.load_from_text('data/wiki_en_output_wordids.txt')mm = gensim.corpora.MmCorpus('data/wiki_en_output_tfidf.mm')model = gensim.models.ldamodel.LdaModel(          corpus=mm,          id2word=id2word,          num_topics=100,          update_every=1,          chunksize=10000,          passes=1)model.save('wiki_lda.pkl')topics = [model[doc] for doc in mm]lens = np.array([len(t) for t in topics])print(np.mean(lens <= 10))print(np.mean(lens))counts = np.zeros(100)for doc_top in topics:    for ti,_ in doc_toc:        counts[ti] += 1        for doc_top in topics:    for ti,_ in doc_top:        counts[ti] += 1        words = model.show_topic(counts.argmax(), 64)print(words)print()print()print()words = model.show_topic(counts.argmin(), 64)print(words)print()print()print() 第五章import ostry:    import ujson as json  # UltraJSON if available         注意try, except 用在importexcept:    import jsonimport sysfrom collections import defaultdicttry:    import enchantexcept:    print(        "Enchant is not installed. You can get it from http://packages.python.org/pyenchant/. Exitting...")    sys.exit(1)from data import chosen, chosen_meta, filtered, filtered_metafiltered_meta = json.load(open(filtered_meta, "r"))speller = enchant.Dict("en_US")def misspelled_fraction(p):    tokens = p.split()    if not tokens:        return 0.0    return 1 - float(sum(speller.check(t) for t in tokens)) / len(tokens)    ## 在此处使用for in，在函数sum内部，类似列表表达式，def data(filename, col=None):    for line in open(filename, "r"):        data = line.strip().split("\t")        # check format        Id, ParentId, IsAccepted, TimeToAnswer, Score, Text, NumTextTokens, NumCodeLines, LinkCount, NumImages = data        if col:            yield data[col]           #生成器， 注意生成器一定和for in 等循环之类的在一起使用？        else:            yield dataposts_to_keep = set()found_questions = 0num_qestion_sample = 1000# keep the best and worst, but only if we have one with positive and one with negative score# filter_method = "negative_positive"# if true, only keep the lowest scoring answer per class in addition to the accepted one# filter_method = "only_one_per_class "# if not None, specifies the number of unaccepted per question# filter_method = "sample_per_question"filter_method = "negative_positive"  # warning: this does not retrieve many!# filter_method = "only_one_per_class"MaxAnswersPerQuestions = 10  # filter_method == "sample_per_question"# filter_method = "all"# equal share of questions that are unanswered and those that are answered# filter_method = "half-half"unaccepted_scores = {}has_q_accepted_a = {}num_q_with_accepted_a = 0num_q_without_accepted_a = 0for ParentId, posts in filtered_meta.items():    assert ParentId != -1    if len(posts) < 2:        continue    ParentId = int(ParentId)    AllIds = set([ParentId])    AcceptedId = None    UnacceptedId = None    UnacceptedIds = []    UnacceptedScore = sys.maxsize    NegativeScoreIds = []    PositiveScoreIds = []    if filter_method == "half-half":        has_accepted_a = False        for post in posts:            Id, IsAccepted, TimeToAnswer, Score = post            if IsAccepted:                has_accepted_a = True                break        has_q_accepted_a[ParentId] = has_accepted_a        if has_accepted_a:            if num_q_with_accepted_a < num_qestion_sample / 2:                num_q_with_accepted_a += 1                posts_to_keep.add(ParentId)        else:            if num_q_without_accepted_a < num_qestion_sample / 2:                num_q_without_accepted_a += 1                posts_to_keep.add(ParentId)        if num_q_without_accepted_a + num_q_with_accepted_a > num_qestion_sample:            assert -1 not in posts_to_keep            break    else:        for post in posts:            Id, IsAccepted, TimeToAnswer, Score = post            if filter_method == "all":                AllIds.add(int(Id))            elif filter_method == "only_one_per_class":                if IsAccepted:                    AcceptedId = Id                elif Score < UnacceptedScore:                    UnacceptedScore = Score                    UnacceptedId = Id            elif filter_method == "sample_per_question":                if IsAccepted:                    AcceptedId = Id                else:                    UnacceptedIds.append(Id)            elif filter_method == "negative_positive":                if Score < 0:                    NegativeScoreIds.append((Score, Id))                elif Score > 0:                    PositiveScoreIds.append((Score, Id))            else:                raise ValueError(filter_method)        added = False        if filter_method == "all":            posts_to_keep.update(AllIds)            added = True        elif filter_method == "only_one_per_class":            if AcceptedId is not None and UnacceptedId is not None:                posts_to_keep.add(ParentId)                posts_to_keep.add(AcceptedId)                posts_to_keep.add(UnacceptedId)                added = True        elif filter_method == "sample_per_question":            if AcceptedId is not None and UnacceptedIds is not None:         ##注意python中None 不是零，不是NULL，不是空字符串                posts_to_keep.add(ParentId)                posts_to_keep.add(AcceptedId)                posts_to_keep.update(UnacceptedIds[:MaxAnswersPerQuestions])                added = True        elif filter_method == "negative_positive":            if PositiveScoreIds and NegativeScoreIds:                posts_to_keep.add(ParentId)                posScore, posId = sorted(PositiveScoreIds)[-1]                posts_to_keep.add(posId)                negScore, negId = sorted(NegativeScoreIds)[0]                posts_to_keep.add(negId)                print("%i: %i/%i %i/%i" % (ParentId, posId,                      posScore, negId, negScore))                added = True        if added:            found_questions += 1    if num_qestion_sample and found_questions >= num_qestion_sample:        breaktotal = 0kept = 0already_written = set()chosen_meta_dict = defaultdict(dict)with open(chosen, "w") as f:    for line in data(filtered):        strId, ParentId, IsAccepted, TimeToAnswer, Score, Text, NumTextTokens, NumCodeLines, LinkCount, NumImages = line        Text = Text.strip()        total += 1        Id = int(strId)        if Id in posts_to_keep:            if Id in already_written:                print(Id, "is already written")                continue            if kept % 100 == 0:                print(kept)            # setting meta info            post = chosen_meta_dict[Id]            post['ParentId'] = int(ParentId)            post['IsAccepted'] = int(IsAccepted)            post['TimeToAnswer'] = int(TimeToAnswer)            post['Score'] = int(Score)            post['NumTextTokens'] = int(NumTextTokens)            post['NumCodeLines'] = int(NumCodeLines)            post['LinkCount'] = int(LinkCount)            post['MisSpelledFraction'] = misspelled_fraction(Text)            post['NumImages'] = int(NumImages)            post['idx'] = kept  # index into the file            if int(ParentId) == -1:                q = chosen_meta_dict[Id]                if not 'Answers' in q:                    q['Answers'] = []                if filter_method == "half-half":                    q['HasAcceptedAnswer'] = has_q_accepted_a[Id]            else:                q = chosen_meta_dict[int(ParentId)]                if int(IsAccepted) == 1:                    assert 'HasAcceptedAnswer' not in q                    q['HasAcceptedAnswer'] = True                if 'Answers' not in q:                    q['Answers'] = [Id]                else:                    q['Answers'].append(Id)            f.writelines("%s\t%s\n" % (Id, Text))            kept += 1with open(chosen_meta, "w") as fm:    json.dump(chosen_meta_dict, fm)print("total=", total)print("kept=", kept)import timestart_time = time.time()import numpy as npfrom sklearn.metrics import classification_reportfrom sklearn.metrics import precision_recall_curve, roc_curve, aucfrom sklearn.cross_validation import KFoldfrom sklearn import neighborsfrom data import chosen, chosen_metafrom utils import plot_roc, plot_prfrom utils import plot_feat_importancefrom utils import load_metafrom utils import fetch_postsfrom utils import plot_feat_histfrom utils import plot_bias_variancefrom utils import plot_k_complexity# question Id -> {'features'->feature vector, 'answers'->[answer Ids]}, 'scores'->[scores]}# scores will be added on-the-fly as the are not in metameta, id_to_idx, idx_to_id = load_meta(chosen_meta)import nltk# splitting questions into train (70%) and test(30%) and then take their# answersall_posts = list(meta.keys())all_questions = [q for q, v in meta.items() if v['ParentId'] == -1]all_answers = [q for q, v in meta.items() if v['ParentId'] != -1]  # [:500]feature_names = np.array((    'NumTextTokens',    'NumCodeLines',    'LinkCount',    'AvgSentLen',    'AvgWordLen',    'NumAllCaps',    'NumExclams',    'NumImages'))# activate the following for reduced feature space"""feature_names = np.array((    'NumTextTokens',    'LinkCount',))"""def prepare_sent_features():    for pid, text in fetch_posts(chosen, with_index=True):        if not text:            meta[pid]['AvgSentLen'] = meta[pid]['AvgWordLen'] = 0        else:            sent_lens = [len(nltk.word_tokenize(                sent)) for sent in nltk.sent_tokenize(text)]            meta[pid]['AvgSentLen'] = np.mean(sent_lens)            meta[pid]['AvgWordLen'] = np.mean(                [len(w) for w in nltk.word_tokenize(text)])        meta[pid]['NumAllCaps'] = np.sum(            [word.isupper() for word in nltk.word_tokenize(text)])        meta[pid]['NumExclams'] = text.count('!')prepare_sent_features()def get_features(aid):    return tuple(meta[aid][fn] for fn in feature_names)qa_X = np.asarray([get_features(aid) for aid in all_answers])# Score > 0 tests => positive class is good answer# Score <= 0 tests => positive class is poor answerqa_Y = np.asarray([meta[aid]['Score'] > 0 for aid in all_answers])classifying_answer = "good"for idx, feat in enumerate(feature_names):    plot_feat_hist([(qa_X[:, idx], feat)])"""plot_feat_hist([(qa_X[:, idx], feature_names[idx]) for idx in [1,0]], 'feat_hist_two.png')plot_feat_hist([(qa_X[:, idx], feature_names[idx]) for idx in [3,4,5,6]], 'feat_hist_four.png')"""avg_scores_summary = []def measure(clf_class, parameters, name, data_size=None, plot=False):    start_time_clf = time.time()    if data_size is None:        X = qa_X        Y = qa_Y    else:        X = qa_X[:data_size]        Y = qa_Y[:data_size]    cv = KFold(n=len(X), n_folds=10, indices=True)    train_errors = []    test_errors = []    scores = []    roc_scores = []    fprs, tprs = [], []    pr_scores = []    precisions, recalls, thresholds = [], [], []    for train, test in cv:        X_train, y_train = X[train], Y[train]        X_test, y_test = X[test], Y[test]        clf = clf_class(**parameters)                ##可变参数        clf.fit(X_train, y_train)        train_score = clf.score(X_train, y_train)        test_score = clf.score(X_test, y_test)        train_errors.append(1 - train_score)        test_errors.append(1 - test_score)        scores.append(test_score)        proba = clf.predict_proba(X_test)        label_idx = 1        fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, label_idx])        precision, recall, pr_thresholds = precision_recall_curve(            y_test, proba[:, label_idx])        roc_scores.append(auc(fpr, tpr))        fprs.append(fpr)        tprs.append(tpr)        pr_scores.append(auc(recall, precision))        precisions.append(precision)        recalls.append(recall)        thresholds.append(pr_thresholds)        print(classification_report(y_test, proba[:, label_idx] >              0.63, target_names=['not accepted', 'accepted']))    # get medium clone    scores_to_sort = pr_scores  # roc_scores    medium = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]    if plot:        #plot_roc(roc_scores[medium], name, fprs[medium], tprs[medium])        plot_pr(pr_scores[medium], name, precisions[medium], recalls[medium], classifying_answer + " answers")        if hasattr(clf, 'coef_'):            plot_feat_importance(feature_names, clf, name)    summary = (name,               np.mean(scores), np.std(scores),               np.mean(roc_scores), np.std(roc_scores),               np.mean(pr_scores), np.std(pr_scores),               time.time() - start_time_clf)    print(summary)    avg_scores_summary.append(summary)    precisions = precisions[medium]    recalls = recalls[medium]    thresholds = np.hstack(([0], thresholds[medium]))    idx80 = precisions >= 0.8    print("P=%.2f R=%.2f thresh=%.2f" % (precisions[idx80][0], recalls[          idx80][0], thresholds[idx80][0]))    return np.mean(train_errors), np.mean(test_errors)def bias_variance_analysis(clf_class, parameters, name):    data_sizes = np.arange(60, 2000, 4)    train_errors = []    test_errors = []    for data_size in data_sizes:        train_error, test_error = measure(            clf_class, parameters, name, data_size=data_size)        train_errors.append(train_error)        test_errors.append(test_error)    plot_bias_variance(data_sizes, train_errors, test_errors, name, "Bias-Variance for '%s'" % name)def k_complexity_analysis(clf_class, parameters):    ks = np.hstack((np.arange(1, 20), np.arange(21, 100, 5)))    train_errors = []    test_errors = []    for k in ks:        parameters['n_neighbors'] = k        train_error, test_error = measure(            clf_class, parameters, "%dNN" % k, data_size=2000)        train_errors.append(train_error)        test_errors.append(test_error)    plot_k_complexity(ks, train_errors, test_errors)for k in [5]: #[5, 10, 40, 90]:    bias_variance_analysis(neighbors.KNeighborsClassifier, {'n_neighbors':k, 'warn_on_equidistant':False}, "%iNN"%k)    k_complexity_analysis(neighbors.KNeighborsClassifier, {'n_neighbors':k,     'warn_on_equidistant':False})    #measure(neighbors.KNeighborsClassifier, {'n_neighbors': k, 'p': 2,            #'warn_on_equidistant': False}, "%iNN" % k)from sklearn.linear_model import LogisticRegressionfor C in [0.1]: #[0.01, 0.1, 1.0, 10.0]:    name = "LogReg C=%.2f" % C    bias_variance_analysis(LogisticRegression, {'penalty':'l2', 'C':C}, name)    measure(LogisticRegression, {'penalty': 'l2', 'C': C}, name, plot=True)print("=" * 50)from operator import itemgetterfor s in reversed(sorted(avg_scores_summary, key=itemgetter(1))):    print("%-20s\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f" % s)print("time spent:", time.time() - start_time)import os# DATA_DIR = r"C:\pymlbook-data\ch05"DATA_DIR = r"/media/sf_C/pymlbook-data/ch05"CHART_DIR = os.path.join("..", "charts")filtered = os.path.join(DATA_DIR, "filtered.tsv")filtered_meta = os.path.join(DATA_DIR, "filtered-meta.json")chosen = os.path.join(DATA_DIR, "chosen.tsv")chosen_meta = os.path.join(DATA_DIR, "chosen-meta.json")import numpy as npfrom scipy.stats import normfrom matplotlib import pyplotnp.random.seed(3)num_per_class = 40X = np.hstack((norm.rvs(2, size=num_per_class, scale=2),              norm.rvs(8, size=num_per_class, scale=3)))y = np.hstack((np.zeros(num_per_class),               np.ones(num_per_class)))def lr_model(clf, X):    return 1.0 / (1.0 + np.exp(-(clf.intercept_ + clf.coef_ * X)))from sklearn.linear_model import LogisticRegressionlogclf = LogisticRegression()print(logclf)logclf.fit(X.reshape(num_per_class * 2, 1), y)            ## reshapeprint(np.exp(logclf.intercept_), np.exp(logclf.coef_.ravel()))      ##注意线性回归的截距和参数print("P(x=-1)=%.2f\tP(x=7)=%.2f" % (lr_model(logclf, -1), lr_model(logclf, 7)))X_test = np.arange(-5, 20, 0.1)pyplot.figure(figsize=(10, 4))pyplot.xlim((-5, 20))pyplot.scatter(X, y, c=y)pyplot.xlabel("feature value")pyplot.ylabel("class")pyplot.grid(True, linestyle='-', color='0.75')pyplot.savefig("log_reg_example_data.png", bbox_inches="tight")def lin_model(clf, X):    return clf.intercept_ + clf.coef_ * Xfrom sklearn.linear_model import LinearRegressionclf = LinearRegression()print(clf)clf.fit(X.reshape(num_per_class * 2, 1), y)X_odds = np.arange(0, 1, 0.001)pyplot.figure(figsize=(10, 4))pyplot.subplot(1, 2, 1)pyplot.scatter(X, y, c=y)pyplot.plot(X_test, lin_model(clf, X_test))pyplot.xlabel("feature value")pyplot.ylabel("class")pyplot.title("linear fit on original data")pyplot.grid(True, linestyle='-', color='0.75')X_ext = np.hstack((X, norm.rvs(20, size=100, scale=5)))y_ext = np.hstack((y, np.ones(100)))clf = LinearRegression()clf.fit(X_ext.reshape(num_per_class * 2 + 100, 1), y_ext)pyplot.subplot(1, 2, 2)pyplot.scatter(X_ext, y_ext, c=y_ext)pyplot.plot(X_ext, lin_model(clf, X_ext))pyplot.xlabel("feature value")pyplot.ylabel("class")pyplot.title("linear fit on additional data")pyplot.grid(True, linestyle='-', color='0.75')pyplot.savefig("log_reg_log_linear_fit.png", bbox_inches="tight")pyplot.figure(figsize=(10, 4))pyplot.xlim((-5, 20))pyplot.scatter(X, y, c=y)pyplot.plot(X_test, lr_model(logclf, X_test).ravel())pyplot.plot(X_test, np.ones(X_test.shape[0]) * 0.5, "--")pyplot.xlabel("feature value")pyplot.ylabel("class")pyplot.grid(True, linestyle='-', color='0.75')pyplot.savefig("log_reg_example_fitted.png", bbox_inches="tight")X = np.arange(0, 1, 0.001)pyplot.figure(figsize=(10, 4))pyplot.subplot(1, 2, 1)pyplot.xlim((0, 1))pyplot.ylim((0, 10))pyplot.plot(X, X / (1 - X))pyplot.xlabel("P")pyplot.ylabel("odds = P / (1-P)")pyplot.grid(True, linestyle='-', color='0.75')pyplot.subplot(1, 2, 2)pyplot.xlim((0, 1))pyplot.plot(X, np.log(X / (1 - X)))pyplot.xlabel("P")pyplot.ylabel("log(odds) = log(P / (1-P))")pyplot.grid(True, linestyle='-', color='0.75')pyplot.savefig("log_reg_log_odds.png", bbox_inches="tight")import refrom operator import itemgetter               #多条件排序from collections import Mappingimport scipy.sparse as spfrom sklearn.base import BaseEstimator         ##注意评估器选择函数from sklearn.feature_extraction.text import strip_accents_ascii, strip_accents_unicodeimport nltkfrom collections import Countertry:    import ujson as json  # UltraJSON if availableexcept:    import jsonposcache_filename = "poscache.json"class PosCounter(Counter):    def __init__(self, iterable=(), normalize=True, poscache=None, **kwargs):        self.n_sents = 0        self.normalize = normalize        self.poscache = poscache        super(PosCounter, self).__init__(iterable, **kwargs)    def update(self, other):        """Adds counts for elements in other"""        if isinstance(other, self.__class__):            self.n_sents += other.n_sents            for x, n in other.items():                self[x] += n        else:            for sent in other:                self.n_sents += 1                # import pdb;pdb.set_trace()                if self.poscache is not None:                    if sent in self.poscache:                        tags = self.poscache[sent]                    else:                        self.poscache[sent] = tags = nltk.pos_tag(                            nltk.word_tokenize(sent))                else:                    tags = nltk.pos_tag(nltk.word_tokenize(sent))                for x in tags:                    tok, tag = x                    self[tag] += 1            if self.normalize:                for x, n in self.items():                    self[x] /= float(self.n_sents)class PosTagFreqVectorizer(BaseEstimator):    """    Convert a collection of raw documents to a matrix Pos tag frequencies    """    def __init__(self, input='content', charset='utf-8',                 charset_error='strict', strip_accents=None,                 vocabulary=None,                 normalize=True,                 dtype=float):        self.input = input        self.charset = charset        self.charset_error = charset_error        self.strip_accents = strip_accents        if vocabulary is not None:            self.fixed_vocabulary = True            if not isinstance(vocabulary, Mapping):                vocabulary = dict((t, i) for i, t in enumerate(vocabulary))            self.vocabulary_ = vocabulary        else:            self.fixed_vocabulary = False        try:            self.poscache = json.load(open(poscache_filename, "r"))        except IOError:            self.poscache = {}        self.normalize = normalize        self.dtype = dtype    def write_poscache(self):        json.dump(self.poscache, open(poscache_filename, "w"))    def decode(self, doc):        """Decode the input into a string of unicode symbols        The decoding strategy depends on the vectorizer parameters.        """        if self.input == 'filename':            doc = open(doc, 'rb').read()        elif self.input == 'file':            doc = doc.read()        if isinstance(doc, bytes):            doc = doc.decode(self.charset, self.charset_error)        return doc    def build_preprocessor(self):        """Return a function to preprocess the text before tokenization"""        # unfortunately python functools package does not have an efficient        # `compose` function that would have allowed us to chain a dynamic        # number of functions. However the however of a lambda call is a few        # hundreds of nanoseconds which is negligible when compared to the        # cost of tokenizing a string of 1000 chars for instance.        noop = lambda x: x        # accent stripping        if not self.strip_accents:            strip_accents = noop        elif hasattr(self.strip_accents, '__call__'):            strip_accents = self.strip_accents        elif self.strip_accents == 'ascii':            strip_accents = strip_accents_ascii        elif self.strip_accents == 'unicode':            strip_accents = strip_accents_unicode        else:            raise ValueError('Invalid value for "strip_accents": %s' %                             self.strip_accents)        only_prose = lambda s: re.sub('<[^>]*>', '', s).replace("\n", " ")        return lambda x: strip_accents(only_prose(x))    def build_tokenizer(self):        """Return a function that split a string in sequence of tokens"""        return nltk.sent_tokenize    def build_analyzer(self):        """Return a callable that handles preprocessing and tokenization"""        preprocess = self.build_preprocessor()        tokenize = self.build_tokenizer()        return lambda doc: tokenize(preprocess(self.decode(doc)))    def _term_count_dicts_to_matrix(self, term_count_dicts):        i_indices = []        j_indices = []        values = []        vocabulary = self.vocabulary_        for i, term_count_dict in enumerate(term_count_dicts):            for term, count in term_count_dict.items():                j = vocabulary.get(term)                if j is not None:                    i_indices.append(i)                    j_indices.append(j)                    values.append(count)            # free memory as we go            term_count_dict.clear()        shape = (len(term_count_dicts), max(vocabulary.values()) + 1)        spmatrix = sp.csr_matrix((values, (i_indices, j_indices)),                                 shape=shape, dtype=self.dtype)        return spmatrix    def fit(self, raw_documents, y=None):        """Learn a vocabulary dictionary of all tokens in the raw documents        Parameters        ----------        raw_documents: iterable            an iterable which yields either str, unicode or file objects        Returns        -------        self        """        self.fit_transform(raw_documents)        return self    def fit_transform(self, raw_documents, y=None):        """Learn the vocabulary dictionary and return the count vectors        This is more efficient than calling fit followed by transform.        Parameters        ----------        raw_documents: iterable            an iterable which yields either str, unicode or file objects        Returns        -------        vectors: array, [n_samples, n_features]        """        if self.fixed_vocabulary:            # No need to fit anything, directly perform the transformation.            # We intentionally don't call the transform method to make it            # fit_transform overridable without unwanted side effects in            # TfidfVectorizer            analyze = self.build_analyzer()            term_counts_per_doc = [PosCounter(analyze(doc), normalize=self.normalize, poscache=self.poscache)                                   for doc in raw_documents]            return self._term_count_dicts_to_matrix(term_counts_per_doc)        self.vocabulary_ = {}        # result of document conversion to term count dicts        term_counts_per_doc = []        term_counts = Counter()        analyze = self.build_analyzer()        for doc in raw_documents:            term_count_current = PosCounter(                analyze(doc), normalize=self.normalize, poscache=self.poscache)            term_counts.update(term_count_current)            term_counts_per_doc.append(term_count_current)        self.write_poscache()        terms = set(term_counts)        # store map from term name to feature integer index: we sort the term        # to have reproducible outcome for the vocabulary structure: otherwise        # the mapping from feature name to indices might depend on the memory        # layout of the machine. Furthermore sorted terms might make it        # possible to perform binary search in the feature names array.        self.vocabulary_ = dict(((t, i) for i, t in enumerate(sorted(terms))))        return self._term_count_dicts_to_matrix(term_counts_per_doc)    def transform(self, raw_documents):        """Extract token counts out of raw text documents using the vocabulary        fitted with fit or the one provided in the constructor.        Parameters        ----------        raw_documents: iterable            an iterable which yields either str, unicode or file objects        Returns        -------        vectors: sparse matrix, [n_samples, n_features]        """        if not hasattr(self, 'vocabulary_') or len(self.vocabulary_) == 0:            raise ValueError("Vocabulary wasn't fitted or is empty!")        # raw_documents can be an iterable so we don't know its size in        # advance        # XXX @larsmans tried to parallelize the following loop with joblib.        # The result was some 20% slower than the serial version.        analyze = self.build_analyzer()        term_counts_per_doc = [Counter(analyze(doc)) for doc in raw_documents]        return self._term_count_dicts_to_matrix(term_counts_per_doc)    def get_feature_names(self):        """Array mapping from feature integer indices to feature name"""        if not hasattr(self, 'vocabulary_') or len(self.vocabulary_) == 0:            raise ValueError("Vocabulary wasn't fitted or is empty!")        return [t for t, i in sorted(iter(self.vocabulary_.items()),                                     key=itemgetter(1))]## This script filters the posts and keeps those posts that are or belong# to a question that has been asked in 2011 or 2012.#import osimport retry:    import ujson as json  # UltraJSON if availableexcept:    import jsonfrom dateutil import parser as dateparserfrom operator import itemgetterfrom xml.etree import cElementTree as etreefrom collections import defaultdictfrom data import DATA_DIRfilename = os.path.join(DATA_DIR, "posts-2011-12.xml")filename_filtered = os.path.join(DATA_DIR, "filtered.tsv")q_creation = {}  # creation datetimes of questionsq_accepted = {}  # id of accepted answermeta = defaultdict(    list)  # question -> [(answer Id, IsAccepted, TimeToAnswer, Score), ...]# regegx to find code snippetscode_match = re.compile('<pre>(.*?)</pre>', re.MULTILINE | re.DOTALL)link_match = re.compile(    '<a href="http://.*?".*?>(.*?)</a>', re.MULTILINE | re.DOTALL)img_match = re.compile('<img(.*?)/>', re.MULTILINE | re.DOTALL)tag_match = re.compile('<[^>]*>', re.MULTILINE | re.DOTALL)def filter_html(s):    num_code_lines = 0    link_count_in_code = 0    code_free_s = s    num_images = len(img_match.findall(s))    # remove source code and count how many lines         正则表达式的用法    for match_str in code_match.findall(s):        num_code_lines += match_str.count('\n')        code_free_s = code_match.sub("", code_free_s)        # sometimes source code contain links, which we don't want to count        link_count_in_code += len(link_match.findall(match_str))    anchors = link_match.findall(s)    link_count = len(anchors)    link_count -= link_count_in_code    html_free_s = re.sub(        " +", " ", tag_match.sub('', code_free_s)).replace("\n", "")    link_free_s = html_free_s    for anchor in anchors:        if anchor.lower().startswith("http://"):            link_free_s = link_free_s.replace(anchor, '')    num_text_tokens = html_free_s.count(" ")    return link_free_s, num_text_tokens, num_code_lines, link_count, num_imagesyears = defaultdict(int)num_questions = 0num_answers = 0def parsexml(filename):    global num_questions, num_answers    counter = 0    it = map(itemgetter(1),             iter(etree.iterparse(filename, events=('start',))))    root = next(it)  # get posts element    for elem in it:        if counter % 100000 == 0:            print(counter)        counter += 1        if elem.tag == 'row':            creation_date = dateparser.parse(elem.get('CreationDate'))            # import pdb;pdb.set_trace()            # if creation_date.year < 2011:            #    continue            Id = int(elem.get('Id'))            PostTypeId = int(elem.get('PostTypeId'))            Score = int(elem.get('Score'))            if PostTypeId == 1:                num_questions += 1                years[creation_date.year] += 1                ParentId = -1                TimeToAnswer = 0                q_creation[Id] = creation_date                accepted = elem.get('AcceptedAnswerId')                if accepted:                    q_accepted[Id] = int(accepted)                IsAccepted = 0            elif PostTypeId == 2:                num_answers += 1                ParentId = int(elem.get('ParentId'))                if not ParentId in q_creation:                    # question was too far in the past                    continue                TimeToAnswer = (creation_date - q_creation[ParentId]).seconds                if ParentId in q_accepted:                    IsAccepted = int(q_accepted[ParentId] == Id)                else:                    IsAccepted = 0                meta[ParentId].append((Id, IsAccepted, TimeToAnswer, Score))            else:                continue            Text, NumTextTokens, NumCodeLines, LinkCount, NumImages = filter_html(                elem.get('Body'))            values = (Id, ParentId,                      IsAccepted,                      TimeToAnswer, Score,                      Text,                      NumTextTokens, NumCodeLines, LinkCount, NumImages)            yield values            root.clear()  # preserve memorywith open(os.path.join(DATA_DIR, filename_filtered), "w") as f:    for item in parsexml(filename):        line = "\t".join(map(str, item))        f.write(line.encode("utf-8") + "\n")with open(os.path.join(DATA_DIR, "filtered-meta.json"), "w") as f:    json.dump(meta, f)print("years:", years)print("#qestions: %i" % num_questions)print("#answers: %i" % num_answers)import ostry:    import ujson as json  # UltraJSON if availableexcept:    import jsonfrom matplotlib import pylabimport numpy as npfrom data import CHART_DIRdef fetch_data(filename, col=None, line_count=-1, only_questions=False):    count = 0    for line in open(filename, "r"):        count += 1        if line_count > 0 and count > line_count:            break        data = Id, ParentId, IsQuestion, IsAccepted, TimeToAnswer, Score, Text, NumTextTokens, NumCodeLines, LinkCount, MisSpelledFraction = line.split(            "\t")        IsQuestion = int(IsQuestion)        if only_questions and not IsQuestion:            continue        if col:            if col < 6:                val = int(data[col])            else:                val = data[col]            yield val        else:            Id = int(Id)            assert Id >= 0, line            ParentId = int(ParentId)            IsAccepted = int(IsAccepted)            assert not IsQuestion == IsAccepted == 1, "%i %i --- %s" % (                IsQuestion, IsAccepted, line)            assert (ParentId == -1 and IsQuestion) or (                ParentId >= 0 and not IsQuestion), "%i %i --- %s" % (ParentId, IsQuestion, line)            TimeToAnswer = int(TimeToAnswer)            Score = int(Score)            NumTextTokens = int(NumTextTokens)            NumCodeLines = int(NumCodeLines)            LinkCount = int(LinkCount)            MisSpelledFraction = float(MisSpelledFraction)            yield Id, ParentId, IsQuestion, IsAccepted, TimeToAnswer, Score, Text, NumTextTokens, NumCodeLines, LinkCount, MisSpelledFractiondef fetch_posts(filename, with_index=True, line_count=-1):    count = 0    for line in open(filename, "r"):        count += 1        if line_count > 0 and count > line_count:            break        Id, Text = line.split("\t")        Text = Text.strip()        if with_index:            yield int(Id), Text        else:            yield Textdef load_meta(filename):    meta = json.load(open(filename, "r"))    keys = list(meta.keys())    # JSON only allows string keys, changing that to int    for key in keys:        meta[int(key)] = meta[key]        del meta[key]    # post Id to index in vectorized    id_to_idx = {}    # and back    idx_to_id = {}    for PostId, Info in meta.items():        id_to_idx[PostId] = idx = Info['idx']        idx_to_id[idx] = PostId    return meta, id_to_idx, idx_to_iddef plot_roc(auc_score, name, fpr, tpr):    pylab.figure(num=None, figsize=(6, 5))    pylab.plot([0, 1], [0, 1], 'k--')    pylab.xlim([0.0, 1.0])    pylab.ylim([0.0, 1.0])    pylab.xlabel('False Positive Rate')    pylab.ylabel('True Positive Rate')    pylab.title('Receiver operating characteristic (AUC=%0.2f)\n%s' % (        auc_score, name))    pylab.legend(loc="lower right")    pylab.grid(True, linestyle='-', color='0.75')    pylab.fill_between(tpr, fpr, alpha=0.5)    pylab.plot(fpr, tpr, lw=1)    pylab.savefig(os.path.join(CHART_DIR, "roc_" + name.replace(" ", "_")+ ".png"))def plot_pr(auc_score, name, precision, recall, label=None):    pylab.figure(num=None, figsize=(6, 5))    pylab.xlim([0.0, 1.0])    pylab.ylim([0.0, 1.0])    pylab.xlabel('Recall')    pylab.ylabel('Precision')    pylab.title('P/R (AUC=%0.2f) / %s' % (auc_score, label))    pylab.fill_between(recall, precision, alpha=0.5)    pylab.grid(True, linestyle='-', color='0.75')    pylab.plot(recall, precision, lw=1)    filename = name.replace(" ", "_")    pylab.savefig(os.path.join(CHART_DIR, "pr_" + filename + ".png"))def show_most_informative_features(vectorizer, clf, n=20):    c_f = sorted(zip(clf.coef_[0], vectorizer.get_feature_names()))    top = list(zip(c_f[:n], c_f[:-(n + 1):-1]))    for (c1, f1), (c2, f2) in top:        print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (c1, f1, c2, f2))def plot_feat_importance(feature_names, clf, name):    pylab.figure(num=None, figsize=(6, 5))    coef_ = clf.coef_    important = np.argsort(np.absolute(coef_.ravel()))    f_imp = feature_names[important]    coef = coef_.ravel()[important]    inds = np.argsort(coef)    f_imp = f_imp[inds]    coef = coef[inds]    xpos = np.array(list(range(len(coef))))    pylab.bar(xpos, coef, width=1)    pylab.title('Feature importance for %s' % (name))    ax = pylab.gca()    ax.set_xticks(np.arange(len(coef)))    labels = ax.set_xticklabels(f_imp)    for label in labels:        label.set_rotation(90)    filename = name.replace(" ", "_")    pylab.savefig(os.path.join(        CHART_DIR, "feat_imp_%s.png" % filename), bbox_inches="tight")def plot_feat_hist(data_name_list, filename=None):    if len(data_name_list)>1:        assert filename is not None    pylab.figure(num=None, figsize=(8, 6))    num_rows = 1 + (len(data_name_list) - 1) / 2    num_cols = 1 if len(data_name_list) == 1 else 2    pylab.figure(figsize=(5 * num_cols, 4 * num_rows))    for i in range(num_rows):        for j in range(num_cols):            pylab.subplot(num_rows, num_cols, 1 + i * num_cols + j)            x, name = data_name_list[i * num_cols + j]            pylab.title(name)            pylab.xlabel('Value')            pylab.ylabel('Fraction')            # the histogram of the data            max_val = np.max(x)            if max_val <= 1.0:                bins = 50            elif max_val > 50:                bins = 50            else:                bins = max_val            n, bins, patches = pylab.hist(                x, bins=bins, normed=1, facecolor='blue', alpha=0.75)            pylab.grid(True)    if not filename:        filename = "feat_hist_%s.png" % name.replace(" ", "_")    pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")def plot_bias_variance(data_sizes, train_errors, test_errors, name, title):    pylab.figure(num=None, figsize=(6, 5))    pylab.ylim([0.0, 1.0])    pylab.xlabel('Data set size')    pylab.ylabel('Error')    pylab.title("Bias-Variance for '%s'" % name)    pylab.plot(        data_sizes, test_errors, "--", data_sizes, train_errors, "b-", lw=1)    pylab.legend(["train error", "test error"], loc="upper right")    pylab.grid(True, linestyle='-', color='0.75')    pylab.savefig(os.path.join(CHART_DIR, "bv_" + name.replace(" ", "_") + ".png"), bbox_inches="tight")def plot_k_complexity(ks, train_errors, test_errors):    pylab.figure(num=None, figsize=(6, 5))    pylab.ylim([0.0, 1.0])    pylab.xlabel('k')    pylab.ylabel('Error')    pylab.title('Errors for for different values of k')    pylab.plot(        ks, test_errors, "--", ks, train_errors, "-", lw=1)    pylab.legend(["train error", "test error"], loc="upper right")    pylab.grid(True, linestyle='-', color='0.75')    pylab.savefig(os.path.join(CHART_DIR, "kcomplexity.png"), bbox_inches="tight")第六章import osimport collectionsfrom matplotlib import pylabimport numpy as npDATA_DIR = os.path.join("..", "data")CHART_DIR = os.path.join("..", "charts")import csvimport jsondef tweak_labels(Y, pos_sent_list):    pos = Y == pos_sent_list[0]            #注意此种写法，最后赋值的是逻辑值    for sent_label in pos_sent_list[1:]:        pos |= Y == sent_label      #|=  注意竖杠和等于号在一起， a|=b 等价于a=a|b 按位或    Y = np.zeros(Y.shape[0])    Y[pos] = 1    Y = Y.astype(int)            #类型转化    return Ydef load_sanders_data(dirname=".", line_count=-1):    count = 0    topics = []    labels = []    tweets = []    with open(os.path.join(DATA_DIR, dirname, "corpus.csv"), "r") as csvfile:        metareader = csv.reader(csvfile, delimiter=',', quotechar='"')  #注意格式        for line in metareader:            count += 1            if line_count > 0 and count > line_count:                break            topic, label, tweet_id = line            # import vimpdb;vimpdb.set_trace()            tweet_fn = os.path.join(                DATA_DIR, dirname, 'rawdata', '%s.json' % tweet_id)            tweet = json.load(open(tweet_fn, "r"))            if 'text' in tweet and tweet['user']['lang']=="en":                topics.append(topic)                labels.append(label)                tweets.append(tweet['text'])    tweets = np.asarray(tweets)    labels = np.asarray(labels)    # return topics, tweets, labels    return tweets, labelsdef load_kaggle_data(filename="kaggle/training.txt", line_count=-1):    count = 0    labels = []    texts = []    read_texts = set([])    for line in open(os.path.join(DATA_DIR, filename), "r"):        count += 1        if line_count > 0 and count > line_count:            break        label, text = line.split("\t")        # Some tweets occur multiple times, so we have to        # remove them to not bias the training set.        if text in read_texts:            continue        read_texts.add(text)        labels.append(label)        texts.append(text)    texts = np.asarray(texts)    labels = np.asarray(labels, dtype=np.int)    return texts, labelsdef plot_pr(auc_score, name, phase, precision, recall, label=None):    pylab.clf()          #清除图像    pylab.figure(num=None, figsize=(5, 4))    pylab.grid(True)    pylab.fill_between(recall, precision, alpha=0.5)    pylab.plot(recall, precision, lw=1)    pylab.xlim([0.0, 1.0])    pylab.ylim([0.0, 1.0])    pylab.xlabel('Recall')    pylab.ylabel('Precision')    pylab.title('P/R curve (AUC=%0.2f) / %s' % (auc_score, label))    filename = name.replace(" ", "_")    pylab.savefig(os.path.join(CHART_DIR, "pr_%s_%s.png"%(filename, phase)), bbox_inches="tight")def show_most_informative_features(vectorizer, clf, n=20):    c_f = sorted(zip(clf.coef_[0], vectorizer.get_feature_names()))    top = zip(c_f[:n], c_f[:-(n + 1):-1])    for (c1, f1), (c2, f2) in top:        print "\t%.4f\t%-15s\t\t%.4f\t%-15s" % (c1, f1, c2, f2)def plot_log():    pylab.clf()    pylab.figure(num=None, figsize=(6, 5))    x = np.arange(0.001, 1, 0.001)    y = np.log(x)    pylab.title('Relationship between probabilities and their logarithm')    pylab.plot(x, y)    pylab.grid(True)    pylab.xlabel('P')    pylab.ylabel('log(P)')    filename = 'log_probs.png'    pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")def plot_feat_importance(feature_names, clf, name):    pylab.clf()    coef_ = clf.coef_    important = np.argsort(np.absolute(coef_.ravel()))    f_imp = feature_names[important]    coef = coef_.ravel()[important]    inds = np.argsort(coef)    f_imp = f_imp[inds]    coef = coef[inds]    xpos = np.array(range(len(coef)))    pylab.bar(xpos, coef, width=1)    pylab.title('Feature importance for %s' % (name))    ax = pylab.gca()    ax.set_xticks(np.arange(len(coef)))    labels = ax.set_xticklabels(f_imp)    for label in labels:        label.set_rotation(90)    filename = name.replace(" ", "_")    pylab.savefig(os.path.join(        CHART_DIR, "feat_imp_%s.png" % filename), bbox_inches="tight")def plot_feat_hist(data_name_list, filename=None):    pylab.clf()    # import pdb;pdb.set_trace()    num_rows = 1 + (len(data_name_list) - 1) / 2    num_cols = 1 if len(data_name_list) == 1 else 2    pylab.figure(figsize=(5 * num_cols, 4 * num_rows))    for i in range(num_rows):        for j in range(num_cols):            pylab.subplot(num_rows, num_cols, 1 + i * num_cols + j)            x, name = data_name_list[i * num_cols + j]            pylab.title(name)            pylab.xlabel('Value')            pylab.ylabel('Density')            # the histogram of the data            max_val = np.max(x)            if max_val <= 1.0:                bins = 50            elif max_val > 50:                bins = 50            else:                bins = max_val            n, bins, patches = pylab.hist(                x, bins=bins, normed=1, facecolor='green', alpha=0.75)            pylab.grid(True)    if not filename:        filename = "feat_hist_%s.png" % name    pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")def plot_bias_variance(data_sizes, train_errors, test_errors, name):    pylab.clf()    pylab.ylim([0.0, 1.0])    pylab.xlabel('Data set size')    pylab.ylabel('Error')    pylab.title("Bias-Variance for '%s'" % name)    pylab.plot(        data_sizes, train_errors, "-", data_sizes, test_errors, "--", lw=1)    pylab.legend(["train error", "test error"], loc="upper right")    pylab.grid()    pylab.savefig(os.path.join(CHART_DIR, "bv_" + name + ".png"))def load_sent_word_net():    sent_scores = collections.defaultdict(list)    with open(os.path.join(DATA_DIR, "SentiWordNet_3.0.0_20130122.txt"), "r") as csvfile:        reader = csv.reader(csvfile, delimiter='\t', quotechar='"')        for line in reader:            if line[0].startswith("#"):            #注意用法                continue            if len(line)==1:                continue            POS,ID,PosScore,NegScore,SynsetTerms,Gloss = line            if len(POS)==0 or len(ID)==0:                continue            #print POS,PosScore,NegScore,SynsetTerms            for term in SynsetTerms.split(" "):                term = term.split("#")[0] # drop #number at the end of every term                term = term.replace("-", " ").replace("_", " ")   #两个replace 连用                key = "%s/%s"%(POS,term.split("#")[0])                sent_scores[key].append((float(PosScore), float(NegScore)))    for key, value in sent_scores.iteritems():        sent_scores[key] = np.mean(value, axis=0)    return sent_scoresdef log_false_positives(clf, X, y, name):    with open("FP_"+name.replace(" ", "_")+".tsv", "w") as f:        false_positive = clf.predict(X)!=y        for tweet, false_class in zip(X[false_positive], y[false_positive]):            f.write("%s\t%s\n"%(false_class, tweet.encode("ascii", "ignore")))if __name__ == '__main__':    plot_log()import timestart_time = time.time()import numpy as npfrom sklearn.metrics import precision_recall_curve, roc_curve, aucfrom sklearn.cross_validation import ShuffleSplitfrom utils import plot_prfrom utils import load_sanders_datafrom utils import tweak_labelsfrom sklearn.feature_extraction.text import TfidfVectorizerfrom sklearn.pipeline import Pipeline              #管道机制from sklearn.naive_bayes import MultinomialNBdef create_ngram_model():    tfidf_ngrams = TfidfVectorizer(ngram_range=(1, 3),                                   analyzer="word", binary=False)    clf = MultinomialNB()    pipeline = Pipeline([('vect', tfidf_ngrams), ('clf', clf)])    return pipelinedef train_model(clf_factory, X, Y, name="NB ngram", plot=False):    cv = ShuffleSplit(        n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)    train_errors = []    test_errors = []    scores = []    pr_scores = []    precisions, recalls, thresholds = [], [], []    for train, test in cv:        X_train, y_train = X[train], Y[train]        X_test, y_test = X[test], Y[test]        clf = clf_factory()        clf.fit(X_train, y_train)        train_score = clf.score(X_train, y_train)        test_score = clf.score(X_test, y_test)        train_errors.append(1 - train_score)        test_errors.append(1 - test_score)        scores.append(test_score)        proba = clf.predict_proba(X_test)        fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])        precision, recall, pr_thresholds = precision_recall_curve(            y_test, proba[:, 1])        pr_scores.append(auc(recall, precision))        precisions.append(precision)        recalls.append(recall)        thresholds.append(pr_thresholds)    scores_to_sort = pr_scores    median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]    if plot:        plot_pr(pr_scores[median], name, "01", precisions[median],                recalls[median], label=name)        summary = (np.mean(scores), np.std(scores),                   np.mean(pr_scores), np.std(pr_scores))        print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary    return np.mean(train_errors), np.mean(test_errors)def print_incorrect(clf, X, Y):    Y_hat = clf.predict(X)    wrong_idx = Y_hat != Y    X_wrong = X[wrong_idx]    Y_wrong = Y[wrong_idx]    Y_hat_wrong = Y_hat[wrong_idx]    for idx in xrange(len(X_wrong)):        print "clf.predict('%s')=%i instead of %i" %\            (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx])if __name__ == "__main__":    X_orig, Y_orig = load_sanders_data()    classes = np.unique(Y_orig)    for c in classes:        print "#%s: %i" % (c, sum(Y_orig == c))    print "== Pos vs. neg =="    pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative")    X = X_orig[pos_neg]    Y = Y_orig[pos_neg]    Y = tweak_labels(Y, ["positive"])    train_model(create_ngram_model, X, Y, name="pos vs neg", plot=True)    print "== Pos/neg vs. irrelevant/neutral =="    X = X_orig    Y = tweak_labels(Y_orig, ["positive", "negative"])    train_model(create_ngram_model, X, Y, name="sent vs rest", plot=True)    print "== Pos vs. rest =="    X = X_orig    Y = tweak_labels(Y_orig, ["positive"])    train_model(create_ngram_model, X, Y, name="pos vs rest", plot=True)    print "== Neg vs. rest =="    X = X_orig    Y = tweak_labels(Y_orig, ["negative"])    train_model(create_ngram_model, X, Y, name="neg vs rest", plot=True)    print "time spent:", time.time() - start_timeimport timestart_time = time.time()import numpy as npfrom sklearn.metrics import precision_recall_curve, roc_curve, aucfrom sklearn.cross_validation import ShuffleSplitfrom utils import plot_prfrom utils import load_sanders_datafrom utils import tweak_labelsfrom sklearn.feature_extraction.text import TfidfVectorizerfrom sklearn.pipeline import Pipelinefrom sklearn.grid_search import GridSearchCVfrom sklearn.metrics import f1_scorefrom sklearn.naive_bayes import MultinomialNBphase = "02"def create_ngram_model(params=None):    tfidf_ngrams = TfidfVectorizer(ngram_range=(1, 3),                                   analyzer="word", binary=False)    clf = MultinomialNB()    pipeline = Pipeline([('vect', tfidf_ngrams), ('clf', clf)])    if params:        pipeline.set_params(**params)    return pipelinedef grid_search_model(clf_factory, X, Y):    cv = ShuffleSplit(        n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)    param_grid = dict(vect__ngram_range=[(1, 1), (1, 2), (1, 3)],                      vect__min_df=[1, 2],                      vect__stop_words=[None, "english"],                      vect__smooth_idf=[False, True],                      vect__use_idf=[False, True],                      vect__sublinear_tf=[False, True],                      vect__binary=[False, True],                      clf__alpha=[0, 0.01, 0.05, 0.1, 0.5, 1],                      )    grid_search = GridSearchCV(clf_factory(),                               param_grid=param_grid,                               cv=cv,                               score_func=f1_score,                               verbose=10)    grid_search.fit(X, Y)    clf = grid_search.best_estimator_    print clf    return clfdef train_model(clf, X, Y, name="NB ngram", plot=False):    # create it again for plotting    cv = ShuffleSplit(        n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)    train_errors = []    test_errors = []    scores = []    pr_scores = []    precisions, recalls, thresholds = [], [], []    for train, test in cv:        X_train, y_train = X[train], Y[train]        X_test, y_test = X[test], Y[test]        clf.fit(X_train, y_train)        train_score = clf.score(X_train, y_train)        test_score = clf.score(X_test, y_test)        train_errors.append(1 - train_score)        test_errors.append(1 - test_score)        scores.append(test_score)        proba = clf.predict_proba(X_test)        fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])        precision, recall, pr_thresholds = precision_recall_curve(            y_test, proba[:, 1])        pr_scores.append(auc(recall, precision))        precisions.append(precision)        recalls.append(recall)        thresholds.append(pr_thresholds)    if plot:        scores_to_sort = pr_scores        median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]        plot_pr(pr_scores[median], name, phase, precisions[median],                recalls[median], label=name)    summary = (np.mean(scores), np.std(scores),               np.mean(pr_scores), np.std(pr_scores))    print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary    return np.mean(train_errors), np.mean(test_errors)def print_incorrect(clf, X, Y):    Y_hat = clf.predict(X)    wrong_idx = Y_hat != Y    X_wrong = X[wrong_idx]    Y_wrong = Y[wrong_idx]    Y_hat_wrong = Y_hat[wrong_idx]    for idx in xrange(len(X_wrong)):        print "clf.predict('%s')=%i instead of %i" %\            (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx])def get_best_model():    best_params = dict(vect__ngram_range=(1, 2),                       vect__min_df=1,                       vect__stop_words=None,                       vect__smooth_idf=False,                       vect__use_idf=False,                       vect__sublinear_tf=True,                       vect__binary=False,                       clf__alpha=0.01,                       )    best_clf = create_ngram_model(best_params)    return best_clfif __name__ == "__main__":    X_orig, Y_orig = load_sanders_data()    classes = np.unique(Y_orig)    for c in classes:        print "#%s: %i" % (c, sum(Y_orig == c))    print "== Pos vs. neg =="    pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative")    X = X_orig[pos_neg]    Y = Y_orig[pos_neg]    Y = tweak_labels(Y, ["positive"])    train_model(get_best_model(), X, Y, name="pos vs neg", plot=True)    print "== Pos/neg vs. irrelevant/neutral =="    X = X_orig    Y = tweak_labels(Y_orig, ["positive", "negative"])    # best_clf = grid_search_model(create_ngram_model, X, Y, name="sent vs    # rest", plot=True)    train_model(get_best_model(), X, Y, name="pos vs neg", plot=True)    print "== Pos vs. rest =="    X = X_orig    Y = tweak_labels(Y_orig, ["positive"])    train_model(get_best_model(), X, Y, name="pos vs rest",    plot=True)    print "== Neg vs. rest =="    X = X_orig    Y = tweak_labels(Y_orig, ["negative"])    train_model(get_best_model(), X, Y, name="neg vs rest",    plot=True)    print "time spent:", time.time() - start_timeimport timestart_time = time.time()import reimport numpy as npfrom sklearn.metrics import precision_recall_curve, roc_curve, aucfrom sklearn.cross_validation import ShuffleSplitfrom sklearn.pipeline import Pipelinefrom utils import plot_prfrom utils import load_sanders_datafrom utils import tweak_labelsfrom utils import log_false_positivesfrom sklearn.feature_extraction.text import TfidfVectorizerfrom sklearn.naive_bayes import MultinomialNBfrom utils import load_sent_word_netsent_word_net = load_sent_word_net()phase = "03"emo_repl = {    # positive emoticons    "<3": " good ",    ":d": " good ", # :D in lower case    ":dd": " good ", # :DD in lower case    "8)": " good ",    ":-)": " good ",    ":)": " good ",    ";)": " good ",    "(-:": " good ",    "(:": " good ",    # negative emoticons:    ":/": " bad ",    ":>": " sad ",    ":')": " sad ",    ":-(": " bad ",    ":(": " bad ",    ":S": " bad ",    ":-S": " bad ",    }emo_repl_order = [k for (k_len,k) in reversed(sorted([(len(k),k) for k in emo_repl.keys()]))]re_repl = {    r"\br\b": "are",    r"\bu\b": "you",    r"\bhaha\b": "ha",    r"\bhahaha\b": "ha",    r"\bdon't\b": "do not",    r"\bdoesn't\b": "does not",    r"\bdidn't\b": "did not",    r"\bhasn't\b": "has not",    r"\bhaven't\b": "have not",    r"\bhadn't\b": "had not",    r"\bwon't\b": "will not",    r"\bwouldn't\b": "would not",    r"\bcan't\b": "can not",    r"\bcannot\b": "can not",    }def create_ngram_model(params=None):    def preprocessor(tweet):        global emoticons_replaced        tweet = tweet.lower()        for k in emo_repl_order:            tweet = tweet.replace(k, emo_repl[k])        for r, repl in re_repl.iteritems():            tweet = re.sub(r, repl, tweet)        return tweet    tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor,                                   analyzer="word")    clf = MultinomialNB()    pipeline = Pipeline([('tfidf', tfidf_ngrams), ('clf', clf)])    if params:        pipeline.set_params(**params)    return pipelinedef train_model(clf, X, Y, name="NB ngram", plot=False):    # create it again for plotting    cv = ShuffleSplit(        n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)    train_errors = []    test_errors = []    scores = []    pr_scores = []    precisions, recalls, thresholds = [], [], []    clfs = [] # just to later get the median    for train, test in cv:        X_train, y_train = X[train], Y[train]        X_test, y_test = X[test], Y[test]        clf.fit(X_train, y_train)        clfs.append(clf)        train_score = clf.score(X_train, y_train)        test_score = clf.score(X_test, y_test)        train_errors.append(1 - train_score)        test_errors.append(1 - test_score)        scores.append(test_score)        proba = clf.predict_proba(X_test)              #使用predict_proba进行预测，结果是可能性        fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])        precision, recall, pr_thresholds = precision_recall_curve(            y_test, proba[:, 1])        pr_scores.append(auc(recall, precision))        precisions.append(precision)        recalls.append(recall)        thresholds.append(pr_thresholds)    if plot:        scores_to_sort = pr_scores        median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]        plot_pr(pr_scores[median], name, phase, precisions[median],                recalls[median], label=name)        log_false_positives(clfs[median], X_test, y_test, name)    summary = (np.mean(scores), np.std(scores),               np.mean(pr_scores), np.std(pr_scores))    print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary    return np.mean(train_errors), np.mean(test_errors)def print_incorrect(clf, X, Y):    Y_hat = clf.predict(X)    wrong_idx = Y_hat != Y    X_wrong = X[wrong_idx]    Y_wrong = Y[wrong_idx]    Y_hat_wrong = Y_hat[wrong_idx]    for idx in xrange(len(X_wrong)):        print "clf.predict('%s')=%i instead of %i" %\            (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx])def get_best_model():    best_params = dict(tfidf__ngram_range=(1, 2),                       tfidf__min_df=1,                       tfidf__stop_words=None,                       tfidf__smooth_idf=False,                       tfidf__use_idf=False,                       tfidf__sublinear_tf=True,                       tfidf__binary=False,                       clf__alpha=0.01,                       )    best_clf = create_ngram_model(best_params)    return best_clfif __name__ == "__main__":    X_orig, Y_orig = load_sanders_data()    classes = np.unique(Y_orig)    for c in classes:        print "#%s: %i" % (c, sum(Y_orig == c))    print "== Pos vs. neg =="    pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative")    X = X_orig[pos_neg]    Y = Y_orig[pos_neg]    Y = tweak_labels(Y, ["positive"])    train_model(get_best_model(), X, Y, name="pos vs neg", plot=True)    print "== Pos/neg vs. irrelevant/neutral =="    X = X_orig    Y = tweak_labels(Y_orig, ["positive", "negative"])    # best_clf = grid_search_model(create_union_model, X, Y, name="sent vs    # rest", plot=True)    train_model(get_best_model(), X, Y, name="pos+neg vs rest", plot=True)    print "== Pos vs. rest =="    X = X_orig    Y = tweak_labels(Y_orig, ["positive"])    train_model(get_best_model(), X, Y, name="pos vs rest",    plot=True)    print "== Neg vs. rest =="    X = X_orig    Y = tweak_labels(Y_orig, ["negative"])    train_model(get_best_model(), X, Y, name="neg vs rest",    plot=True)    print "time spent:", time.time() - start_time# This script trains tries to tweak hyperparameters to improve P/R AUC#import timestart_time = time.time()import nltkimport numpy as npfrom sklearn.metrics import precision_recall_curve, roc_curve, aucfrom sklearn.cross_validation import ShuffleSplitfrom utils import plot_prfrom utils import load_sanders_datafrom utils import tweak_labelsfrom utils import log_false_positivesfrom sklearn.feature_extraction.text import TfidfVectorizerfrom sklearn.pipeline import Pipeline, FeatureUnion   #特征联合，并行from sklearn.grid_search import GridSearchCVfrom sklearn.metrics import f1_scorefrom sklearn.base import BaseEstimatorfrom sklearn.naive_bayes import MultinomialNBfrom utils import load_sent_word_netsent_word_net = load_sent_word_net()import jsonposcache_filename = "poscache.json"try:    poscache = json.load(open(poscache_filename, "r"))except IOError:    poscache = {}class StructCounter(BaseEstimator):    def get_feature_names(self):        return np.array(['sent_neut', 'sent_pos', 'sent_neg',         'nouns', 'adjectives', 'verbs', 'adverbs',         'allcaps', 'exclamation', 'question', 'hashtag', 'mentioning'])    def fit(self, documents, y=None):        return self    def _get_sentiments(self, d):        # http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html        #import pdb;pdb.set_trace()        sent = tuple(d.split())        if poscache is not None:            if d in poscache:                tagged = poscache[d]            else:                poscache[d] = tagged = nltk.pos_tag(sent)        else:            tagged = nltk.pos_tag(sent)        pos_vals = []        neg_vals = []        nouns = 0.        adjectives = 0.        verbs = 0.        adverbs = 0.        for w,t in tagged:            p, n = 0,0            sent_pos_type = None            if t.startswith("NN"):                sent_pos_type = "n"                nouns += 1            elif t.startswith("JJ"):                sent_pos_type = "a"                adjectives += 1            elif t.startswith("VB"):                sent_pos_type = "v"                verbs += 1            elif t.startswith("RB"):                sent_pos_type = "r"                adverbs += 1            if sent_pos_type is not None:                sent_word = "%s/%s"%(sent_pos_type, w)                if sent_word in sent_word_net:                    p,n = sent_word_net[sent_word]            pos_vals.append(p)            neg_vals.append(n)        l = len(sent)        avg_pos_val = np.mean(pos_vals)        avg_neg_val = np.mean(neg_vals)        return [1-avg_pos_val-avg_neg_val, avg_pos_val, avg_neg_val,                nouns/l, adjectives/l, verbs/l, adverbs/l]    def transform(self, documents):        obj_val, pos_val, neg_val, nouns, adjectives, verbs, adverbs = np.array([self._get_sentiments(d) for d in documents]).T        allcaps = []        exclamation = []        question = []        hashtag = []        mentioning = []        for d in documents:            #import pdb;pdb.set_trace()            allcaps.append(np.sum([t.isupper() for t in d.split() if len(t)>2]))            exclamation.append(d.count("!"))            question.append(d.count("?"))            hashtag.append(d.count("#"))            mentioning.append(d.count("@"))        result = np.array([obj_val, pos_val, neg_val, nouns, adjectives, verbs, adverbs, allcaps,            exclamation, question, hashtag, mentioning]).T        return resultdef create_union_model(params=None):    def preprocessor(tweet):        global emoticons_replaced        #return tweet.lower()        repl = {            # positive emoticons            "<3": " good ",            ":D": " good ",            "8)": " good ",            ":-)": " good ",            ":)": " good ",            ";)": " good ",            ";-)": " good ",            # negative emoticons:            ":/": " bad ",            ":>": " sad ",            ":-(": " bad ",            ":(": " bad ",            ":S": " bad ",            ":-S": " bad ",        }        for a,b in repl.iteritems():            tweet = tweet.replace(a,b)        return tweet.lower()    tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor,                                   analyzer="word")    struct_stats = StructCounter()    all_features = FeatureUnion([('struct', struct_stats), ('tfidf', tfidf_ngrams)])   #注意FeatureUNion， 是特征并列处理，然后组合在一起，简单的lbind    #all_features = FeatureUnion([('tfidf', tfidf_ngrams)])    #all_features = FeatureUnion([('struct', struct_stats)])    clf = MultinomialNB()    pipeline = Pipeline([('all', all_features), ('clf', clf)])    if params:        pipeline.set_params(**params)    return pipelinedef grid_search_model(clf_factory, X, Y):    cv = ShuffleSplit(        n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)    param_grid = dict(all__tfidf__ngram_range=[(1, 1), (1, 2), (1, 3)],                      all__tfidf__min_df=[1, 2],                      all__tfidf__stop_words=[None, "english"],                      all__tfidf__smooth_idf=[False, True],                      all__tfidf__use_idf=[False, True],                      all__tfidf__sublinear_tf=[False, True],                      all__tfidf__binary=[False, True],                      clf__alpha=[0, 0.01, 0.05, 0.1, 0.5, 1],                      )    grid_search = GridSearchCV(clf_factory(),                               param_grid=param_grid,                               cv=cv,                               score_func=f1_score,                               verbose=10)    grid_search.fit(X, Y)    clf = grid_search.best_estimator_                    # grid_search找出最好的模型    print clf    return clfdef train_model(clf, X, Y, name="NB ngram", plot=False):    # create it again for plotting    cv = ShuffleSplit(        n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)    train_errors = []    test_errors = []    scores = []    pr_scores = []    precisions, recalls, thresholds = [], [], []    clfs = [] # just to later get the median    for train, test in cv:        X_train, y_train = X[train], Y[train]        X_test, y_test = X[test], Y[test]        clf.fit(X_train, y_train)        clfs.append(clf)        train_score = clf.score(X_train, y_train)        test_score = clf.score(X_test, y_test)        train_errors.append(1 - train_score)        test_errors.append(1 - test_score)        scores.append(test_score)        proba = clf.predict_proba(X_test)        fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])        precision, recall, pr_thresholds = precision_recall_curve(            y_test, proba[:, 1])        pr_scores.append(auc(recall, precision))        precisions.append(precision)        recalls.append(recall)        thresholds.append(pr_thresholds)    if plot:        scores_to_sort = pr_scores        median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]        plot_pr(pr_scores[median], name, precisions[median],                recalls[median], label=name)        log_false_positives(clfs[median], X_test, y_test, name)    summary = (np.mean(scores), np.std(scores),               np.mean(pr_scores), np.std(pr_scores))    print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary    return np.mean(train_errors), np.mean(test_errors)def print_incorrect(clf, X, Y):    Y_hat = clf.predict(X)    wrong_idx = Y_hat != Y    X_wrong = X[wrong_idx]    Y_wrong = Y[wrong_idx]    Y_hat_wrong = Y_hat[wrong_idx]    for idx in xrange(len(X_wrong)):        print "clf.predict('%s')=%i instead of %i" %\            (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx])def get_best_model():    best_params = dict(all__tfidf__ngram_range=(1, 2),                       all__tfidf__min_df=1,                       all__tfidf__stop_words=None,                       all__tfidf__smooth_idf=False,                       all__tfidf__use_idf=False,                       all__tfidf__sublinear_tf=True,                       all__tfidf__binary=False,                       clf__alpha=0.01,                       )    best_clf = create_union_model(best_params)    return best_clfif __name__ == "__main__":    X_orig, Y_orig = load_sanders_data()    #from sklearn.utils import shuffle    #print "shuffle, sample"    #X_orig, Y_orig = shuffle(X_orig, Y_orig)    #X_orig = X_orig[:100,]    #Y_orig = Y_orig[:100,]    classes = np.unique(Y_orig)    for c in classes:        print "#%s: %i" % (c, sum(Y_orig == c))    print "== Pos vs. neg =="    pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative")    X = X_orig[pos_neg]    Y = Y_orig[pos_neg]    Y = tweak_labels(Y, ["positive"])    grid_search_model(create_union_model, X, Y)    print "== Pos/neg vs. irrelevant/neutral =="    X = X_orig    Y = tweak_labels(Y_orig, ["positive", "negative"])    # best_clf = grid_search_model(create_union_model, X, Y, name="sent vs    # rest", plot=True)    grid_search_model(create_union_model, X, Y)    print "== Pos vs. rest =="    X = X_orig    Y = tweak_labels(Y_orig, ["positive"])    grid_search_model(create_union_model, X, Y)    print "== Neg vs. rest =="    X = X_orig    Y = tweak_labels(Y_orig, ["negative"])    grid_search_model(create_union_model, X, Y)    print "time spent:", time.time() - start_time    json.dump(poscache, open(poscache_filename, "w"))## This script trains tries to tweak hyperparameters to improve P/R AUC#import timestart_time = time.time()import reimport nltkimport numpy as npfrom sklearn.metrics import precision_recall_curve, roc_curve, aucfrom sklearn.cross_validation import ShuffleSplitfrom utils import plot_prfrom utils import load_sanders_datafrom utils import tweak_labelsfrom utils import log_false_positivesfrom sklearn.feature_extraction.text import TfidfVectorizerfrom sklearn.pipeline import Pipeline, FeatureUnionfrom sklearn.grid_search import GridSearchCVfrom sklearn.metrics import f1_scorefrom sklearn.base import BaseEstimatorfrom sklearn.naive_bayes import MultinomialNBfrom utils import load_sent_word_netsent_word_net = load_sent_word_net()phase = "04"import jsonposcache_filename = "poscache.json"try:                           # try, except 模式    poscache = json.load(open(poscache_filename, "r"))except IOError:    poscache = {}class LinguisticVectorizer(BaseEstimator):    def get_feature_names(self):        return np.array(['sent_neut', 'sent_pos', 'sent_neg',         'nouns', 'adjectives', 'verbs', 'adverbs',         'allcaps', 'exclamation', 'question'])    def fit(self, documents, y=None):        return self    def _get_sentiments(self, d):        # http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html        #import pdb;pdb.set_trace()        sent = tuple(nltk.word_tokenize(d))      #tuple类型        if poscache is not None:            if d in poscache:                tagged = poscache[d]            else:                poscache[d] = tagged = nltk.pos_tag(sent)        else:            tagged = nltk.pos_tag(sent)        pos_vals = []        neg_vals = []        nouns = 0.        adjectives = 0.        verbs = 0.        adverbs = 0.        for w,t in tagged:            p, n = 0,0            sent_pos_type = None            if t.startswith("NN"):                sent_pos_type = "n"                nouns += 1            elif t.startswith("JJ"):                sent_pos_type = "a"                adjectives += 1            elif t.startswith("VB"):                sent_pos_type = "v"                verbs += 1            elif t.startswith("RB"):                sent_pos_type = "r"                adverbs += 1            if sent_pos_type is not None:                sent_word = "%s/%s"%(sent_pos_type, w)                if sent_word in sent_word_net:                    p,n = sent_word_net[sent_word]            pos_vals.append(p)            neg_vals.append(n)        l = len(sent)        avg_pos_val = np.mean(pos_vals)    #注意        avg_neg_val = np.mean(neg_vals)        #import pdb;pdb.set_trace()        return [1-avg_pos_val-avg_neg_val, avg_pos_val, avg_neg_val,                nouns/l, adjectives/l, verbs/l, adverbs/l]    def transform(self, documents):        obj_val, pos_val, neg_val, nouns, adjectives, verbs, adverbs = np.array([self._get_sentiments(d) for d in documents]).T        allcaps = []        exclamation = []        question = []        for d in documents:            allcaps.append(np.sum([t.isupper() for t in d.split() if len(t)>2]))            exclamation.append(d.count("!"))            question.append(d.count("?"))        result = np.array([obj_val, pos_val, neg_val, nouns, adjectives, verbs, adverbs, allcaps,            exclamation, question]).T        return resultemo_repl = {    # positive emoticons    "<3": " good ",    ":d": " good ", # :D in lower case    ":dd": " good ", # :DD in lower case    "8)": " good ",    ":-)": " good ",    ":)": " good ",    ";)": " good ",    "(-:": " good ",    "(:": " good ",    # negative emoticons:    ":/": " bad ",    ":>": " sad ",    ":')": " sad ",    ":-(": " bad ",    ":(": " bad ",    ":S": " bad ",    ":-S": " bad ",    }emo_repl_order = [k for (k_len,k) in reversed(sorted([(len(k),k) for k in emo_repl.keys()]))]re_repl = {    r"\br\b": "are",    r"\bu\b": "you",    r"\bhaha\b": "ha",    r"\bhahaha\b": "ha",    r"\bdon't\b": "do not",    r"\bdoesn't\b": "does not",    r"\bdidn't\b": "did not",    r"\bhasn't\b": "has not",    r"\bhaven't\b": "have not",    r"\bhadn't\b": "had not",    r"\bwon't\b": "will not",    r"\bwouldn't\b": "would not",    r"\bcan't\b": "can not",    r"\bcannot\b": "can not",    }def create_union_model(params=None):    def preprocessor(tweet):        tweet = tweet.lower()        for k in emo_repl_order:            tweet = tweet.replace(k, emo_repl[k])        for r, repl in re_repl.iteritems():            tweet = re.sub(r, repl, tweet)        return tweet.replace("-", " ").replace("_", " ")    tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor,                                   analyzer="word")    ling_stats = LinguisticVectorizer()    all_features = FeatureUnion([('ling', ling_stats), ('tfidf', tfidf_ngrams)])    #all_features = FeatureUnion([('tfidf', tfidf_ngrams)])    #all_features = FeatureUnion([('ling', ling_stats)])    clf = MultinomialNB()    pipeline = Pipeline([('all', all_features), ('clf', clf)])    if params:        pipeline.set_params(**params)    return pipelinedef __grid_search_model(clf_factory, X, Y):    cv = ShuffleSplit(        n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)    param_grid = dict(vect__ngram_range=[(1, 1), (1, 2), (1, 3)],                      vect__min_df=[1, 2],                      vect__smooth_idf=[False, True],                      vect__use_idf=[False, True],                      vect__sublinear_tf=[False, True],                      vect__binary=[False, True],                      clf__alpha=[0, 0.01, 0.05, 0.1, 0.5, 1],                      )    grid_search = GridSearchCV(clf_factory(),                               param_grid=param_grid,                               cv=cv,                               score_func=f1_score,                               verbose=10)    grid_search.fit(X, Y)    clf = grid_search.best_estimator_    print clf    return clfdef train_model(clf, X, Y, name="NB ngram", plot=False):    # create it again for plotting    cv = ShuffleSplit(        n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)    train_errors = []    test_errors = []    scores = []    pr_scores = []    precisions, recalls, thresholds = [], [], []    clfs = [] # just to later get the median    for train, test in cv:        X_train, y_train = X[train], Y[train]        X_test, y_test = X[test], Y[test]        clf.fit(X_train, y_train)        clfs.append(clf)        train_score = clf.score(X_train, y_train)        test_score = clf.score(X_test, y_test)        train_errors.append(1 - train_score)        test_errors.append(1 - test_score)        scores.append(test_score)        proba = clf.predict_proba(X_test)        fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])        precision, recall, pr_thresholds = precision_recall_curve(            y_test, proba[:, 1])        pr_scores.append(auc(recall, precision))        precisions.append(precision)        recalls.append(recall)        thresholds.append(pr_thresholds)    if plot:        scores_to_sort = pr_scores        median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]        plot_pr(pr_scores[median], name, phase, precisions[median],                recalls[median], label=name)        log_false_positives(clfs[median], X_test, y_test, name)    summary = (np.mean(scores), np.std(scores),               np.mean(pr_scores), np.std(pr_scores))    print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary    return np.mean(train_errors), np.mean(test_errors)def print_incorrect(clf, X, Y):    Y_hat = clf.predict(X)    wrong_idx = Y_hat != Y    X_wrong = X[wrong_idx]    Y_wrong = Y[wrong_idx]    Y_hat_wrong = Y_hat[wrong_idx]    for idx in xrange(len(X_wrong)):        print "clf.predict('%s')=%i instead of %i" %\            (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx])def get_best_model():    best_params = dict(all__tfidf__ngram_range=(1, 2),                       all__tfidf__min_df=1,                       all__tfidf__stop_words=None,                       all__tfidf__smooth_idf=False,                       all__tfidf__use_idf=False,                       all__tfidf__sublinear_tf=True,                       all__tfidf__binary=False,                       clf__alpha=0.01,                       )    best_clf = create_union_model(best_params)    return best_clfif __name__ == "__main__":    X_orig, Y_orig = load_sanders_data()    #from sklearn.utils import shuffle    #print "shuffle, sample"    #X_orig, Y_orig = shuffle(X_orig, Y_orig)    #X_orig = X_orig[:100,]    #Y_orig = Y_orig[:100,]    classes = np.unique(Y_orig)    for c in classes:        print "#%s: %i" % (c, sum(Y_orig == c))    print "== Pos vs. neg =="    pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative")    X = X_orig[pos_neg]    Y = Y_orig[pos_neg]    Y = tweak_labels(Y, ["positive"])    train_model(get_best_model(), X, Y, name="pos vs neg", plot=True)    print "== Pos/neg vs. irrelevant/neutral =="    X = X_orig    Y = tweak_labels(Y_orig, ["positive", "negative"])    # best_clf = grid_search_model(create_union_model, X, Y, name="sent vs    # rest", plot=True)    train_model(get_best_model(), X, Y, name="pos+neg vs rest", plot=True)    print "== Pos vs. rest =="    X = X_orig    Y = tweak_labels(Y_orig, ["positive"])    train_model(get_best_model(), X, Y, name="pos vs rest",    plot=True)    print "== Neg vs. rest =="    X = X_orig    Y = tweak_labels(Y_orig, ["negative"])    train_model(get_best_model(), X, Y, name="neg vs rest",    plot=True)    print "time spent:", time.time() - start_time    json.dump(poscache, open(poscache_filename, "w"))     #dump 序列化## This script trains tries to tweak hyperparameters to improve P/R AUC#import timestart_time = time.time()import reimport nltkimport numpy as npfrom sklearn.metrics import precision_recall_curve, roc_curve, aucfrom sklearn.cross_validation import ShuffleSplitfrom utils import plot_prfrom utils import load_sanders_datafrom utils import tweak_labelsfrom utils import log_false_positivesfrom sklearn.feature_extraction.text import TfidfVectorizerfrom sklearn.pipeline import Pipeline, FeatureUnionfrom sklearn.grid_search import GridSearchCVfrom sklearn.metrics import f1_scorefrom sklearn.base import BaseEstimatorfrom sklearn.naive_bayes import MultinomialNBfrom utils import load_sent_word_netsent_word_net = load_sent_word_net()import jsonposcache_filename = "poscache.json"try:    poscache = json.load(open(poscache_filename, "r"))except IOError:    poscache = {}class LinguisticVectorizer(BaseEstimator):    def get_feature_names(self):        return np.array(['sent_neut', 'sent_pos', 'sent_neg',         'nouns', 'adjectives', 'verbs', 'adverbs',         'allcaps', 'exclamation', 'question'])    def fit(self, documents, y=None):        return self    def _get_sentiments(self, d):        # http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html        #import pdb;pdb.set_trace()        sent = tuple(nltk.word_tokenize(d))        if poscache is not None:            if d in poscache:                tagged = poscache[d]            else:                poscache[d] = tagged = nltk.pos_tag(sent)        else:            tagged = nltk.pos_tag(sent)        pos_vals = []        neg_vals = []        nouns = 0.        adjectives = 0.        verbs = 0.        adverbs = 0.        for w,t in tagged:            p, n = 0,0            sent_pos_type = None            if t.startswith("NN"):                sent_pos_type = "n"                nouns += 1            elif t.startswith("JJ"):                sent_pos_type = "a"                adjectives += 1            elif t.startswith("VB"):                sent_pos_type = "v"                verbs += 1            elif t.startswith("RB"):                sent_pos_type = "r"                adverbs += 1            if sent_pos_type is not None:                sent_word = "%s/%s"%(sent_pos_type, w)                if sent_word in sent_word_net:                    p,n = sent_word_net[sent_word]            pos_vals.append(p)            neg_vals.append(n)        l = len(sent)        avg_pos_val = np.max(pos_vals)        avg_neg_val = np.max(neg_vals)        return [max(0,1-avg_pos_val-avg_neg_val), avg_pos_val, avg_neg_val,                nouns/l, adjectives/l, verbs/l, adverbs/l]    def transform(self, documents):        obj_val, pos_val, neg_val, nouns, adjectives, verbs, adverbs = np.array([self._get_sentiments(d) for d in documents]).T        allcaps = []        exclamation = []        question = []        for d in documents:            allcaps.append(np.sum([t.isupper() for t in d.split() if len(t)>2]))            exclamation.append(d.count("!"))            question.append(d.count("?"))        result = np.array([obj_val, pos_val, neg_val, nouns, adjectives, verbs, adverbs, allcaps,            exclamation, question]).T        return resultemo_repl = {    # positive emoticons    "<3": " good ",    ":d": " good ", # :D in lower case    ":dd": " good ", # :DD in lower case    "8)": " good ",    ":-)": " good ",    ":)": " good ",    ";)": " good ",    "(-:": " good ",    "(:": " good ",    # negative emoticons:    ":/": " bad ",    ":>": " sad ",    ":')": " sad ",    ":-(": " bad ",    ":(": " bad ",    ":S": " bad ",    ":-S": " bad ",    }emo_repl_order = [k for (k_len,k) in reversed(sorted([(len(k),k) for k in emo_repl.keys()]))]re_repl = {    r"\br\b": "are",    r"\bu\b": "you",    r"\bhaha\b": "ha",    r"\bhahaha\b": "ha",    r"\bdon't\b": "do not",    r"\bdoesn't\b": "does not",    r"\bdidn't\b": "did not",    r"\bhasn't\b": "has not",    r"\bhaven't\b": "have not",    r"\bhadn't\b": "had not",    r"\bwon't\b": "will not",    r"\bwouldn't\b": "would not",    r"\bcan't\b": "can not",    r"\bcannot\b": "can not",    }def create_union_model(params=None):    def preprocessor(tweet):        tweet = tweet.lower()        for k in emo_repl_order:            tweet = tweet.replace(k, emo_repl[k])        for r, repl in re_repl.iteritems():            tweet = re.sub(r, repl, tweet)        return tweet.replace("-", " ").replace("_", " ")    tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor,                                   analyzer="word")    ling_stats = LinguisticVectorizer()    all_features = FeatureUnion([('ling', ling_stats), ('tfidf', tfidf_ngrams)])    #all_features = FeatureUnion([('tfidf', tfidf_ngrams)])    #all_features = FeatureUnion([('ling', ling_stats)])    clf = MultinomialNB()    pipeline = Pipeline([('all', all_features), ('clf', clf)])    if params:        pipeline.set_params(**params)    return pipelinedef __grid_search_model(clf_factory, X, Y):    cv = ShuffleSplit(        n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)    param_grid = dict(vect__ngram_range=[(1, 1), (1, 2), (1, 3)],                      vect__min_df=[1, 2],                      vect__smooth_idf=[False, True],                      vect__use_idf=[False, True],                      vect__sublinear_tf=[False, True],                      vect__binary=[False, True],                      clf__alpha=[0, 0.01, 0.05, 0.1, 0.5, 1],                      )    grid_search = GridSearchCV(clf_factory(),                               param_grid=param_grid,                               cv=cv,                               score_func=f1_score,                               verbose=10)    grid_search.fit(X, Y)    clf = grid_search.best_estimator_    print clf    return clfdef train_model(clf, X, Y, name="NB ngram", plot=False):    # create it again for plotting    cv = ShuffleSplit(        n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)    train_errors = []    test_errors = []    scores = []    pr_scores = []    precisions, recalls, thresholds = [], [], []    clfs = [] # just to later get the median    for train, test in cv:        X_train, y_train = X[train], Y[train]        X_test, y_test = X[test], Y[test]        clf.fit(X_train, y_train)        clfs.append(clf)        train_score = clf.score(X_train, y_train)        test_score = clf.score(X_test, y_test)        train_errors.append(1 - train_score)        test_errors.append(1 - test_score)        scores.append(test_score)        proba = clf.predict_proba(X_test)        fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])        precision, recall, pr_thresholds = precision_recall_curve(            y_test, proba[:, 1])        pr_scores.append(auc(recall, precision))        precisions.append(precision)        recalls.append(recall)        thresholds.append(pr_thresholds)    if plot:        scores_to_sort = pr_scores        median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]        plot_pr(pr_scores[median], name, precisions[median],                recalls[median], label=name)        log_false_positives(clfs[median], X_test, y_test, name)    summary = (np.mean(scores), np.std(scores),               np.mean(pr_scores), np.std(pr_scores))    print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary    return np.mean(train_errors), np.mean(test_errors)def print_incorrect(clf, X, Y):    Y_hat = clf.predict(X)    wrong_idx = Y_hat != Y    X_wrong = X[wrong_idx]    Y_wrong = Y[wrong_idx]    Y_hat_wrong = Y_hat[wrong_idx]    for idx in xrange(len(X_wrong)):        print "clf.predict('%s')=%i instead of %i" %\            (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx])def get_best_model():    best_params = dict(all__tfidf__ngram_range=(1, 2),                       all__tfidf__min_df=1,                       all__tfidf__stop_words=None,                       all__tfidf__smooth_idf=False,                       all__tfidf__use_idf=False,                       all__tfidf__sublinear_tf=True,                       all__tfidf__binary=False,                       clf__alpha=0.01,                       )    best_clf = create_union_model(best_params)    return best_clfif __name__ == "__main__":    X_orig, Y_orig = load_sanders_data()    #from sklearn.utils import shuffle    #print "shuffle, sample"    #X_orig, Y_orig = shuffle(X_orig, Y_orig)    #X_orig = X_orig[:100,]    #Y_orig = Y_orig[:100,]    classes = np.unique(Y_orig)    for c in classes:        print "#%s: %i" % (c, sum(Y_orig == c))    print "== Pos vs. neg =="    pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative")    X = X_orig[pos_neg]    Y = Y_orig[pos_neg]    Y = tweak_labels(Y, ["positive"])    train_model(get_best_model(), X, Y, name="pos vs neg", plot=True)    print "== Pos/neg vs. irrelevant/neutral =="    X = X_orig    Y = tweak_labels(Y_orig, ["positive", "negative"])    # best_clf = grid_search_model(create_union_model, X, Y, name="sent vs    # rest", plot=True)    train_model(get_best_model(), X, Y, name="pos+neg vs rest", plot=True)    print "== Pos vs. rest =="    X = X_orig    Y = tweak_labels(Y_orig, ["positive"])    train_model(get_best_model(), X, Y, name="pos vs rest",    plot=True)    print "== Neg vs. rest =="    X = X_orig    Y = tweak_labels(Y_orig, ["negative"])    train_model(get_best_model(), X, Y, name="neg vs rest",    plot=True)    print "time spent:", time.time() - start_time    json.dump(poscache, open(poscache_filename, "w")) This script trains tries to tweak hyperparameters to improve P/R AUC#import timestart_time = time.time()import reimport nltkimport numpy as npfrom sklearn.metrics import precision_recall_curve, roc_curve, aucfrom sklearn.cross_validation import ShuffleSplitfrom utils import plot_prfrom utils import load_sanders_datafrom utils import tweak_labelsfrom utils import log_false_positivesfrom sklearn.feature_extraction.text import CountVectorizerfrom sklearn.pipeline import Pipeline, FeatureUnionfrom sklearn.grid_search import GridSearchCVfrom sklearn.metrics import f1_scorefrom sklearn.base import BaseEstimatorfrom sklearn.naive_bayes import BernoulliNBfrom utils import load_sent_word_netsent_word_net = load_sent_word_net()import jsonposcache_filename = "poscache.json"try:    poscache = json.load(open(poscache_filename, "r"))except IOError:    poscache = {}class LinguisticVectorizer(BaseEstimator):    def get_feature_names(self):        return np.array(['sent_neut', 'sent_pos', 'sent_neg',         'nouns', 'adjectives', 'verbs', 'adverbs',         'allcaps', 'exclamation', 'question'])    def fit(self, documents, y=None):        return self    def _get_sentiments(self, d):        # http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html        #import pdb;pdb.set_trace()        sent = tuple(nltk.word_tokenize(d))        if poscache is not None:            if d in poscache:                tagged = poscache[d]            else:                poscache[d] = tagged = nltk.pos_tag(sent)        else:            tagged = nltk.pos_tag(sent)        pos_vals = []        neg_vals = []        nouns = 0.        adjectives = 0.        verbs = 0.        adverbs = 0.        for w,t in tagged:            p, n = 0,0            sent_pos_type = None            if t.startswith("NN"):                sent_pos_type = "n"                nouns += 1            elif t.startswith("JJ"):                sent_pos_type = "a"                adjectives += 1            elif t.startswith("VB"):                sent_pos_type = "v"                verbs += 1            elif t.startswith("RB"):                sent_pos_type = "r"                adverbs += 1            if sent_pos_type is not None:                sent_word = "%s/%s"%(sent_pos_type, w)                if sent_word in sent_word_net:                    p,n = sent_word_net[sent_word]            pos_vals.append(p)            neg_vals.append(n)        l = len(sent)        avg_pos_val = np.mean(pos_vals)        avg_neg_val = np.mean(neg_vals)        #import pdb;pdb.set_trace()        return [1-avg_pos_val-avg_neg_val, avg_pos_val, avg_neg_val,                nouns/l, adjectives/l, verbs/l, adverbs/l]    def transform(self, documents):        obj_val, pos_val, neg_val, nouns, adjectives, verbs, adverbs = np.array([self._get_sentiments(d) for d in documents]).T        allcaps = []        exclamation = []        question = []        for d in documents:            allcaps.append(np.sum([t.isupper() for t in d.split() if len(t)>2]))            exclamation.append(d.count("!"))            question.append(d.count("?"))        result = np.array([obj_val, pos_val, neg_val, nouns, adjectives, verbs, adverbs, allcaps,            exclamation, question]).T        return resultemo_repl = {    # positive emoticons    "<3": " good ",    ":d": " good ", # :D in lower case    ":dd": " good ", # :DD in lower case    "8)": " good ",    ":-)": " good ",    ":)": " good ",    ";)": " good ",    "(-:": " good ",    "(:": " good ",    # negative emoticons:    ":/": " bad ",    ":>": " sad ",    ":')": " sad ",    ":-(": " bad ",    ":(": " bad ",    ":S": " bad ",    ":-S": " bad ",    }emo_repl_order = [k for (k_len,k) in reversed(sorted([(len(k),k) for k in emo_repl.keys()]))]re_repl = {    r"\br\b": "are",    r"\bu\b": "you",    r"\bhaha\b": "ha",    r"\bhahaha\b": "ha",    r"\bdon't\b": "do not",    r"\bdoesn't\b": "does not",    r"\bdidn't\b": "did not",    r"\bhasn't\b": "has not",    r"\bhaven't\b": "have not",    r"\bhadn't\b": "had not",    r"\bwon't\b": "will not",    r"\bwouldn't\b": "would not",    r"\bcan't\b": "can not",    r"\bcannot\b": "can not",    }def create_union_model(params=None):    def preprocessor(tweet):        tweet = tweet.lower()        for k in emo_repl_order:            tweet = tweet.replace(k, emo_repl[k])        for r, repl in re_repl.iteritems():            tweet = re.sub(r, repl, tweet)        return tweet.replace("-", " ").replace("_", " ")    count_ngrams = CountVectorizer(preprocessor=preprocessor,                                   analyzer="word")    ling_stats = LinguisticVectorizer()    all_features = FeatureUnion([('ling', ling_stats), ('count', count_ngrams)])    #all_features = FeatureUnion([('count', count_ngrams)])    #all_features = FeatureUnion([('ling', ling_stats)])    clf = BernoulliNB()    pipeline = Pipeline([('all', all_features), ('clf', clf)])    if params:        pipeline.set_params(**params)    return pipelinedef __grid_search_model(clf_factory, X, Y):    cv = ShuffleSplit(        n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)    param_grid = dict(all__count__ngram_range=[(1, 1), (1, 2), (1, 3)],                      all__count__min_df=[1, 2],                      )    grid_search = GridSearchCV(clf_factory(),                               param_grid=param_grid,                               cv=cv,                               score_func=f1_score,                               verbose=10)    grid_search.fit(X, Y)    clf = grid_search.best_estimator_    print clf    return clfdef train_model(clf, X, Y, name="NB ngram", plot=False):    # create it again for plotting    cv = ShuffleSplit(        n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)    train_errors = []    test_errors = []    scores = []    pr_scores = []    precisions, recalls, thresholds = [], [], []    clfs = [] # just to later get the median    for train, test in cv:        X_train, y_train = X[train], Y[train]        X_test, y_test = X[test], Y[test]        clf.fit(X_train, y_train)        clfs.append(clf)        train_score = clf.score(X_train, y_train)        test_score = clf.score(X_test, y_test)        train_errors.append(1 - train_score)        test_errors.append(1 - test_score)        scores.append(test_score)        proba = clf.predict_proba(X_test)        fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])        precision, recall, pr_thresholds = precision_recall_curve(            y_test, proba[:, 1])        pr_scores.append(auc(recall, precision))        precisions.append(precision)        recalls.append(recall)        thresholds.append(pr_thresholds)    if plot:        scores_to_sort = pr_scores        median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]        plot_pr(pr_scores[median], name, precisions[median],                recalls[median], label=name)        log_false_positives(clfs[median], X_test, y_test, name)    summary = (np.mean(scores), np.std(scores),               np.mean(pr_scores), np.std(pr_scores))    print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary    return np.mean(train_errors), np.mean(test_errors)def print_incorrect(clf, X, Y):    Y_hat = clf.predict(X)    wrong_idx = Y_hat != Y    X_wrong = X[wrong_idx]    Y_wrong = Y[wrong_idx]    Y_hat_wrong = Y_hat[wrong_idx]    for idx in xrange(len(X_wrong)):        print "clf.predict('%s')=%i instead of %i" %\            (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx])def get_best_model():    best_params = dict(all__count__ngram_range=(1, 2),                       all__count__min_df=1,                       all__count__stop_words=None,                       all__count__binary=True,                       )    best_clf = create_union_model(best_params)    return best_clfif __name__ == "__main__":    X_orig, Y_orig = load_sanders_data()    #from sklearn.utils import shuffle    #print "shuffle, sample"    #X_orig, Y_orig = shuffle(X_orig, Y_orig)    #X_orig = X_orig[:100,]    #Y_orig = Y_orig[:100,]    classes = np.unique(Y_orig)    for c in classes:        print "#%s: %i" % (c, sum(Y_orig == c))    print "== Pos vs. neg =="    pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative")    X = X_orig[pos_neg]    Y = Y_orig[pos_neg]    Y = tweak_labels(Y, ["positive"])    train_model(get_best_model(), X, Y, name="pos vs neg", plot=True)    print "== Pos/neg vs. irrelevant/neutral =="    X = X_orig    Y = tweak_labels(Y_orig, ["positive", "negative"])    #best_clf = __grid_search_model(create_union_model, X, Y)    #print best_clf    train_model(get_best_model(), X, Y, name="pos+neg vs rest", plot=True)    print "== Pos vs. rest =="    X = X_orig    Y = tweak_labels(Y_orig, ["positive"])    train_model(get_best_model(), X, Y, name="pos vs rest",    plot=True)    print "== Neg vs. rest =="    X = X_orig    Y = tweak_labels(Y_orig, ["negative"])    train_model(get_best_model(), X, Y, name="neg vs rest",    plot=True)    print "time spent:", time.time() - start_time    json.dump(poscache, open(poscache_filename, "w"))第七章import numpy as npfrom sklearn.datasets import load_bostonimport pylab as pltboston = load_boston()x = np.array([np.concatenate((v,[1])) for v in boston.data]) ##concatenate 矩阵连接，vstack，hstack，等一起y = boston.targets,total_error,_,_ = np.linalg.lstsq(x,y)rmse = np.sqrt(total_error[0]/len(x))print('Residual: {}'.format(rmse))        # 注意是空的 {}， 可以两个空的连着plt.plot(np.dot(x,s), boston.target,'ro')       #矩阵乘法 dotplt.plot([0,50],[0,50], 'g-')plt.xlabel('predicted')plt.ylabel('real')plt.show()from __future__ import print_functionfrom sklearn.cross_validation import KFoldfrom sklearn.linear_model import ElasticNet, Lasso, Ridgefrom sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCVimport numpy as npx = np.array([np.concatenate((v,[1])) for v in boston.data])y = boston.targetfor name,met in [        ('elastic-net(.5)', ElasticNet(fit_intercept=True, alpha=0.5)),     # 注意模型也在内部，此种写法        ('lasso(.5)', Lasso(fit_intercept=True, alpha=0.5)),        ('ridge(.5)', Ridge(fit_intercept=True, alpha=0.5)),        ]:    met.fit(x,y)    p = np.array([met.predict(xi) for xi in x])    e = p-y    total_error = np.dot(e,e)     # 注意此种写法 平方    rmse_train = np.sqrt(total_error/len(p))    kf = KFold(len(x), n_folds=10)    err = 0    for train,test in kf:        met.fit(x[train],y[train])               # 注意此种交叉验证方式设计，采用的是索引        p = np.array([met.predict(xi) for xi in x[test]])        e = p-y[test]        err += np.dot(e,e)    rmse_10cv = np.sqrt(err/len(x))    print('Method: {}'.format(name))    print('RMSE on training: {}'.format(rmse_train))    print('RMSE on 10-fold CV: {}'.format(rmse_10cv))    print()    print()from sklearn.cross_validation import KFoldfrom sklearn.linear_model import LinearRegression, ElasticNetimport numpy as npfrom sklearn.datasets import load_bostonboston = load_boston()x = np.array([np.concatenate((v,[1])) for v in boston.data])y = boston.targetFIT_EN = Falseif FIT_EN:    model = ElasticNet(fit_intercept=True, alpha=0.5)    # 巧妙的使用判断，和fit结合else:    model = LinearRegression(fit_intercept=True)model.fit(x,y)p = np.array([model.predict(xi) for xi in x])e = p-ytotal_error = np.dot(e,e)rmse_train = np.sqrt(total_error/len(p))kf = KFold(len(x), n_folds=10)         #此处是索引err = 0for train,test in kf:    model.fit(x[train],y[train])    p = np.array([model.predict(xi) for xi in x[test]])    e = p-y[test]    err += np.dot(e,e)rmse_10cv = np.sqrt(err/len(x))print('RMSE on training: {}'.format(rmse_train))print('RMSE on 10-fold CV: {}'.format(rmse_10cv))import numpy as npfrom sklearn.datasets import load_bostonimport pylab as pltfrom mpltools import stylestyle.use('ggplot')boston = load_boston()plt.scatter(boston.data[:,5], boston.target)plt.xlabel("RM")plt.ylabel("House Price")x = boston.data[:,5]x = np.array([[v] for v in x])y = boston.targetslope,res,_,_ = np.linalg.lstsq(x,y)          #_, 使用在不想起名字不使用的情况下？plt.plot([0,boston.data[:,5].max()+1],[0,slope*(boston.data[:,5].max()+1)], '-', lw=4)plt.savefig('Figure1.png',dpi=150)rmse = np.sqrt(res[0]/len(x))print('Residual: {}'.format(rmse))import numpy as npfrom sklearn.datasets import load_bostonimport pylab as pltfrom mpltools import style               # 使用方法，sytle.usestyle.use('ggplot')boston = load_boston()plt.scatter(boston.data[:,5], boston.target)plt.xlabel("RM")plt.ylabel("House Price")x = boston.data[:,5]xmin = x.min()xmax = x.max()x = np.array([[v,1] for v in x])y = boston.target(slope,bias),res,_,_ = np.linalg.lstsq(x,y)plt.plot([xmin,xmax],[slope*xmin + bias, slope*xmax + bias], '-', lw=4)plt.savefig('Figure2.png',dpi=150)rmse = np.sqrt(res[0]/len(x))print('Residual: {}'.format(rmse))import numpy as npfrom sklearn.datasets import load_svmlight_filefrom sklearn.linear_model import ElasticNet, LinearRegressiondata,target = load_svmlight_file('E2006.train')lr = LinearRegression(fit_intercept=True)from sklearn.cross_validation import KFoldkf = KFold(len(target), n_folds=10)err = 0for train,test in kf:    lr.fit(data[train],target[train])    p = map(lr.predict, data[test])    p = np.array(p).ravel()        ##注意ravel的使用， 和flatten的区别    e = p-target[test]    err += np.dot(e,e)rmse_10cv = np.sqrt(err/len(target))lr.fit(data,target)p = np.array(map(lr.predict, data))    # 推到表达式也可实现p = p.ravel()e = p-targettotal_error = np.dot(e,e)rmse_train = np.sqrt(total_error/len(p))print('RMSE on training: {}'.format(rmse_train))print('RMSE on 10-fold CV: {}'.format(rmse_10cv))import numpy as npfrom scipy import sparsefrom sklearn.linear_model import LassoCV, RidgeCV, ElasticNetCVfrom sklearn.cross_validation import KFolddata = np.array([[int(tok) for tok in line.split('\t')[:3]] for line in open('data/ml-100k/u.data')])   #多层列表推导式，取出需要的值ij = data[:,:2]ij -= 1 # original data is in 1-based systemvalues = data[:,2]reviews = sparse.csc_matrix((values,ij.T)).astype(float)  #两种稀疏矩阵压缩，astypereg = ElasticNetCV(fit_intercept=True, alphas=[0.0125, 0.025,0.05,.125,.25,.5,1.,2.,4.])def movie_norm(xc):    xc = xc.copy().toarray()    x1 = np.array([xi[xi > 0].mean() for xi in xc])    x1 = np.nan_to_num(x1)       #注意 np.nan_to_num的使用    for i in range(xc.shape[0]):        xc[i] -= (xc[i] > 0) * x1[i]    return xc, x1def learn_for(i):    u = reviews[i]    us = np.delete(np.arange(reviews.shape[0]), i)    ps, = np.where(u.toarray().ravel() > 0)    x = reviews[us][:,ps].T    y = u.data    err = 0    eb = 0    kf = KFold(len(y), n_folds=4)    for train,test in kf:        xc,x1 = movie_norm(x[train])        reg.fit(xc, y[train]-x1)        xc,x1 = movie_norm(x[test])        p = np.array([reg.predict(xi) for xi in  xc]).ravel()        e = (p+x1)-y[test]        err += np.sum(e*e)        eb += np.sum( (y[train].mean() - y[test])**2 )    return np.sqrt(err/float(len(y))), np.sqrt(eb/float(len(y)))whole_data = []for i in range(reviews.shape[0]):    s = learn_for(i)    print(s[0] < s[1])    print(s)    whole_data.append(s)第八章def apriori(dataset, minsupport, maxsize):    '''    freqsets, baskets = apriori(dataset, minsupport, maxsize)    Parameters    ----------    dataset : sequence of sequences        input dataset    minsupport : int        Minimal support for frequent items    maxsize : int        Maximal size of frequent items to return    Returns    -------    freqsets : sequence of sequences    baskets : dictionary    '''    from collections import defaultdict    baskets = defaultdict(list)    pointers = defaultdict(list)    for i,ds in enumerate(dataset):        for ell in ds:            pointers[ell].append(i)            baskets[frozenset([ell])].append(i)    pointers = dict([(k,frozenset(v)) for k,v in pointers.items()])    baskets = dict([(k,frozenset(v)) for k,v in baskets.items()])    valid = set(list(el)[0] for el,c in baskets.items() if (len(c) >= minsupport))    dataset = [[el for el in ds if (el in valid)] for ds in dataset]    dataset = [ds for ds in dataset if len(ds) > 1]    dataset = map(frozenset,dataset)    itemsets = [frozenset([v]) for v in valid]    freqsets = []    for i in range(maxsize-1):        print(len(itemsets))        newsets = []        for i,ell in enumerate(itemsets):            ccounts = baskets[ell]            for v_,pv in pointers.items():                if v_ not in ell:                    csup = (ccounts & pv)                    if len(csup) >= minsupport:                        new = frozenset(ell|set([v_]))                        if new not in baskets:                            newsets.append(new)                            baskets[new] = csup        freqsets.extend(itemsets)        itemsets = newsets    return freqsets,basketsdef association_rules(dataset, freqsets, baskets, minlift):    '''    for (antecendent, consequent, base, py_x, lift) in association_rules(dataset, freqsets, baskets, minlift):        ...    This function takes the returns from ``apriori``.    Parameters    ----------    dataset : sequence of sequences        input dataset    freqsets : sequence of sequences    baskets : dictionary    minlift : int        minimal lift of yielded rules    '''    nr_transactions = float(len(dataset))    freqsets = [f for f in freqsets if len(f) > 1]    for fset in freqsets:        for f in fset:            consequent = frozenset([f])            antecendent = fset-consequent            base = len(baskets[consequent])/nr_transactions            py_x = len(baskets[fset])/float(len(baskets[antecendent]))            lift = py_x / base            if lift > minlift:                yield (antecendent, consequent, base, py_x, lift)      # 注意生成器的使用from apriori import apriori, association_rulesfrom gzip import GzipFiledataset = [ [int(tok) for tok in line.strip().split()]            for line in GzipFile('retail.dat.gz')]freqsets,baskets = apriori(dataset, 80, maxsize=5)nr_transactions = float(len(dataset))for ant,con,base,pyx,lift in association_rules(dataset, freqsets,baskets, 30):    print('{} | {} | {} ({:%}) | {} | {} | {}'                    .format(ant, con, len(baskets[con]), len(baskets[con])/ nr_transactions, len(baskets[ant]), len(baskets[con|ant]), int(lift)))import numpy as npfrom collections import defaultdictfrom itertools import chain        # 注意itertools 模块提供迭代器from gzip import GzipFileminsupport = 44 dataset = [ [int(tok) for tok in line.strip().split()]            for line in GzipFile('retail.dat.gz')]dataset = dataset[::20]counts = defaultdict(int)for elem in chain(*dataset):        #注意chain内部是迭代器，注意此处使用的函数参数为*不固定的    counts[elem] += 1valid = set(el for el,c in counts.items() if (c >= minsupport))dataset = [[el for el in ds if (el in valid)] for ds in dataset]dataset = [frozenset(ds) for ds in dataset if len(ds) > 1]itemsets = [frozenset([v]) for v in valid]allsets = [itemsets]for i in range(16):    print(len(itemsets))    nextsets = []    for i,ell in enumerate(itemsets):        for v_ in valid:            if v_ not in ell:                c = (ell|set([v_]))                if sum(1 for d in dataset if d.issuperset(c)) > minsupport:                    nextsets.append(c)    allsets.append(nextsets)    itemsets = nextsetsimport numpy as npfrom collections import defaultdictfrom itertools import chainfrom gzip import GzipFiledataset = [ [int(tok) for tok in line.strip().split()]            for line in GzipFile('retail.dat.gz')]counts = defaultdict(int)for elem in chain(*dataset):    counts[elem] += 1counts = np.array(list(counts.values()))bins = [1,2,4,8,16,32,64,128,512]print(' {0:11} | {1:12}'.format('Nr of baskets', 'Nr of products'))print('--------------------------------')for i in range(len(bins)):    bot = bins[i]    top = (bins[i+1] if (i+1) < len(bins) else 100000000000)    print('  {0:4} - {1:3}   | {2:12}'.format(bot, (top if top < 1000 else ''), np.sum( (counts >= bot) & (counts < top) )))# 注意在.format中使用了if else等语法import numpy as np# This is the version in the book:def all_correlations(bait, target):    '''    corrs = all_correlations(bait, target)    corrs[i] is the correlation between bait and target[i]    '''    return np.array(            [np.corrcoef(bait, c)[0,1]                for c in target])# This is a faster, but harder to read, implementation:def all_correlations(y, X):    '''    Cs = all_correlations(y, X)    Cs[i] = np.corrcoef(y, X[i])[0,1]    '''    X = np.asanyarray(X, float)    y = np.asanyarray(y, float)    xy = np.dot(X, y)    y_ = y.mean()    ys_ = y.std()    x_ = X.mean(1)    xs_ = X.std(1)    n = float(len(y))    ys_ += 1e-5 # Handle zeros in ys    xs_ += 1e-5 # Handle zeros in x    return (xy - x_*y_*n)/n/xs_/ys_import numpy as np# This is the version in the book:def all_correlations(bait, target):    '''    corrs = all_correlations(bait, target)    corrs[i] is the correlation between bait and target[i]    '''    return np.array(            [np.corrcoef(bait, c)[0,1]                for c in target])# This is a faster, but harder to read, implementation:def all_correlations(y, X):    '''    Cs = all_correlations(y, X)    Cs[i] = np.corrcoef(y, X[i])[0,1]    '''    X = np.asanyarray(X, float)          #把matrix转换为array用asarray（） asanyarray()根据和你的输入的类型保持一致    y = np.asanyarray(y, float)    xy = np.dot(X, y)    y_ = y.mean()    ys_ = y.std()    x_ = X.mean(1)    xs_ = X.std(1)    n = float(len(y))    ys_ += 1e-5 # Handle zeros in ys    xs_ += 1e-5 # Handle zeros in x    return (xy - x_*y_*n)/n/xs_/ys_from __future__ import print_functionfrom all_correlations import all_correlationsimport numpy as npfrom scipy import sparsefrom load_ml100k import loadreviews = load()def estimate_user(user, rest):    bu = user > 0    br = rest > 0    ws = all_correlations(bu,br)    selected = ws.argsort()[-100:]    estimates = rest[selected].mean(0)    estimates /= (.1+br[selected].mean(0))    return estimatesdef train_test(user, rest):    estimates = estimate_user(user, rest)    bu = user > 0    br = rest > 0    err = estimates[bu]-user[bu]    null = rest.mean(0)    null /= (.1+br.mean(0))    nerr = null[bu]-user[bu]    return np.dot(err,err), np.dot(nerr, nerr)def cross_validate_all():    err = []    for i in xrange(reviews.shape[0]):        err.append(            train_test(reviews[i], np.delete(reviews, i, 0))            )    revs = (reviews > 0).sum(1)    err = np.array(err)    rmse = np.sqrt(err / revs[:,None])    print(np.mean(rmse, 0))    print(np.mean(rmse[revs > 60], 0))def all_estimates(reviews):    reviews = reviews.toarray()    estimates = np.zeros_like(reviews)  ##注意zeros_like和ones_like 和原矩阵的shape一样， 但数据都是0， 或1    for i in xrange(reviews.shape[0]):        estimates[i] = estimate_user(reviews[i], np.delete(reviews, i, 0))    return estimatesfrom load_ml100k import loadfrom matplotlib import pyplot as pltdata = load()data = data.toarray()        # 和todense一样，都是把稀疏举证转为数组plt.gray()plt.imshow(data[:200,:200], interpolation='nearest')  ##imshow显示二值型数据，颜色深浅plt.xlabel('User ID')plt.ylabel('Film ID')plt.savefig('../1400_08_03+.png')import numpy as npfrom scipy import sparsedef load():    data = np.array([[int(t) for t in line.split('\t')[:3]] for line in open('data/ml-100k/u.data')])    ij = data[:,:2]    ij -= 1 # original data is in 1-based system    values = data[:,2]    reviews = sparse.csc_matrix((values,ij.T)).astype(float)  ##csc 按列压缩矩阵， csr按行压缩矩阵    return reviewsfrom __future__ import print_functionimport numpy as npfrom load_ml100k import loadfrom all_correlations import all_correlationsdef nn_movie(ureviews, reviews, uid, mid, k=1):    X = ureviews    y = ureviews[mid].copy()    y -= y.mean()    y /= (y.std()+1e-5)    corrs = np.dot(X, y)    likes = corrs.argsort()    likes = likes[::-1]    c = 0    pred = 3.    for ell in likes:        if ell == mid:            continue        if reviews[uid,ell] > 0:            pred = reviews[uid,ell]            if c == k:                return pred            c += 1    return preddef all_estimates(reviews, k=1):    reviews = reviews.astype(float)    k -= 1    nusers, nmovies = reviews.shape    estimates = np.zeros_like(reviews)    for u in range(nusers):        ureviews = np.delete(reviews, u, 0)        ureviews -= ureviews.mean(0)        ureviews /= (ureviews.std(0)+1e-4)        ureviews = ureviews.T.copy()        for m in np.where(reviews[u] > 0)[0]:            estimates[u,m] = nn_movie(ureviews, reviews, u, m, k)    return estimatesif __name__ == '__main__':    reviews = load().toarray()    estimates = all_estimates(reviews)    error = (estimates-reviews)    error **= 2          #注意**用法    error = error[reviews > 0]    print(np.sqrt(error).mean())from __future__ import print_functionfrom sklearn.linear_model import LinearRegressionfrom load_ml100k import loadimport numpy as npimport similar_movieimport usermodelimport corrneighboursreviews = load()reg = LinearRegression()es = np.array([    usermodel.all_estimates(reviews),    corrneighbours.all_estimates(reviews),    similar_movies.all_estimates(reviews),    ])reviews = reviews.toarray()        total_error = 0.0coefficients = []for u in xrange(reviews.shape[0]):    es0 = np.delete(es,u,1)    r0 = np.delete(reviews, u, 0)    X,Y = np.where(r0 > 0)    X = es[:,X,Y]    y = r0[r0 > 0]    reg.fit(X.T,y)    coefficients.append(reg.coef_)    r0 = reviews[u]    X = np.where(r0 > 0)    p0 = reg.predict(es[:,u,X].squeeze().T)    err0 = r0[r0 > 0]-p0    total_error += np.dot(err0,err0)    print(u)from sklearn.linear_model import LinearRegressionfrom load_ml100k import loadimport numpy as npimport similar_movieimport usermodelimport corrneighbourssreviews = load()reviews = sreviews.toarray()reg = LinearRegression()es = np.array([    usermodel.all_estimates(sreviews),    similar_movie.all_estimates(reviews,k=1),    similar_movie.all_estimates(reviews,k=2),    similar_movie.all_estimates(reviews,k=3),    similar_movie.all_estimates(reviews,k=4),    similar_movie.all_estimates(reviews,k=5),    ])total_error = 0.0coefficients = []for u in xrange(reviews.shape[0]):    es0 = np.delete(es,u,1)      #delete的用法 留一    r0 = np.delete(reviews, u, 0)    X,Y = np.where(r0 > 0)    X = es[:,X,Y]    y = r0[r0 > 0]    reg.fit(X.T,y)    coefficients.append(reg.coef_)    r0 = reviews[u]    X = np.where(r0 > 0)    p0 = reg.predict(es[:,u,X].squeeze().T)    err0 = r0[r0 > 0]-p0    total_error += np.dot(err0,err0)coefficients = np.array(coefficients)import numpy as npfrom sklearn.linear_model import LassoCV, RidgeCV, ElasticNetCVfrom sklearn.cross_validation import KFoldfrom load_ml100k import loaddef learn_for(reviews, i):    reg = ElasticNetCV(fit_intercept=True, alphas=[0.0125, 0.025,0.05,.125,.25,.5,1.,2.,4.])    u = reviews[i]    us = range(reviews.shape[0])    del us[i]    ps, = np.where(u.toarray().ravel() > 0)    x = reviews[us][:,ps].T    y = u.data    kf = KFold(len(y), n_folds=4)    predictions = np.zeros(len(ps))    for train,test in kf:        xc = x[train].copy().toarray()        x1 = np.array([xi[xi > 0].mean() for xi in xc])        x1 = np.nan_to_num(x1)        for i in xrange(xc.shape[0]):            xc[i] -= (xc[i] > 0) * x1[i]        reg.fit(xc, y[train]-x1)        xc = x[test].copy().toarray()        x1 = np.array([xi[xi > 0].mean() for xi in xc])第九章import osfrom matplotlib import pylabimport numpy as npDATA_DIR = os.path.join("..", "data")CHART_DIR = os.path.join("..", "charts")GENRE_DIR = "/media/sf_P/pymlbook-data/09-genre-class/genres"GENRE_LIST = ["classical", "jazz", "country", "pop", "rock", "metal"]def plot_confusion_matrix(cm, genre_list, name, title):    pylab.clf()    pylab.matshow(cm, fignum=False, cmap='Blues', vmin=0, vmax=1.0)     # 注意用matshow画混淆举证，并设置cmap颜色等    ax = pylab.axes()    ax.set_xticks(range(len(genre_list)))   # 注意图形的建立方式    ax.set_xticklabels(genre_list)    ax.xaxis.set_ticks_position("bottom")    ax.set_yticks(range(len(genre_list)))    ax.set_yticklabels(genre_list)    pylab.title(title)    pylab.colorbar()    pylab.grid(False)    pylab.show()    pylab.xlabel('Predicted class')    pylab.ylabel('True class')    pylab.grid(False)    pylab.savefig(os.path.join(CHART_DIR, "confusion_matrix_%s.png"%name), bbox_inches="tight")def plot_pr(auc_score, name, precision, recall, label=None):    pylab.clf()    pylab.figure(num=None, figsize=(5, 4))    pylab.grid(True)    pylab.fill_between(recall, precision, alpha=0.5)   # 使用fill_between    pylab.plot(recall, precision, lw=1)    pylab.xlim([0.0, 1.0])    pylab.ylim([0.0, 1.0])    pylab.xlabel('Recall')    pylab.ylabel('Precision')    pylab.title('P/R curve (AUC = %0.2f) / %s' % (auc_score, label))    filename = name.replace(" ", "_")    pylab.savefig(os.path.join(CHART_DIR, "pr_" + filename + ".png"), bbox_inches="tight")def plot_roc(auc_score, name, tpr, fpr, label=None):    pylab.clf()    pylab.figure(num=None, figsize=(5, 4))    pylab.grid(True)    pylab.plot([0, 1], [0, 1], 'k--')    pylab.plot(fpr, tpr)    pylab.fill_between(fpr, tpr, alpha=0.5)    pylab.xlim([0.0, 1.0])    pylab.ylim([0.0, 1.0])    pylab.xlabel('False Positive Rate')    pylab.ylabel('True Positive Rate')    pylab.title('ROC curve (AUC = %0.2f) / %s' % (auc_score, label), verticalalignment="bottom")    pylab.legend(loc="lower right")    filename = name.replace(" ", "_")    pylab.savefig(os.path.join(CHART_DIR, "roc_" + filename + ".png"), bbox_inches="tight")def show_most_informative_features(vectorizer, clf, n=20):    c_f = sorted(zip(clf.coef_[0], vectorizer.get_feature_names()))    top = zip(c_f[:n], c_f[:-(n + 1):-1])    for (c1, f1), (c2, f2) in top:        print "\t%.4f\t%-15s\t\t%.4f\t%-15s" % (c1, f1, c2, f2)def plot_log():    pylab.clf()    x = np.arange(0.001, 1, 0.001)    y = np.log(x)    pylab.title('Relationship between probabilities and their logarithm')    pylab.plot(x, y)    pylab.grid(True)    pylab.xlabel('P')    pylab.ylabel('log(P)')    filename = 'log_probs.png'    pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")def plot_feat_importance(feature_names, clf, name):    pylab.clf()    coef_ = clf.coef_    important = np.argsort(np.absolute(coef_.ravel()))    f_imp = feature_names[important]    coef = coef_.ravel()[important]    inds = np.argsort(coef)    f_imp = f_imp[inds]    coef = coef[inds]    xpos = np.array(range(len(coef)))    pylab.bar(xpos, coef, width=1)    pylab.title('Feature importance for %s' % (name))    ax = pylab.gca()    ax.set_xticks(np.arange(len(coef)))    labels = ax.set_xticklabels(f_imp)    for label in labels:        label.set_rotation(90)    filename = name.replace(" ", "_")    pylab.savefig(os.path.join(        CHART_DIR, "feat_imp_%s.png" % filename), bbox_inches="tight")def plot_feat_hist(data_name_list, filename=None):    pylab.clf()    # import pdb;pdb.set_trace()    num_rows = 1 + (len(data_name_list) - 1) / 2    num_cols = 1 if len(data_name_list) == 1 else 2    pylab.figure(figsize=(5 * num_cols, 4 * num_rows))    for i in range(num_rows):        for j in range(num_cols):            pylab.subplot(num_rows, num_cols, 1 + i * num_cols + j)      # 注意subplot 动态设置行列。            x, name = data_name_list[i * num_cols + j]            pylab.title(name)            pylab.xlabel('Value')            pylab.ylabel('Density')            # the histogram of the data            max_val = np.max(x)            if max_val <= 1.0:                bins = 50            elif max_val > 50:                bins = 50            else:                bins = max_val            n, bins, patches = pylab.hist(                x, bins=bins, normed=1, facecolor='green', alpha=0.75)            pylab.grid(True)    if not filename:        filename = "feat_hist_%s.png" % name    pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")def plot_bias_variance(data_sizes, train_errors, test_errors, name):    pylab.clf()    pylab.ylim([0.0, 1.0])    pylab.xlabel('Data set size')    pylab.ylabel('Error')    pylab.title("Bias-Variance for '%s'" % name)    pylab.plot(        data_sizes, train_errors, "-", data_sizes, test_errors, "--", lw=1)    pylab.legend(["train error", "test error"], loc="upper right")    pylab.grid(True)    pylab.savefig(os.path.join(CHART_DIR, "bv_" + name + ".png"))import numpy as npfrom collections import defaultdictfrom sklearn.metrics import precision_recall_curve, roc_curvefrom sklearn.metrics import aucfrom sklearn.cross_validation import ShuffleSplitfrom sklearn.metrics import confusion_matrixfrom utils import plot_pr, plot_roc, plot_confusion_matrix, GENRE_LISTfrom fft import read_fftTEST_DIR = "/media/sf_P/pymlbook-data/09-genre-class/private"genre_list = GENRE_LISTdef train_model(clf_factory, X, Y, name, plot=False):    labels = np.unique(Y)      #np.unique函数使用    cv = ShuffleSplit(        n=len(X), n_iter=1, test_size=0.3, indices=True, random_state=0)    train_errors = []    test_errors = []    scores = []    pr_scores = defaultdict(list)    precisions, recalls, thresholds = defaultdict(        list), defaultdict(list), defaultdict(list)    roc_scores = defaultdict(list)    tprs = defaultdict(list)    fprs = defaultdict(list)    clfs = []  # just to later get the median    cms = []    for train, test in cv:        X_train, y_train = X[train], Y[train]        X_test, y_test = X[test], Y[test]        clf = clf_factory()        clf.fit(X_train, y_train)        clfs.append(clf)        train_score = clf.score(X_train, y_train)        test_score = clf.score(X_test, y_test)        scores.append(test_score)        train_errors.append(1 - train_score)        test_errors.append(1 - test_score)        y_pred = clf.predict(X_test)        cm = confusion_matrix(y_test, y_pred)        cms.append(cm)        for label in labels:            y_label_test = np.asarray(y_test == label, dtype=int)   # 注意在此处嵌入dtype语句            proba = clf.predict_proba(X_test)            proba_label = proba[:, label]            precision, recall, pr_thresholds = precision_recall_curve(                y_label_test, proba_label)            pr_scores[label].append(auc(recall, precision))            precisions[label].append(precision)            recalls[label].append(recall)            thresholds[label].append(pr_thresholds)            fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label)            roc_scores[label].append(auc(fpr, tpr))            tprs[label].append(tpr)            fprs[label].append(fpr)    if plot:        for label in labels:            print "Plotting", genre_list[label]            scores_to_sort = roc_scores[label]            median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]            desc = "%s %s" % (name, genre_list[label])            plot_pr(pr_scores[label][median], desc, precisions[label][median],                    recalls[label][median], label='%s vs rest' % genre_list[label])            plot_roc(roc_scores[label][median], desc, tprs[label][median],                     fprs[label][median], label='%s vs rest' % genre_list[label])    all_pr_scores = np.asarray(pr_scores.values()).flatten()    summary = (np.mean(scores), np.std(scores),               np.mean(all_pr_scores), np.std(all_pr_scores))    print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary    return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)def create_model():    from sklearn.linear_model.logistic import LogisticRegression    clf = LogisticRegression()    return clfif __name__ == "__main__":    X, y = read_fft(genre_list)    train_avg, test_avg, cms = train_model(        create_model, X, y, "Log Reg FFT", plot=True)    cm_avg = np.mean(cms, axis=0)    cm_norm = cm_avg / np.sum(cm_avg, axis=0)    print cm_norm    plot_confusion_matrix(cm_norm, genre_list, "fft",                          "Confusion matrix of an FFT based classifier")  import numpy as npfrom collections import defaultdictfrom sklearn.metrics import precision_recall_curve, roc_curvefrom sklearn.metrics import aucfrom sklearn.cross_validation import ShuffleSplitfrom sklearn.metrics import confusion_matrixfrom utils import plot_roc, plot_confusion_matrix, GENRE_LISTfrom ceps import read_cepsTEST_DIR = "/media/sf_P/pymlbook-data/09-genre-class/private"genre_list = GENRE_LISTdef train_model(clf_factory, X, Y, name, plot=False):    labels = np.unique(Y)    cv = ShuffleSplit(        n=len(X), n_iter=1, test_size=0.3, indices=True, random_state=0)    train_errors = []    test_errors = []    scores = []    pr_scores = defaultdict(list)    precisions, recalls, thresholds = defaultdict(        list), defaultdict(list), defaultdict(list)    roc_scores = defaultdict(list)    tprs = defaultdict(list)    fprs = defaultdict(list)    clfs = []  # just to later get the median    cms = []    for train, test in cv:        X_train, y_train = X[train], Y[train]        X_test, y_test = X[test], Y[test]        clf = clf_factory()        clf.fit(X_train, y_train)        clfs.append(clf)        train_score = clf.score(X_train, y_train)        test_score = clf.score(X_test, y_test)        scores.append(test_score)        train_errors.append(1 - train_score)        test_errors.append(1 - test_score)        y_pred = clf.predict(X_test)        cm = confusion_matrix(y_test, y_pred)        cms.append(cm)        for label in labels:            y_label_test = np.asarray(y_test == label, dtype=int)            proba = clf.predict_proba(X_test)            proba_label = proba[:, label]            precision, recall, pr_thresholds = precision_recall_curve(                y_label_test, proba_label)            pr_scores[label].append(auc(recall, precision))            precisions[label].append(precision)            recalls[label].append(recall)            thresholds[label].append(pr_thresholds)            fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label)            roc_scores[label].append(auc(fpr, tpr))            tprs[label].append(tpr)            fprs[label].append(fpr)    if plot:        for label in labels:            print "Plotting", genre_list[label]            scores_to_sort = roc_scores[label]            median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]            desc = "%s %s" % (name, genre_list[label])            plot_roc(roc_scores[label][median], desc, tprs[label][median],                     fprs[label][median], label='%s vs rest' % genre_list[label])    all_pr_scores = np.asarray(pr_scores.values()).flatten()    summary = (np.mean(scores), np.std(scores),               np.mean(all_pr_scores), np.std(all_pr_scores))    print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary    return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)def create_model():    from sklearn.linear_model.logistic import LogisticRegression    clf = LogisticRegression()    return clfif __name__ == "__main__":    X, y = read_ceps(genre_list)    train_avg, test_avg, cms = train_model(        create_model, X, y, "Log Reg CEPS", plot=True)    cm_avg = np.mean(cms, axis=0)    cm_norm = cm_avg / np.sum(cm_avg, axis=0)    print cm_norm    plot_confusion_matrix(cm_norm, genre_list, "ceps",                          "Confusion matrix of a CEPS based classifier")import osimport globimport sysimport numpy as npimport scipyimport scipy.io.wavfile        #利用scipy对wav文件进行处理from scikits.talkbox.features import mfcc     #talkbox对音频进行XXX变换的包from utils import GENRE_DIRdef write_ceps(ceps, fn):    """    Write the MFCC to separate files to speed up processing.    """    base_fn, ext = os.path.splitext(fn)    data_fn = base_fn + ".ceps"    np.save(data_fn, ceps)    print "Written", data_fndef create_ceps(fn):    sample_rate, X = scipy.io.wavfile.read(fn)    ceps, mspec, spec = mfcc(X)    write_ceps(ceps, fn)def read_ceps(genre_list, base_dir=GENRE_DIR):    X = []    y = []    for label, genre in enumerate(genre_list):        for fn in glob.glob(os.path.join(base_dir, genre, "*.ceps.npy")):            ceps = np.load(fn)            num_ceps = len(ceps)            X.append(                np.mean(ceps[int(num_ceps / 10):int(num_ceps * 9 / 10)], axis=0))            y.append(label)    return np.array(X), np.array(y)if __name__ == "__main__":    os.chdir(GENRE_DIR)    glob_wav = os.path.join(sys.argv[1], "*.wav")    print glob_wav    for fn in glob.glob(glob_wav):        create_ceps(fn)import sysimport osimport globimport numpy as npimport scipyimport scipy.io.wavfilefrom utils import GENRE_DIR, CHART_DIRimport matplotlib.pyplot as pltfrom matplotlib.ticker import EngFormatterdef write_fft(fft_features, fn):    """    Write the FFT features to separate files to speed up processing.    """    base_fn, ext = os.path.splitext(fn)    data_fn = base_fn + ".fft"    np.save(data_fn, fft_features)    print "Written", data_fndef create_fft(fn):    sample_rate, X = scipy.io.wavfile.read(fn)    fft_features = abs(scipy.fft(X)[:1000])    write_fft(fft_features, fn)def read_fft(genre_list, base_dir=GENRE_DIR):    X = []    y = []    for label, genre in enumerate(genre_list):        genre_dir = os.path.join(base_dir, genre, "*.fft.npy")        file_list = glob.glob(genre_dir)        assert(file_list), genre_dir        for fn in file_list:            fft_features = np.load(fn)            X.append(fft_features[:2000])            y.append(label)    return np.array(X), np.array(y)def plot_wav_fft(wav_filename, desc=None):    plt.clf()    plt.figure(num=None, figsize=(6, 4))    sample_rate, X = scipy.io.wavfile.read(wav_filename)    spectrum = np.fft.fft(X)    freq = np.fft.fftfreq(len(X), 1.0 / sample_rate)    plt.subplot(211)    num_samples = 200.0    plt.xlim(0, num_samples / sample_rate)    plt.xlabel("time [s]")    plt.title(desc or wav_filename)    plt.plot(np.arange(num_samples) / sample_rate, X[:num_samples])    plt.grid(True)    plt.subplot(212)    plt.xlim(0, 5000)    plt.xlabel("frequency [Hz]")    plt.xticks(np.arange(5) * 1000)    if desc:        desc = desc.strip()        fft_desc = desc[0].lower() + desc[1:]    else:        fft_desc = wav_filename    plt.title("FFT of %s" % fft_desc)    plt.plot(freq, abs(spectrum), linewidth=5)    plt.grid(True)    plt.tight_layout()    rel_filename = os.path.split(wav_filename)[1]    plt.savefig("%s_wav_fft.png" % os.path.splitext(rel_filename)[0],                bbox_inches='tight')    plt.show()def plot_wav_fft_demo():    plot_wav_fft("sine_a.wav", "400Hz sine wave")    plot_wav_fft("sine_b.wav", "3,000Hz sine wave")    plot_wav_fft("sine_mix.wav", "Mixed sine wave")def plot_specgram(ax, fn):    sample_rate, X = scipy.io.wavfile.read(fn)    ax.specgram(X, Fs=sample_rate, xextent=(0, 30))def plot_specgrams(base_dir=CHART_DIR):    """    Plot a bunch of spectrograms of wav files in different genres    """    plt.clf()    genres = ["classical", "jazz", "country", "pop", "rock", "metal"]    num_files = 3    f, axes = plt.subplots(len(genres), num_files)    for genre_idx, genre in enumerate(genres):        for idx, fn in enumerate(glob.glob(os.path.join(GENRE_DIR, genre, "*.wav"))):            if idx == num_files:                break            axis = axes[genre_idx, idx]            axis.yaxis.set_major_formatter(EngFormatter())            axis.set_title("%s song %i" % (genre, idx + 1))            plot_specgram(axis, fn)       #经过后台处理，直接使用specgram画图    specgram_file = os.path.join(base_dir, "Spectrogram_Genres.png")    plt.savefig(specgram_file, bbox_inches="tight")    plt.show()if __name__ == "__main__":    # for fn in glob.glob(os.path.join(sys.argv[1], "*.wav")):    #    create_fft(fn)    # plot_decomp()    if len(sys.argv) > 1:        plot_wav_fft(sys.argv[1], desc="some sample song")    else:        plot_wav_fft_demo()    plot_specgrams()第十章  图像处理import numpy as npimport mahotas as mh  #读取图像文件def edginess_sobel(image):    '''    edgi = edginess_sobel(image)    Measure the "edginess" of an image    '''    edges = mh.sobel(image, just_filter=True)    edges = edges.ravel()    return np.sqrt(np.dot(edges, edges))import numpy as npimport mahotas as mhtext = mh.imread("simple-dataset/text21.jpg")scene = mh.imread("simple-dataset/scene00.jpg")h,w,_ = text.shapecanvas = np.zeros((h,2*w+128,3), np.uint8)canvas[:,-w:] = scenecanvas[:,:w] = textcanvas = canvas[::4,::4]mh.imsave('../1400OS_10_10+.jpg', canvas)import mahotas as mhfrom mahotas.colors import rgb2greyimport numpy as npim = mh.imread('lenna.jpg')im = rgb2grey(im)   #灰度处理salt = np.random.random(im.shape) > .975pepper = np.random.random(im.shape) > .975im = np.maximum(salt*170, mh.stretch(im))im = np.minimum(pepper*30 + im*(~pepper), im)mh.imsave('../1400OS_10_13+.jpg', im.astype(np.uint8))import mahotas as mhfrom sklearn import cross_validationfrom sklearn.linear_model.logistic import LogisticRegressionfrom mpltools import stylefrom matplotlib import pyplot as pltimport numpy as npfrom glob import globbasedir = 'AnimTransDistr'def features_for(images):    fs = []    for im in images:        im = mh.imread(im,as_grey=True).astype(np.uint8)        fs.append(mh.features.haralick(im).mean(0))    return np.array(fs)def features_labels(groups):    labels = np.zeros(sum(map(len,groups)))    st = 0    for i,g in enumerate(groups):        labels[st:st+len(g)] = i        st += len(g)    return np.vstack(groups), labels    classes = [    'Anims',    'Cars',    'Distras',    'Trans',]features = []labels = []for ci,cl in enumerate(classes):    images = glob('{}/{}/*.jpg'.format(basedir,cl))    features.extend(features_for(images))    labels.extend([ci for _ in images])features = np.array(features)labels = np.array(labels)scores0 = cross_validation.cross_val_score(LogisticRegression(), features, labels, cv=10)print('Accuracy (5 fold x-val) with Logistic Regrssion [std features]: %s%%' % (0.1* round(1000*scores0.mean())))tfeatures = featuresfrom sklearn.cluster import KMeansfrom mahotas.features import surfimages = []labels = []for ci,cl in enumerate(classes):    curimages = glob('{}/{}/*.jpg'.format(basedir,cl))    images.extend(curimages)    labels.extend([ci for _ in curimages])labels = np.array(labels)alldescriptors = []for im in images:    im = mh.imread(im, as_grey=1)    im = im.astype(np.uint8)    #alldescriptors.append(surf.dense(im, spacing=max(im.shape)//32))    alldescriptors.append(surf.surf(im, descriptor_only=True))print('Descriptors done')k = 256km = KMeans(k)concatenated = np.concatenate(alldescriptors)concatenated = concatenated[::64]print('k-meaning...')km.fit(concatenated)features = []for d in alldescriptors:    c = km.predict(d)    features.append(        np.array([np.sum(c == i) for i in xrange(k)])        )features = np.array(features)print('predicting...')scoreSURFlr = cross_validation.cross_val_score(LogisticRegression(), features, labels, cv=5).mean()print('Accuracy (5 fold x-val) with Log. Reg [SURF features]: %s%%' % (0.1* round(1000*scoreSURFlr.mean())))print('combined...')allfeatures = np.hstack([features, tfeatures])scoreSURFplr = cross_validation.cross_val_score(LogisticRegression(), allfeatures, labels, cv=5).mean()print('Accuracy (5 fold x-val) with Log. Reg [All features]: %s%%' % (0.1* round(1000*scoreSURFplr.mean())))style.use('ggplot')plt.plot([0,1,2],100*np.array([scores0.mean(), scoreSURFlr, scoreSURFplr]), 'k-', lw=8)plt.plot([0,1,2],100*np.array([scores0.mean(), scoreSURFlr, scoreSURFplr]), 'o', mec='#cccccc', mew=12, mfc='white')plt.xlim(-.5,2.5)plt.ylim(scores0.mean()*90., scoreSURFplr*110)plt.xticks([0,1,2], ["baseline", "SURF", "combined"])plt.ylabel('Accuracy (%)')plt.savefig('../1400OS_10_18+.png')from matplotlib import pyplot as pltimport numpy as npimport mahotas as mhimage = mh.imread('../1400OS_10_01.jpeg')image = mh.colors.rgb2gray(image)im8 = mh.gaussian_filter(image,8)im16 = mh.gaussian_filter(image,16)im32 = mh.gaussian_filter(image,32)h,w = im8.shapecanvas = np.ones((h,3*w+256), np.uint8)canvas *= 255canvas[:,:w] = im8canvas[:,w+128:2*w+128] = im16canvas[:,-w:] = im32mh.imsave('../1400OS_10_05+.jpg', canvas[:,::2])im32 = mh.stretch(im32)ot32 = mh.otsu(im32)mh.imsave('../1400OS_10_06+.jpg', (im32 > ot32).astype(np.uint8)*255)from matplotlib import pyplot as pltimport numpy as npimport mahotas as mhimage = mh.imread('../1400OS_10_01.jpeg')image = mh.colors.rgb2gray(image, dtype=np.uint8)image = image[::4,::4]thresh = mh.sobel(image)filtered = mh.sobel(image, just_filter=True)thresh = mh.dilate(thresh,np.ones((7,7)))filtered = mh.dilate(mh.stretch(filtered),np.ones((7,7)))h,w = thresh.shapecanvas = 255*np.ones((h,w*2+64), np.uint8)canvas[:,:w] = thresh*255canvas[:,-w:] = filteredmh.imsave('../1400OS_10_09+.jpg', canvas)import mahotas as mhimport numpy as npim = mh.imread('lenna.jpg')r,g,b = im.transpose(2,0,1)h,w = r.shaper12 = mh.gaussian_filter(r, 12.)g12 = mh.gaussian_filter(g, 12.)b12 = mh.gaussian_filter(b, 12.)im12 = mh.as_rgb(r12,g12,b12)X,Y = np.mgrid[:h,:w]X = X-h/2.Y = Y-w/2.X /= X.max()Y /= Y.max()C = np.exp(-2.*(X**2+ Y**2))C -= C.min()C /= C.ptp()C = C[:,:,None]ring = mh.stretch(im*C + (1-C)*im12)mh.imsave('lenna-ring.jpg', ring)import mahotas as mhfrom sklearn import cross_validationfrom sklearn.linear_model.logistic import LogisticRegressionimport numpy as npfrom glob import globfrom edginess import edginess_sobelbasedir = 'simple-dataset'def features_for(im):    im = mh.imread(im,as_grey=True).astype(np.uint8)    return mh.features.haralick(im).mean(0)features = []sobels = []labels = []images = glob('{}/*.jpg'.format(basedir))for im in images:    features.append(features_for(im))    sobels.append(edginess_sobel(mh.imread(im, as_grey=True)))    labels.append(im[:-len('00.jpg')])features = np.array(features)labels = np.array(labels)scores = cross_validation.cross_val_score(LogisticRegression(), features, labels, cv=5)print('Accuracy (5 fold x-val) with Logistic Regrssion [std features]: {}%'.format(0.1* round(1000*scores.mean())))scores = cross_validation.cross_val_score(LogisticRegression(), np.hstack([np.atleast_2d(sobels).T,features]), labels, cv=5).mean()print('Accuracy (5 fold x-val) with Logistic Regrssion [std features + sobel]: {}%'.format(0.1* round(1000*scores.mean())))import numpy as npimport mahotas as mhimage = mh.imread('../1400OS_10_01.jpeg')image = mh.colors.rgb2gray(image, dtype=np.uint8)thresh = mh.thresholding.otsu(image)print(thresh)otsubin =  (image > thresh)mh.imsave('otsu-threshold.jpeg', otsubin.astype(np.uint8)*255)otsubin =  ~ mh.close(~otsubin, np.ones((15,15)))mh.imsave('otsu-closed.jpeg', otsubin.astype(np.uint8)*255)thresh = mh.thresholding.rc(image)print(thresh)mh.imsave('rc-threshold.jpeg', (image > thresh).astype(np.uint8)*255)第十一章 相关性import osfrom matplotlib import pylabimport numpy as npimport scipyfrom scipy.stats import norm, pearsonr  ##相关性DATA_DIR = os.path.join("..", "data")CHART_DIR = os.path.join("..", "charts")def _plot_correlation_func(x, y):    r, p = pearsonr(x, y)    title = "Cor($X_1$, $X_2$) = %.3f" % r    pylab.scatter(x, y)    pylab.title(title)    pylab.xlabel("$X_1$")    pylab.ylabel("$X_2$")    f1 = scipy.poly1d(scipy.polyfit(x, y, 1))    pylab.plot(x, f1(x), "r--", linewidth=2)    # pylab.xticks([w*7*24 for w in [0,1,2,3,4]], ['week %i'%(w+1) for w in    # [0,1,2,3,4]])def plot_correlation_demo():    np.random.seed(0)  # to reproduce the data later on    pylab.clf()    pylab.figure(num=None, figsize=(8, 8))    x = np.arange(0, 10, 0.2)    pylab.subplot(221)    y = 0.5 * x + norm.rvs(1, loc=0, scale=.01, size=len(x))    _plot_correlation_func(x, y)    pylab.subplot(222)    y = 0.5 * x + norm.rvs(1, loc=0, scale=.1, size=len(x))    _plot_correlation_func(x, y)    pylab.subplot(223)    y = 0.5 * x + norm.rvs(1, loc=0, scale=1, size=len(x))    _plot_correlation_func(x, y)    pylab.subplot(224)    y = norm.rvs(1, loc=0, scale=10, size=len(x))    _plot_correlation_func(x, y)    pylab.autoscale(tight=True)    pylab.grid(True)    filename = "corr_demo_1.png"    pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")    pylab.clf()    pylab.figure(num=None, figsize=(8, 8))    x = np.arange(-5, 5, 0.2)    pylab.subplot(221)    y = 0.5 * x ** 2 + norm.rvs(1, loc=0, scale=.01, size=len(x))    _plot_correlation_func(x, y)    pylab.subplot(222)    y = 0.5 * x ** 2 + norm.rvs(1, loc=0, scale=.1, size=len(x))    _plot_correlation_func(x, y)    pylab.subplot(223)    y = 0.5 * x ** 2 + norm.rvs(1, loc=0, scale=1, size=len(x))    _plot_correlation_func(x, y)    pylab.subplot(224)    y = 0.5 * x ** 2 + norm.rvs(1, loc=0, scale=10, size=len(x))    _plot_correlation_func(x, y)    pylab.autoscale(tight=True)    pylab.grid(True)    filename = "corr_demo_2.png"    pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")if __name__ == '__main__':    plot_correlation_demo()import osfrom matplotlib import pylabimport numpy as npfrom scipy.stats import norm, entropy #互信息的计算DATA_DIR = os.path.join("..", "data")CHART_DIR = os.path.join("..", "charts")def mutual_info(x, y, bins=10):    counts_xy, bins_x, bins_y = np.histogram2d(x, y, bins=(bins, bins))    counts_x, bins = np.histogram(x, bins=bins)    counts_y, bins = np.histogram(y, bins=bins)    counts_xy += 1    counts_x += 1    counts_y += 1    P_xy = counts_xy / np.sum(counts_xy, dtype=float)    P_x = counts_x / np.sum(counts_x, dtype=float)    P_y = counts_y / np.sum(counts_y, dtype=float)    I_xy = np.sum(P_xy * np.log2(P_xy / (P_x.reshape(-1, 1) * P_y)))    return I_xy / (entropy(counts_x) + entropy(counts_y))def plot_entropy():    pylab.clf()    pylab.figure(num=None, figsize=(5, 4))    title = "Entropy $H(X)$"    pylab.title(title)    pylab.xlabel("$P(X=$coin will show heads up$)$")    pylab.ylabel("$H(X)$")    pylab.xlim(xmin=0, xmax=1.1)    x = np.arange(0.001, 1, 0.001)    y = -x * np.log2(x) - (1 - x) * np.log2(1 - x)    pylab.plot(x, y)    # pylab.xticks([w*7*24 for w in [0,1,2,3,4]], ['week %i'%(w+1) for w in    # [0,1,2,3,4]])    pylab.autoscale(tight=True)    pylab.grid(True)    filename = "entropy_demo.png"    pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")def _plot_mi_func(x, y):    mi = mutual_info(x, y)    title = "NI($X_1$, $X_2$) = %.3f" % mi    pylab.scatter(x, y)    pylab.title(title)    pylab.xlabel("$X_1$")    pylab.ylabel("$X_2$")def plot_mi_demo():    np.random.seed(0)  # to reproduce the data later on    pylab.clf()    pylab.figure(num=None, figsize=(8, 8))    x = np.arange(0, 10, 0.2)    pylab.subplot(221)    y = 0.5 * x + norm.rvs(1, loc=0, scale=.01, size=len(x))    _plot_mi_func(x, y)    pylab.subplot(222)    y = 0.5 * x + norm.rvs(1, loc=0, scale=.1, size=len(x))    _plot_mi_func(x, y)    pylab.subplot(223)    y = 0.5 * x + norm.rvs(1, loc=0, scale=1, size=len(x))    _plot_mi_func(x, y)    pylab.subplot(224)    y = norm.rvs(1, loc=0, scale=10, size=len(x))    _plot_mi_func(x, y)    pylab.autoscale(tight=True)    pylab.grid(True)    filename = "mi_demo_1.png"    pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")    pylab.clf()    pylab.figure(num=None, figsize=(8, 8))    x = np.arange(-5, 5, 0.2)    pylab.subplot(221)    y = 0.5 * x ** 2 + norm.rvs(1, loc=0, scale=.01, size=len(x))    _plot_mi_func(x, y)    pylab.subplot(222)    y = 0.5 * x ** 2 + norm.rvs(1, loc=0, scale=.1, size=len(x))    _plot_mi_func(x, y)    pylab.subplot(223)    y = 0.5 * x ** 2 + norm.rvs(1, loc=0, scale=1, size=len(x))    _plot_mi_func(x, y)    pylab.subplot(224)    y = 0.5 * x ** 2 + norm.rvs(1, loc=0, scale=10, size=len(x))    _plot_mi_func(x, y)    pylab.autoscale(tight=True)    pylab.grid(True)    filename = "mi_demo_2.png"    pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")if __name__ == '__main__':    plot_entropy()    plot_mi_demo()from sklearn.feature_selection import RFE              #特征选择方法from sklearn.linear_model import LogisticRegressionfrom sklearn.datasets import make_classificationX, y = make_classification(    n_samples=100, n_features=10, n_informative=3, random_state=0)clf = LogisticRegression()clf.fit(X, y)for i in range(1, 11):    selector = RFE(clf, i)    selector = selector.fit(X, y)    print("%i\t%s\t%s" % (i, selector.support_, selector.ranking_)) import osfrom matplotlib import pylabimport numpy as npfrom sklearn import linear_model, decomposition     #PCA方法from sklearn import lda　　＃LDA方法logistic = linear_model.LogisticRegression()CHART_DIR = os.path.join("..", "charts")np.random.seed(3)x1 = np.arange(0, 10, .2)x2 = x1 + np.random.normal(loc=0, scale=1, size=len(x1))def plot_simple_demo_1():    pylab.clf()    fig = pylab.figure(num=None, figsize=(10, 4))    pylab.subplot(121)    title = "Original feature space"    pylab.title(title)    pylab.xlabel("$X_1$")    pylab.ylabel("$X_2$")    x1 = np.arange(0, 10, .2)    x2 = x1 + np.random.normal(loc=0, scale=1, size=len(x1))    good = (x1 > 5) | (x2 > 5)    bad = ~good    x1g = x1[good]    x2g = x2[good]    pylab.scatter(x1g, x2g, edgecolor="blue", facecolor="blue")    x1b = x1[bad]    x2b = x2[bad]    pylab.scatter(x1b, x2b, edgecolor="red", facecolor="white")    pylab.grid(True)    pylab.subplot(122)    X = np.c_[(x1, x2)]    pca = decomposition.PCA(n_components=1)　　　#PCA    Xtrans = pca.fit_transform(X)    Xg = Xtrans[good]    Xb = Xtrans[bad]    pylab.scatter(        Xg[:, 0], np.zeros(len(Xg)), edgecolor="blue", facecolor="blue")    pylab.scatter(        Xb[:, 0], np.zeros(len(Xb)), edgecolor="red", facecolor="white")    title = "Transformed feature space"    pylab.title(title)    pylab.xlabel("$X'$")    fig.axes[1].get_yaxis().set_visible(False)    print(pca.explained_variance_ratio_)    pylab.grid(True)    pylab.autoscale(tight=True)    filename = "pca_demo_1.png"    pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")def plot_simple_demo_2():    pylab.clf()    fig = pylab.figure(num=None, figsize=(10, 4))    pylab.subplot(121)    title = "Original feature space"    pylab.title(title)    pylab.xlabel("$X_1$")    pylab.ylabel("$X_2$")    x1 = np.arange(0, 10, .2)    x2 = x1 + np.random.normal(loc=0, scale=1, size=len(x1))    good = x1 > x2    bad = ~good    x1g = x1[good]    x2g = x2[good]    pylab.scatter(x1g, x2g, edgecolor="blue", facecolor="blue")    x1b = x1[bad]    x2b = x2[bad]    pylab.scatter(x1b, x2b, edgecolor="red", facecolor="white")    pylab.grid(True)    pylab.subplot(122)    X = np.c_[(x1, x2)]    pca = decomposition.PCA(n_components=1)    Xtrans = pca.fit_transform(X)    Xg = Xtrans[good]    Xb = Xtrans[bad]    pylab.scatter(        Xg[:, 0], np.zeros(len(Xg)), edgecolor="blue", facecolor="blue")    pylab.scatter(        Xb[:, 0], np.zeros(len(Xb)), edgecolor="red", facecolor="white")    title = "Transformed feature space"    pylab.title(title)    pylab.xlabel("$X'$")    fig.axes[1].get_yaxis().set_visible(False)    print(pca.explained_variance_ratio_)    pylab.grid(True)    pylab.autoscale(tight=True)    filename = "pca_demo_2.png"    pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")def plot_simple_demo_lda():    pylab.clf()    fig = pylab.figure(num=None, figsize=(10, 4))    pylab.subplot(121)    title = "Original feature space"    pylab.title(title)    pylab.xlabel("$X_1$")    pylab.ylabel("$X_2$")    good = x1 > x2    bad = ~good    x1g = x1[good]    x2g = x2[good]    pylab.scatter(x1g, x2g, edgecolor="blue", facecolor="blue")    x1b = x1[bad]    x2b = x2[bad]    pylab.scatter(x1b, x2b, edgecolor="red", facecolor="white")    pylab.grid(True)    pylab.subplot(122)    X = np.c_[(x1, x2)]    lda_inst = lda.LDA(n_components=1)               #LDA 方法    Xtrans = lda_inst.fit_transform(X, good)    Xg = Xtrans[good]    Xb = Xtrans[bad]    pylab.scatter(        Xg[:, 0], np.zeros(len(Xg)), edgecolor="blue", facecolor="blue")    pylab.scatter(        Xb[:, 0], np.zeros(len(Xb)), edgecolor="red", facecolor="white")    title = "Transformed feature space"    pylab.title(title)    pylab.xlabel("$X'$")    fig.axes[1].get_yaxis().set_visible(False)    pylab.grid(True)    pylab.autoscale(tight=True)    filename = "lda_demo.png"    pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")if __name__ == '__main__':    plot_simple_demo_1()    plot_simple_demo_2()    plot_simple_demo_lda()import osimport numpy as npfrom matplotlib import pylabfrom mpl_toolkits.mplot3d import Axes3Dfrom sklearn import linear_model, manifold, decomposition, datasets     #MDA在manifold包中logistic = linear_model.LogisticRegression()CHART_DIR = os.path.join("..", "charts")np.random.seed(3)# all examples will have three classes in this filecolors = ['r', 'g', 'b']markers = ['o', 6, '*']def plot_demo_1():    X = np.c_[np.ones(5), 2 * np.ones(5), 10 * np.ones(5)].T   ##np.c_  Translates slice objects to concatenation along the second axis, 和vstock等有类似又有区分    y = np.array([0, 1, 2])    fig = pylab.figure(figsize=(10, 4))    ax = fig.add_subplot(121, projection='3d')    ax.set_axis_bgcolor('white')   #set_axis_bgcolor    mds = manifold.MDS(n_components=3)    #MDS 分析方法    Xtrans = mds.fit_transform(X)    for cl, color, marker in zip(np.unique(y), colors, markers):        ax.scatter(            Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], Xtrans[y == cl][:, 2], c=color, marker=marker, edgecolor='black')    pylab.title("MDS on example data set in 3 dimensions")    ax.view_init(10, -15)    mds = manifold.MDS(n_components=2)    Xtrans = mds.fit_transform(X)    ax = fig.add_subplot(122)    for cl, color, marker in zip(np.unique(y), colors, markers):        ax.scatter(            Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], c=color, marker=marker, edgecolor='black')    pylab.title("MDS on example data set in 2 dimensions")    filename = "mds_demo_1.png"    pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")def plot_iris_mds():    iris = datasets.load_iris()    X = iris.data    y = iris.target    # MDS    fig = pylab.figure(figsize=(10, 4))    ax = fig.add_subplot(121, projection='3d')    ax.set_axis_bgcolor('white')    mds = manifold.MDS(n_components=3)    Xtrans = mds.fit_transform(X)    for cl, color, marker in zip(np.unique(y), colors, markers):        ax.scatter(            Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], Xtrans[y == cl][:, 2], c=color, marker=marker, edgecolor='black')    pylab.title("MDS on Iris data set in 3 dimensions")    ax.view_init(10, -15)    mds = manifold.MDS(n_components=2)    Xtrans = mds.fit_transform(X)    ax = fig.add_subplot(122)    for cl, color, marker in zip(np.unique(y), colors, markers):        ax.scatter(            Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], c=color, marker=marker, edgecolor='black')    pylab.title("MDS on Iris data set in 2 dimensions")    filename = "mds_demo_iris.png"    pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")    # PCA    fig = pylab.figure(figsize=(10, 4))    ax = fig.add_subplot(121, projection='3d')    ax.set_axis_bgcolor('white')    pca = decomposition.PCA(n_components=3)    Xtrans = pca.fit(X).transform(X)    for cl, color, marker in zip(np.unique(y), colors, markers):        ax.scatter(            Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], Xtrans[y == cl][:, 2], c=color, marker=marker, edgecolor='black')    pylab.title("PCA on Iris data set in 3 dimensions")    ax.view_init(50, -35)    pca = decomposition.PCA(n_components=2)    Xtrans = pca.fit_transform(X)    ax = fig.add_subplot(122)    for cl, color, marker in zip(np.unique(y), colors, markers):        ax.scatter(            Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], c=color, marker=marker, edgecolor='black')    pylab.title("PCA on Iris data set in 2 dimensions")    filename = "pca_demo_iris.png"    pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")if __name__ == '__main__':    plot_demo_1()    plot_iris_mds()第十二章  from jug import TaskGenerator        #大数据分布式平台运算from time import sleep@TaskGeneratordef double(x):    sleep(4)    return 2*x@TaskGeneratordef add(a, b):    return a + b@TaskGeneratordef print_final_result(oname, value):    with open(oname, 'w') as output:        print >>output, "Final result:", valueinput = 2y = double(input)z = double(y)y2 = double(7)z2 = double(y2)print_final_result('output.txt', add(z,z2))
阅读全文
0 0
机器学习系统设计 学习笔记

机器学习系统设计学习笔记