Kaggle实战学习笔记

来源：互联网发布：python idle mac下载编辑：程序博客网时间：2024/06/14 12:21
学习笔记
第一课数据与可视化#numpy科学计算工具箱import numpy as np#使用make_classification构造1000个样本，每个样本有20个featurefrom sklearn.datasets import make_classificationX, y = make_classification(1000, n_features=20, n_informative=2,                            n_redundant=2, n_classes=2, random_state=0)#存为dataframe格式from pandas import DataFramedf = DataFrame(np.hstack((X, y[:, None])),columns = range(20) + ["class"])  #注意hstackdf[:6]import matplotlib.pyplot as pltimport seaborn as sns#使用pairplot去看不同特征维度pair下数据的空间分布状况_ = sns.pairplot(df[:50], vars=[8, 11, 12, 14, 19], hue="class", size=1.5)plt.show()import matplotlib.pyplot as pltplt.figure(figsize=(12, 10))_ = sns.corrplot(df, annot=False)  #新版本中移除此函数plt.show()修改学习曲线from sklearn.svm import LinearSVCfrom sklearn.learning_curve import learning_curve#绘制学习曲线，以确定模型的状况def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,                        train_sizes=np.linspace(.1, 1.0, 5)):    """    画出data在某模型上的learning curve.    参数解释    ----------    estimator : 你用的分类器。    title : 表格的标题。    X : 输入的feature，numpy类型    y : 输入的target vector    ylim : tuple格式的(ymin, ymax), 设定图像中纵坐标的最低点和最高点    cv : 做cross-validation的时候，数据分成的份数，其中一份作为cv集，其余n-1份作为training(默认为3份)    """    plt.figure()    train_sizes, train_scores, test_scores = learning_curve(        estimator, X, y, cv=5, n_jobs=1, train_sizes=train_sizes)    train_scores_mean = np.mean(train_scores, axis=1)    train_scores_std = np.std(train_scores, axis=1)    test_scores_mean = np.mean(test_scores, axis=1)    test_scores_std = np.std(test_scores, axis=1)    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,                     train_scores_mean + train_scores_std, alpha=0.1,                     color="r")    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,                     test_scores_mean + test_scores_std, alpha=0.1, color="g")    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",             label="Training score")    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",             label="Cross-validation score")    plt.xlabel("Training examples")    plt.ylabel("Score")    plt.legend(loc="best")    plt.grid("on")     if ylim:        plt.ylim(ylim)    plt.title(title)    plt.show()#少样本的情况情况下绘出学习曲线plot_learning_curve(LinearSVC(C=10.0), "LinearSVC(C=10.0)",                    X, y, ylim=(0.8, 1.01),                    train_sizes=np.linspace(.05, 0.2, 5))#增大一些样本量plot_learning_curve(LinearSVC(C=10.0), "LinearSVC(C=10.0)",                    X, y, ylim=(0.8, 1.1),                    train_sizes=np.linspace(.1, 1.0, 5))plot_learning_curve(LinearSVC(C=10.0), "LinearSVC(C=10.0) Features: 11&14", X[:, [11, 14]], y, ylim=(0.8, 1.0), train_sizes=np.linspace(.05, 0.2, 5))模型融合：stacking融合方法"""Kaggle competition: Predicting a Biological Response.Blending {RandomForests, ExtraTrees, GradientBoosting} + stretching to[0,1]. The blending scheme is related to the idea Jose H. Solorzanopresented here:http://www.kaggle.com/c/bioresponse/forums/t/1889/question-about-the-process-of-ensemble-learning/10950#post10950'''You can try this: In one of the 5 folds, train the models, then usethe results of the models as 'variables' in logistic regression overthe validation data of that fold'''. Or at least this is theimplementation of my understanding of that idea :-)The predictions are saved in test.csv. The code below created my bestsubmission to the competition:- public score (25%): 0.43464- private score (75%): 0.37751- final rank on the private leaderboard: 17th over 711 teams :-)Note: if you increase the number of estimators of the classifiers,e.g. n_estimators=1000, you get a better score/rank on the privatetest set.Copyright 2012, Emanuele Olivetti.BSD license, 3 clauses."""from __future__ import divisionimport numpy as npimport load_datafrom sklearn.cross_validation import StratifiedKFoldfrom sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifierfrom sklearn.ensemble import GradientBoostingClassifierfrom sklearn.linear_model import LogisticRegressiondef logloss(attempt, actual, epsilon=1.0e-15):    """Logloss, i.e. the score of the bioresponse competition.    """    attempt = np.clip(attempt, epsilon, 1.0-epsilon)    #这个方法会给出一个区间，在区间之外的数字将被剪除到区间的边缘，例如给定一个区间[0,1]，则小于0的将变成0，大于1则变成1.    return - np.mean(actual * np.log(attempt) +                               (1.0 - actual) * np.log(1.0 - attempt))       ##注意logLoss的具体写法if __name__ == '__main__':    np.random.seed(0)  # seed to shuffle the train set    n_folds = 10    verbose = True    shuffle = False    X, y, X_submission = load_data.load()            if shuffle:        idx = np.random.permutation(y.size)           #产生随机数        X = X[idx]        y = y[idx]    skf = list(StratifiedKFold(y, n_folds))            #分层KFold    clfs = [RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),            RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),            ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),            ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),            GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50)]    print "Creating train and test sets for blending."    dataset_blend_train = np.zeros((X.shape[0], len(clfs)))    dataset_blend_test = np.zeros((X_submission.shape[0], len(clfs)))    for j, clf in enumerate(clfs):       #注意此种写法，enumerate        print j, clf        dataset_blend_test_j = np.zeros((X_submission.shape[0], len(skf)))        for i, (train, test) in enumerate(skf):            print "Fold", i            X_train = X[train]            y_train = y[train]            X_test = X[test]            y_test = y[test]            clf.fit(X_train, y_train)            y_submission = clf.predict_proba(X_test)[:, 1]            dataset_blend_train[test, j] = y_submission            dataset_blend_test_j[:, i] = clf.predict_proba(X_submission)[:, 1]        dataset_blend_test[:, j] = dataset_blend_test_j.mean(1)    print    print "Blending."    clf = LogisticRegression()    clf.fit(dataset_blend_train, y)    y_submission = clf.predict_proba(dataset_blend_test)[:, 1]    print "Linear stretch of predictions to [0,1]"    y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())    print "Saving Results."    tmp = np.vstack([range(1, len(y_submission)+1), y_submission]).T    np.savetxt(fname='submission.csv', X=tmp, fmt='%d,%0.9f',             #保存为文本               header='MoleculeId,PredictedProbability', comments='')# 基本CSV读写操作# 我们需要读取给定的训练数据，再进行后续的数据(特征等)处理def read_data(file_name):    f = open(file_name)    #ignore header    f.readline()    samples = []    target = []    for line in f:        line = line.strip().split(",")        sample = [float(x) for x in line]        samples.append(sample)    return samplesdef write_delimited_file(file_path, data,header=None, delimiter=","):    f_out = open(file_path,"w")    if header is not None:        f_out.write(delimiter.join(header) + "\n")    for line in data:        if isinstance(line, str):            ## 注意需要判断是否是实例， isinstance            f_out.write(line + "\n")        else:            f_out.write(delimiter.join(line) + "\n")    f_out.close()#!/usr/bin/env pythonbio competition  https://www.kaggle.com/c/bioresponse#descriptionfrom sklearn.linear_model import LogisticRegressionimport csv_io                ##此模块？import mathimport scipydef train_and_predict():    #read in the training file    train = read_data("train.csv")         #使用read_data 在csv_io 自写模块中    print '读取训练数据完毕\n...\n'    #set the training responses    target = [x[0] for x in train]    #set the training features    train = [x[1:] for x in train]    #read in the test file    realtest = read_data("test.csv")    print '读取待预测数据\n...\n'    # code for logistic regression    lr = LogisticRegression()    lr.fit(train, target)    print 'Logistic Regression训练完毕!\n...\n'    predicted_probs = lr.predict_proba(realtest)    # write solutions to file    predicted_probs = ["%f" % x[1] for x in predicted_probs]    write_delimited_file("lr_solution.csv", predicted_probs)    print 'Logistic Regression预测完毕! 请提交lr_solution.csv文件到Kaggle'if __name__=="__main__":    train_and_predict()Kaggle旧金山犯罪类型分类问题，https://www.kaggle.com/c/sf-crimeimport pandas as pdfrom sklearn.feature_extraction.text import CountVectorizerfrom sklearn.cross_validation import train_test_splitfrom sklearn.linear_model import LogisticRegressionfrom sklearn.metrics import log_lossimport numpy as np# 先了解自己的数据train = pd.read_csv('sf_data/train.csv', parse_dates=['Dates'])   # 注意设置时间test = pd.read_csv('sf_data/test.csv', parse_dates=['Dates'])train.head()test.head()all_addr = np.array(train.Address.tolist() + test.Address.tolist())list(all_addr)stop_words = ['dr', 'wy', 'bl', 'av', 'st', 'ct', 'ln', 'block', 'of']vectorizer = CountVectorizer(max_features=300, stop_words=stop_words)features = vectorizer.fit_transform(all_addr).toarray()        # 稀疏矩阵用toarray()转化为矩阵features[0,:]X = features[:train.shape[0]]y = train.Category#分成80%的训练集和20%的验证集X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=44)log_model = LogisticRegression().fit(X=X_train, y=y_train)results = log_model.predict_proba(X_test)np.round(results[1], 3)log_loss_score = log_loss(y_test, results)print('log loss score: {0}'.format(round(log_loss_score, 3)))log_model = LogisticRegression().fit(X=features[:train.shape[0]], y=train.Category)results = log_model.predict_proba(features[train.shape[0]:])resultssubmission = pd.DataFrame(results)      #注意转换为DataFrame，然后进行下面的一系列操作submission.columns = sorted(train.Category.unique())submission.set_index(test.Id)   #set_indexsubmission.index.name="Id"submission.to_csv('py_submission_logreg_addr_300.csv')经典又兼具备趣味性的Kaggle案例  https://www.kaggle.com/c/titanic # 这个ipython notebook主要是我解决Kaggle Titanic问题的思路和过程import pandas as pd #数据分析import numpy as np #科学计算from pandas import Series,DataFramedata_train = pd.read_csv("Train.csv")data_train.columns#data_train[data_train.Cabin.notnull()]['Survived'].value_counts()  data_train.info()data_train.describe()import matplotlib.pyplot as pltfig = plt.figure()fig.set(alpha=0.2)  # 设定图表颜色alpha参数plt.subplot2grid((2,3),(0,0))             # 在一张大图里分列几个小图data_train.Survived.value_counts().plot(kind='bar')# plots a bar graph of those who surived vs those who did not. plt.title(u"获救情况 (1为获救)") # puts a title on our graphplt.ylabel(u"人数")  plt.subplot2grid((2,3),(0,1))data_train.Pclass.value_counts().plot(kind="bar")plt.ylabel(u"人数")plt.title(u"乘客等级分布")plt.subplot2grid((2,3),(0,2))plt.scatter(data_train.Survived, data_train.Age)plt.ylabel(u"年龄")                         # sets the y axis lableplt.grid(b=True, which='major', axis='y') # formats the grid line style of our graphsplt.title(u"按年龄看获救分布 (1为获救)")plt.subplot2grid((2,3),(1,0), colspan=2)data_train.Age[data_train.Pclass == 1].plot(kind='kde')   # plots a kernel desnsity estimate of the subset of the 1st class passanges's agedata_train.Age[data_train.Pclass == 2].plot(kind='kde')data_train.Age[data_train.Pclass == 3].plot(kind='kde')plt.xlabel(u"年龄")# plots an axis lableplt.ylabel(u"密度") plt.title(u"各等级的乘客年龄分布")plt.legend((u'头等舱', u'2等舱',u'3等舱'),loc='best') # sets our legend for our graph.plt.subplot2grid((2,3),(1,2))data_train.Embarked.value_counts().plot(kind='bar')plt.title(u"各登船口岸上船人数")plt.ylabel(u"人数")  plt.show()#看看各乘客等级的获救情况fig = plt.figure()fig.set(alpha=0.2)  # 设定图表颜色alpha参数Survived_0 = data_train.Pclass[data_train.Survived == 0].value_counts()Survived_1 = data_train.Pclass[data_train.Survived == 1].value_counts()df=pd.DataFrame({u'获救':Survived_1, u'未获救':Survived_0})df.plot(kind='bar', stacked=True)plt.title(u"各乘客等级的获救情况")plt.xlabel(u"乘客等级") plt.ylabel(u"人数") plt.show()#看看各登录港口的获救情况fig = plt.figure()fig.set(alpha=0.2)  # 设定图表颜色alpha参数Survived_0 = data_train.Embarked[data_train.Survived == 0].value_counts()Survived_1 = data_train.Embarked[data_train.Survived == 1].value_counts()df=pd.DataFrame({u'获救':Survived_1, u'未获救':Survived_0})df.plot(kind='bar', stacked=True)plt.title(u"各登录港口乘客的获救情况")plt.xlabel(u"登录港口") plt.ylabel(u"人数") plt.show()#看看各性别的获救情况fig = plt.figure()fig.set(alpha=0.2)  # 设定图表颜色alpha参数Survived_m = data_train.Survived[data_train.Sex == 'male'].value_counts()Survived_f = data_train.Survived[data_train.Sex == 'female'].value_counts()df=pd.DataFrame({u'男性':Survived_m, u'女性':Survived_f})df.plot(kind='bar', stacked=True)plt.title(u"按性别看获救情况")plt.xlabel(u"性别") plt.ylabel(u"人数")plt.show()#然后我们再来看看各种舱级别情况下各性别的获救情况fig=plt.figure()fig.set(alpha=0.65) # 设置图像透明度，无所谓plt.title(u"根据舱等级和性别的获救情况")ax1=fig.add_subplot(141)data_train.Survived[data_train.Sex == 'female'][data_train.Pclass != 3].value_counts().plot(kind='bar', label="female highclass", color='#FA2479')ax1.set_xticklabels([u"获救", u"未获救"], rotation=0)ax1.legend([u"女性/高级舱"], loc='best')ax2=fig.add_subplot(142, sharey=ax1)data_train.Survived[data_train.Sex == 'female'][data_train.Pclass == 3].value_counts().plot(kind='bar', label='female, low class', color='pink')ax2.set_xticklabels([u"未获救", u"获救"], rotation=0)plt.legend([u"女性/低级舱"], loc='best')ax3=fig.add_subplot(143, sharey=ax1)data_train.Survived[data_train.Sex == 'male'][data_train.Pclass != 3].value_counts().plot(kind='bar', label='male, high class',color='lightblue')ax3.set_xticklabels([u"未获救", u"获救"], rotation=0)plt.legend([u"男性/高级舱"], loc='best')ax4=fig.add_subplot(144, sharey=ax1)data_train.Survived[data_train.Sex == 'male'][data_train.Pclass == 3].value_counts().plot(kind='bar', label='male low class', color='steelblue')ax4.set_xticklabels([u"未获救", u"获救"], rotation=0)plt.legend([u"男性/低级舱"], loc='best')plt.show()g = data_train.groupby(['SibSp','Survived'])          # 注意分组统计，多层分组的应用df = pd.DataFrame(g.count()['PassengerId'])          ###使用此种方式进行多层分组统计data_train.Cabin.value_counts()  # value_counts 应用， 和count区别 values_counts分类了， count没有分类，统一统计了#cabin的值计数太分散了，绝大多数Cabin值只出现一次。感觉上作为类目，加入特征未必会有效#那我们一起看看这个值的有无，对于survival的分布状况，影响如何吧fig = plt.figure()fig.set(alpha=0.2)  # 设定图表颜色alpha参数Survived_cabin = data_train.Survived[pd.notnull(data_train.Cabin)].value_counts()Survived_nocabin = data_train.Survived[pd.isnull(data_train.Cabin)].value_counts()df=pd.DataFrame({u'有':Survived_cabin, u'无':Survived_nocabin}).transpose()   ##注意用词典的方式进行画图。用上面过两维，和有无两维，组成了类似2*2交叉表，此处需要transpose，作图的时候df.plot(kind='bar', stacked=True)plt.title(u"按Cabin有无看获救情况")plt.xlabel(u"Cabin有无") plt.ylabel(u"人数")plt.show()#似乎有cabin记录的乘客survival比例稍高，那先试试把这个值分为两类，有cabin值/无cabin值，一会儿加到类别特征好了from sklearn.ensemble import RandomForestRegressor ### 使用 RandomForestClassifier 填补缺失的年龄属性def set_missing_ages(df):        # 把已有的数值型特征取出来丢进Random Forest Regressor中    age_df = df[['Age','Fare', 'Parch', 'SibSp', 'Pclass']]   #注意使用此种方式固定列，便于以后操作    # 乘客分成已知年龄和未知年龄两部分    known_age = age_df[age_df.Age.notnull()].as_matrix()     # 注意notnull， isnull 的用法，并且转化为矩阵    unknown_age = age_df[age_df.Age.isnull()].as_matrix()    # y即目标年龄    y = known_age[:, 0]    # X即特征属性值    X = known_age[:, 1:]    # fit到RandomForestRegressor之中    rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1)    rfr.fit(X, y)        # 用得到的模型进行未知年龄结果预测    predictedAges = rfr.predict(unknown_age[:, 1::])   #此种写法，不包含第一列        # 用得到的预测结果填补原缺失数据    df.loc[ (df.Age.isnull()), 'Age' ] = predictedAges     # 用loc取        return df, rfrdef set_Cabin_type(df):    df.loc[ (df.Cabin.notnull()), 'Cabin' ] = "Yes"    df.loc[ (df.Cabin.isnull()), 'Cabin' ] = "No"    return dfdata_train, rfr = set_missing_ages(data_train)data_train = set_Cabin_type(data_train)data_train# 因为逻辑回归建模时，需要输入的特征都是数值型特征# 我们先对类目型的特征离散/因子化# 以Cabin为例，原本一个属性维度，因为其取值可以是['yes','no']，而将其平展开为'Cabin_yes','Cabin_no'两个属性# 原本Cabin取值为yes的，在此处的'Cabin_yes'下取值为1，在'Cabin_no'下取值为0# 原本Cabin取值为no的，在此处的'Cabin_yes'下取值为0，在'Cabin_no'下取值为1# 我们使用pandas的get_dummies来完成这个工作，并拼接在原来的data_train之上，如下所示dummies_Cabin = pd.get_dummies(data_train['Cabin'], prefix= 'Cabin')dummies_Embarked = pd.get_dummies(data_train['Embarked'], prefix= 'Embarked')   #get_dummies的使用方法， dummies_Sex = pd.get_dummies(data_train['Sex'], prefix= 'Sex')dummies_Pclass = pd.get_dummies(data_train['Pclass'], prefix= 'Pclass')df = pd.concat([data_train, dummies_Cabin, dummies_Embarked, dummies_Sex, dummies_Pclass], axis=1)     #注意最后用pd的concat 连接， numpy有concatenate连接方法df.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)   #注意inplace， 删除列用dropdf# 接下来我们要接着做一些数据预处理的工作，比如scaling，将一些变化幅度较大的特征化到[-1,1]之内# 这样可以加速logistic regression的收敛import sklearn.preprocessing as preprocessingscaler = preprocessing.StandardScaler()age_scale_param = scaler.fit(df['Age'])df['Age_scaled'] = scaler.fit_transform(df['Age'], age_scale_param)       #注意这种写法和普通的不同，Age，Fare开始使用相同的scaler，但又使用同的训练数据，所以把训练完的对象又单独加入fit_transform的参数中fare_scale_param = scaler.fit(df['Fare'])df['Fare_scaled'] = scaler.fit_transform(df['Fare'], fare_scale_param)df# 我们把需要的feature字段取出来，转成numpy格式，使用scikit-learn中的LogisticRegression建模from sklearn import linear_modeltrain_df = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')   # 使用正则表达式过滤，filter(regex=  |)train_np = train_df.as_matrix()# y即Survival结果y = train_np[:, 0]# X即特征属性值X = train_np[:, 1:]# fit到RandomForestRegressor之中clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)clf.fit(X, y)   clfX.shape#测试集和训练集做一样的操作data_test = pd.read_csv("test.csv")data_test.loc[ (data_test.Fare.isnull()), 'Fare' ] = 0# 接着我们对test_data做和train_data中一致的特征变换# 首先用同样的RandomForestRegressor模型填上丢失的年龄tmp_df = data_test[['Age','Fare', 'Parch', 'SibSp', 'Pclass']]null_age = tmp_df[data_test.Age.isnull()].as_matrix()# 根据特征属性X预测年龄并补上X = null_age[:, 1:]predictedAges = rfr.predict(X)   #注意此处是训练集得来的模型data_test.loc[ (data_test.Age.isnull()), 'Age' ] = predictedAgesdata_test = set_Cabin_type(data_test)dummies_Cabin = pd.get_dummies(data_test['Cabin'], prefix= 'Cabin')dummies_Embarked = pd.get_dummies(data_test['Embarked'], prefix= 'Embarked')dummies_Sex = pd.get_dummies(data_test['Sex'], prefix= 'Sex')dummies_Pclass = pd.get_dummies(data_test['Pclass'], prefix= 'Pclass')df_test = pd.concat([data_test, dummies_Cabin, dummies_Embarked, dummies_Sex, dummies_Pclass], axis=1)df_test.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)df_test['Age_scaled'] = scaler.fit_transform(df_test['Age'], age_scale_param)       #来此训练集df_test['Fare_scaled'] = scaler.fit_transform(df_test['Fare'], fare_scale_param)df_testtest = df_test.filter(regex='Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')predictions = clf.predict(test)result = pd.DataFrame({'PassengerId':data_test['PassengerId'].as_matrix(), 'Survived':predictions.astype(np.int32)})result.to_csv("logistic_regression_predictions.csv", index=False)pd.read_csv("logistic_regression_predictions.csv")import numpy as npimport matplotlib.pyplot as pltfrom sklearn.learning_curve import learning_curve# 用sklearn的learning_curve得到training_score和cv_score，使用matplotlib画出learning curvedef plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1,                         train_sizes=np.linspace(.05, 1., 20), verbose=0, plot=True):    """    画出data在某模型上的learning curve.    参数解释    ----------    estimator : 你用的分类器。    title : 表格的标题。    X : 输入的feature，numpy类型    y : 输入的target vector    ylim : tuple格式的(ymin, ymax), 设定图像中纵坐标的最低点和最高点    cv : 做cross-validation的时候，数据分成的份数，其中一份作为cv集，其余n-1份作为training(默认为3份)    n_jobs : 并行的的任务数(默认1)    """    train_sizes, train_scores, test_scores = learning_curve(        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, verbose=verbose)  #注意此处的train_sizes是0.05到1的值，如何为实际        train_scores_mean = np.mean(train_scores, axis=1)    train_scores_std = np.std(train_scores, axis=1)    test_scores_mean = np.mean(test_scores, axis=1)    test_scores_std = np.std(test_scores, axis=1)        if plot:        plt.figure()        plt.title(title)        if ylim is not None:          #注意在Python中None和NULL的区分            plt.ylim(*ylim)   #平方？？        plt.xlabel(u"训练样本数")        plt.ylabel(u"得分")        plt.gca().invert_yaxis()         ###利用gca()获得ax的属性，然后利用invert_yaxis() 反向Y轴        plt.grid()            plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std,                          alpha=0.1, color="b")        plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std,                          alpha=0.1, color="r")        plt.plot(train_sizes, train_scores_mean, 'o-', color="b", label=u"训练集上得分")        plt.plot(train_sizes, test_scores_mean, 'o-', color="r", label=u"交叉验证集上得分")            plt.legend(loc="best")                plt.draw()        plt.gca().invert_yaxis()        plt.show()        midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) + (test_scores_mean[-1] - test_scores_std[-1])) / 2    diff = (train_scores_mean[-1] + train_scores_std[-1]) - (test_scores_mean[-1] - test_scores_std[-1])    return midpoint, diffplot_learning_curve(clf, u"学习曲线", X, y)pd.DataFrame({"columns":list(train_df.columns)[1:], "coef":list(clf.coef_.T)})from sklearn import cross_validation# 简单看看打分情况clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)all_data = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')X = all_data.as_matrix()[:,1:]y = all_data.as_matrix()[:,0]print cross_validation.cross_val_score(clf, X, y, cv=5)# 分割数据split_train, split_cv = cross_validation.train_test_split(df, test_size=0.3, random_state=0)train_df = split_train.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')# 生成模型clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)clf.fit(train_df.as_matrix()[:,1:], train_df.as_matrix()[:,0])# 对cross validation数据进行预测cv_df = split_cv.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')predictions = clf.predict(cv_df.as_matrix()[:,1:])split_cv[ predictions != cv_df.as_matrix()[:,0] ].drop()# 去除预测错误的case看原始dataframe数据#split_cv['PredictResult'] = predictionsorigin_data_train = pd.read_csv("Train.csv")bad_cases = origin_data_train.loc[origin_data_train['PassengerId'].isin(split_cv[predictions != cv_df.as_matrix()[:,0]]['PassengerId'].values)]  #注意去除写法 isinbad_casesdata_train[data_train['Name'].str.contains("Major")]data_train = pd.read_csv("Train.csv")data_train['Sex_Pclass'] = data_train.Sex + "_" + data_train.Pclass.map(str)from sklearn.ensemble import RandomForestRegressor ### 使用 RandomForestClassifier 填补缺失的年龄属性def set_missing_ages(df):        # 把已有的数值型特征取出来丢进Random Forest Regressor中    age_df = df[['Age','Fare', 'Parch', 'SibSp', 'Pclass']]    # 乘客分成已知年龄和未知年龄两部分    known_age = age_df[age_df.Age.notnull()].as_matrix()    unknown_age = age_df[age_df.Age.isnull()].as_matrix()    # y即目标年龄    y = known_age[:, 0]    # X即特征属性值    X = known_age[:, 1:]    # fit到RandomForestRegressor之中    rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1)    rfr.fit(X, y)        # 用得到的模型进行未知年龄结果预测    predictedAges = rfr.predict(unknown_age[:, 1::])        # 用得到的预测结果填补原缺失数据    df.loc[ (df.Age.isnull()), 'Age' ] = predictedAges         return df, rfrdef set_Cabin_type(df):    df.loc[ (df.Cabin.notnull()), 'Cabin' ] = "Yes"    df.loc[ (df.Cabin.isnull()), 'Cabin' ] = "No"    return dfdata_train, rfr = set_missing_ages(data_train)data_train = set_Cabin_type(data_train)dummies_Cabin = pd.get_dummies(data_train['Cabin'], prefix= 'Cabin')dummies_Embarked = pd.get_dummies(data_train['Embarked'], prefix= 'Embarked')dummies_Sex = pd.get_dummies(data_train['Sex'], prefix= 'Sex')dummies_Pclass = pd.get_dummies(data_train['Pclass'], prefix= 'Pclass')dummies_Sex_Pclass = pd.get_dummies(data_train['Sex_Pclass'], prefix= 'Sex_Pclass')   #新增加特征df = pd.concat([data_train, dummies_Cabin, dummies_Embarked, dummies_Sex, dummies_Pclass, dummies_Sex_Pclass], axis=1)df.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'Sex_Pclass'], axis=1, inplace=True)import sklearn.preprocessing as preprocessingscaler = preprocessing.StandardScaler()age_scale_param = scaler.fit(df['Age'])df['Age_scaled'] = scaler.fit_transform(df['Age'], age_scale_param)fare_scale_param = scaler.fit(df['Fare'])df['Fare_scaled'] = scaler.fit_transform(df['Fare'], fare_scale_param)from sklearn import linear_modeltrain_df = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*')train_np = train_df.as_matrix()# y即Survival结果y = train_np[:, 0]# X即特征属性值X = train_np[:, 1:]# fit到RandomForestRegressor之中clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)clf.fit(X, y)clfdata_test = pd.read_csv("test.csv")data_test.loc[ (data_test.Fare.isnull()), 'Fare' ] = 0data_test['Sex_Pclass'] = data_test.Sex + "_" + data_test.Pclass.map(str)# 接着我们对test_data做和train_data中一致的特征变换# 首先用同样的RandomForestRegressor模型填上丢失的年龄tmp_df = data_test[['Age','Fare', 'Parch', 'SibSp', 'Pclass']]null_age = tmp_df[data_test.Age.isnull()].as_matrix()# 根据特征属性X预测年龄并补上X = null_age[:, 1:]predictedAges = rfr.predict(X)data_test.loc[ (data_test.Age.isnull()), 'Age' ] = predictedAgesdata_test = set_Cabin_type(data_test)dummies_Cabin = pd.get_dummies(data_test['Cabin'], prefix= 'Cabin')dummies_Embarked = pd.get_dummies(data_test['Embarked'], prefix= 'Embarked')dummies_Sex = pd.get_dummies(data_test['Sex'], prefix= 'Sex')dummies_Pclass = pd.get_dummies(data_test['Pclass'], prefix= 'Pclass')dummies_Sex_Pclass = pd.get_dummies(data_test['Sex_Pclass'], prefix= 'Sex_Pclass')df_test = pd.concat([data_test, dummies_Cabin, dummies_Embarked, dummies_Sex, dummies_Pclass, dummies_Sex_Pclass], axis=1)df_test.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'Sex_Pclass'], axis=1, inplace=True)df_test['Age_scaled'] = scaler.fit_transform(df_test['Age'], age_scale_param)df_test['Fare_scaled'] = scaler.fit_transform(df_test['Fare'], fare_scale_param)df_testtest = df_test.filter(regex='Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*')predictions = clf.predict(test)result = pd.DataFrame({'PassengerId':data_test['PassengerId'].as_matrix(), 'Survived':predictions.astype(np.int32)})result.to_csv("logistic_regression_predictions2.csv", index=False)from sklearn.ensemble import BaggingRegressortrain_df = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*|Mother|Child|Family|Title')train_np = train_df.as_matrix()# y即Survival结果y = train_np[:, 0]# X即特征属性值X = train_np[:, 1:]# fit到BaggingRegressor之中clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)bagging_clf = BaggingRegressor(clf, n_estimators=10, max_samples=0.8, max_features=1.0, bootstrap=True, bootstrap_features=False, n_jobs=-1)bagging_clf.fit(X, y) #用同一个模型，数据集分为10份test = df_test.filter(regex='Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*|Mother|Child|Family|Title')predictions = bagging_clf.predict(test)result = pd.DataFrame({'PassengerId':data_test['PassengerId'].as_matrix(), 'Survived':predictions.astype(np.int32)})result.to_csv("/Users/MLS/Downloads/logistic_regression_predictions2.csv", index=False)用两个分类器import numpy as npimport pandas as pdfrom pandas import  DataFramefrom patsy import dmatrices   #用于生成设计矩阵import stringfrom operator import itemgetterimport jsonfrom sklearn.ensemble import RandomForestClassifierfrom sklearn.cross_validation import cross_val_scorefrom sklearn.pipeline import Pipelinefrom sklearn.grid_search import GridSearchCVfrom sklearn.cross_validation import train_test_split,StratifiedShuffleSplit,StratifiedKFoldfrom sklearn import preprocessingfrom sklearn.metrics import classification_reportfrom sklearn.externals import joblib      #持久化模块##Read configuration parameterstrain_file="train.csv"MODEL_PATH="./"test_file="test.csv"SUBMISSION_PATH="./"seed= 0print train_file,seed# 输出得分def report(grid_scores, n_top=3):    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]    #注意此种排序的写法    for i, score in enumerate(top_scores):        print("Model with rank: {0}".format(i + 1))        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(              score.mean_validation_score,              np.std(score.cv_validation_scores)))        print("Parameters: {0}".format(score.parameters))        print("")#清理和处理数据def substrings_in_string(big_string, substrings):    for substring in substrings:        if string.find(big_string, substring) != -1:  ##注意！=-1此种写法            return substring    print big_string    return np.nanle = preprocessing.LabelEncoder()   #标签编码enc=preprocessing.OneHotEncoder()   #OneHot编码def clean_and_munge_data(df):    #处理缺省值    df.Fare = df.Fare.map(lambda x: np.nan if x==0 else x)   #注意此种写法缺失值转为为0的写法    #处理一下名字，生成Title字段    title_list=['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',                'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',                'Don', 'Jonkheer']    df['Title']=df['Name'].map(lambda x: substrings_in_string(x, title_list))  #lambda 和map结合在处理缺失值的妙用    #处理特殊的称呼，全处理成mr, mrs, miss, master    def replace_titles(x):        title=x['Title']        if title in ['Mr','Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:            return 'Mr'        elif title in ['Master']:            return 'Master'        elif title in ['Countess', 'Mme','Mrs']:            return 'Mrs'        elif title in ['Mlle', 'Ms','Miss']:            return 'Miss'        elif title =='Dr':            if x['Sex']=='Male':                return 'Mr'            else:                return 'Mrs'        elif title =='':            if x['Sex']=='Male':                return 'Master'            else:                return 'Miss'        else:            return title    df['Title']=df.apply(replace_titles, axis=1)  #apply(func,args,kwargs)从Python2.3开始，已经被func(*args,**kwargs)代替了.    #看看家族是否够大，咳咳    df['Family_Size']=df['SibSp']+df['Parch']    df['Family']=df['SibSp']*df['Parch']    df.loc[ (df.Fare.isnull())&(df.Pclass==1),'Fare'] =np.median(df[df['Pclass'] == 1]['Fare'].dropna())  #众数填充    df.loc[ (df.Fare.isnull())&(df.Pclass==2),'Fare'] =np.median( df[df['Pclass'] == 2]['Fare'].dropna())    df.loc[ (df.Fare.isnull())&(df.Pclass==3),'Fare'] = np.median(df[df['Pclass'] == 3]['Fare'].dropna())    df['Gender'] = df['Sex'].map( {'female': 0, 'male': 1} ).astype(int)       #注意map内部是字典    df['AgeFill']=df['Age']    mean_ages = np.zeros(4)    mean_ages[0]=np.average(df[df['Title'] == 'Miss']['Age'].dropna())    mean_ages[1]=np.average(df[df['Title'] == 'Mrs']['Age'].dropna())    mean_ages[2]=np.average(df[df['Title'] == 'Mr']['Age'].dropna())    mean_ages[3]=np.average(df[df['Title'] == 'Master']['Age'].dropna())    df.loc[ (df.Age.isnull()) & (df.Title == 'Miss') ,'AgeFill'] = mean_ages[0]    df.loc[ (df.Age.isnull()) & (df.Title == 'Mrs') ,'AgeFill'] = mean_ages[1]    df.loc[ (df.Age.isnull()) & (df.Title == 'Mr') ,'AgeFill'] = mean_ages[2]    df.loc[ (df.Age.isnull()) & (df.Title == 'Master') ,'AgeFill'] = mean_ages[3]    df['AgeCat']=df['AgeFill']    df.loc[ (df.AgeFill<=10) ,'AgeCat'] = 'child'    df.loc[ (df.AgeFill>60),'AgeCat'] = 'aged'    df.loc[ (df.AgeFill>10) & (df.AgeFill <=30) ,'AgeCat'] = 'adult'    df.loc[ (df.AgeFill>30) & (df.AgeFill <=60) ,'AgeCat'] = 'senior'    df.Embarked = df.Embarked.fillna('S')    df.loc[ df.Cabin.isnull()==True,'Cabin'] = 0.5    df.loc[ df.Cabin.isnull()==False,'Cabin'] = 1.5    df['Fare_Per_Person']=df['Fare']/(df['Family_Size']+1)    #Age times class    df['AgeClass']=df['AgeFill']*df['Pclass']    df['ClassFare']=df['Pclass']*df['Fare_Per_Person']    df['HighLow']=df['Pclass']    df.loc[ (df.Fare_Per_Person<8) ,'HighLow'] = 'Low'    df.loc[ (df.Fare_Per_Person>=8) ,'HighLow'] = 'High'    le.fit(df['Sex'] )    x_sex=le.transform(df['Sex'])    df['Sex']=x_sex.astype(np.float)    le.fit( df['Ticket'])    x_Ticket=le.transform( df['Ticket'])    df['Ticket']=x_Ticket.astype(np.float)    le.fit(df['Title'])    x_title=le.transform(df['Title'])    df['Title'] =x_title.astype(np.float)    le.fit(df['HighLow'])    x_hl=le.transform(df['HighLow'])    df['HighLow']=x_hl.astype(np.float)    le.fit(df['AgeCat'])    x_age=le.transform(df['AgeCat'])    df['AgeCat'] =x_age.astype(np.float)    le.fit(df['Embarked'])    x_emb=le.transform(df['Embarked'])    df['Embarked']=x_emb.astype(np.float)    df = df.drop(['PassengerId','Name','Age','Cabin'], axis=1) #remove Name,Age and PassengerId    return df#读取数据traindf=pd.read_csv(train_file)##清洗数据df=clean_and_munge_data(traindf)########################################formula################################ formula_ml='Survived~Pclass+C(Title)+Sex+C(AgeCat)+Fare_Per_Person+Fare+Family_Size' #这一部重要，需要研究y_train, x_train = dmatrices(formula_ml, data=df, return_type='dataframe')   # 生成矩阵，根据参数见的关系生成。参数之间相关性。 y_train = np.asarray(y_train).ravel()print y_train.shape,x_train.shape##选择训练和测试集X_train, X_test, Y_train, Y_test = train_test_split(x_train, y_train, test_size=0.2,random_state=seed)#初始化分类器clf=RandomForestClassifier(n_estimators=500, criterion='entropy', max_depth=5, min_samples_split=1,  min_samples_leaf=1, max_features='auto',    bootstrap=False, oob_score=False, n_jobs=1, random_state=seed,  verbose=0)###grid search找到最好的参数param_grid = dict( )##创建分类pipelinepipeline=Pipeline([ ('clf',clf) ])grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=3,scoring='accuracy',\cv=StratifiedShuffleSplit(Y_train, n_iter=10, test_size=0.2, train_size=None, indices=None, \   #CV嵌套在GridSearch里面，CV使用Y_train分割random_state=seed, n_iterations=None)).fit(X_train, Y_train)# 对结果打分print("Best score: %0.3f" % grid_search.best_score_)      #注意，best_score_print(grid_search.best_estimator_)report(grid_search.grid_scores_) print('-----grid search end------------')print ('on all train set')scores = cross_val_score(grid_search.best_estimator_, x_train, y_train,cv=3,scoring='accuracy') #全量， 注意best_estimatorprint scores.mean(),scoresprint ('on test set')scores = cross_val_score(grid_search.best_estimator_, X_test, Y_test,cv=3,scoring='accuracy')print scores.mean(),scores# 对结果打分print(classification_report(Y_train, grid_search.best_estimator_.predict(X_train) ))print('test data')print(classification_report(Y_test, grid_search.best_estimator_.predict(X_test) ))model_file=MODEL_PATH+'model-rf.pkl'joblib.dump(grid_search.best_estimator_, model_file)Kaggle自行车租赁预测比赛   https://www.kaggle.com/c/bike-sharing-demandimport numpy as npimport pandas as pdimport matplotlib.pyplot as pltimport seaborn as sns%matplotlib inlinedf_train = pd.read_csv('kaggle_bike_competition_train.csv',header = 0)df_train.head(10)df_train.dtypes#让它告诉我们形状df_train.shapedf_train.count()type(df_train.datetime)# 把月、日、和 小时单独拎出来，放到3列中df_train['month'] = pd.DatetimeIndex(df_train.datetime).month       #处理时间，使用pd.DatetimeIndex().monthdf_train['day'] = pd.DatetimeIndex(df_train.datetime).dayofweekdf_train['hour'] = pd.DatetimeIndex(df_train.datetime).hour# 那个，保险起见，咱们还是先存一下吧df_train_origin = df_train# 抛掉不要的字段df_train = df_train.drop(['datetime','casual','registered'], axis = 1)# 看一眼df_train.head(5)df_train_target = df_train['count'].values       #注意后面加了valuedf_train_data = df_train.drop(['count'],axis = 1).valuesprint 'df_train_data shape is ', df_train_data.shapeprint 'df_train_target shape is ', df_train_target.shapefrom sklearn import linear_modelfrom sklearn import cross_validationfrom sklearn import svmfrom sklearn.ensemble import RandomForestRegressorfrom sklearn.learning_curve import learning_curvefrom sklearn.grid_search import GridSearchCVfrom sklearn.metrics import explained_variance_score# 总得切分一下数据咯（训练集和测试集）cv = cross_validation.ShuffleSplit(len(df_train_data), n_iter=3, test_size=0.2,   # 注意在此处使用的是len,最终使用的是索引    random_state=0)# 各种模型来一圈print "岭回归"    for train, test in cv:        svc = linear_model.Ridge().fit(df_train_data[train], df_train_target[train])    print("train score: {0:.3f}, test score: {1:.3f}\n".format(        svc.score(df_train_data[train], df_train_target[train]), svc.score(df_train_data[test], df_train_target[test])))    print "支持向量回归/SVR(kernel='rbf',C=10,gamma=.001)"for train, test in cv:        svc = svm.SVR(kernel ='rbf', C = 10, gamma = .001).fit(df_train_data[train], df_train_target[train])    print("train score: {0:.3f}, test score: {1:.3f}\n".format(        svc.score(df_train_data[train], df_train_target[train]), svc.score(df_train_data[test], df_train_target[test])))    print "随机森林回归/Random Forest(n_estimators = 100)"    for train, test in cv:        svc = RandomForestRegressor(n_estimators = 100).fit(df_train_data[train], df_train_target[train])    print("train score: {0:.3f}, test score: {1:.3f}\n".format(        svc.score(df_train_data[train], df_train_target[train]), svc.score(df_train_data[test], df_train_target[test])))X = df_train_datay = df_train_targetX_train, X_test, y_train, y_test = cross_validation.train_test_split(    X, y, test_size=0.2, random_state=0)tuned_parameters = [{'n_estimators':[10,100,500]}]       scores = ['r2']for score in scores:        print score        clf = GridSearchCV(RandomForestRegressor(), tuned_parameters, cv=5, scoring=score)    clf.fit(X_train, y_train)    print("别！喝！咖！啡！了！最佳参数找到了亲！！：")    print ""    #best_estimator_ returns the best estimator chosen by the search    print(clf.best_estimator_)    print ""    print("得分分别是:")    print ""    #grid_scores_的返回值:    #    * a dict of parameter settings    #    * the mean score over the cross-validation folds     #    * the list of scores for each fold    for params, mean_score, scores in clf.grid_scores_:   # grid_scores_里面只有测试集的分数？        print("%0.3f (+/-%0.03f) for %r"              % (mean_score, scores.std() / 2, params))    print ""def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):        plt.figure()    plt.title(title)    if ylim is not None:        plt.ylim(*ylim)    plt.xlabel("Training examples")    plt.ylabel("Score")    train_sizes, train_scores, test_scores = learning_curve(        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)    train_scores_mean = np.mean(train_scores, axis=1)    train_scores_std = np.std(train_scores, axis=1)    test_scores_mean = np.mean(test_scores, axis=1)    test_scores_std = np.std(test_scores, axis=1)    plt.grid()    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,                     train_scores_mean + train_scores_std, alpha=0.1,                     color="r")    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,                     test_scores_mean + test_scores_std, alpha=0.1, color="g")    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",             label="Training score")    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",             label="Cross-validation score")    plt.legend(loc="best")    return plttitle = "Learning Curves (Random Forest, n_estimators = 100)"cv = cross_validation.ShuffleSplit(df_train_data.shape[0], n_iter=10,test_size=0.2, random_state=0)estimator = RandomForestRegressor(n_estimators = 100)plot_learning_curve(estimator, title, X, y, (0.0, 1.01), cv=cv, n_jobs=4)plt.show()# 尝试一下缓解过拟合，当然，未必成功print "随机森林回归/Random Forest(n_estimators=200, max_features=0.6, max_depth=15)"   # 这里调高了n_estimators，max_ 的数量，在Random Forest里面降低模型复杂度？for train, test in cv:     svc = RandomForestRegressor(n_estimators = 200, max_features=0.6, max_depth=15).fit(df_train_data[train], df_train_target[train])    print("train score: {0:.3f}, test score: {1:.3f}\n".format(        svc.score(df_train_data[train], df_train_target[train]), svc.score(df_train_data[test], df_train_target[test])))# 看你们自己的咯df_train_registered = df_train_origin.drop(['datetime','casual','count'], axis = 1)df_train_casual = df_train_origin.drop(['datetime','count','registered'], axis = 1)df_train_registered.head()# 风速df_train_origin.groupby('windspeed').mean().plot(y='count', marker='o')  #注意groupby分组统计后直接作图plt.show()# 湿度df_train_origin.groupby('humidity').mean().plot(y='count', marker='o')plt.show()#温度湿度变化df_train_origin.plot(x='temp', y='humidity', kind='scatter')           #直接作图plt.show()# scatter一下各个维度fig, axs = plt.subplots(2, 3, sharey=True)df_train_origin.plot(kind='scatter', x='temp', y='count', ax=axs[0, 0], figsize=(16, 8), color='magenta')df_train_origin.plot(kind='scatter', x='atemp', y='count', ax=axs[0, 1], color='cyan')df_train_origin.plot(kind='scatter', x='humidity', y='count', ax=axs[0, 2], color='red')df_train_origin.plot(kind='scatter', x='windspeed', y='count', ax=axs[1, 0], color='yellow')df_train_origin.plot(kind='scatter', x='month', y='count', ax=axs[1, 1], color='blue')df_train_origin.plot(kind='scatter', x='hour', y='count', ax=axs[1, 2], color='green')sns.pairplot(df_train_origin[["temp", "month", "humidity", "count"]], hue="count")  # 注意seabosn中的pairplot 画多个变量之间的关系corr = df_train_origin[['temp','weather','windspeed','day', 'month', 'hour','count']].corr() # corr计算各特征变量之间的关联度corrplt.figure()plt.matshow(corr)         # 显示相关性图，matshowplt.colorbar()          # 颜色parplt.show()特征工程 数据集来源于Data Hackathon 3.ximport pandas as pdimport numpy as np%matplotlib inline#载入数据:train = pd.read_csv('Train.csv')test = pd.read_csv('Test.csv')train.dtypestrain.head(5)#合成一个总的datatrain['source']= 'train'test['source'] = 'test'data=pd.concat([train, test],ignore_index=True)       # 合成的使用用pandas的concate 或python的。。。data.shapedata.apply(lambda x: sum(x.isnull()))  # 注意用此方式查看缺省值  var = ['Gender','Salary_Account','Mobile_Verified','Var1','Filled_Form','Device_Type','Var2','Source']for v in var:    print '\n%s这一列数据的不同取值和出现的次数\n'%v    print data[v].value_counts()len(data['City'].unique())  # 注意unique的使用data.drop('City',axis=1,inplace=True)data['DOB'].head()#创建一个年龄的字段Agedata['Age'] = data['DOB'].apply(lambda x: 115 - int(x[-2:]))data['Age'].head()#把原始的DOB字段去掉:data.drop('DOB',axis=1,inplace=True)data.boxplot(column=['EMI_Loan_Submitted'],return_type='axes')#好像缺失值比较多，干脆就开一个新的字段，表明是缺失值还是不是缺失值data['EMI_Loan_Submitted_Missing'] = data['EMI_Loan_Submitted'].apply(lambda x: 1 if pd.isnull(x) else 0)data[['EMI_Loan_Submitted','EMI_Loan_Submitted_Missing']].head(10)#原始那一列就可以不要了data.drop('EMI_Loan_Submitted',axis=1,inplace=True)len(data['Employer_Name'].value_counts())#丢掉data.drop('Employer_Name',axis=1,inplace=True)data.boxplot(column='Existing_EMI',return_type='axes')data['Existing_EMI'].describe()#缺省值不多，用均值代替data['Existing_EMI'].fillna(0, inplace=True)data.boxplot(column=['Interest_Rate'],return_type='axes')#缺省值太多，也造一个字段，表示有无data['Interest_Rate_Missing'] = data['Interest_Rate'].apply(lambda x: 1 if pd.isnull(x) else 0)    #造一个有无的字段print data[['Interest_Rate','Interest_Rate_Missing']].head(10)data.drop('Interest_Rate',axis=1,inplace=True)#找中位数去填补缺省值（因为缺省的不多）data['Loan_Amount_Applied'].fillna(data['Loan_Amount_Applied'].median(),inplace=True)data['Loan_Tenure_Applied'].fillna(data['Loan_Tenure_Applied'].median(),inplace=True)# 缺省值太多。。。是否缺省。。。data['Loan_Amount_Submitted_Missing'] = data['Loan_Amount_Submitted'].apply(lambda x: 1 if pd.isnull(x) else 0)data['Loan_Tenure_Submitted_Missing'] = data['Loan_Tenure_Submitted'].apply(lambda x: 1 if pd.isnull(x) else 0)data['Source'] = data['Source'].apply(lambda x: 'others' if x not in ['S122','S133'] else x)data['Source'].value_counts()from sklearn.preprocessing import LabelEncoderle = LabelEncoder()        # 数值编码， 原来为object类型，转化为int类型var_to_encode = ['Device_Type','Filled_Form','Gender','Var1','Var2','Mobile_Verified','Source']for col in var_to_encode:    data[col] = le.fit_transform(data[col])data = pd.get_dummies(data, columns=var_to_encode)  #类别型的One-Hot 编码， 此处先把类别行的用LabelEncoder编码为数字，然后在转化为one_hot编码，可以直接one_hot,只是起的列名字不同而已data.columnstrain = data.loc[data['source']=='train']test = data.loc[data['source']=='test']XGBoost模型调优 import pandas as pdimport numpy as npimport xgboost as xgbfrom xgboost.sklearn import XGBClassifierfrom sklearn import cross_validation, metricsfrom sklearn.grid_search import GridSearchCVimport matplotlib.pylab as plt%matplotlib inlinefrom matplotlib.pylab import rcParamsrcParams['figure.figsize'] = 12, 4train = pd.read_csv('train_modified.csv')test = pd.read_csv('test_modified.csv')import pandas as pdimport numpy as npimport xgboost as xgbfrom xgboost.sklearn import XGBClassifierfrom sklearn import cross_validation, metricsfrom sklearn.grid_search import GridSearchCVimport matplotlib.pylab as plt%matplotlib inlinefrom matplotlib.pylab import rcParamsrcParams['figure.figsize'] = 12, 4           #注意此处， 默认设置图形大小train = pd.read_csv('train_modified.csv')test = pd.read_csv('test_modified.csv')train.shape, test.shapetarget='Disbursed'IDcol = 'ID'train['Disbursed'].value_counts()#test_results = pd.read_csv('test_results.csv')def modelfit(alg, dtrain, dtest, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):    if useTrainCV:        xgb_param = alg.get_xgb_params()        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)        xgtest = xgb.DMatrix(dtest[predictors].values)        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,             early_stopping_rounds=early_stopping_rounds, show_progress=False)        alg.set_params(n_estimators=cvresult.shape[0])        #建模    alg.fit(dtrain[predictors], dtrain['Disbursed'],eval_metric='auc')            #对训练集预测    dtrain_predictions = alg.predict(dtrain[predictors])    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]            #输出模型的一些结果    print "\n关于现在这个模型"    print "准确率 : %.4g" % metrics.accuracy_score(dtrain['Disbursed'].values, dtrain_predictions)    print "AUC 得分 (训练集): %f" % metrics.roc_auc_score(dtrain['Disbursed'], dtrain_predprob)                    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)    feat_imp.plot(kind='bar', title='Feature Importances')    plt.ylabel('Feature Importance Score')predictors = [x for x in train.columns if x not in [target, IDcol]]xgb1 = XGBClassifier(        learning_rate =0.1,        n_estimators=1000,        max_depth=5,        min_child_weight=1,        gamma=0,        subsample=0.8,        colsample_bytree=0.8,        objective= 'binary:logistic',        nthread=4,        scale_pos_weight=1,        seed=27)modelfit(xgb1, train, test, predictors)#对subsample 和 max_features 用grid search查找最好的参数param_test1 = {    'max_depth':range(3,10,2),    'min_child_weight':range(1,6,2)}gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5,                                        min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,                                        objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27),                        param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=5)gsearch1.fit(train[predictors],train[target])gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_# 对于max_depth和min_child_weight查找最好的参数param_test2 = {    'max_depth':[4,5,6],    'min_child_weight':[4,5,6]}gsearch2 = GridSearchCV(estimator = XGBClassifier( learning_rate=0.1, n_estimators=140, max_depth=5,                                        min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,                                        objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27),                        param_grid = param_test2, scoring='roc_auc',n_jobs=4,iid=False, cv=5)gsearch2.fit(train[predictors],train[target])#交叉验证对min_child_weight寻找最合适的参数param_test2b = {    'min_child_weight':[6,8,10,12]}gsearch2b = GridSearchCV(estimator = XGBClassifier( learning_rate=0.1, n_estimators=140, max_depth=4,                                        min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,                                        objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27),                        param_grid = param_test2b, scoring='roc_auc',n_jobs=4,iid=False, cv=5)gsearch2b.fit(train[predictors],train[target])#Grid seach选择合适的gammaparam_test3 = {    'gamma':[i/10.0 for i in range(0,5)]}gsearch3 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=4,                                        min_child_weight=6, gamma=0, subsample=0.8, colsample_bytree=0.8,                                        objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27),                        param_grid = param_test3, scoring='roc_auc',n_jobs=4,iid=False, cv=5)gsearch3.fit(train[predictors],train[target])predictors = [x for x in train.columns if x not in [target, IDcol]]xgb2 = XGBClassifier(        learning_rate =0.1,        n_estimators=1000,        max_depth=4,        min_child_weight=6,        gamma=0,        subsample=0.8,        colsample_bytree=0.8,        objective= 'binary:logistic',        nthread=4,        scale_pos_weight=1,        seed=27)modelfit(xgb2, train, test, predictors)#对subsample 和 colsample_bytree用grid search寻找最合适的参数param_test4 = {    'subsample':[i/10.0 for i in range(6,10)],    'colsample_bytree':[i/10.0 for i in range(6,10)]}gsearch4 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=4,                                        min_child_weight=6, gamma=0, subsample=0.8, colsample_bytree=0.8,                                        objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27),                        param_grid = param_test4, scoring='roc_auc',n_jobs=4,iid=False, cv=5)gsearch4.fit(train[predictors],train[target])# 同上param_test5 = {    'subsample':[i/100.0 for i in range(75,90,5)],    'colsample_bytree':[i/100.0 for i in range(75,90,5)]}gsearch5 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=4,                                        min_child_weight=6, gamma=0, subsample=0.8, colsample_bytree=0.8,                                        objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27),                        param_grid = param_test5, scoring='roc_auc',n_jobs=4,iid=False, cv=5)gsearch5.fit(train[predictors],train[target])#对reg_alpha用grid search寻找最合适的参数param_test6 = {    'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]}gsearch6 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=4,                                        min_child_weight=6, gamma=0.1, subsample=0.8, colsample_bytree=0.8,                                        objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27),                        param_grid = param_test6, scoring='roc_auc',n_jobs=4,iid=False, cv=5)gsearch6.fit(train[predictors],train[target])# 换一组参数对reg_alpha用grid search寻找最合适的参数param_test7 = {    'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05]}gsearch7 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=4,                                        min_child_weight=6, gamma=0.1, subsample=0.8, colsample_bytree=0.8,                                        objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27),                        param_grid = param_test7, scoring='roc_auc',n_jobs=4,iid=False, cv=5)gsearch7.fit(train[predictors],train[target])xgb3 = XGBClassifier(        learning_rate =0.1,        n_estimators=1000,        max_depth=4,        min_child_weight=6,        gamma=0,        subsample=0.8,        colsample_bytree=0.8,        reg_alpha=0.005,        objective= 'binary:logistic',        nthread=4,        scale_pos_weight=1,        seed=27)modelfit(xgb3, train, test, predictors)第二课房价预测案例import numpy as npimport pandas as pdimport xgboosttrain_df = pd.read_csv('../input/train.csv', index_col=0)    #注意../ 代表上一个目录？test_df = pd.read_csv('../input/test.csv', index_col=0)%matplotlib inlineprices = pd.DataFrame({"price":train_df["SalePrice"], "log(price + 1)":np.log1p(train_df["SalePrice"])})   # 注意此处使用了lo1p,它是log(1+X) 防止X为零的一个类似拉普拉斯平滑，log1p()就需要expm1(); 回归的时候如果初始数据不是正态分布，需要做处理， 分类的时候没必要prices.hist()y_train = np.log1p(train_df.pop('SalePrice'))all_df = pd.concat((train_df, test_df), axis=0)  # pandas 里面的concat 合并all_df.shapeall_df['MSSubClass'].dtypesall_df['MSSubClass'] = all_df['MSSubClass'].astype(str)  # 转换为astypeall_df['MSSubClass'].value_counts()pd.get_dummies(all_df['MSSubClass'], prefix='MSSubClass').head()all_dummy_df = pd.get_dummies(all_df)    #把所有的数据进行了one-hot-encodeall_dummy_df.head()all_dummy_df.isnull().sum().sort_values(ascending=False).head(10)  #缺失值， sum ， sort_valuesmean_cols = all_dummy_df.mean()mean_cols.head(10)all_dummy_df = all_dummy_df.fillna(mean_cols)all_dummy_df.isnull().sum().sum()numeric_cols = all_df.columns[all_df.dtypes != 'object']         #判断对那些列是numerical类型，即不是对象类型的， 注意此处使用的是dtypes!='object', 注意取出是个列表numeric_colsnumeric_col_means = all_dummy_df.loc[:, numeric_cols].mean()            #计算出所有数值型数字的标准列numeric_col_std = all_dummy_df.loc[:, numeric_cols].std()             all_dummy_df.loc[:, numeric_cols] = (all_dummy_df.loc[:, numeric_cols] - numeric_col_means) / numeric_col_std  # 标准化dummy_train_df = all_dummy_df.loc[train_df.index]dummy_test_df = all_dummy_df.loc[test_df.index]from sklearn.linear_model import Ridgefrom sklearn.model_selection import cross_val_scoreX_train = dummy_train_df.values    #此处是values 注意DataFrame 转化为narray的方式X_test = dummy_test_df.valuesalphas = np.logspace(-3, 2, 50)test_scores = []for alpha in alphas:    clf = Ridge(alpha)    test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error'))  #注意此处有个负号， 和scoring 的选择有关    test_scores.append(np.mean(test_score))import matplotlib.pyplot as plt%matplotlib inlineplt.plot(alphas, test_scores)plt.title("Alpha vs CV Error");max_features = [.1, .3, .5, .7, .9, .99]test_scores = []for max_feat in max_features:    clf = RandomForestRegressor(n_estimators=200, max_features=max_feat)    test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=5, scoring='neg_mean_squared_error'))    test_scores.append(np.mean(test_score))plt.plot(max_features, test_scores)plt.title("Max Features vs CV Error");ridge = Ridge(alpha=15)rf = RandomForestRegressor(n_estimators=500, max_features=.3)ridge.fit(X_train, y_train)rf.fit(X_train, y_train)y_ridge = np.expm1(ridge.predict(X_test))   #注意前面用了log(x+1),此处我们用的expm1是反过程y_rf = np.expm1(rf.predict(X_test))y_final = (y_ridge + y_rf) / 2          #模型融合  取平均submission_df = pd.DataFrame(data= {'Id' : test_df.index, 'SalePrice': y_final})房价预测案例（进阶版）dummy_train_df = all_dummy_df.loc[train_df.index]dummy_test_df = all_dummy_df.loc[test_df.index]dummy_train_df.shape, dummy_test_df.shapeX_train = dummy_train_df.valuesX_test = dummy_test_df.valuesfrom sklearn.linear_model import Ridgeridge = Ridge(15)from sklearn.ensemble import BaggingRegressorfrom sklearn.model_selection import cross_val_scoreparams = [1, 10, 15, 20, 25, 30, 40]test_scores = []for param in params:                #此处bagging 用的是同一个模型下面不同分类器的组合    clf = BaggingRegressor(n_estimators=param, base_estimator=ridge)    test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error'))    test_scores.append(np.mean(test_score))import matplotlib.pyplot as plt%matplotlib inlineplt.plot(params, test_scores)plt.title("n_estimator vs CV Error");params = [10, 15, 20, 25, 30, 40, 50, 60, 70, 100]test_scores = []for param in params:    clf = BaggingRegressor(n_estimators=param)    test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error'))    test_scores.append(np.mean(test_score))from sklearn.ensemble import AdaBoostRegressorparams = [10, 15, 20, 25, 30, 35, 40, 45, 50]test_scores = []for param in params:    clf = BaggingRegressor(n_estimators=param, base_estimator=ridge)    test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error'))    test_scores.append(np.mean(test_score))from xgboost import XGBRegressorparams = [1,2,3,4,5,6]test_scores = []for param in params:    clf = XGBRegressor(max_depth=param)    test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error'))    test_scores.append(np.mean(test_score))用每日新闻预测金融市场变化 标准版from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizerimport pandas as pdimport numpy as npfrom sklearn.svm import SVCfrom sklearn.metrics import roc_auc_scorefrom datetime import datedata = pd.read_csv('../input/Combined_News_DJIA.csv')data["combined_news"] = data.filter(regex=("Top.*")).apply(lambda x: ''.join(str(x.values)), axis=1)  #注意pd中使用filter.后使用apply， 在axis=1，此时lambda 中的x是一列一列，是每个Seriestrain = data[data['Date'] < '2015-01-01']test = data[data['Date'] > '2014-12-31']feature_extraction = TfidfVectorizer()X_train = feature_extraction.fit_transform(train["combined_news"].values)X_test = feature_extraction.transform(test["combined_news"].values)y_train = train["Label"].valuesy_test = test["Label"].valuesclf = SVC(probability=True, kernel='rbf')clf = SVC(probability=True, kernel='rbf')clf.fit(X_train, y_train)predictions = clf.predict_proba(X_test)print('ROC-AUC yields ' + str(roc_auc_score(y_test, predictions[:,1])))进阶版X_train = train["combined_news"].str.lower().str.replace('"', '').str.replace("'", '').str.split()X_test = test["combined_news"].str.lower().str.replace('"', '').str.replace("'", '').str.split()from nltk.corpus import stopwordsstop = stopwords.words('english')  import redef hasNumbers(inputString):    return bool(re.search(r'\d', inputString))from nltk.stem import WordNetLemmatizerwordnet_lemmatizer = WordNetLemmatizer()def check(word):    """    如果需要这个单词，则True    如果应该去除，则False    """    if word in stop:        return False    elif hasNumbers(word):        return False    else:        return TrueX_train = X_train.apply(lambda x: [wordnet_lemmatizer.lemmatize(item) for item in x if check(item)])X_test = X_test.apply(lambda x: [wordnet_lemmatizer.lemmatize(item) for item in x if check(item)])print(X_test[1611])X_train = X_train.apply(lambda x: ' '.join(x))X_test = X_test.apply(lambda x: ' '.join(x))print(X_test[1611])feature_extraction = TfidfVectorizer(lowercase=False)X_train = feature_extraction.fit_transform(X_train.values)X_test = feature_extraction.transform(X_test.values)clf = SVC(probability=True, kernel='rbf')clf.fit(X_train, y_train)predictions = clf.predict_proba(X_test)print('ROC-AUC yields ' + str(roc_auc_score(y_test, predictions[:,1])))第三课  Click-Through Rate Prediction第四课关键词搜索import numpy as npimport pandas as pdfrom sklearn.ensemble import RandomForestRegressor, BaggingRegressorfrom nltk.stem.snowball import SnowballStemmerdf_train = pd.read_csv('../input/train.csv', encoding="ISO-8859-1")df_test = pd.read_csv('../input/test.csv', encoding="ISO-8859-1")df_desc = pd.read_csv('../input/product_descriptions.csv')df_train.head()df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)df_all = pd.merge(df_all, df_desc, how='left', on='product_uid')stemmer = SnowballStemmer('english')def str_stemmer(s):    return " ".join([stemmer.stem(word) for word in s.lower().split()])def str_common_word(str1, str2):    return sum(int(str2.find(word)>=0) for word in str1.split())关键词搜索（进阶版）df_all = pd.merge(df_all, df_desc, how='left', on='product_uid')stemmer = SnowballStemmer('english')def str_stemmer(s):    return " ".join([stemmer.stem(word) for word in s.lower().split()])def str_common_word(str1, str2):    return sum(int(str2.find(word)>=0) for word in str1.split())df_all['search_term'] = df_all['search_term'].map(lambda x:str_stemmer(x))df_all['product_title'] = df_all['product_title'].map(lambda x:str_stemmer(x))df_all['product_description'] = df_all['product_description'].map(lambda x:str_stemmer(x))import LevenshteinLevenshtein.ratio('hello', 'hello world')df_all['dist_in_title'] = df_all.apply(lambda x:Levenshtein.ratio(x['search_term'],x['product_title']), axis=1)df_all['dist_in_desc'] = df_all.apply(lambda x:Levenshtein.ratio(x['search_term'],x['product_description']), axis=1)df_all['all_texts']=df_all['product_title'] + ' . ' + df_all['product_description'] + ' . 'from gensim.utils import tokenizefrom gensim.corpora.dictionary import Dictionarydictionary = Dictionary(list(tokenize(x, errors='ignore')) for x in df_all['all_texts'].values)print(dictionary)class MyCorpus(object):    def __iter__(self):        for x in df_all['all_texts'].values:            yield dictionary.doc2bow(list(tokenize(x, errors='ignore')))# 这里这么折腾一下，仅仅是为了内存friendly。面对大量corpus数据时，你直接存成一个list，会使得整个运行变得很慢。# 所以我们搞成这样，一次只输出一组。但本质上依旧长得跟 [['sentence', '1'], ['sentence', '2'], ...]一样corpus = MyCorpus()from gensim.models.tfidfmodel import TfidfModeltfidf = TfidfModel(corpus)tfidf[dictionary.doc2bow(list(tokenize('hello world, good morning', errors='ignore')))]from gensim.similarities import MatrixSimilarity# 先把刚刚那句话包装成一个方法def to_tfidf(text):    res = tfidf[dictionary.doc2bow(list(tokenize(text, errors='ignore')))]    return res# 然后，我们创造一个cosine similarity的比较方法def cos_sim(text1, text2):    tfidf1 = to_tfidf(text1)    tfidf2 = to_tfidf(text2)    index = MatrixSimilarity([tfidf1],num_features=len(dictionary))    sim = index[tfidf2]    # 本来sim输出是一个array，我们不需要一个array来表示，    # 所以我们直接cast成一个float    return float(sim[0])text1 = 'hello world'text2 = 'hello from the other side'cos_sim(text1, text2)df_all['tfidf_cos_sim_in_title'] = df_all.apply(lambda x: cos_sim(x['search_term'], x['product_title']), axis=1)df_all['tfidf_cos_sim_in_title'][:5]df_all['tfidf_cos_sim_in_desc'] = df_all.apply(lambda x: cos_sim(x['search_term'], x['product_description']), axis=1)import nltk# nltk也是自带一个强大的句子分割器。tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')tokenizer.tokenize(df_all['all_texts'].values[0])sentences = [tokenizer.tokenize(x) for x in df_all['all_texts'].values]sentences = [y for x in sentences for y in x]from nltk.tokenize import word_tokenizew2v_corpus = [word_tokenize(x) for x in sentences]from gensim.models.word2vec import Word2Vecmodel = Word2Vec(w2v_corpus, size=128, window=5, min_count=5, workers=4)# 先拿到全部的vocabularyvocab = model.vocab# 得到任意text的vectordef get_vector(text):    # 建立一个全是0的array    res =np.zeros([128])    count = 0    for word in word_tokenize(text):        if word in vocab:            res += model[word]            count += 1    return res/count  df_all['w2v_cos_sim_in_title'] = df_all.apply(lambda x: w2v_cos_sim(x['search_term'], x['product_title']), axis=1)df_all['w2v_cos_sim_in_desc'] = df_all.apply(lambda x: w2v_cos_sim(x['search_term'], x['product_description']), axis=1)df_all = df_all.drop(['search_term','product_title','product_description','all_texts'],axis=1)df_train = df_all.loc[df_train.index]df_test = df_all.loc[df_test.index]test_ids = df_test['id']y_train = df_train['relevance'].valuesX_train = df_train.drop(['id','relevance'],axis=1).valuesX_test = df_test.drop(['id','relevance'],axis=1).valuesfrom sklearn.ensemble import RandomForestRegressorfrom sklearn.model_selection import cross_val_scorefrom sklearn.ensemble import RandomForestRegressorfrom sklearn.model_selection import cross_val_score第七课便利店销量预测import pandas as pdimport datetimeimport csvimport numpy as npimport osimport scipy as spimport xgboost as xgbimport itertoolsimport operatorimport warningswarnings.filterwarnings("ignore")from sklearn.preprocessing import StandardScaler, LabelEncoderfrom sklearn.base import TransformerMixinfrom sklearn import cross_validationfrom matplotlib import pylab as pltplot = Truegoal = 'Sales'myid = 'Id'定义一些变换和评判准则def ToWeight(y):    w = np.zeros(y.shape, dtype=float)    ind = y != 0    w[ind] = 1./(y[ind]**2)    return wdef rmspe(yhat, y):    w = ToWeight(y)    rmspe = np.sqrt(np.mean( w * (y - yhat)**2 ))    return rmspedef rmspe_xg(yhat, y):    # y = y.values    y = y.get_label()    y = np.exp(y) - 1    yhat = np.exp(yhat) - 1    w = ToWeight(y)    rmspe = np.sqrt(np.mean(w * (y - yhat)**2))    return "rmspe", rmspestore = pd.read_csv('./data/store.csv')store.head()train_df = pd.read_csv('./data/train.csv')test_df = pd.read_csv('./data/test.csv')def load_data():    """        加载数据，设定数值型和非数值型数据    """    store = pd.read_csv('./data/store.csv')    train_org = pd.read_csv('./data/train.csv',dtype={'StateHoliday':pd.np.string_})    test_org = pd.read_csv('./data/test.csv',dtype={'StateHoliday':pd.np.string_})    train = pd.merge(train_org,store, on='Store', how='left')    test = pd.merge(test_org,store, on='Store', how='left')    features = test.columns.tolist()    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']    features_numeric = test.select_dtypes(include=numerics).columns.tolist()    features_non_numeric = [f for f in features if f not in features_numeric]    return (train,test,features,features_non_numeric)数据与特征处理def process_data(train,test,features,features_non_numeric):    """        Feature engineering and selection.    """    # # FEATURE ENGINEERING    train = train[train['Sales'] > 0]    for data in [train,test]:        # year month day        data['year'] = data.Date.apply(lambda x: x.split('-')[0])        data['year'] = data['year'].astype(float)        data['month'] = data.Date.apply(lambda x: x.split('-')[1])        data['month'] = data['month'].astype(float)        data['day'] = data.Date.apply(lambda x: x.split('-')[2])        data['day'] = data['day'].astype(float)        # promo interval "Jan,Apr,Jul,Oct"        data['promojan'] = data.PromoInterval.apply(lambda x: 0 if isinstance(x, float) else 1 if "Jan" in x else 0)        data['promofeb'] = data.PromoInterval.apply(lambda x: 0 if isinstance(x, float) else 1 if "Feb" in x else 0)        data['promomar'] = data.PromoInterval.apply(lambda x: 0 if isinstance(x, float) else 1 if "Mar" in x else 0)        data['promoapr'] = data.PromoInterval.apply(lambda x: 0 if isinstance(x, float) else 1 if "Apr" in x else 0)        data['promomay'] = data.PromoInterval.apply(lambda x: 0 if isinstance(x, float) else 1 if "May" in x else 0)        data['promojun'] = data.PromoInterval.apply(lambda x: 0 if isinstance(x, float) else 1 if "Jun" in x else 0)        data['promojul'] = data.PromoInterval.apply(lambda x: 0 if isinstance(x, float) else 1 if "Jul" in x else 0)        data['promoaug'] = data.PromoInterval.apply(lambda x: 0 if isinstance(x, float) else 1 if "Aug" in x else 0)        data['promosep'] = data.PromoInterval.apply(lambda x: 0 if isinstance(x, float) else 1 if "Sep" in x else 0)        data['promooct'] = data.PromoInterval.apply(lambda x: 0 if isinstance(x, float) else 1 if "Oct" in x else 0)        data['promonov'] = data.PromoInterval.apply(lambda x: 0 if isinstance(x, float) else 1 if "Nov" in x else 0)        data['promodec'] = data.PromoInterval.apply(lambda x: 0 if isinstance(x, float) else 1 if "Dec" in x else 0)    # # Features set.    noisy_features = [myid,'Date']    features = [c for c in features if c not in noisy_features]    features_non_numeric = [c for c in features_non_numeric if c not in noisy_features]    features.extend(['year','month','day'])    # Fill NA    class DataFrameImputer(TransformerMixin):        # http://stackoverflow.com/questions/25239958/impute-categorical-missing-values-in-scikit-learn        def __init__(self):            """Impute missing values.            Columns of dtype object are imputed with the most frequent value            in column.            Columns of other types are imputed with mean of column.            """        def fit(self, X, y=None):            self.fill = pd.Series([X[c].value_counts().index[0] # mode                if X[c].dtype == np.dtype('O') else X[c].mean() for c in X], # mean                index=X.columns)            return self        def transform(self, X, y=None):            return X.fillna(self.fill)    train = DataFrameImputer().fit_transform(train)    test = DataFrameImputer().fit_transform(test)    # Pre-processing non-numberic values    le = LabelEncoder()    for col in features_non_numeric:        le.fit(list(train[col])+list(test[col]))        train[col] = le.transform(train[col])        test[col] = le.transform(test[col])    # LR和神经网络这种模型都对输入数据的幅度极度敏感，请先做归一化操作    scaler = StandardScaler()    for col in set(features) - set(features_non_numeric) - \      set([]): # TODO: add what not to scale        scaler.fit(list(train[col])+list(test[col]))        train[col] = scaler.transform(train[col])        test[col] = scaler.transform(test[col])    return (train,test,features,features_non_numeric)训练与分析def XGB_native(train,test,features,features_non_numeric):    depth = 13    eta = 0.01    ntrees = 8000    mcw = 3    params = {"objective": "reg:linear",              "booster": "gbtree",              "eta": eta,              "max_depth": depth,              "min_child_weight": mcw,              "subsample": 0.9,              "colsample_bytree": 0.7,              "silent": 1              }    print "Running with params: " + str(params)    print "Running with ntrees: " + str(ntrees)    print "Running with features: " + str(features)    # Train model with local split    tsize = 0.05    X_train, X_test = cross_validation.train_test_split(train, test_size=tsize)    dtrain = xgb.DMatrix(X_train[features], np.log(X_train[goal] + 1))    dvalid = xgb.DMatrix(X_test[features], np.log(X_test[goal] + 1))    watchlist = [(dvalid, 'eval'), (dtrain, 'train')]    gbm = xgb.train(params, dtrain, ntrees, evals=watchlist, early_stopping_rounds=100, feval=rmspe_xg, verbose_eval=True)    train_probs = gbm.predict(xgb.DMatrix(X_test[features]))    indices = train_probs < 0    train_probs[indices] = 0    error = rmspe(np.exp(train_probs) - 1, X_test[goal].values)    print error    # Predict and Export    test_probs = gbm.predict(xgb.DMatrix(test[features]))    indices = test_probs < 0    test_probs[indices] = 0    submission = pd.DataFrame({myid: test[myid], goal: np.exp(test_probs) - 1})    if not os.path.exists('result/'):        os.makedirs('result/')    submission.to_csv("./result/dat-xgb_d%s_eta%s_ntree%s_mcw%s_tsize%s.csv" % (str(depth),str(eta),str(ntrees),str(mcw),str(tsize)) , index=False)    # Feature importance    if plot:      outfile = open('xgb.fmap', 'w')      i = 0      for feat in features:          outfile.write('{0}\t{1}\tq\n'.format(i, feat))          i = i + 1      outfile.close()      importance = gbm.get_fscore(fmap='xgb.fmap')      importance = sorted(importance.items(), key=operator.itemgetter(1))      df = pd.DataFrame(importance, columns=['feature', 'fscore'])      df['fscore'] = df['fscore'] / df['fscore'].sum()      # Plotitup      plt.figure()      df.plot()      df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(25, 15))      plt.title('XGBoost Feature Importance')      plt.xlabel('relative importance')      plt.gcf().savefig('Feature_Importance_xgb_d%s_eta%s_ntree%s_mcw%s_tsize%s.png' % (str(depth),str(eta),str(ntrees),str(mcw),str(tsize)))  print "=> 载入数据中..."train,test,features,features_non_numeric = load_data()print "=> 处理数据与特征工程..."train,test,features,features_non_numeric = process_data(train,test,features,features_non_numeric)print "=> 使用XGBoost建模..."XGB_native(train,test,features,features_non_numeric)Kaggle event 推荐比赛from __future__ import divisionimport itertoolsimport cPickleimport datetimeimport hashlibimport localeimport numpy as npimport pycountryimport scipy.io as sioimport scipy.sparse as ssimport scipy.spatial.distance as ssdfrom collections import defaultdictfrom sklearn.preprocessing import normalizeclass DataCleaner:  """  Common utilities for converting strings to equivalent numbers  or number buckets.  """  def __init__(self):    # 载入 locales    self.localeIdMap = defaultdict(int)    for i, l in enumerate(locale.locale_alias.keys()):      self.localeIdMap[l] = i + 1    # 载入 countries    self.countryIdMap = defaultdict(int)    ctryIdx = defaultdict(int)    for i, c in enumerate(pycountry.countries):      self.countryIdMap[c.name.lower()] = i + 1      if c.name.lower() == "usa":        ctryIdx["US"] = i      if c.name.lower() == "canada":        ctryIdx["CA"] = i    for cc in ctryIdx.keys():      for s in pycountry.subdivisions.get(country_code=cc):        self.countryIdMap[s.name.lower()] = ctryIdx[cc] + 1    # 载入 gender id 字典    self.genderIdMap = defaultdict(int, {"male":1, "female":2})  def getLocaleId(self, locstr):    return self.localeIdMap[locstr.lower()]  def getGenderId(self, genderStr):    return self.genderIdMap[genderStr]  def getJoinedYearMonth(self, dateString):    dttm = datetime.datetime.strptime(dateString, "%Y-%m-%dT%H:%M:%S.%fZ")    return "".join([str(dttm.year), str(dttm.month)])  def getCountryId(self, location):    if (isinstance(location, str)        and len(location.strip()) > 0        and location.rfind("  ") > -1):      return self.countryIdMap[location[location.rindex("  ") + 2:].lower()]    else:      return 0  def getBirthYearInt(self, birthYear):    try:      return 0 if birthYear == "None" else int(birthYear)    except:      return 0  def getTimezoneInt(self, timezone):    try:      return int(timezone)    except:      return 0  def getFeatureHash(self, value):    if len(value.strip()) == 0:      return -1    else:      return int(hashlib.sha224(value).hexdigest()[0:4], 16)  def getFloatValue(self, value):    if len(value.strip()) == 0:      return 0.0    else:      return float(value)  2.处理user和event关联数据class ProgramEntities:  """  我们只关心train和test中出现的user和event，因此重点处理这部分关联数据  """  def __init__(self):    # 统计训练集中有多少独立的用户的events    uniqueUsers = set()    uniqueEvents = set()    eventsForUser = defaultdict(set)    usersForEvent = defaultdict(set)    for filename in ["train.csv", "test.csv"]:      f = open(filename, 'rb')      f.readline().strip().split(",")      for line in f:        cols = line.strip().split(",")        uniqueUsers.add(cols[0])        uniqueEvents.add(cols[1])        eventsForUser[cols[0]].add(cols[1])        usersForEvent[cols[1]].add(cols[0])      f.close()    self.userEventScores = ss.dok_matrix((len(uniqueUsers), len(uniqueEvents)))    self.userIndex = dict()    self.eventIndex = dict()    for i, u in enumerate(uniqueUsers):      self.userIndex[u] = i    for i, e in enumerate(uniqueEvents):      self.eventIndex[e] = i    ftrain = open("train.csv", 'rb')    ftrain.readline()    for line in ftrain:      cols = line.strip().split(",")      i = self.userIndex[cols[0]]      j = self.eventIndex[cols[1]]      self.userEventScores[i, j] = int(cols[4]) - int(cols[5])    ftrain.close()    sio.mmwrite("PE_userEventScores", self.userEventScores)    # 为了防止不必要的计算，我们找出来所有关联的用户 或者 关联的event    # 所谓的关联用户，指的是至少在同一个event上有行为的用户pair    # 关联的event指的是至少同一个user有行为的event pair    self.uniqueUserPairs = set()    self.uniqueEventPairs = set()    for event in uniqueEvents:      users = usersForEvent[event]      if len(users) > 2:        self.uniqueUserPairs.update(itertools.combinations(users, 2))    for user in uniqueUsers:      events = eventsForUser[user]      if len(events) > 2:        self.uniqueEventPairs.update(itertools.combinations(events, 2))    cPickle.dump(self.userIndex, open("PE_userIndex.pkl", 'wb'))    cPickle.dump(self.eventIndex, open("PE_eventIndex.pkl", 'wb'))用户与用户相似度矩阵class Users:  """  构建 user/user 相似度矩阵  """  def __init__(self, programEntities, sim=ssd.correlation):    cleaner = DataCleaner()    nusers = len(programEntities.userIndex.keys())    fin = open("users.csv", 'rb')    colnames = fin.readline().strip().split(",")    self.userMatrix = ss.dok_matrix((nusers, len(colnames) - 1))    for line in fin:      cols = line.strip().split(",")      # 只考虑train.csv中出现的用户      if programEntities.userIndex.has_key(cols[0]):        i = programEntities.userIndex[cols[0]]        self.userMatrix[i, 0] = cleaner.getLocaleId(cols[1])        self.userMatrix[i, 1] = cleaner.getBirthYearInt(cols[2])        self.userMatrix[i, 2] = cleaner.getGenderId(cols[3])        self.userMatrix[i, 3] = cleaner.getJoinedYearMonth(cols[4])        self.userMatrix[i, 4] = cleaner.getCountryId(cols[5])        self.userMatrix[i, 5] = cleaner.getTimezoneInt(cols[6])    fin.close()    # 归一化用户矩阵    self.userMatrix = normalize(self.userMatrix, norm="l1", axis=0, copy=False)    sio.mmwrite("US_userMatrix", self.userMatrix)    # 计算用户相似度矩阵，之后会用到    self.userSimMatrix = ss.dok_matrix((nusers, nusers))    for i in range(0, nusers):      self.userSimMatrix[i, i] = 1.0    for u1, u2 in programEntities.uniqueUserPairs:      i = programEntities.userIndex[u1]      j = programEntities.userIndex[u2]      if not self.userSimMatrix.has_key((i, j)):        usim = sim(self.userMatrix.getrow(i).todense(),          self.userMatrix.getrow(j).todense())        self.userSimMatrix[i, j] = usim        self.userSimMatrix[j, i] = usim    sio.mmwrite("US_userSimMatrix", self.userSimMatrix)用户社交关系挖掘class UserFriends:  """  找出某用户的那些朋友，想法非常简单  1)如果你有更多的朋友，可能你性格外向，更容易参加各种活动  2)如果你朋友会参加某个活动，可能你也会跟随去参加一下  """  def __init__(self, programEntities):    nusers = len(programEntities.userIndex.keys())    self.numFriends = np.zeros((nusers))    self.userFriends = ss.dok_matrix((nusers, nusers))    fin = open("user_friends.csv", 'rb')    fin.readline()                # skip header    ln = 0    for line in fin:      if ln % 200 == 0:        print "Loading line: ", ln      cols = line.strip().split(",")      user = cols[0]      if programEntities.userIndex.has_key(user):        friends = cols[1].split(" ")        i = programEntities.userIndex[user]        self.numFriends[i] = len(friends)        for friend in friends:          if programEntities.userIndex.has_key(friend):            j = programEntities.userIndex[friend]            # the objective of this score is to infer the degree to            # and direction in which this friend will influence the            # user's decision, so we sum the user/event score for            # this user across all training events.            eventsForUser = programEntities.userEventScores.getrow(j).todense()            score = eventsForUser.sum() / np.shape(eventsForUser)[1]            self.userFriends[i, j] += score            self.userFriends[j, i] += score      ln += 1    fin.close()    # 归一化数组    sumNumFriends = self.numFriends.sum(axis=0)    self.numFriends = self.numFriends / sumNumFriends    sio.mmwrite("UF_numFriends", np.matrix(self.numFriends))    self.userFriends = normalize(self.userFriends, norm="l1", axis=0, copy=False)    sio.mmwrite("UF_userFriends", self.userFriends)构造event和event相似度数据class Events:  """  构建event-event相似度，注意这里有2种相似度：  1）由用户-event行为，类似协同过滤算出的相似度  2）由event本身的内容(event信息)计算出的event-event相似度  """  def __init__(self, programEntities, psim=ssd.correlation, csim=ssd.cosine):    cleaner = DataCleaner()    fin = open("events.csv", 'rb')    fin.readline() # skip header    nevents = len(programEntities.eventIndex.keys())    self.eventPropMatrix = ss.dok_matrix((nevents, 7))    self.eventContMatrix = ss.dok_matrix((nevents, 100))    ln = 0    for line in fin.readlines():#      if ln > 10:#        break      cols = line.strip().split(",")      eventId = cols[0]      if programEntities.eventIndex.has_key(eventId):        i = programEntities.eventIndex[eventId]        self.eventPropMatrix[i, 0] = cleaner.getJoinedYearMonth(cols[2]) # start_time        self.eventPropMatrix[i, 1] = cleaner.getFeatureHash(cols[3]) # city        self.eventPropMatrix[i, 2] = cleaner.getFeatureHash(cols[4]) # state        self.eventPropMatrix[i, 3] = cleaner.getFeatureHash(cols[5]) # zip        self.eventPropMatrix[i, 4] = cleaner.getFeatureHash(cols[6]) # country        self.eventPropMatrix[i, 5] = cleaner.getFloatValue(cols[7]) # lat        self.eventPropMatrix[i, 6] = cleaner.getFloatValue(cols[8]) # lon        for j in range(9, 109):          self.eventContMatrix[i, j-9] = cols[j]        ln += 1    fin.close()    self.eventPropMatrix = normalize(self.eventPropMatrix,        norm="l1", axis=0, copy=False)    sio.mmwrite("EV_eventPropMatrix", self.eventPropMatrix)    self.eventContMatrix = normalize(self.eventContMatrix,        norm="l1", axis=0, copy=False)    sio.mmwrite("EV_eventContMatrix", self.eventContMatrix)    # calculate similarity between event pairs based on the two matrices        self.eventPropSim = ss.dok_matrix((nevents, nevents))    self.eventContSim = ss.dok_matrix((nevents, nevents))    for e1, e2 in programEntities.uniqueEventPairs:      i = programEntities.eventIndex[e1]      j = programEntities.eventIndex[e2]      if not self.eventPropSim.has_key((i,j)):        epsim = psim(self.eventPropMatrix.getrow(i).todense(),          self.eventPropMatrix.getrow(j).todense())        self.eventPropSim[i, j] = epsim        self.eventPropSim[j, i] = epsim      if not self.eventContSim.has_key((i,j)):        ecsim = csim(self.eventContMatrix.getrow(i).todense(),          self.eventContMatrix.getrow(j).todense())        self.eventContSim[i, j] = epsim        self.eventContSim[j, i] = epsim    sio.mmwrite("EV_eventPropSim", self.eventPropSim)    sio.mmwrite("EV_eventContSim", self.eventContSim)活跃度/event热度 数据class EventAttendees():  """  统计某个活动，参加和不参加的人数，从而为活动活跃度做准备  """  def __init__(self, programEvents):    nevents = len(programEvents.eventIndex.keys())    self.eventPopularity = ss.dok_matrix((nevents, 1))    f = open("event_attendees.csv", 'rb')    f.readline() # skip header    for line in f:      cols = line.strip().split(",")      eventId = cols[0]      if programEvents.eventIndex.has_key(eventId):        i = programEvents.eventIndex[eventId]        self.eventPopularity[i, 0] = \          len(cols[1].split(" ")) - len(cols[4].split(" "))    f.close()    self.eventPopularity = normalize(self.eventPopularity, norm="l1",      axis=0, copy=False)    sio.mmwrite("EA_eventPopularity", self.eventPopularity)7.串起所有的数据处理和准备流程def data_prepare():  """  计算生成所有的数据，用矩阵或者其他形式存储方便后续提取特征和建模  """  print "第1步：统计user和event相关信息..."  pe = ProgramEntities()  print "第1步完成...\n"  print "第2步：计算用户相似度信息，并用矩阵形式存储..."  Users(pe)  print "第2步完成...\n"  print "第3步：计算用户社交关系信息，并存储..."  UserFriends(pe)  print "第3步完成...\n"  print "第4步：计算event相似度信息，并用矩阵形式存储..."  Events(pe)  print "第4步完成...\n"  print "第5步：计算event热度信息..."  EventAttendees(pe)  print "第5步完成...\n"# 运行进行数据准备data_prepare()8.构建特征# 这是构建特征部分from __future__ import divisionimport cPickleimport numpy as npimport scipy.io as sioclass DataRewriter:  def __init__(self):    # 读入数据做初始化    self.userIndex = cPickle.load(open("PE_userIndex.pkl", 'rb'))    self.eventIndex = cPickle.load(open("PE_eventIndex.pkl", 'rb'))    self.userEventScores = sio.mmread("PE_userEventScores").todense()    self.userSimMatrix = sio.mmread("US_userSimMatrix").todense()    self.eventPropSim = sio.mmread("EV_eventPropSim").todense()    self.eventContSim = sio.mmread("EV_eventContSim").todense()    self.numFriends = sio.mmread("UF_numFriends")    self.userFriends = sio.mmread("UF_userFriends").todense()    self.eventPopularity = sio.mmread("EA_eventPopularity").todense()      def userReco(self, userId, eventId):    """    根据User-based协同过滤，得到event的推荐度    基本的伪代码思路如下：    for item i      for every other user v that has a preference for i        compute similarity s between u and v        incorporate v's preference for i weighted by s into running aversge    return top items ranked by weighted average    """    i = self.userIndex[userId]    j = self.eventIndex[eventId]    vs = self.userEventScores[:, j]    sims = self.userSimMatrix[i, :]    prod = sims * vs    try:      return prod[0, 0] - self.userEventScores[i, j]    except IndexError:      return 0  def eventReco(self, userId, eventId):    """    根据基于物品的协同过滤，得到Event的推荐度    基本的伪代码思路如下：    for item i       for every item j tht u has a preference for        compute similarity s between i and j        add u's preference for j weighted by s to a running average    return top items, ranked by weighted average    """    i = self.userIndex[userId]    j = self.eventIndex[eventId]    js = self.userEventScores[i, :]    psim = self.eventPropSim[:, j]    csim = self.eventContSim[:, j]    pprod = js * psim    cprod = js * csim    pscore = 0    cscore = 0    try:      pscore = pprod[0, 0] - self.userEventScores[i, j]    except IndexError:      pass    try:      cscore = cprod[0, 0] - self.userEventScores[i, j]    except IndexError:      pass    return pscore, cscore  def userPop(self, userId):    """    基于用户的朋友个数来推断用户的社交程度    主要的考量是如果用户的朋友非常多，可能会更倾向于参加各种社交活动    """    if self.userIndex.has_key(userId):      i = self.userIndex[userId]      try:        return self.numFriends[0, i]      except IndexError:        return 0    else:      return 0  def friendInfluence(self, userId):    """    朋友对用户的影响    主要考虑用户所有的朋友中，有多少是非常喜欢参加各种社交活动/event的    用户的朋友圈如果都积极参与各种event，可能会对当前用户有一定的影响    """    nusers = np.shape(self.userFriends)[1]    i = self.userIndex[userId]    return (self.userFriends[i, :].sum(axis=0) / nusers)[0,0]  def eventPop(self, eventId):    """    本活动本身的热度    主要是通过参与的人数来界定的    """    i = self.eventIndex[eventId]    return self.eventPopularity[i, 0]  def rewriteData(self, start=1, train=True, header=True):    """    把前面user-based协同过滤 和 item-based协同过滤，以及各种热度和影响度作为特征组合在一起    生成新的训练数据，用于分类器分类使用    """    fn = "train.csv" if train else "test.csv"    fin = open(fn, 'rb')    fout = open("data_" + fn, 'wb')    # write output header    if header:      ocolnames = ["invited", "user_reco", "evt_p_reco",        "evt_c_reco", "user_pop", "frnd_infl", "evt_pop"]      if train:        ocolnames.append("interested")        ocolnames.append("not_interested")      fout.write(",".join(ocolnames) + "\n")    ln = 0    for line in fin:      ln += 1      if ln < start:        continue      cols = line.strip().split(",")      userId = cols[0]      eventId = cols[1]      invited = cols[2]      if ln%500 == 0:          print "%s:%d (userId, eventId)=(%s, %s)" % (fn, ln, userId, eventId)      user_reco = self.userReco(userId, eventId)      evt_p_reco, evt_c_reco = self.eventReco(userId, eventId)      user_pop = self.userPop(userId)      frnd_infl = self.friendInfluence(userId)      evt_pop = self.eventPop(eventId)      ocols = [invited, user_reco, evt_p_reco,        evt_c_reco, user_pop, frnd_infl, evt_pop]      if train:        ocols.append(cols[4]) # interested        ocols.append(cols[5]) # not_interested      fout.write(",".join(map(lambda x: str(x), ocols)) + "\n")    fin.close()    fout.close()  def rewriteTrainingSet(self):    self.rewriteData(True)  def rewriteTestSet(self):    self.rewriteData(False)# When running with cython, the actual class will be converted to a .so# file, and the following code (along with the commented out import below)# will need to be put into another .py and this should be run.#import CRegressionData as rddr = DataRewriter()print "生成训练数据...\n"dr.rewriteData(train=True, start=2, header=True)print "生成预测数据...\n"dr.rewriteData(train=False, start=2, header=True)9.建模与预测# 建模与预测from __future__ import divisionimport mathimport numpy as npimport pandas as pdfrom sklearn.cross_validation import KFoldfrom sklearn.linear_model import SGDClassifierdef train():  """  在我们得到的特征上训练分类器，target为1(感兴趣)，或者是0(不感兴趣)  """  trainDf = pd.read_csv("data_train.csv")  X = np.matrix(pd.DataFrame(trainDf, index=None,    columns=["invited", "user_reco", "evt_p_reco", "evt_c_reco",    "user_pop", "frnd_infl", "evt_pop"]))  y = np.array(trainDf.interested)  clf = SGDClassifier(loss="log", penalty="l2")  clf.fit(X, y)  return clfdef validate():  """  10折的交叉验证，并输出交叉验证的平均准确率  """  trainDf = pd.read_csv("data_train.csv")  X = np.matrix(pd.DataFrame(trainDf, index=None,    columns=["invited", "user_reco", "evt_p_reco", "evt_c_reco",    "user_pop", "frnd_infl", "evt_pop"]))  y = np.array(trainDf.interested)  nrows = len(trainDf)  kfold = KFold(nrows, 10)  avgAccuracy = 0  run = 0  for train, test in kfold:    Xtrain, Xtest, ytrain, ytest = X[train], X[test], y[train], y[test]    clf = SGDClassifier(loss="log", penalty="l2")    clf.fit(Xtrain, ytrain)    accuracy = 0    ntest = len(ytest)    for i in range(0, ntest):      yt = clf.predict(Xtest[i, :])      if yt == ytest[i]:        accuracy += 1    accuracy = accuracy / ntest    print "accuracy (run %d): %f" % (run, accuracy)    avgAccuracy += accuracy    run += 1  print "Average accuracy", (avgAccuracy / run)def test(clf):  """  读取test数据，用分类器完成预测  """  origTestDf = pd.read_csv("test.csv")  users = origTestDf.user  events = origTestDf.event  testDf = pd.read_csv("data_test.csv")  fout = open("result.csv", 'wb')  fout.write(",".join(["user", "event", "outcome", "dist"]) + "\n")  nrows = len(testDf)  Xp = np.matrix(testDf)  yp = np.zeros((nrows, 2))  for i in range(0, nrows):    xp = Xp[i, :]    yp[i, 0] = clf.predict(xp)    yp[i, 1] = clf.decision_function(xp)    fout.write(",".join(map(lambda x: str(x),       [users[i], events[i], yp[i, 0], yp[i, 1]])) + "\n")  fout.close()clf = train()test(clf)生成要提交的文件# 处理成提交结果的格式from __future__ import divisionimport pandas as pddef byDist(x, y):  return int(y[1] - x[1])def generate_submition_file():  # 输出文件  fout = open("final_result.csv", 'wb')  fout.write(",".join(["User", "Events"]) + "\n")  resultDf = pd.read_csv("result.csv")  # group remaining user/events  grouped = resultDf.groupby("user")  for name, group in grouped:    user = str(name)    tuples = zip(list(group.event), list(group.dist), list(group.outcome))#    tuples = filter(lambda x: x[2]==1, tuples)    tuples = sorted(tuples, cmp=byDist)    events = "\"" + str(map(lambda x: x[0], tuples)) + "\""    fout.write(",".join([user, events]) + "\n")  fout.close()generate_submition_file()
阅读全文
0 0
Kaggle实战学习 笔记

Kaggle实战学习笔记