项目2: 为CharityML寻找捐献者
来源:互联网 发布:知乎搞笑的事情 编辑:程序博客网 时间:2024/05/01 03:58
In [3]:
# 为这个项目导入需要的库import numpy as npimport pandas as pdfrom time import timefrom IPython.display import display # 允许为DataFrame使用display()# 导入附加的可视化代码visuals.pyimport visuals as vs# 为notebook提供更加漂亮的可视化%matplotlib inline# 导入人口普查数据data = pd.read_csv("census.csv")# 成功 - 显示第一条记录display(data.head(n=1))
In [4]:
# TODO:总的记录数n_records = data.shape[0]# TODO:被调查者的收入大于$50,000的人数n_greater_50k = data[data.income.str.contains('>50K')].shape[0]# TODO:被调查者的收入最多为$50,000的人数n_at_most_50k = data[data.income.str.contains('<=50K')].shape[0]# TODO:被调查者收入大于$50,000所占的比例greater_percent = np.divide(n_greater_50k, float(n_records)) * 100# 打印结果print "Total number of records: {}".format(n_records)print "Individuals making more than $50,000: {}".format(n_greater_50k)print "Individuals making at most $50,000: {}".format(n_at_most_50k)print "Percentage of individuals making more than $50,000: {:.2f}%".format(greater_percent)
In [5]:
# 将数据切分成特征和对应的标签income_raw = data['income']features_raw = data.drop('income', axis = 1)# 可视化原来数据的倾斜的连续特征vs.distribution(data)
In [6]:
# 对于倾斜的数据使用Log转换skewed = ['capital-gain', 'capital-loss']features_raw[skewed] = data[skewed].apply(lambda x: np.log(x + 1))# 可视化经过log之后的数据分布vs.distribution(features_raw, transformed = True)
In [7]:
# 导入sklearn.preprocessing.StandardScalerfrom sklearn.preprocessing import MinMaxScaler# 初始化一个 scaler,并将它施加到特征上scaler = MinMaxScaler()numerical = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']features_raw[numerical] = scaler.fit_transform(data[numerical])# 显示一个经过缩放的样例记录display(features_raw.head(n = 1))
In [8]:
# TODO:使用pandas.get_dummies()对'features_raw'数据进行独热编码features = pd.get_dummies(features_raw)# TODO:将'income_raw'编码成数字值income = income_raw.replace(['>50K', '<=50K'], [1, 0])# 打印经过独热编码之后的特征数量encoded = list(features.columns)print "{} total features after one-hot encoding.".format(len(encoded))# 移除下面一行的注释以观察编码的特征名字# print encoded
In [9]:
# 导入 train_test_splitfrom sklearn.model_selection import train_test_split# 将'features'和'income'数据切分成训练集和测试集X_train, X_test, y_train, y_test = train_test_split(features, income, test_size = 0.2, random_state = 0)# 显示切分的结果print "Training set has {} samples.".format(X_train.shape[0])print "Testing set has {} samples.".format(X_test.shape[0])
In [10]:
# TODO: 计算准确率accuracy = np.divide(n_greater_50k, float(n_records))# TODO: 使用上面的公式,并设置beta=0.5计算F-scorerecall = np.divide(n_greater_50k, n_greater_50k)precision = np.divide(n_greater_50k, float(n_records))fscore = (1 + np.power(0.5, 2)) * np.multiply(precision, recall) / (np.power(0.5, 2) * precision + recall)# 打印结果print "Naive Predictor: [Accuracy score: {:.4f}, F-score: {:.4f}]".format(accuracy, fscore)
In [11]:
# TODO:从sklearn中导入两个评价指标 - fbeta_score和accuracy_scorefrom sklearn.metrics import fbeta_score, accuracy_scoredef train_predict(learner, sample_size, X_train, y_train, X_test, y_test): ''' inputs: - learner: the learning algorithm to be trained and predicted on - sample_size: the size of samples (number) to be drawn from training set - X_train: features training set - y_train: income training set - X_test: features testing set - y_test: income testing set ''' results = {} # TODO:使用sample_size大小的训练数据来拟合学习器 # TODO: Fit the learner to the training data using slicing with 'sample_size' start = time() # 获得程序开始时间 learner = learner.fit(X_train[: sample_size], y_train[: sample_size]) end = time() # 获得程序结束时间 # TODO:计算训练时间 results['train_time'] = end - start # TODO: 得到在测试集上的预测值 # 然后得到对前300个训练数据的预测结果 start = time() # 获得程序开始时间 predictions_test = learner.predict(X_test) predictions_train = learner.predict(X_train[: 300]) end = time() # 获得程序结束时间 # TODO:计算预测用时 results['pred_time'] = end - start # TODO:计算在最前面的300个训练数据的准确率 results['acc_train'] = accuracy_score(y_train[: 300], predictions_train) # TODO:计算在测试集上的准确率 results['acc_test'] = accuracy_score(y_test, predictions_test) # TODO:计算在最前面300个训练数据上的F-score results['f_train'] = fbeta_score(y_train[: 300], predictions_train, beta=0.5) # TODO:计算测试集上的F-score results['f_test'] = fbeta_score(y_test, predictions_test, beta=0.5) # 成功 print "{} trained on {} samples.".format(learner.__class__.__name__, sample_size) # 返回结果 return results
In [12]:
# TODO:从sklearn中导入三个监督学习模型from sklearn import tree, svm, ensemble# TODO:初始化三个模型clf_A = tree.DecisionTreeClassifier()clf_B = svm.SVC()clf_C = ensemble.AdaBoostClassifier()# TODO:计算1%, 10%, 100%的训练数据分别对应多少点samples_1 = int(X_train.shape[0] * 0.01)samples_10 = int(X_train.shape[0] * 0.1)samples_100 = int(X_train.shape[0] * 1)print [samples_1, samples_10, samples_100]# 收集学习器的结果results = {}for clf in [clf_A, clf_B, clf_C]: clf_name = clf.__class__.__name__ results[clf_name] = {} for i, samples in enumerate([samples_1, samples_10, samples_100]): results[clf_name][i] = \ train_predict(clf, samples, X_train, y_train, X_test, y_test)# 对选择的三个模型得到的评价结果进行可视化vs.evaluate(results, accuracy, fscore)
In [21]:
# TODO:导入'GridSearchCV', 'make_scorer'和其他一些需要的库from sklearn.model_selection import GridSearchCV, KFoldfrom sklearn.metrics import make_scorerfrom sklearn.ensemble import AdaBoostClassifier# TODO:初始化分类器clf = AdaBoostClassifier(random_state=0)# TODO:创建你希望调节的参数列表parameters = {'n_estimators': [50, 100, 200]}# TODO:创建一个fbeta_score打分对象scorer = make_scorer(fbeta_score, beta=0.5)# TODO:在分类器上使用网格搜索,使用'scorer'作为评价函数kfold = KFold(n_splits=10)grid_obj = GridSearchCV(clf, parameters, scorer, cv=kfold)# TODO:用训练数据拟合网格搜索对象并找到最佳参数grid_fit = grid_obj.fit(X_train, y_train)# 得到estimatorbest_clf = grid_obj.best_estimator_# 使用没有调优的模型做预测predictions = (clf.fit(X_train, y_train)).predict(X_test)best_predictions = best_clf.predict(X_test)# 汇报调参前和调参后的分数print "Unoptimized model\n------"print "Accuracy score on testing data: {:.4f}".format(accuracy_score(y_test, predictions))print "F-score on testing data: {:.4f}".format(fbeta_score(y_test, predictions, beta = 0.5))print "\nOptimized Model\n------"print "Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test, best_predictions))print "Final F-score on the testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 0.5))
In [23]:
# TODO:导入一个有'feature_importances_'的监督学习模型from sklearn.ensemble import RandomForestClassifier# TODO:在训练集上训练一个监督学习模型model = RandomForestClassifier(random_state=0)model.fit(X_train, y_train)# TODO: 提取特征重要性importances = model.feature_importances_importances_AdaBoost = best_clf.feature_importances_# 绘图vs.feature_plot(importances, X_train, y_train)vs.feature_plot(importances_AdaBoost, X_train, y_train)
In [25]:
# 导入克隆模型的功能from sklearn.base import clone# 减小特征空间X_train_reduced = X_train[X_train.columns.values[(np.argsort(importances_AdaBoost)[::-1])[:5]]]X_test_reduced = X_test[X_test.columns.values[(np.argsort(importances_AdaBoost)[::-1])[:5]]]# 在前面的网格搜索的基础上训练一个“最好的”模型clf = (clone(best_clf)).fit(X_train_reduced, y_train)# 做一个新的预测reduced_predictions = clf.predict(X_test_reduced)# 对于每一个版本的数据汇报最终模型的分数print "Final Model trained on full data\n------"print "Accuracy on testing data: {:.4f}".format(accuracy_score(y_test, best_predictions))print "F-score on testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 0.5))print "\nFinal Model trained on reduced data\n------"print "Accuracy on testing data: {:.4f}".format(accuracy_score(y_test, reduced_predictions))print "F-score on testing data: {:.4f}".format(fbeta_score(y_test, reduced_predictions, beta = 0.5))
阅读全文
0 0
- 项目2: 为CharityML寻找捐献者
- 寻找兼职.net 项目
- 寻找和为Sum的多个数-2
- 航模项目流程 2——寻找物料,画原理图
- 为自己开发软件寻找理由!
- 不要为自己寻找理由
- 不要为自己寻找理由
- 网络项目寻找创业合作伙伴
- 网络项目寻找创业合作伙伴
- 浩易南:网上如何寻找项目?
- 寻找长度为n数组中出现次数超过n/2的元素
- 寻找大小为n的数组中出现次数超过n/2的那个数
- 为心寻找漂泊中的归宿
- 寻找两数之和为某个值
- (转)过来人,为年轻人寻找明天
- 如何为路由器配置寻找IP地址
- 只为寻找一个有希望明天
- Linus,一生只为寻找欢笑
- 安卓学习日记 6-9 Rxjava
- Java锁机制--synchronized
- 启程了!在CSDN
- Android在webview下加载本地html的方式
- 数据结构实验之求二叉树后序遍历和层次遍历
- 项目2: 为CharityML寻找捐献者
- FATFS文件系统+源码分析——学习笔记
- MySQL添加用户、删除用户与授权
- Android Studio引入外部so的方法
- lnmp一键安装包搭建的环境,mysql在局域网内连接不上的问题
- C语言的内存分配calloc()函数
- Django学习笔记(二)
- web缓存(代理服务器)
- 希尔排序