python数据分析学习笔记十
来源:互联网 发布:php require use 编辑:程序博客网 时间:2024/03/29 23:13
第十章 预测性分析和机器学习
监督学习
无监督学习
强化学习
1 scikit-learn
略
2 预处理
import numpy as npfrom sklearn import preprocessingfrom scipy.stats import anderson# 加载数据rain = np.load('rain.npy')rain = .1 * rainrain[rain < 0] = .05 / 2# 期望值 标准差和安德森print("Rain mean", rain.mean())print("Rain Variance", rain.var())print("Anderson Rain", anderson(rain))scaled = preprocessing.scale(rain)print("Scaled mean", scaled.mean())print("Scaled Variance", scaled.var())print("Anderson Scaled", anderson(scaled))# 把特征值从数值型转换布尔型binarized = preprocessing.binarize(rain)print("binarized", np.unique(binarized), binarized.sum())# 分类标准类别,输出0-62之间的整数lb = preprocessing.LabelBinarizer()lb.fit(rain.astype(int))print(lb.classes_)
运行结果如下:
Rain mean 2.17919594267
Rain Variance 18.803443919
Anderson Rain AndersonResult(statistic=inf,critical_values=array([ 0.576, 0.656, 0.787, 0.918, 1.092]), significance_level=array([ 15. , 10. , 5. , 2.5, 1. ]))
Scaled mean 3.41301602808e-17
Scaled Variance 1.0
Anderson ScaledAndersonResult(statistic=inf, critical_values=array([ 0.576, 0.656, 0.787, 0.918, 1.092]), significance_level=array([ 15., 10. , 5. , 2.5, 1. ]))
binarized [ 0. 1.] 24594.0
[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 2324
2526 27 28 29 30 31 32 33 34 35 36 37 38 39 40 42 43 44 45 46 47 48 49 50
5253 55 58 61 62]
3 基于逻辑回归的分类
该算法可以用以预测事件发生的概率,或是事物是否属于一类别的概率
from sklearn.linear_model import LogisticRegressionfrom sklearn.cross_validation import KFoldfrom sklearn import datasetsimport numpy as npdef classify(x, y): # 使用逻辑回归进行分类 clf = LogisticRegression(random_state=12) scores = [] # k-折交叉验证 kf = KFold(len(y), n_folds=10) # 检查分类的状确性 for train, test in kf: clf.fit(x[train], y[train]) scores.append(clf.score(x[test], y[test])) print(np.mean(scores))# 加载数据信息rain = np.load('rain.npy')dates = np.load('doy.npy')# 使用日期和降雨量来构建数组x = np.vstack((dates[:-1], rain[:-1]))# 无雨,小雨,雨y = np.sign(rain[1:])classify(x.T, y)iris = datasets.load_iris()x = iris.data[:, :2]y = iris.targetclassify(x, y)
运行结果如下:
0.576726256477
0.413333333333
4 基于支持向量机的分类
支持向量机 Support vector machines SVM
支持向量回归 Support vector Regression SVR
可以用来进行回归分析,也可以用来分类
示例代码如下:
from sklearn.svm import SVCfrom sklearn.grid_search import GridSearchCVfrom sklearn import datasetsimport numpy as npfrom pprint import PrettyPrinterdef classify(x, y): # 进行网格搜索 clf = GridSearchCV(SVC(random_state=42, max_iter=100), {'kernel': ['linear', 'poly', 'rbf'], 'C': [1, 10]}) clf.fit(x, y) print("Score", clf.score(x, y)) PrettyPrinter().pprint(clf.grid_scores_)rain = np.load('rain.npy')dates = np.load('doy.npy')x = np.vstack((dates[:-1], rain[:-1]))y = np.sign(rain[1:])classify(x.T, y)iris = datasets.load_iris()x = iris.data[:, :2]y = iris.targetclassify(x, y)
运行结果如下:
#天气数据
Score 0.559660687823
[mean: 0.42879, std: 0.11308, params:{'kernel': 'linear', 'C': 1},
mean: 0.55570, std: 0.00559, params:{'kernel': 'poly', 'C': 1},
mean: 0.36939, std: 0.00169, params:{'kernel': 'rbf', 'C': 1},
mean: 0.30658, std: 0.03034, params:{'kernel': 'linear', 'C': 10},
mean: 0.41673, std: 0.20214, params:{'kernel': 'poly', 'C': 10},
mean: 0.49195, std: 0.08911, params:{'kernel': 'rbf', 'C': 10}]
#鸢属花样本数据
Score 0.82
[mean: 0.80000, std: 0.03949, params:{'kernel': 'linear', 'C': 1},
mean: 0.58667, std: 0.12603, params:{'kernel': 'poly', 'C': 1},
mean: 0.80000, std: 0.03254, params:{'kernel': 'rbf', 'C': 1},
mean: 0.74667, std: 0.07391, params:{'kernel': 'linear', 'C': 10},
mean: 0.56667, std: 0.13132, params:{'kernel': 'poly', 'C': 10},
mean: 0.79333, std: 0.03467, params:{'kernel': 'rbf', 'C': 10}]
5 基于elasticNetCV的回归分类
弹性网格正则化 Elasic net Regularization 降低回归分析的过拟合风险
实际上是LASSO(The Least Absolute Shrikage and Selection Operator)算法和岭回归方法的线性组合
示例代码如下:
from sklearn.linear_model import ElasticNetCVimport numpy as npfrom sklearn import datasetsimport matplotlib.pyplot as pltdef regress(x, y, title): clf = ElasticNetCV(max_iter=200, # 最大迭代次数 cv=10, # 包总量 l1_ratio=[.1, .5, .7, .9, .95, .99, 1] # 0表示只使用岭回归,1表示只使用 LASSO回归,否则使用混合算法 ) clf.fit(x, y) print("Score", clf.score(x, y)) pred = clf.predict(x) plt.title("Scatter plot of prediction and " + title) plt.xlabel("Prediction") plt.ylabel("Target") plt.scatter(y, pred) if "Boston" in title: plt.plot(y, y, label="Perfect Fit") plt.legend() plt.grid = True plt.show()rain = .1 * np.load('rain.npy')rain[rain < 0] = .05 / 2dates = np.load("doy.npy")x = np.vstack((dates[:-1], rain[:-1]))y = rain[1:]regress(x.T, y, "rain data")boston = datasets.load_boston()x = boston.datay = boston.targetregress(x, y, "Boston house prices")
运行结果如下:
Score 0.0527838760942
Score 0.683143903455
6 支持向量回归
示例代码如下:
import numpy as npfrom sklearn import datasetsfrom sklearn.model_selection import learning_curvefrom sklearn.svm import SVRfrom sklearn import preprocessingimport multiprocessingimport matplotlib.pyplot as plt# 错误信息# D:\Python35\lib\site-packages\sklearn\svm\base.py:220: ConvergenceWarning: Solver terminated early (max_iter=800). Consider pre-processing your data with StandardScaler or MinMaxScaler.# % self.max_iter, ConvergenceWarning)def regress(x, y, ncpus, title): X = preprocessing.scale(x) Y = preprocessing.scale(y) clf = SVR(max_iter=ncpus * 200) # 根据cpu数量创建作业数 train_sizes, train_scores, test_scores = learning_curve(clf, X, Y, n_jobs=ncpus) # 求平均数,然后画出得分 plt.figure() plt.title(title) plt.plot(train_sizes, train_scores.mean(axis=1), label="Train score") plt.plot(train_sizes, test_scores.mean(axis=1), '--', label="Test score") print("Max test score " + title, test_scores.max()) plt.grid(True) plt.legend(loc='best') plt.show()def main(): rain = .1 * np.load('rain.npy') rain[rain < 0] = .05 / 2 dates = np.load('doy.npy') x = np.vstack((dates[:-1], rain[:-1])) y = rain[1:] ncpus = multiprocessing.cpu_count() regress(x.T, y, ncpus, "Rain") boston = datasets.load_boston() x = boston.data y = boston.target regress(x, y, ncpus, "Boston")if __name__ == '__main__': main()
运行结果如下:
Max test score Rain -0.0272088393925
Max test score Boston 0.662188537037
7 基于相似性传播算法的聚类分析
聚类分析就是把数据分成一些组,这些组就是所谓的聚类
聚类分析,属无监督学习
相似性传播 affinity propagation
示例代码如下:
# 生成三个数据块x, _ = datasets.make_blobs(n_samples=100, centers=3, n_features=2, random_state=10)# 创建矩阵S = euclidean_distances(x)# print(S)# 根据矩阵,给数据标注其所属聚类aff_pro = cluster.AffinityPropagation().fit(S)labels = aff_pro.labels_# 绘制图形styles = ['o', 'x', '^']for style, label in zip(styles, np.unique(labels)): print(label) plt.plot(x[labels == label], style, label=label)plt.title("Clustering Blobs")plt.grid(True)plt.legend(loc='best')plt.show()
运行结果如下:
0
1
2
8 均值漂移算法
一种不需要估算聚类数的聚类算法(可以应用于图像处理,是不是等同于中值滤波)
示例代码如下:
import numpy as npfrom sklearn import clusterimport matplotlib.pyplot as pltimport pandas as pd# 加载数据rain = .1 * np.load('rain.npy')rain[rain < 0] = .05 / 2dates = np.load('doy.npy')x = np.vstack((dates, rain))# 创建dataFrame,并计算平均值df = pd.DataFrame.from_records(x.T, columns=['dates', 'rain'])df = df.groupby('dates').mean()df.plot()# 均值漂移算法x = np.vstack((np.arange(1, len(df) + 1), df.as_matrix().ravel()))x = x.Tms = cluster.MeanShift()ms.fit(x)labels = ms.predict(x)# 绘制图形plt.figure()grays = ['0', '0.5', '0.75']for gray, label in zip(grays, np.unique(labels)): match = labels == label x0 = x[:, 0] x1 = x[:, 1] plt.plot(x0[match], x1[match], lw=label + 1, label=label) plt.fill_between(x0, x1, where=match, color=gray)plt.grid(True)plt.legend()plt.show()
运行结果如下:
9 遗传算法
可用于搜索和优化方面
示例代码如下:
运行结果如下:
Gen(交叉概率) nevals(突变率) max(最大代数)
0 400 0.000484774
1 222 0.000656187
2 246 0.00745961
3 239 0.00745961
4 240 0.0184182
5 216 0.0309736
6 237 0.06957
7 243 0.06957
8 231 0.224381
9 226 0.224381
10 247 0.224381
11 228 0.247313
12 241 0.28318
13 242 0.354144
14 246 0.46282
15 239 0.46282
16 266 0.480937
17 233 0.648529
....
76 230 0.998861
77 252 0.998861
78 232 0.998861
79 243 0.998861
80 235 0.998861
0.9988605380058289
10 神经网络
人工神经网络 ANN: 由神经元组成的网络,每个神经元都有输入和输出功能
程序报错,需要调试
11 决策树
示例代码如下:
# 所属模块发生变化# from sklearn.cross_validation import train_test_split# from sklearn.grid_search import RandomizedSearchCVfrom sklearn.model_selection import train_test_splitfrom sklearn import treefrom sklearn.model_selection import RandomizedSearchCVfrom scipy.stats import randint as sp_randintimport pydot# import StringIOfrom io import StringIOimport numpy as npfrom tempfile import NamedTemporaryFile# 加载数据信息rain = .1 * np.load('rain.npy')rain[rain < 0] = .05 / 2dates = np.load('doy.npy').astype(int)x = np.vstack((dates[:-1], np.sign(rain[:-1])))x = x.Ty = np.sign(rain[1:])# 创建测试集和训练集数据x_tain, x_test, y_train, y_test = train_test_split(x, y, random_state=37)# 验证各参数的取值范围clf = tree.DecisionTreeClassifier(random_state=37)params = {"max_depth": [2, None], "min_samples_leaf": sp_randint(1, 5), "criterion": ["gini", "entropy"]}rscv = RandomizedSearchCV(clf, params)rscv.fit(x_tain, y_train)# 绘制决策树的对象sio = StringIO()tree.export_graphviz(rscv.best_estimator_, out_file=sio, feature_names=['day-of-year', 'yest'])dec_tree = pydot.graph_from_dot_data(sio.getvalue())with NamedTemporaryFile(prefix='rain', suffix='.png', delete=False) as f: # dec_tree.write_png(f.name) dec_tree[0].write_png(f.name) print("Written figure to", f.name)print('Best Train Score', rscv.best_score_)print('Test Score', rscv.score(x_test, y_test))print("Best params", rscv.best_params_)
运行结果如下:
Written figure toC:\Users\ADMINI~1\AppData\Local\Temp\rainmys2nqfh.png
Best Train Score 0.703164923517
Test Score 0.705058763413
Best params {'min_samples_leaf': 1,'criterion': 'entropy', 'max_depth': 2}
- python数据分析学习笔记十
- python数据分析学习笔记
- Python数据分析学习笔记一
- Python数据分析学习笔记二
- Python数据分析学习笔记三
- Python数据分析学习笔记四
- Python数据分析学习笔记五
- Python数据分析学习笔记六
- python数据分析入门学习笔记儿
- python数据分析入门学习笔记
- python数据分析入门学习笔记儿
- python数据分析入门学习笔记儿
- python数据分析学习笔记一
- python数据分析学习笔记二
- python数据分析学习笔记三
- python数据分析学习笔记六
- python数据分析入门学习笔记
- # Python数据分析学习笔记(一)
- VirtualBox安装VBoxLinuxAdditions错误:unable to find the sources of your current linux kernel
- cloudera add host
- 后台时间控件,用js动态加载时,要初始化——项目问题
- C++图形学之线性插值
- 诶呀 居然可以写微博了 哈哈哈
- python数据分析学习笔记十
- Android Studio 打开源码项目,配置启动运行
- QPushButton的鼠标事件处理及EventFilter事件过滤器的用法
- 树莓派Debian支持ll
- DapperLambda发布
- python数据分析学习笔记九
- jquery Ajax操作
- python数据分析学习笔记八
- 剖析淘宝 TDDL ( TAOBAO DISTRIBUTE DATA LAYER )