scikit-learn 常用模型介绍及使用(下)
来源:互联网 发布:windows 10桌面太亮 编辑:程序博客网 时间:2024/05/16 03:10
scikit-learn 常用模型介绍及使用
在上一篇文章主要介绍了机器学习中常用的模型使用方法,比如线性回归、逻辑回归、决策树、随机森林、bagging、random forest、xgboost、adaboost、svm、k-means、密度聚类、谱和谱聚类等,这些在很多的比赛中是比较常用的算法,前几天看一篇文章还介绍xgboost为何在很多比赛中都有较好的效果,速度快、精度也较高,xgboost 与深度学习到底孰优孰劣
以后有空要单独写一次xgboost的算法的使用。
今天开始介绍相对比较难一点的算法,包括EM、贝叶斯网络、HMM、LDA等,记得上课时,邹博老师说过到了EM算法就是机器学习里比较难、深入的部分了。
EM 算法
算法概述
EM算法简单来说就是分为两步E、M两步,核心利用jensen不等式;
E步骤:根据参数初始值或者上一次迭代的模型参数来计算出隐变量的后验概率,其实就是隐变量的期望。
M步骤:将似然函数最大化以获得新的参数值
不断迭代,使得似然函数最大化的参数,比较典型的有高斯混合模型。
代码
# !/usr/bin/python# -*- coding:utf-8 -*-import numpy as npfrom scipy.stats import multivariate_normalfrom sklearn.mixture import GaussianMixturefrom mpl_toolkits.mplot3d import Axes3Dimport matplotlib as mplimport matplotlib.pyplot as pltfrom sklearn.metrics.pairwise import pairwise_distances_argminmpl.rcParams['font.sans-serif'] = [u'SimHei']mpl.rcParams['axes.unicode_minus'] = Falseif __name__ == '__main__': style = 'sklearn' np.random.seed(0) mu1_fact = (0, 0, 0) cov_fact = np.identity(3) data1 = np.random.multivariate_normal(mu1_fact, cov_fact, 400) mu2_fact = (2, 2, 1) cov_fact = np.identity(3) data2 = np.random.multivariate_normal(mu2_fact, cov_fact, 100) data = np.vstack((data1, data2)) y = np.array([True] * 400 + [False] * 100) if style == 'sklearn': g = GaussianMixture(n_components=2, covariance_type='full', tol=1e-6, max_iter=1000) g.fit(data) print '类别概率:\t', g.weights_[0] print '均值:\n', g.means_, '\n' print '方差:\n', g.covariances_, '\n' mu1, mu2 = g.means_ sigma1, sigma2 = g.covariances_ else: num_iter = 100 n, d = data.shape # 随机指定 # mu1 = np.random.standard_normal(d) # print mu1 # mu2 = np.random.standard_normal(d) # print mu2 mu1 = data.min(axis=0) mu2 = data.max(axis=0) sigma1 = np.identity(d) sigma2 = np.identity(d) pi = 0.5 # EM for i in range(num_iter): # E Step norm1 = multivariate_normal(mu1, sigma1) norm2 = multivariate_normal(mu2, sigma2) tau1 = pi * norm1.pdf(data) tau2 = (1 - pi) * norm2.pdf(data) gamma = tau1 / (tau1 + tau2) # M Step mu1 = np.dot(gamma, data) / np.sum(gamma) mu2 = np.dot((1 - gamma), data) / np.sum((1 - gamma)) sigma1 = np.dot(gamma * (data - mu1).T, data - mu1) / np.sum(gamma) sigma2 = np.dot((1 - gamma) * (data - mu2).T, data - mu2) / np.sum(1 - gamma) pi = np.sum(gamma) / n print i, ":\t", mu1, mu2 print '类别概率:\t', pi print '均值:\t', mu1, mu2 print '方差:\n', sigma1, '\n\n', sigma2, '\n' # 预测分类 norm1 = multivariate_normal(mu1, sigma1) norm2 = multivariate_normal(mu2, sigma2) tau1 = norm1.pdf(data) tau2 = norm2.pdf(data) fig = plt.figure(figsize=(13, 7), facecolor='w') ax = fig.add_subplot(121, projection='3d') ax.scatter(data[:, 0], data[:, 1], data[:, 2], c='b', s=30, marker='o', depthshade=True) ax.set_xlabel('X') ax.set_ylabel('Y') ax.set_zlabel('Z') ax.set_title(u'原始数据', fontsize=18) ax = fig.add_subplot(122, projection='3d') order = pairwise_distances_argmin([mu1_fact, mu2_fact], [mu1, mu2], metric='euclidean') if order[0] == 0: c1 = tau1 > tau2 else: c1 = tau1 < tau2 c2 = ~c1 acc = np.mean(y == c1) print u'准确率:%.2f%%' % (100*acc) ax.scatter(data[c1, 0], data[c1, 1], data[c1, 2], c='r', s=30, marker='o', depthshade=True) ax.scatter(data[c2, 0], data[c2, 1], data[c2, 2], c='g', s=30, marker='^', depthshade=True) ax.set_xlabel('X') ax.set_ylabel('Y') ax.set_zlabel('Z') ax.set_title(u'EM算法分类', fontsize=18) # plt.suptitle(u'EM算法的实现', fontsize=20) # plt.subplots_adjust(top=0.92) plt.tight_layout() plt.show()
高斯混合模型GMM
# !/usr/bin/python# -*- coding:utf-8 -*-import numpy as npfrom sklearn.mixture import GaussianMixturefrom sklearn.model_selection import train_test_splitimport matplotlib as mplimport matplotlib.colorsimport matplotlib.pyplot as pltmpl.rcParams['font.sans-serif'] = [u'SimHei']mpl.rcParams['axes.unicode_minus'] = False# from matplotlib.font_manager import FontProperties# font_set = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=15)# fontproperties=font_setdef expand(a, b): d = (b - a) * 0.05 return a-d, b+dif __name__ == '__main__': data = np.loadtxt('18.HeightWeight.csv', dtype=np.float, delimiter=',', skiprows=1) print data.shape y, x = np.split(data, [1, ], axis=1) x, x_test, y, y_test = train_test_split(x, y, train_size=0.6, random_state=0) gmm = GaussianMixture(n_components=2, covariance_type='full', random_state=0) x_min = np.min(x, axis=0) x_max = np.max(x, axis=0) gmm.fit(x) print '均值 = \n', gmm.means_ print '方差 = \n', gmm.covariances_ y_hat = gmm.predict(x) y_test_hat = gmm.predict(x_test) change = (gmm.means_[0][0] > gmm.means_[1][0]) if change: z = y_hat == 0 y_hat[z] = 1 y_hat[~z] = 0 z = y_test_hat == 0 y_test_hat[z] = 1 y_test_hat[~z] = 0 acc = np.mean(y_hat.ravel() == y.ravel()) acc_test = np.mean(y_test_hat.ravel() == y_test.ravel()) acc_str = u'训练集准确率:%.2f%%' % (acc * 100) acc_test_str = u'测试集准确率:%.2f%%' % (acc_test * 100) print acc_str print acc_test_str cm_light = mpl.colors.ListedColormap(['#FF8080', '#77E0A0']) cm_dark = mpl.colors.ListedColormap(['r', 'g']) x1_min, x1_max = x[:, 0].min(), x[:, 0].max() x2_min, x2_max = x[:, 1].min(), x[:, 1].max() x1_min, x1_max = expand(x1_min, x1_max) x2_min, x2_max = expand(x2_min, x2_max) x1, x2 = np.mgrid[x1_min:x1_max:500j, x2_min:x2_max:500j] grid_test = np.stack((x1.flat, x2.flat), axis=1) grid_hat = gmm.predict(grid_test) grid_hat = grid_hat.reshape(x1.shape) if change: z = grid_hat == 0 grid_hat[z] = 1 grid_hat[~z] = 0 plt.figure(figsize=(9, 7), facecolor='w') plt.pcolormesh(x1, x2, grid_hat, cmap=cm_light) plt.scatter(x[:, 0], x[:, 1], s=50, c=y, marker='o', cmap=cm_dark, edgecolors='k') plt.scatter(x_test[:, 0], x_test[:, 1], s=60, c=y_test, marker='^', cmap=cm_dark, edgecolors='k') p = gmm.predict_proba(grid_test) p = p[:, 0].reshape(x1.shape) CS = plt.contour(x1, x2, p, levels=(0.2, 0.5, 0.8), colors=list('rgb'), linewidths=2) plt.clabel(CS, fontsize=15, fmt='%.1f', inline=True) ax1_min, ax1_max, ax2_min, ax2_max = plt.axis() xx = 0.9*ax1_min + 0.1*ax1_max yy = 0.1*ax2_min + 0.9*ax2_max plt.text(xx, yy, acc_str, fontsize=18) yy = 0.15*ax2_min + 0.85*ax2_max plt.text(xx, yy, acc_test_str, fontsize=18) plt.xlim((x1_min, x1_max)) plt.ylim((x2_min, x2_max)) plt.xlabel(u'身高(cm)', fontsize='large') plt.ylabel(u'体重(kg)', fontsize='large') plt.title(u'EM算法估算GMM的参数', fontsize=20) plt.grid() plt.show()
DPGMM
# !/usr/bin/python# -*- coding:utf-8 -*-import numpy as npfrom sklearn.mixture import GaussianMixture, BayesianGaussianMixtureimport scipy as spimport matplotlib as mplimport matplotlib.colorsimport matplotlib.pyplot as pltfrom matplotlib.patches import Ellipsedef expand(a, b, rate=0.05): d = (b - a) * rate return a-d, b+dmatplotlib.rcParams['font.sans-serif'] = [u'SimHei']matplotlib.rcParams['axes.unicode_minus'] = Falseif __name__ == '__main__': np.random.seed(0) cov1 = np.diag((1, 2)) N1 = 500 N2 = 300 N = N1 + N2 x1 = np.random.multivariate_normal(mean=(3, 2), cov=cov1, size=N1) m = np.array(((1, 1), (1, 3))) x1 = x1.dot(m) x2 = np.random.multivariate_normal(mean=(-1, 10), cov=cov1, size=N2) x = np.vstack((x1, x2)) y = np.array([0]*N1 + [1]*N2) n_components = 3 # 绘图使用 colors = '#A0FFA0', '#2090E0', '#FF8080' cm = mpl.colors.ListedColormap(colors) x1_min, x1_max = x[:, 0].min(), x[:, 0].max() x2_min, x2_max = x[:, 1].min(), x[:, 1].max() x1_min, x1_max = expand(x1_min, x1_max) x2_min, x2_max = expand(x2_min, x2_max) x1, x2 = np.mgrid[x1_min:x1_max:500j, x2_min:x2_max:500j] grid_test = np.stack((x1.flat, x2.flat), axis=1) plt.figure(figsize=(9, 9), facecolor='w') plt.suptitle(u'GMM/DPGMM比较', fontsize=23) ax = plt.subplot(211) gmm = GaussianMixture(n_components=n_components, covariance_type='full', random_state=0) gmm.fit(x) centers = gmm.means_ covs = gmm.covariances_ print 'GMM均值 = \n', centers print 'GMM方差 = \n', covs y_hat = gmm.predict(x) grid_hat = gmm.predict(grid_test) grid_hat = grid_hat.reshape(x1.shape) plt.pcolormesh(x1, x2, grid_hat, cmap=cm) plt.scatter(x[:, 0], x[:, 1], s=30, c=y, cmap=cm, marker='o') clrs = list('rgbmy') for i, cc in enumerate(zip(centers, covs)): center, cov = cc value, vector = sp.linalg.eigh(cov) width, height = value[0], value[1] v = vector[0] / sp.linalg.norm(vector[0]) angle = 180* np.arctan(v[1] / v[0]) / np.pi e = Ellipse(xy=center, width=width, height=height, angle=angle, color=clrs[i], alpha=0.5, clip_box = ax.bbox) ax.add_artist(e) ax1_min, ax1_max, ax2_min, ax2_max = plt.axis() plt.xlim((x1_min, x1_max)) plt.ylim((x2_min, x2_max)) plt.title(u'GMM', fontsize=20) plt.grid(True) # DPGMM dpgmm = BayesianGaussianMixture(n_components=n_components, covariance_type='full', max_iter=1000, n_init=5, weight_concentration_prior_type='dirichlet_process', weight_concentration_prior=10) dpgmm.fit(x) centers = dpgmm.means_ covs = dpgmm.covariances_ print 'DPGMM均值 = \n', centers print 'DPGMM方差 = \n', covs y_hat = dpgmm.predict(x) # print y_hat ax = plt.subplot(212) grid_hat = dpgmm.predict(grid_test) grid_hat = grid_hat.reshape(x1.shape) plt.pcolormesh(x1, x2, grid_hat, cmap=cm) plt.scatter(x[:, 0], x[:, 1], s=30, c=y, cmap=cm, marker='o') for i, cc in enumerate(zip(centers, covs)): if i not in y_hat: continue center, cov = cc value, vector = sp.linalg.eigh(cov) width, height = value[0], value[1] v = vector[0] / sp.linalg.norm(vector[0]) angle = 180* np.arctan(v[1] / v[0]) / np.pi e = Ellipse(xy=center, width=width, height=height, angle=angle, color='m', alpha=0.5, clip_box = ax.bbox) ax.add_artist(e) plt.xlim((x1_min, x1_max)) plt.ylim((x2_min, x2_max)) plt.title('DPGMM', fontsize=20) plt.grid(True) plt.tight_layout() plt.subplots_adjust(top=0.9) plt.show()
贝叶斯网络
贝叶斯网络是在贝叶斯算法上拓展而来构建贝叶斯网络,基本原理是朴素贝叶斯算法。
示例
#!/usr/bin/python# -*- coding:utf-8 -*-import numpy as npimport matplotlib.pyplot as pltimport matplotlib as mplfrom sklearn.preprocessing import StandardScalerfrom sklearn.naive_bayes import GaussianNB, MultinomialNBfrom sklearn.pipeline import Pipelinefrom sklearn.neighbors import KNeighborsClassifierdef iris_type(s): it = {'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2} return it[s]if __name__ == "__main__": data = np.loadtxt('..\8.Regression\8.iris.data', dtype=float, delimiter=',', converters={4: iris_type}) print data x, y = np.split(data, (4,), axis=1) x = x[:, :2] print x print y gnb = Pipeline([ ('sc', StandardScaler()), ('clf', GaussianNB())]) gnb.fit(x, y.ravel()) # gnb = MultinomialNB().fit(x, y.ravel()) # gnb = KNeighborsClassifier(n_neighbors=5).fit(x, y.ravel()) # 画图 N, M = 500, 500 # 横纵各采样多少个值 x1_min, x1_max = x[:, 0].min(), x[:, 0].max() # 第0列的范围 x2_min, x2_max = x[:, 1].min(), x[:, 1].max() # 第1列的范围 t1 = np.linspace(x1_min, x1_max, N) t2 = np.linspace(x2_min, x2_max, M) x1, x2 = np.meshgrid(t1, t2) # 生成网格采样点 x_test = np.stack((x1.flat, x2.flat), axis=1) # 测试点 # 无意义,只是为了凑另外两个维度 # x3 = np.ones(x1.size) * np.average(x[:, 2]) # x4 = np.ones(x1.size) * np.average(x[:, 3]) # x_test = np.stack((x1.flat, x2.flat, x3, x4), axis=1) # 测试点 mpl.rcParams['font.sans-serif'] = [u'simHei'] mpl.rcParams['axes.unicode_minus'] = False cm_light = mpl.colors.ListedColormap(['#77E0A0', '#FF8080', '#A0A0FF']) cm_dark = mpl.colors.ListedColormap(['g', 'r', 'b']) y_hat = gnb.predict(x_test) # 预测值 y_hat = y_hat.reshape(x1.shape) # 使之与输入的形状相同 plt.figure(facecolor='w') plt.pcolormesh(x1, x2, y_hat, cmap=cm_light) # 预测值的显示 plt.scatter(x[:, 0], x[:, 1], c=y, edgecolors='k', s=50, cmap=cm_dark) # 样本的显示 plt.xlabel(u'花萼长度', fontsize=14) plt.ylabel(u'花萼宽度', fontsize=14) plt.xlim(x1_min, x1_max) plt.ylim(x2_min, x2_max) plt.title(u'GaussianNB对鸢尾花数据的分类结果', fontsize=18) plt.grid(True) plt.show() # 训练集上的预测结果 y_hat = gnb.predict(x) y = y.reshape(-1) result = y_hat == y print y_hat print result acc = np.mean(result) print '准确度: %.2f%%' % (100 * acc)
主题模型 LDA
主题模型可以用生成模型来看文档和主题这两件事。所谓生成模型,就是说,我们认为一篇文章的每个词都是通过“以一定概率选择了某个主题,并从这个主题中以一定概率选择某个词语”这样一个过程得到的。
代码
# !/usr/bin/python# -*- coding:utf-8 -*-from gensim import corpora, models, similaritiesfrom pprint import pprint# import logging# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)if __name__ == '__main__': f = open('22.LDA_test.txt') stop_list = set('for a of the and to in'.split()) # texts = [line.strip().split() for line in f] # print(texts) texts = [[word for word in line.strip().lower().split() if word not in stop_list] for line in f] print 'Text = ' pprint(texts) dictionary = corpora.Dictionary(texts) V = len(dictionary) corpus = [dictionary.doc2bow(text) for text in texts] corpus_tfidf = models.TfidfModel(corpus)[corpus] print 'TF-IDF:' for c in corpus_tfidf: print c print '\nLSI Model:' lsi = models.LsiModel(corpus_tfidf, num_topics=2, id2word=dictionary) topic_result = [a for a in lsi[corpus_tfidf]] pprint(topic_result) print 'LSI Topics:' pprint(lsi.print_topics(num_topics=2, num_words=5)) similarity = similarities.MatrixSimilarity(lsi[corpus_tfidf]) # similarities.Similarity() print 'Similarity:' pprint(list(similarity)) print '\nLDA Model:' num_topics = 2 lda = models.LdaModel(corpus_tfidf, num_topics=num_topics, id2word=dictionary, alpha='auto', eta='auto', minimum_probability=0.001) doc_topic = [doc_t for doc_t in lda[corpus_tfidf]] print 'Document-Topic:\n' pprint(doc_topic) for doc_topic in lda.get_document_topics(corpus_tfidf): print doc_topic for topic_id in range(num_topics): print 'Topic', topic_id # pprint(lda.get_topic_terms(topicid=topic_id)) pprint(lda.show_topic(topic_id)) similarity = similarities.MatrixSimilarity(lda[corpus_tfidf]) print 'Similarity:' pprint(list(similarity)) hda = models.HdpModel(corpus_tfidf, id2word=dictionary) topic_result = [a for a in hda[corpus_tfidf]] print '\n\nUSE WITH CARE--\nHDA Model:' pprint(topic_result) print 'HDA Topics:' print hda.print_topics(num_topics=2, num_words=5)
LDA包使用
# !/usr/bin/python# -*- coding:utf-8 -*-import numpy as npimport matplotlib.pyplot as pltimport matplotlib as mplimport ldaimport lda.datasetsfrom pprint import pprintif __name__ == "__main__": # document-term matrix X = lda.datasets.load_reuters() print("type(X): {}".format(type(X))) print("shape: {}\n".format(X.shape)) print(X[:10, :10]) # the vocab vocab = lda.datasets.load_reuters_vocab() print("type(vocab): {}".format(type(vocab))) print("len(vocab): {}\n".format(len(vocab))) print(vocab[:10]) # titles for each story titles = lda.datasets.load_reuters_titles() print("type(titles): {}".format(type(titles))) print("len(titles): {}\n".format(len(titles))) pprint(titles[:10]) print 'LDA start ----' topic_num = 20 model = lda.LDA(n_topics=topic_num, n_iter=500, random_state=1) model.fit(X) # topic-word topic_word = model.topic_word_ print("type(topic_word): {}".format(type(topic_word))) print("shape: {}".format(topic_word.shape)) print(vocab[:5]) print(topic_word[:, :5]) # Print Topic distribution n = 7 for i, topic_dist in enumerate(topic_word): topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n + 1):-1] print('*Topic {}\n- {}'.format(i, ' '.join(topic_words))) # Document - topic doc_topic = model.doc_topic_ print("type(doc_topic): {}".format(type(doc_topic))) print("shape: {}".format(doc_topic.shape)) for i in range(10): topic_most_pr = doc_topic[i].argmax() print(u"文档: {} 主题: {} value: {}".format(i, topic_most_pr, doc_topic[i][topic_most_pr])) mpl.rcParams['font.sans-serif'] = [u'SimHei'] mpl.rcParams['axes.unicode_minus'] = False # Topic - word plt.figure(figsize=(8, 9)) # f, ax = plt.subplots(5, 1, sharex=True) for i, k in enumerate([0, 5, 9, 14, 19]): ax = plt.subplot(5, 1, i+1) ax.plot(topic_word[k, :], 'r-') ax.set_xlim(-50, 4350) # [0,4258] ax.set_ylim(0, 0.08) ax.set_ylabel(u"概率") ax.set_title(u"主题 {}".format(k)) plt.xlabel(u"词", fontsize=14) plt.tight_layout() plt.suptitle(u'主题的词分布', fontsize=18) plt.subplots_adjust(top=0.9) plt.show() # Document - Topic plt.figure(figsize=(8, 9)) # f, ax= plt.subplots(5, 1, figsize=(8, 6), sharex=True) for i, k in enumerate([1, 3, 4, 8, 9]): ax = plt.subplot(5, 1, i+1) ax.stem(doc_topic[k, :], linefmt='g-', markerfmt='ro') ax.set_xlim(-1, topic_num+1) ax.set_ylim(0, 1) ax.set_ylabel(u"概率") ax.set_title(u"文档 {}".format(k)) plt.xlabel(u"主题", fontsize=14) plt.suptitle(u'文档的主题分布', fontsize=18) plt.tight_layout() plt.subplots_adjust(top=0.9) plt.show()
HMM
隐马模型(Hidden Markov)可用标注问题,在语音识别、NLP、生物信息、模式识别等领域被实践证明是有效的算法。
HMM是关于时序的概率模型,描述由一个隐藏的马尔科夫链生成不可观测的状态随机序列,再由各个状态生成观测随机序列的过程。
隐马模型随机生成的状态随机序列,称为状态序列,每个状态生成一个观测,由此产生的观测随机序列,称为观测序列。
HMM主要包括三部分信息:初始概率分布π,状态转移概率分布A,观测概率分布B三部分。
隐马模型主要包括三部分的内容:
- 概率计算:前向-后向算法,改定模型,观测序列O,计算模型下观测序列出现的概率
- 参数估计:baum-welch算法-EM,已知观测序列,估计模型
- 模型预测:viterbi算法-动态规划,已知模型和观测序列,预测观测序列下状态序列I
实例
# !/usr/bin/python# -*- coding:utf-8 -*-import mathimport matplotlib.pyplot as pltimport numpy as npimport codecsimport randominfinite = -(2**31)def log_normalize(a): s = 0 for x in a: s += x s = math.log(s) for i in range(len(a)): if a[i] == 0: a[i] = infinite else: a[i] = math.log(a[i]) - sdef log_sum(a): if not a: # a为空 return infinite m = max(a) s = 0 for t in a: s += math.exp(t-m) return m + math.log(s)def calc_alpha(pi, A, B, o, alpha): for i in range(4): alpha[0][i] = pi[i] + B[i][ord(o[0])] T = len(o) temp = [0 for i in range(4)] del i for t in range(1, T): for i in range(4): for j in range(4): temp[j] = (alpha[t-1][j] + A[j][i]) alpha[t][i] = log_sum(temp) alpha[t][i] += B[i][ord(o[t])]def calc_beta(pi, A, B, o, beta): T = len(o) for i in range(4): beta[T-1][i] = 1 temp = [0 for i in range(4)] del i for t in range(T-2, -1, -1): for i in range(4): beta[t][i] = 0 for j in range(4): temp[j] = A[i][j] + B[j][ord(o[t+1])] + beta[t+1][j] beta[t][i] += log_sum(temp)def calc_gamma(alpha, beta, gamma): for t in range(len(alpha)): for i in range(4): gamma[t][i] = alpha[t][i] + beta[t][i] s = log_sum(gamma[t]) for i in range(4): gamma[t][i] -= sdef calc_ksi(alpha, beta, A, B, o, ksi): T = len(alpha) temp = [0 for x in range(16)] for t in range(T-1): k = 0 for i in range(4): for j in range(4): ksi[t][i][j] = alpha[t][i] + A[i][j] + B[j][ord(o[t+1])] + beta[t+1][j] temp[k] =ksi[t][i][j] k += 1 s = log_sum(temp) for i in range(4): for j in range(4): ksi[t][i][j] -= sdef bw(pi, A, B, alpha, beta, gamma, ksi, o): T = len(alpha) for i in range(4): pi[i] = gamma[0][i] s1 = [0 for x in range(T-1)] s2 = [0 for x in range(T-1)] for i in range(4): for j in range(4): for t in range(T-1): s1[t] = ksi[t][i][j] s2[t] = gamma[t][i] A[i][j] = log_sum(s1) - log_sum(s2) s1 = [0 for x in range(T)] s2 = [0 for x in range(T)] for i in range(4): for k in range(65536): if k % 5000 == 0: print i, k valid = 0 for t in range(T): if ord(o[t]) == k: s1[valid] = gamma[t][i] valid += 1 s2[t] = gamma[t][i] if valid == 0: B[i][k] = -log_sum(s2) # 平滑 else: B[i][k] = log_sum(s1[:valid]) - log_sum(s2)def baum_welch(pi, A, B): f = file(".\\2.txt") sentence = f.read()[3:].decode('utf-8') # 跳过文件头 f.close() T = len(sentence) # 观测序列 alpha = [[0 for i in range(4)] for t in range(T)] beta = [[0 for i in range(4)] for t in range(T)] gamma = [[0 for i in range(4)] for t in range(T)] ksi = [[[0 for j in range(4)] for i in range(4)] for t in range(T-1)] for time in range(100): print "time:", time calc_alpha(pi, A, B, sentence, alpha) # alpha(t,i):给定lamda,在时刻t的状态为i且观测到o(1),o(2)...o(t)的概率 calc_beta(pi, A, B, sentence, beta) # beta(t,i):给定lamda和时刻t的状态i,观测到o(t+1),o(t+2)...oT的概率 calc_gamma(alpha, beta, gamma) # gamma(t,i):给定lamda和O,在时刻t状态位于i的概率 calc_ksi(alpha, beta, A, B, sentence, ksi) # ksi(t,i,j):给定lamda和O,在时刻t状态位于i且在时刻i+1,状态位于j的概率 bw(pi, A, B, alpha, beta, gamma, ksi, sentence) #baum_welch算法 save_parameter(pi, A, B, time)def list_write(f, v): for a in v: f.write(str(a)) f.write(' ') f.write('\n')def save_parameter(pi, A, B, time): f_pi = open(".\\pi%d.txt" % time, "w") list_write(f_pi, pi) f_pi.close() f_A = open(".\\A%d.txt" % time, "w") for a in A: list_write(f_A, a) f_A.close() f_B = open(".\\B%d.txt" % time, "w") for b in B: list_write(f_B, b) f_B.close()def train(): # 初始化pi,A,B pi = [random.random() for x in range(4)] # 初始分布 log_normalize(pi) A = [[random.random() for y in range(4)] for x in range(4)] # 转移矩阵:B/M/E/S A[0][0] = A[0][3] = A[1][0] = A[1][3]\ = A[2][1] = A[2][2] = A[3][1] = A[3][2] = 0 # 不可能事件 B = [[random.random() for y in range(65536)] for x in range(4)] for i in range(4): log_normalize(A[i]) log_normalize(B[i]) baum_welch(pi, A, B) return pi, A, Bdef load_train(): f = file(".\\pi.txt", mode="r") for line in f: pi = map(float, line.split(' ')[:-1]) f.close() f = file(".\\A.txt", mode="r") A = [[] for x in range(4)] # 转移矩阵:B/M/E/S i = 0 for line in f: A[i] = map(float, line.split(' ')[:-1]) i += 1 f.close() f = file(".\\B.txt", mode="r") B = [[] for x in range(4)] i = 0 for line in f: B[i] = map(float, line.split(' ')[:-1]) i += 1 f.close() return pi, A, Bdef viterbi(pi, A, B, o): T = len(o) # 观测序列 delta = [[0 for i in range(4)] for t in range(T)] pre = [[0 for i in range(4)] for t in range(T)] # 前一个状态 # pre[t][i]:t时刻的i状态,它的前一个状态是多少 for i in range(4): delta[0][i] = pi[i] + B[i][ord(o[0])] for t in range(1, T): for i in range(4): delta[t][i] = delta[t-1][0] + A[0][i] for j in range(1,4): vj = delta[t-1][j] + A[j][i] if delta[t][i] < vj: delta[t][i] = vj pre[t][i] = j delta[t][i] += B[i][ord(o[t])] decode = [-1 for t in range(T)] # 解码:回溯查找最大路径 q = 0 for i in range(1, 4): if delta[T-1][i] > delta[T-1][q]: q = i decode[T-1] = q for t in range(T-2, -1, -1): q = pre[t+1][q] decode[t] = q return decodedef segment(sentence, decode): N = len(sentence) i = 0 while i < N: #B/M/E/S if decode[i] == 0 or decode[i] == 1: # Begin j = i+1 while j < N: if decode[j] == 2: break j += 1 print sentence[i:j+1], "|", i = j+1 elif decode[i] == 3 or decode[i] == 2: # single print sentence[i:i+1], "|", i += 1 else: print 'Error:', i, decode[i] i += 1if __name__ == "__main__": pi, A, B = load_train() f = file(".\\24.mybook.txt") data = f.read()[3:].decode('utf-8') f.close() decode = viterbi(pi, A, B, data) segment(data, decode)
# !/usr/bin/python# -*- coding:utf-8 -*-import numpy as npfrom hmmlearn import hmmimport matplotlib.pyplot as pltimport matplotlib as mplfrom sklearn.metrics.pairwise import pairwise_distances_argminimport warningsdef expand(a, b): d = (b - a) * 0.05 return a-d, b+dif __name__ == "__main__": warnings.filterwarnings("ignore") # hmmlearn(0.2.0) < sklearn(0.18) np.random.seed(0) n = 5 # 隐状态数目 n_samples = 1000 pi = np.random.rand(n) pi /= pi.sum() print '初始概率:', pi A = np.random.rand(n, n) mask = np.zeros((n, n), dtype=np.bool) mask[0][1] = mask[0][4] = True mask[1][0] = mask[1][2] = True mask[2][1] = mask[2][3] = True mask[3][2] = mask[3][4] = True mask[4][0] = mask[4][3] = True A[mask] = 0 for i in range(n): A[i] /= A[i].sum() print '转移概率:\n', A means = np.array(((30, 30), (0, 50), (-25, 30), (-15, 0), (15, 0))) print '均值:\n', means covars = np.empty((n, 2, 2)) for i in range(n): # covars[i] = np.diag(np.random.randint(1, 5, size=2)) covars[i] = np.diag(np.random.rand(2)+0.001)*10 # np.random.rand ∈[0,1) print '方差:\n', covars model = hmm.GaussianHMM(n_components=n, covariance_type='full') model.startprob_ = pi model.transmat_ = A model.means_ = means model.covars_ = covars sample, labels = model.sample(n_samples=n_samples, random_state=0) # 估计参数 model = hmm.GaussianHMM(n_components=n, covariance_type='full', n_iter=10) model = model.fit(sample) y = model.predict(sample) np.set_printoptions(suppress=True) print '##估计初始概率:', model.startprob_ print '##估计转移概率:\n', model.transmat_ print '##估计均值:\n', model.means_ print '##估计方差:\n', model.covars_ # 类别 order = pairwise_distances_argmin(means, model.means_, metric='euclidean') print order pi_hat = model.startprob_[order] A_hat = model.transmat_[order] A_hat = A_hat[:, order] means_hat = model.means_[order] covars_hat = model.covars_[order] change = np.empty((n, n_samples), dtype=np.bool) for i in range(n): change[i] = y == order[i] for i in range(n): y[change[i]] = i print '估计初始概率:', pi_hat print '估计转移概率:\n', A_hat print '估计均值:\n', means_hat print '估计方差:\n', covars_hat print labels print y acc = np.mean(labels == y) * 100 print '准确率:%.2f%%' % acc mpl.rcParams['font.sans-serif'] = [u'SimHei'] mpl.rcParams['axes.unicode_minus'] = False plt.scatter(sample[:, 0], sample[:, 1], s=50, c=labels, cmap=plt.cm.Spectral, marker='o', label=u'观测值', linewidths=0.5, zorder=20) plt.plot(sample[:, 0], sample[:, 1], 'r-', zorder=10) plt.scatter(means[:, 0], means[:, 1], s=100, c=np.random.rand(n), marker='D', label=u'中心', alpha=0.8, zorder=30) x1_min, x1_max = sample[:, 0].min(), sample[:, 0].max() x2_min, x2_max = sample[:, 1].min(), sample[:, 1].max() x1_min, x1_max = expand(x1_min, x1_max) x2_min, x2_max = expand(x2_min, x2_max) plt.xlim((x1_min, x1_max)) plt.ylim((x2_min, x2_max)) plt.legend(loc='upper left') plt.grid(True) plt.show()
# !/usr/bin/python# -*- coding:utf-8 -*-import numpy as npfrom hmmlearn import hmmimport matplotlib.pyplot as pltimport matplotlib as mplfrom sklearn.metrics.pairwise import pairwise_distances_argminimport warningsdef expand(a, b): d = (b - a) * 0.05 return a-d, b+dif __name__ == "__main__": warnings.filterwarnings("ignore") # hmmlearn(0.2.0) < sklearn(0.18) # 0日期 1开盘 2最高 3最低 4收盘 5成交量 6成交额 x = np.loadtxt('24.SH600000.txt', delimiter='\t', skiprows=2, usecols=(4, 5, 6, 2, 3)) close_price = x[:, 0] volumn = x[:, 1] amount = x[:, 2] amplitude_price = x[:, 3] - x[:, 4] # 每天的最高价与最低价的差 diff_price = np.diff(close_price) # 涨跌值 volumn = volumn[1:] # 成交量 amount = amount[1:] # 成交额 amplitude_price = amplitude_price[1:] # 每日振幅 sample = np.column_stack((diff_price, volumn, amount, amplitude_price)) # 观测值 n = 5 model = hmm.GaussianHMM(n_components=n, covariance_type='full') model.fit(sample) y = model.predict_proba(sample) np.set_printoptions(suppress=True) print y t = np.arange(len(diff_price)) mpl.rcParams['font.sans-serif'] = [u'SimHei'] mpl.rcParams['axes.unicode_minus'] = False plt.figure(figsize=(10,8), facecolor='w') plt.subplot(421) plt.plot(t, diff_price, 'r-') plt.grid(True) plt.title(u'涨跌幅') plt.subplot(422) plt.plot(t, volumn, 'g-') plt.grid(True) plt.title(u'交易量') clrs = plt.cm.terrain(np.linspace(0, 0.8, n)) plt.subplot(423) for i, clr in enumerate(clrs): plt.plot(t, y[:, i], '-', color=clr, alpha=0.7) plt.title(u'所有组分') plt.grid(True) for i, clr in enumerate(clrs): axes = plt.subplot(4, 2, i+4) plt.plot(t, y[:, i], '-', color=clr) plt.title(u'组分%d' % (i+1)) plt.grid(True) plt.suptitle(u'SH600000股票:GaussianHMM分解隐变量', fontsize=18) plt.tight_layout() plt.subplots_adjust(top=0.9) plt.show()
- scikit-learn 常用模型介绍及使用(下)
- scikit-learn 常用模型介绍及使用(上)
- python scikit learn 常用模型汇总
- 【Scikit-Learn 中文文档】使用 scikit-learn 介绍机器学习 |
- 【Scikit-Learn 中文文档】使用 scikit-learn 介绍机器学习
- 【Scikit-Learn 中文文档】使用 scikit-learn 介绍机器学习
- 【Scikit-Learn 中文文档】使用 scikit-learn 介绍机器学习
- 【Scikit-Learn 中文文档】使用 scikit-learn 介绍机器学习
- 【Scikit-Learn 中文文档】使用 scikit-learn 介绍机器学习
- 【Scikit-Learn 中文文档】使用 scikit-learn 介绍机器学习
- 【Scikit-Learn 中文文档】使用 scikit-learn 介绍机器学习
- 【Scikit-Learn 中文文档】使用 scikit-learn 介绍机器学习
- 【Scikit-Learn 中文文档】使用 scikit-learn 介绍机器学习
- 【Scikit-Learn 中文文档】使用 scikit-learn 介绍机器学习
- 【Scikit-Learn 中文文档】使用 scikit-learn 介绍机器学习
- 【Scikit-Learn 中文文档】使用 scikit-learn 介绍机器学习
- 【Scikit-Learn 中文文档】使用 scikit-learn 介绍机器学习
- 【Scikit-Learn 中文文档】使用 scikit-learn 介绍机器学习
- 互联网DSP广告系统架构及关键技术解析 | 广告行业资深架构师亲述
- DIV+CSS上中下左中右布局案例
- Python-set函数
- opencv实现人脸眼睛的检测
- 一些关于logging部分的代码笔记以及讲解
- scikit-learn 常用模型介绍及使用(下)
- Android ScrollView去掉滚动条及ScrollView属性
- Android流失布局
- Oracle数据库整理笔记
- idea使用maven构建mybatis程序遇到的几个问题
- 现代操作系统在之死锁
- 见微知著:语义分割中的弱监督学习
- LeetCode 540. Single Element in a Sorted Array
- cookie/sessionStorage/localStorage 的区别及用法