python数据分析学习笔记七
来源:互联网 发布:小米手环 知乎 编辑:程序博客网 时间:2024/03/29 04:51
第七章 信号处理与时间序列
(需要统计学知识)
1 statsmodels 子库
示例代码如下
import pkgutil as puimport pydocimport statsmodels as sm# statmodels版本号print("statmodels version", sm.__version__)def clean(astr): s = astr # remove multiple spaces s = ' '.join(s.split()) s = s.replace('=', '') return sdef print_desc(prefix, pkg_path): print("pkg_path", pkg_path) ''' :param prefix: 模块名称 :param pkg_path:模块包路径 :return: ''' for pkg in pu.iter_modules(path=pkg_path): '''Yields (module_loader,模块 name,子库名 ispkg)是否''' print("pkg", pkg) name = prefix + "." + pkg[1] # 输出子库名,帮助文档信息 if pkg[2] == True: try: docstr = pydoc.plain(pydoc.render_doc(name)) docstr = clean(docstr) start = docstr.find("DESCRIPTION") docstr = docstr[start: start + 140] print(name, docstr) except: continueprint_desc("statsmodels", sm.__path__)
运行结果如下:
statmodels version 0.8.0rc1
statsmodels.base
statsmodels.compat
statsmodels.datasets
statsmodels.discrete
statsmodels.distributions
statsmodels.duration
statsmodels.emplike
statsmodels.formula
statsmodels.genmod
statsmodels.graphics
statsmodels.imputation
statsmodels.interface
statsmodels.iolib
statsmodels.miscmodels
statsmodels.multivariate
statsmodels.nonparametric DESCRIPTION Foran overview of this module, see docs/source/nonparametric.rst PACKAGE CONTENTS_kernel_base _smoothers_lowess api bandwidths
statsmodels.regression
statsmodels.resampling
statsmodels.robust
statsmodels.sandbox
statsmodels.src
statsmodels.stats
statsmodels.tools
statsmodels.tsa
2 移动平均值
示例代码如下:
import matplotlib.pyplot as pltimport statsmodels.api as smfrom pandas.stats.moments import rolling_meandata_loader = sm.datasets.sunspots.load_pandas()df = data_loader.datayear_range = df['YEAR'].valuesplt.plot(year_range, df["SUNACTIVITY"].values, label="Original")plt.plot(year_range, rolling_mean(df, 11)["SUNACTIVITY"].values, label="SMA 11")plt.plot(year_range, rolling_mean(df, 22)["SUNACTIVITY"].values, label="SMA 22")plt.legend()plt.show()
方法二:
import matplotlib.pyplot as pltimport statsmodels.api as smfrom pandas.stats.moments import rolling_meanimport pandas.core.genericdata_loader = sm.datasets.sunspots.load_pandas()df = data_loader.datadf11 = data_loader.data.rolling(window=11, center=False).mean()df22 = data_loader.data.rolling(window=22, center=False).mean()year_range = df['YEAR'].valuesplt.plot(year_range, df["SUNACTIVITY"].values, label="Original")# plt.plot(year_range, rolling_mean(df, window=11, center=False)["SUNACTIVITY"].values, label="SMA 11")# plt.plot(year_range, rolling_mean(df, window=22, center=False)["SUNACTIVITY"].values, label="SMA 22")plt.plot(year_range, df11["SUNACTIVITY"].values, label="SMA 11")plt.plot(year_range, df22["SUNACTIVITY"].values, label="SMA 22")plt.legend()plt.show()
运行结果如下:
3 窗口函数
是定义在一个区间(窗口)上的函数,超出定义,函数值取零,可以用来分析频谱,设计滤波器
import matplotlib.pyplot as pltimport statsmodels.api as smfrom pandas.stats.moments import rolling_windowimport pandas as pddata_loader = sm.datasets.sunspots.load_pandas()# 从尾部取150行df = data_loader.data.tail(150)df = pd.DataFrame({"SUNACTIVITY": df['SUNACTIVITY'].values}, index=df['YEAR'])ax = df.plot()def plot_window(win_type): # df2 = rolling_window(df, 22, win_type) df2 = df.rolling(window=22, win_type=win_type).mean() df2.columns = [win_type] # 显示原始数据 df2.plot(ax=ax)# 矩形窗口plot_window("boxcar")# 三角形窗口plot_window("triang")# 布莱克曼窗口plot_window("blackman")# 汉宁窗plot_window("hanning")# 巴特莱特窗plot_window("bartlett")# 显示网格plt.grid()plt.show()
运行结果如下:
4 协整的定义
示例代码如下:
import statsmodels.api as smfrom pandas.stats.moments import rolling_windowimport pandas as pdimport statsmodels.tsa.stattools as tsimport numpy as np# 用来计算ADF统计量def calc_adf(x, y): result = sm.OLS(x, y).fit() return ts.adfuller(result.resid)# 太阳黑子数据载入到numpy数组data_loader = sm.datasets.sunspots.load_pandas()data = data_loader.data.valuesN = len(data)# 计算正弦值,求出该值与自身的协整关系t = np.linspace(-2 * np.pi, 2 * np.pi, N)sine = np.sin(np.sin(t))print('Self ADF', calc_adf(sine, sine))# 给正弦波添加噪音noise = np.random.normal(0, .01, N)print('ADF sine with noise', calc_adf(sine, sine + noise))# 生成一个幅值和偏移量更大的余弦波,并混入噪音cosine = 100 * np.cos(t) + 10print('ADF sine vs cosine with noise', calc_adf(sine, cosine + noise))# 正弦与太阳黑子print('Sine vs sunspots', calc_adf(sine, data))
运行结果如下:
Self ADF (2.1752959320935576e-16,
0.95853208606005602,
0,
308,
{'10%': -2.5717944160060719,
'1%': -3.4517611601803702,
'5%': -2.8709700936076912},
-21598.896016765088)
ADF sine with noise (-11.80572756306368,
9.1062110841151392e-22,
2,
306,
{'10%': -2.5718274501260199,
'1%': -3.4519023023726696,
'5%': -2.8710320399170537},
-1857.1417094083959)
ADF sine vs cosine with noise(-6.9222457355201135,
1.1386106445203264e-09,
16,
292,
{'10%': -2.5720714378870331,
'1%': -3.4529449243622383,
'5%': -2.8714895534256861},
-10180.957513197414)
Sine vs sunspots (-6.7242691810700963,
3.4210811915549913e-09,
16,
292,
{'10%': -2.5720714378870331,
'1%': -3.4529449243622383,
'5%': -2.8714895534256861},
-1102.5867415291168)
5 自相关
数据集内部的相关性,可以用来指明趋势
示例代码如下:
import numpy as npimport pandas as pdimport statsmodels.api as smimport matplotlib.pyplot as pltfrom pandas.tools.plotting import autocorrelation_plot# 读入测试数据data_loader = sm.datasets.sunspots.load_pandas()data = data_loader.data['SUNACTIVITY'].values# 计算自相关值y = data - np.mean(data)norm = np.sum(y ** 2)correlated = np.correlate(y, y, mode='full') / normres = correlated[len(correlated) / 2:]# 关联度最高值的索引print(np.argsort(res)[-5:])# 结果为[ 9 11 10 1 0]# 绘制图形plt.plot(res)plt.grid(True)plt.xlabel('Lag')plt.ylabel('Autocorrelation')plt.show()# 使用pandas绘制autocorrelation_plot(data)plt.show()
运行结果如下:
使用pandas绘制
6 自回归模型
可用于预测时间序列将来的值
一元线性回归的计算公式(因变量y与自变量x之间的线性关系)
β0和β1为模型的参数
ε误差项一个期望值为0的随机变量
回归方程
E(y)=β0+β1x
自加归模型的数学公式
示例代码如下:
from scipy.optimize import leastsqimport statsmodels.api as smimport matplotlib.pyplot as pltimport numpy as np# 搭建模型代码def model(p, x1, x10): p1, p10 = p return p1 * x1 + p10 * x10def error(p, data, x1, x10): return data - model(p, x1, x10)# 给参数表赋值def fit(data): p0 = [.5, 0.5] params = leastsq(error, p0, args=(data[10:], data[9:-1], data[:-10]))[0] return params# 加载数据data_loader = sm.datasets.sunspots.load_pandas()sunspots = data_loader.data['SUNACTIVITY'].valuesprint(sunspots)# 训练模型cutoff = .9 * len(sunspots)params = fit(sunspots[:cutoff])print('Params', params)# 取得各个指标的值pred = params[0] * sunspots[cutoff - 1:-1] + params[1] * sunspots[cutoff - 10:-10]actual = sunspots[cutoff:]print('Root mean square error', np.sqrt(np.mean(actual - pred) ** 2))print('Mean absolute error ', np.mean(np.abs(actual - pred)))print('Mean absolute percentage error', 100 * np.mean(np.abs(actual - pred) / actual))mid = (actual + pred) / 2print('Symmetric Mean absolute percentage error', 100 * np.mean(np.abs(actual - pred) / mid))print('Cofficient of determination', 1 - ((actual - pred) ** 2).sum() / ((actual - actual.mean()) ** 2).sum())year_range = data_loader.data['YEAR'].values[cutoff:]# 绘制图像# 太阳黑子活动数值plt.plot(year_range, actual, 'o', label='Sunspots')# 预测值plt.plot(year_range, pred, 'x', label='Prediction')plt.grid(True)plt.xlabel('YEAR')plt.ylabel('SUNACTIVITY')plt.legend()plt.show()
运行结果如下:
Params [ 0.67172672 0.33626295]
Root mean square error 1.02884848439
Mean absolute error 17.6515446503
Mean absolute percentage error60.7817800736
Symmetric Mean absolute percentage error34.9843386176
Cofficient of determination 0.799940292779
7 ARMA模型
由自回归模型和移动平均模型结合而成,用于时间序列的预测
示例代码如下:
import pandas as pdimport matplotlib.pyplot as pltimport statsmodels.api as smimport datetime# 加载数据data_loader = sm.datasets.sunspots.load_pandas()df = data_loader.data# 拟合数据years = df['YEAR'].values.astype(int)str(years[0])df.index = pd.Index(sm.tsa.datetools.dates_from_range(str(years[0]), str(years[-1])))del df['YEAR']# 预测数据model = sm.tsa.ARMA(df, (10, 1)).fit()prediction = model.predict('1975', str(years[-1]), dynamic=True)# 绘制数据# 太阳黑子活动数据df['1975':].plot()# 预测数据prediction.plot(style='--', label='Prediction')plt.grid(True)plt.legend()plt.show()
运行结果如下:
8 生成周期信号
示例代码如下:
from scipy.optimize import leastsqimport statsmodels.api as smimport matplotlib.pyplot as pltimport numpy as np# 搭建模型def model(p, t): C, p1, f1, phi1, p2, f2, phi2, p3, f3, phi3 = p return C + p1 * np.sin(f1 * t + phi1) + p2 * np.sin(f2 * t + phi2) + p3 * np.sin(f3 * t + phi3)def error(p, y, t): return y - model(p, t)# 给参数表赋值def fit(y, t): p0 = [y.mean(), 0, 2 * np.pi / 11, 0, 0, 2 * np.pi / 22, 0, 0, 2 * np.pi / 100, 0] params = leastsq(error, p0, args=(y, t))[0] return params# 加载数据data_loader = sm.datasets.sunspots.load_pandas()sunspots = data_loader.data["SUNACTIVITY"].valuesyears = data_loader.data["YEAR"].values# 训练模型cutoff = .9 * len(sunspots)params = fit(sunspots[:cutoff], years[:cutoff])print("Params", params)# 取得各个指标的值pred = model(params, years[cutoff:])actual = sunspots[cutoff:]print("Root mean square error", np.sqrt(np.mean((actual - pred) ** 2)))print("Mean absolute error", np.mean(np.abs(actual - pred)))print("Mean absolute percentage error", 100 * np.mean(np.abs(actual - pred) / actual))mid = (actual + pred) / 2print("Symmetric Mean absolute percentage error", 100 * np.mean(np.abs(actual - pred) / mid))print("Coefficient of determination", 1 - ((actual - pred) ** 2).sum() / ((actual - actual.mean()) ** 2).sum())year_range = data_loader.data["YEAR"].values[cutoff:]# 绘制图形plt.plot(year_range, actual, 'o', label="Sunspots")plt.plot(year_range, pred, 'x', label="Prediction")plt.grid(True)plt.xlabel("YEAR")plt.ylabel("SUNACTIVITY")plt.legend()plt.show()
运行结果如下:
Params [ 47.18800156 28.89947466 0.56827281 6.51174621 4.55214731
0.29372076 -14.3092358 -18.16524066 0.06574835 -4.37789397]
Root mean square error 59.5619988827
Mean absolute error 44.581532168 #平均绝结误差
Mean absolute percentage error65.1643378479
Symmetric Mean absolute percentage error78.4479302694
Coefficient of determination-0.363528934815 #判定系数应尽量接近于1
9 傅量叶分析
FFT (fast fourier transform)快速傅里叶变换
示例代码如下:
import numpy as npimport statsmodels.api as smimport matplotlib.pyplot as pltfrom scipy.fftpack import rfftfrom scipy.fftpack import fftshift# 加载数据data_loader = sm.datasets.sunspots.load_pandas()sunspots = data_loader.data["SUNACTIVITY"].values# 创建一个正弦波t = np.linspace(-2 * np.pi, 2 * np.pi, len(sunspots))mid = np.ptp(sunspots) / 2sine = mid + mid * np.sin(np.sin(t))sine_fft = np.abs(fftshift(rfft(sine)))# 最大振幅的相应索引print("Index of max sine FFT", np.argsort(sine_fft)[-5:])# 对太阳黑子进行 ffttransformed = np.abs(fftshift(rfft(sunspots)))print("Indices of max sunspots FFT", np.argsort(transformed)[-5:])# 太阳黑子活动数据和sine函数plt.subplot(311)plt.plot(sunspots, label="Sunspots")plt.plot(sine, lw=2, label="Sine")plt.grid(True)plt.legend()# 傅里叶变换后的太阳黑子活动数据plt.subplot(312)plt.plot(transformed, label="Transformed Sunspots")plt.grid(True)plt.legend()# 傅里叶变换后的sine函数plt.subplot(313)plt.plot(sine_fft, lw=2, label="Transformed Sine")plt.grid(True)plt.legend()plt.show()
运行结果如下:
Index of max sine FFT [160 157 166 158 154]
Indices of max sunspots FFT [205 212 215209 154]
10 谱分析
功率频谱
示例代码如下:
import numpy as npimport statsmodels.api as smimport matplotlib.pyplot as pltfrom scipy.fftpack import rfftfrom scipy.fftpack import fftshift# 加载数据data_loader = sm.datasets.sunspots.load_pandas()sunspots = data_loader.data["SUNACTIVITY"].values# 创建一个正弦波t = np.linspace(-2 * np.pi, 2 * np.pi, len(sunspots))mid = np.ptp(sunspots) / 2sine = mid + mid * np.sin(np.sin(t))# 对正弦波进行FFTsine_fft = np.abs(fftshift(rfft(sine)))# 最大振幅的相应索引print("Index of max sine FFT", np.argsort(sine_fft)[-5:])# 对太阳黑子进行 ffttransformed = fftshift(rfft(sunspots))print("Indices of max sunspots FFT", np.argsort(transformed)[-5:])# 太阳黑子活动数据和sine函数plt.subplot(311)plt.plot(sunspots, label="Sunspots")plt.plot(sine, lw=2, label="Sine")plt.grid(True)plt.legend()# 绘制功率频谱plt.subplot(312)plt.plot(transformed ** 2, label="Power Spectrum")plt.grid(True)plt.legend()# 相位谱,正弦函数的起始角plt.subplot(313)plt.plot(np.angle(transformed), label="Phase Spectrum")plt.grid(True)plt.legend()plt.show()
运行结果如下:
11 滤波
是一种信号处理技术,可以对信号的某些部分进行删减或抑制,可以对高频或是低频进行滤波
中值滤波
Wiener滤波
Detrend滤波
示例代码如下:
import statsmodels.api as smimport matplotlib.pyplot as pltfrom scipy.signal import medfiltfrom scipy.signal import wienerfrom scipy.signal import detrend# 加载数据data_loader = sm.datasets.sunspots.load_pandas()sunspots = data_loader.data['SUNACTIVITY'].valuesyears = data_loader.data['YEAR'].values# 绘制图像# 原始数据plt.plot(years, sunspots, label='SUNATCIVITY')# 中值滤波plt.plot(years, medfilt(sunspots, 11), lw=2, label='Meadian')# wiener滤波plt.plot(years, wiener(sunspots, 11), '--', lw=2, label='Wiener')# detrend滤波plt.plot(years, detrend(sunspots), lw=3, label='Detrend')plt.xlabel('YEAR')plt.grid(True)plt.legend()plt.show()
运行结果如下:
- python数据分析学习笔记七
- python数据分析学习笔记
- Python数据分析学习笔记一
- Python数据分析学习笔记二
- Python数据分析学习笔记三
- Python数据分析学习笔记四
- Python数据分析学习笔记五
- Python数据分析学习笔记六
- python数据分析入门学习笔记儿
- python数据分析入门学习笔记
- python数据分析入门学习笔记儿
- python数据分析入门学习笔记儿
- python数据分析学习笔记一
- python数据分析学习笔记二
- python数据分析学习笔记三
- python数据分析学习笔记六
- python数据分析入门学习笔记
- # Python数据分析学习笔记(一)
- jquery Ajax操作
- python数据分析学习笔记八
- 剖析淘宝 TDDL ( TAOBAO DISTRIBUTE DATA LAYER )
- darwin streaming server 6.0.3 Linux编译
- LNMP添加、删除虚拟主机
- python数据分析学习笔记七
- HTTP、 HTTP1.1、 HTTP/2的区别
- h5中的列表
- 关于在高并发下生成订单号的策略
- [笔记]C++的空类
- 由python2.7的UnicodeEncodeError说Python的编码问题
- windows安装node.js
- 设计模式之---中介者模式
- hadoop测试wordcount出现的问题