python数据分析学习笔记七

来源:互联网 发布:小米手环 知乎 编辑:程序博客网 时间:2024/03/29 04:51

第七章 信号处理与时间序列

(需要统计学知识)

1 statsmodels 子库

示例代码如下

import pkgutil as puimport pydocimport statsmodels as sm# statmodels版本号print("statmodels version", sm.__version__)def clean(astr):    s = astr    # remove multiple spaces    s = ' '.join(s.split())    s = s.replace('=', '')    return sdef print_desc(prefix, pkg_path):    print("pkg_path", pkg_path)    '''    :param prefix: 模块名称    :param pkg_path:模块包路径    :return:    '''    for pkg in pu.iter_modules(path=pkg_path):        '''Yields (module_loader,模块         name,子库名         ispkg)是否'''        print("pkg", pkg)        name = prefix + "." + pkg[1]        # 输出子库名,帮助文档信息        if pkg[2] == True:            try:                docstr = pydoc.plain(pydoc.render_doc(name))                docstr = clean(docstr)                start = docstr.find("DESCRIPTION")                docstr = docstr[start: start + 140]                print(name, docstr)            except:                continueprint_desc("statsmodels", sm.__path__)

运行结果如下:

statmodels version 0.8.0rc1

statsmodels.base

statsmodels.compat

statsmodels.datasets

statsmodels.discrete

statsmodels.distributions

statsmodels.duration

statsmodels.emplike

statsmodels.formula

statsmodels.genmod

statsmodels.graphics

statsmodels.imputation

statsmodels.interface

statsmodels.iolib

statsmodels.miscmodels

statsmodels.multivariate

statsmodels.nonparametric DESCRIPTION Foran overview of this module, see docs/source/nonparametric.rst PACKAGE CONTENTS_kernel_base _smoothers_lowess api bandwidths

statsmodels.regression

statsmodels.resampling

statsmodels.robust

statsmodels.sandbox

statsmodels.src

statsmodels.stats

statsmodels.tools

statsmodels.tsa

 

2 移动平均值

示例代码如下:

import matplotlib.pyplot as pltimport statsmodels.api as smfrom pandas.stats.moments import rolling_meandata_loader = sm.datasets.sunspots.load_pandas()df = data_loader.datayear_range = df['YEAR'].valuesplt.plot(year_range, df["SUNACTIVITY"].values, label="Original")plt.plot(year_range, rolling_mean(df, 11)["SUNACTIVITY"].values, label="SMA 11")plt.plot(year_range, rolling_mean(df, 22)["SUNACTIVITY"].values, label="SMA 22")plt.legend()plt.show()

 

 

 

 

方法二:

import matplotlib.pyplot as pltimport statsmodels.api as smfrom pandas.stats.moments import rolling_meanimport pandas.core.genericdata_loader = sm.datasets.sunspots.load_pandas()df = data_loader.datadf11 = data_loader.data.rolling(window=11, center=False).mean()df22 = data_loader.data.rolling(window=22, center=False).mean()year_range = df['YEAR'].valuesplt.plot(year_range, df["SUNACTIVITY"].values, label="Original")# plt.plot(year_range, rolling_mean(df, window=11, center=False)["SUNACTIVITY"].values, label="SMA 11")# plt.plot(year_range, rolling_mean(df, window=22, center=False)["SUNACTIVITY"].values, label="SMA 22")plt.plot(year_range, df11["SUNACTIVITY"].values, label="SMA 11")plt.plot(year_range, df22["SUNACTIVITY"].values, label="SMA 22")plt.legend()plt.show()

 

运行结果如下:

 

3 窗口函数

 是定义在一个区间(窗口)上的函数,超出定义,函数值取零,可以用来分析频谱,设计滤波器

import matplotlib.pyplot as pltimport statsmodels.api as smfrom pandas.stats.moments import rolling_windowimport pandas as pddata_loader = sm.datasets.sunspots.load_pandas()# 从尾部取150df = data_loader.data.tail(150)df = pd.DataFrame({"SUNACTIVITY": df['SUNACTIVITY'].values}, index=df['YEAR'])ax = df.plot()def plot_window(win_type):    # df2 = rolling_window(df, 22, win_type)    df2 = df.rolling(window=22, win_type=win_type).mean()    df2.columns = [win_type]    # 显示原始数据    df2.plot(ax=ax)# 矩形窗口plot_window("boxcar")# 三角形窗口plot_window("triang")# 布莱克曼窗口plot_window("blackman")# 汉宁窗plot_window("hanning")# 巴特莱特窗plot_window("bartlett")# 显示网格plt.grid()plt.show()

运行结果如下:

4 协整的定义

示例代码如下:

import statsmodels.api as smfrom pandas.stats.moments import rolling_windowimport pandas as pdimport statsmodels.tsa.stattools as tsimport numpy as np# 用来计算ADF统计量def calc_adf(x, y):    result = sm.OLS(x, y).fit()    return ts.adfuller(result.resid)# 太阳黑子数据载入到numpy数组data_loader = sm.datasets.sunspots.load_pandas()data = data_loader.data.valuesN = len(data)# 计算正弦值,求出该值与自身的协整关系t = np.linspace(-2 * np.pi, 2 * np.pi, N)sine = np.sin(np.sin(t))print('Self ADF', calc_adf(sine, sine))# 给正弦波添加噪音noise = np.random.normal(0, .01, N)print('ADF sine with noise', calc_adf(sine, sine + noise))# 生成一个幅值和偏移量更大的余弦波,并混入噪音cosine = 100 * np.cos(t) + 10print('ADF sine vs cosine with noise', calc_adf(sine, cosine + noise))# 正弦与太阳黑子print('Sine vs sunspots', calc_adf(sine, data))

运行结果如下:

Self ADF (2.1752959320935576e-16,

 0.95853208606005602,

0,

308,

 {'10%': -2.5717944160060719,

'1%': -3.4517611601803702,

'5%': -2.8709700936076912},

 -21598.896016765088)

 

ADF sine with noise (-11.80572756306368,

 9.1062110841151392e-22,

 2,

306,

{'10%': -2.5718274501260199,

'1%': -3.4519023023726696,

 '5%': -2.8710320399170537},

 -1857.1417094083959)

 

ADF sine vs cosine with noise(-6.9222457355201135,

1.1386106445203264e-09,

 16,

292,

{'10%': -2.5720714378870331,

 '1%': -3.4529449243622383,

'5%': -2.8714895534256861},

-10180.957513197414)

 

Sine vs sunspots (-6.7242691810700963,

 3.4210811915549913e-09,

 16,

292,

{'10%': -2.5720714378870331,

'1%': -3.4529449243622383,

'5%': -2.8714895534256861},

-1102.5867415291168)

 

 

5 自相关

数据集内部的相关性,可以用来指明趋势

示例代码如下:

import numpy as npimport pandas as pdimport statsmodels.api as smimport matplotlib.pyplot as pltfrom pandas.tools.plotting import autocorrelation_plot# 读入测试数据data_loader = sm.datasets.sunspots.load_pandas()data = data_loader.data['SUNACTIVITY'].values# 计算自相关值y = data - np.mean(data)norm = np.sum(y ** 2)correlated = np.correlate(y, y, mode='full') / normres = correlated[len(correlated) / 2:]# 关联度最高值的索引print(np.argsort(res)[-5:])# 结果为[ 9 11 10  1  0]# 绘制图形plt.plot(res)plt.grid(True)plt.xlabel('Lag')plt.ylabel('Autocorrelation')plt.show()# 使用pandas绘制autocorrelation_plot(data)plt.show()

 

运行结果如下:

 

使用pandas绘制

6 自回归模型

可用于预测时间序列将来的值

一元线性回归的计算公式(因变量y与自变量x之间的线性关系)

β0和β1为模型的参数

ε误差项一个期望值为0的随机变量

回归方程

E(y)=β0+β1x

 

自加归模型的数学公式

示例代码如下:

from scipy.optimize import leastsqimport statsmodels.api as smimport matplotlib.pyplot as pltimport numpy as np# 搭建模型代码def model(p, x1, x10):    p1, p10 = p    return p1 * x1 + p10 * x10def error(p, data, x1, x10):    return data - model(p, x1, x10)# 给参数表赋值def fit(data):    p0 = [.5, 0.5]    params = leastsq(error, p0, args=(data[10:], data[9:-1], data[:-10]))[0]    return params# 加载数据data_loader = sm.datasets.sunspots.load_pandas()sunspots = data_loader.data['SUNACTIVITY'].valuesprint(sunspots)# 训练模型cutoff = .9 * len(sunspots)params = fit(sunspots[:cutoff])print('Params', params)# 取得各个指标的值pred = params[0] * sunspots[cutoff - 1:-1] + params[1] * sunspots[cutoff - 10:-10]actual = sunspots[cutoff:]print('Root mean square error', np.sqrt(np.mean(actual - pred) ** 2))print('Mean absolute error ', np.mean(np.abs(actual - pred)))print('Mean absolute percentage error', 100 * np.mean(np.abs(actual - pred) / actual))mid = (actual + pred) / 2print('Symmetric Mean absolute percentage error', 100 * np.mean(np.abs(actual - pred) / mid))print('Cofficient of determination', 1 - ((actual - pred) ** 2).sum() / ((actual - actual.mean()) ** 2).sum())year_range = data_loader.data['YEAR'].values[cutoff:]# 绘制图像# 太阳黑子活动数值plt.plot(year_range, actual, 'o', label='Sunspots')# 预测值plt.plot(year_range, pred, 'x', label='Prediction')plt.grid(True)plt.xlabel('YEAR')plt.ylabel('SUNACTIVITY')plt.legend()plt.show()

 

运行结果如下:

Params [ 0.67172672  0.33626295]

 

Root mean square error 1.02884848439

Mean absolute error  17.6515446503

Mean absolute percentage error60.7817800736

Symmetric Mean absolute percentage error34.9843386176

Cofficient of determination 0.799940292779

 

 

7 ARMA模型

由自回归模型和移动平均模型结合而成,用于时间序列的预测

 

示例代码如下:

import pandas as pdimport matplotlib.pyplot as pltimport statsmodels.api as smimport datetime# 加载数据data_loader = sm.datasets.sunspots.load_pandas()df = data_loader.data# 拟合数据years = df['YEAR'].values.astype(int)str(years[0])df.index = pd.Index(sm.tsa.datetools.dates_from_range(str(years[0]), str(years[-1])))del df['YEAR']# 预测数据model = sm.tsa.ARMA(df, (10, 1)).fit()prediction = model.predict('1975', str(years[-1]), dynamic=True)# 绘制数据# 太阳黑子活动数据df['1975':].plot()# 预测数据prediction.plot(style='--', label='Prediction')plt.grid(True)plt.legend()plt.show()

 

运行结果如下:

 

 

 

8 生成周期信号

示例代码如下:

from scipy.optimize import leastsqimport statsmodels.api as smimport matplotlib.pyplot as pltimport numpy as np# 搭建模型def model(p, t):    C, p1, f1, phi1, p2, f2, phi2, p3, f3, phi3 = p    return C + p1 * np.sin(f1 * t + phi1) + p2 * np.sin(f2 * t + phi2) + p3 * np.sin(f3 * t + phi3)def error(p, y, t):    return y - model(p, t)# 给参数表赋值def fit(y, t):    p0 = [y.mean(), 0, 2 * np.pi / 11, 0, 0, 2 * np.pi / 22, 0, 0, 2 * np.pi / 100, 0]    params = leastsq(error, p0, args=(y, t))[0]    return params# 加载数据data_loader = sm.datasets.sunspots.load_pandas()sunspots = data_loader.data["SUNACTIVITY"].valuesyears = data_loader.data["YEAR"].values# 训练模型cutoff = .9 * len(sunspots)params = fit(sunspots[:cutoff], years[:cutoff])print("Params", params)# 取得各个指标的值pred = model(params, years[cutoff:])actual = sunspots[cutoff:]print("Root mean square error", np.sqrt(np.mean((actual - pred) ** 2)))print("Mean absolute error", np.mean(np.abs(actual - pred)))print("Mean absolute percentage error", 100 * np.mean(np.abs(actual - pred) / actual))mid = (actual + pred) / 2print("Symmetric Mean absolute percentage error", 100 * np.mean(np.abs(actual - pred) / mid))print("Coefficient of determination", 1 - ((actual - pred) ** 2).sum() / ((actual - actual.mean()) ** 2).sum())year_range = data_loader.data["YEAR"].values[cutoff:]# 绘制图形plt.plot(year_range, actual, 'o', label="Sunspots")plt.plot(year_range, pred, 'x', label="Prediction")plt.grid(True)plt.xlabel("YEAR")plt.ylabel("SUNACTIVITY")plt.legend()plt.show()

运行结果如下:

Params [ 47.18800156  28.89947466  0.56827281   6.51174621   4.55214731

  0.29372076 -14.3092358 -18.16524066   0.06574835  -4.37789397]

Root mean square error 59.5619988827

Mean absolute error 44.581532168  #平均绝结误差

Mean absolute percentage error65.1643378479

Symmetric Mean absolute percentage error78.4479302694

Coefficient of determination-0.363528934815  #判定系数应尽量接近于1

9 傅量叶分析

FFT (fast fourier transform)快速傅里叶变换

示例代码如下:

import numpy as npimport statsmodels.api as smimport matplotlib.pyplot as pltfrom scipy.fftpack import rfftfrom scipy.fftpack import fftshift# 加载数据data_loader = sm.datasets.sunspots.load_pandas()sunspots = data_loader.data["SUNACTIVITY"].values# 创建一个正弦波t = np.linspace(-2 * np.pi, 2 * np.pi, len(sunspots))mid = np.ptp(sunspots) / 2sine = mid + mid * np.sin(np.sin(t))sine_fft = np.abs(fftshift(rfft(sine)))# 最大振幅的相应索引print("Index of max sine FFT", np.argsort(sine_fft)[-5:])# 对太阳黑子进行 ffttransformed = np.abs(fftshift(rfft(sunspots)))print("Indices of max sunspots FFT", np.argsort(transformed)[-5:])# 太阳黑子活动数据和sine函数plt.subplot(311)plt.plot(sunspots, label="Sunspots")plt.plot(sine, lw=2, label="Sine")plt.grid(True)plt.legend()# 傅里叶变换后的太阳黑子活动数据plt.subplot(312)plt.plot(transformed, label="Transformed Sunspots")plt.grid(True)plt.legend()# 傅里叶变换后的sine函数plt.subplot(313)plt.plot(sine_fft, lw=2, label="Transformed Sine")plt.grid(True)plt.legend()plt.show()

运行结果如下:

Index of max sine FFT [160 157 166 158 154]

Indices of max sunspots FFT [205 212 215209 154]

 

10 谱分析

功率频谱

示例代码如下:

import numpy as npimport statsmodels.api as smimport matplotlib.pyplot as pltfrom scipy.fftpack import rfftfrom scipy.fftpack import fftshift# 加载数据data_loader = sm.datasets.sunspots.load_pandas()sunspots = data_loader.data["SUNACTIVITY"].values# 创建一个正弦波t = np.linspace(-2 * np.pi, 2 * np.pi, len(sunspots))mid = np.ptp(sunspots) / 2sine = mid + mid * np.sin(np.sin(t))# 对正弦波进行FFTsine_fft = np.abs(fftshift(rfft(sine)))# 最大振幅的相应索引print("Index of max sine FFT", np.argsort(sine_fft)[-5:])# 对太阳黑子进行 ffttransformed = fftshift(rfft(sunspots))print("Indices of max sunspots FFT", np.argsort(transformed)[-5:])# 太阳黑子活动数据和sine函数plt.subplot(311)plt.plot(sunspots, label="Sunspots")plt.plot(sine, lw=2, label="Sine")plt.grid(True)plt.legend()# 绘制功率频谱plt.subplot(312)plt.plot(transformed ** 2, label="Power Spectrum")plt.grid(True)plt.legend()# 相位谱,正弦函数的起始角plt.subplot(313)plt.plot(np.angle(transformed), label="Phase Spectrum")plt.grid(True)plt.legend()plt.show()

 

运行结果如下:

11 滤波

是一种信号处理技术,可以对信号的某些部分进行删减或抑制,可以对高频或是低频进行滤波

中值滤波

Wiener滤波

Detrend滤波

 

示例代码如下:

import statsmodels.api as smimport matplotlib.pyplot as pltfrom scipy.signal import medfiltfrom scipy.signal import wienerfrom scipy.signal import detrend# 加载数据data_loader = sm.datasets.sunspots.load_pandas()sunspots = data_loader.data['SUNACTIVITY'].valuesyears = data_loader.data['YEAR'].values# 绘制图像# 原始数据plt.plot(years, sunspots, label='SUNATCIVITY')# 中值滤波plt.plot(years, medfilt(sunspots, 11), lw=2, label='Meadian')# wiener滤波plt.plot(years, wiener(sunspots, 11), '--', lw=2, label='Wiener')# detrend滤波plt.plot(years, detrend(sunspots), lw=3, label='Detrend')plt.xlabel('YEAR')plt.grid(True)plt.legend()plt.show()

 

运行结果如下:

 

0 0
原创粉丝点击
热门问题 老师的惩罚 人脸识别 我在镇武司摸鱼那些年 重生之率土为王 我在大康的咸鱼生活 盘龙之生命进化 天生仙种 凡人之先天五行 春回大明朝 姑娘不必设防,我是瞎子 眼周围淤血肿了怎么办 每到秋季就咳嗽怎么办 左肾泥沙样结石怎么办 双肾泥沙样结石怎么办 温州市民卡丢了怎么办 上眼皮过敏肿了怎么办 上眼皮又痒又肿怎么办 上眼皮红肿痛是怎么办 上眼皮肿的厉害怎么办 眼皮肿了还痒痒怎么办 眼睛被手指戳到怎么办 打球眼睛撞肿了怎么办 打球时眼睛被戳怎么办 狗眼睛被打充血怎么办 一只眼睛磨的慌怎么办 5个月宝宝结膜炎怎么办 金毛眼屎多白色怎么办 金毛眼红有眼屎怎么办 狗狗眼睛上火了怎么办 狗上火了眼屎多怎么办 金毛走路扭腰怎么办 金毛流鼻涕微黄怎么办 狗狗下眼皮红了怎么办 金毛眼睛打肿了怎么办 金毛的眼睛红怎么办 眼睛干涩有红血丝怎么办 小孩子眼睛红有眼屎怎么办 狗狗的肉垫粗糙怎么办 狗狗眼睛变蓝色怎么办 脸被太阳晒伤了怎么办 皮肤晒伤红肿痒怎么办 3岁儿童频繁眨眼怎么办 狗狗的眼睛红肿怎么办 脸过敏发红怎么办不痒 上眼皮红肿痒是怎么办 眼睛痒了几天了怎么办 眼睛肿了还痒怎么办 孩子脸上有红血丝怎么办 脸上长了红血丝怎么办 指甲受创出血了怎么办 手指被挤压紫了怎么办