python数据分析学习笔记三

来源:互联网 发布:originlab mac 编辑:程序博客网 时间:2024/04/26 13:46

第三章 统计学与线性代数

1 numpy和scipy模块

示例代码如下:

import pkgutil as puimport numpy as npimport matplotlib as mplimport scipy as spimport pydoc# 输出库的版本信息print('Numpy version: ', np.__version__)print('Scip version: ', sp.__version__)print('Matplotlib: ', mpl.__version__)# 移除multiple 中的空格def clean(astr):    s = astr    s = ' '.join(s.split())    s = s.replace('=', '')    return sdef print_desc(prefix, pkg_path):    # 取得指定模块下的所有子库    for pkg in pu.iter_modules(path=pkg_path):        name = prefix + '.' + pkg[1]        if pkg[2] == True:            try:                # render_doc取得子库或函数的文字说明                #                docstr = pydoc.plain(pydoc.render_doc(name))                docstr = clean(docstr)                # 设定所取内容的起始位置                 start = docstr.find('DESCRIPTION')                docstr = docstr[start:start + 40]                print(name, docstr)            except:                continueprint_desc('numpy', np.__path__)print()print()print()print_desc('scipy', sp.__path__)

 

运行结果如下:

Numpy version:  1.11.2

Scip version:  0.18.1

Matplotlib: 1.5.3

numpy.compat DESCRIPTION This modulecontains duplica

numpy.core DESCRIPTION Functions - array -NumPy Ar

numpy.distutils

numpy.doc DESCRIPTION Topicaldocumentation  The f

numpy.f2py

numpy.fft DESCRIPTION Discrete FourierTransform (

numpy.lib DESCRIPTION Basic functions usedby seve

numpy.linalg DESCRIPTION Core LinearAlgebra Tools --

numpy.ma DESCRIPTION  Masked Arrays Arrays somet

numpy.matrixlib

numpy.polynomial DESCRIPTION Within thedocumentation for

numpy.random DESCRIPTION  Random Number Generation  

numpy.testing DESCRIPTION This singlemodule should pr

 

 

 

scipy._build_utils

scipy._lib DESCRIPTION Module containing privateut

scipy.cluster DESCRIPTION  Clustering package (:mod:`s

scipy.constants DESCRIPTION  Constants (:mod:`scipy.cons

scipy.fftpack DESCRIPTION  Discrete Fourier transforms

scipy.integrate DESCRIPTION  Integration and ODEs (:mod:

scipy.interpolate DESCRIPTION  Interpolation (:mod:`scipy.

scipy.io DESCRIPTION  Input and output (:mod:`sci

scipy.linalg DESCRIPTION  Linear algebra (:mod:`scipy

scipy.misc DESCRIPTION  Miscellaneous routines (:mo

scipy.ndimage DESCRIPTION  Multi-dimensional image pro

scipy.odr DESCRIPTION  Orthogonal distance regress

scipy.optimize DESCRIPTION  Optimization and root findi

scipy.signal DESCRIPTION  Signal processing (:mod:`sc

scipy.sparse DESCRIPTION  Sparse matrices (:mod:`scip

scipy.spatial DESCRIPTION  Spatial algorithms and data

scipy.special DESCRIPTION  Special functions (:mod:`sc

scipy.stats DESCRIPTION  Statistical functions (:mod

2 使用numpy进行简单的描述性统计计算

示例代码如下:

 

#!/usr/bin/env python# -*- coding: utf-8 -*-# @Time    : 2016/12/29 17:16# @Author  : Retacn# @Site    : 使用numpy进行统计计算# @File    : basic_stats.py# @Software: PyCharm__author__ = "retacn"__copyright__ = "property of mankind."__license__ = "CN"__version__ = "0.0.1"__maintainer__ = "retacn"__email__ = "zhenhuayue@sina.com"__status__ = "Development"import numpy as npfrom scipy.stats import scoreatpercentile# 加载csv文件#文件中Curaao,0  Cte d'Ivoire,2.5 两个字符器有问题,修改后正常data = np.loadtxt('mdrtb_2012.csv', delimiter=',', usecols=(1,), skiprows=1, unpack=True)#data = np.loadtxt('MLB2008.csv', delimiter=',', usecols=(1,), skiprows=1, unpack=True)# 最大值print('Max method', data.max())print('Max function', np.max(data))# 最小值print('Min method', data.min())print('Min function', np.min(data))# 平均值print('Mean method', data.mean())print('Mean function', np.mean(data))# 标准差print('Std method', data.std())print('Std function', np.std(data))# 中位数print('Median', np.median(data))print('Score at percentile 50', scoreatpercentile(data, 50))

 

运行结果如下:

Max method 50.0

Max function 50.0

Min method 0.0

Min function 0.0

Mean method 3.2787037037

Mean function 3.2787037037

Std method 5.76332073654

Std function 5.76332073654

Median 1.8

Score at percentile 50 1.8

 

3 使用numpy进行线性代数运算

A 求矩阵逆

示例代码如下:

#!/usr/bin/env python# -*- coding: utf-8 -*-# @Time    : 2016/12/29 17:53# @Author  : Retacn# @Site    : 使用numpy来求矩阵的逆# @File    : inversion.py# @Software: PyCharm__author__ = "retacn"__copyright__ = "property of mankind."__license__ = "CN"__version__ = "0.0.1"__maintainer__ = "retacn"__email__ = "zhenhuayue@sina.com"__status__ = "Development"import numpy as np# 创建示例矩阵A = np.mat('2 4 6;4 2 6;10 -4 18')#使用mit公开课中老师用的矩阵例子进行测试#A = np.mat('1 0 0;-3 1 0;0 0 1')print('A\n', A)# 求矩阵的逆inverse = np.linalg.inv(A)print('inverse of A\n', inverse)# 利用乘法进行验算print('Check\n', A * inverse)print('Error\n', A * inverse - np.eye(3))

 

运行结果如下:

A

 [[2  4 6]

 [4  2 6]

 [10-4 18]]

inverse of A

 [[-0.41666667 0.66666667 -0.08333333]

 [0.08333333  0.16666667 -0.08333333]

 [0.25       -0.33333333  0.08333333]]

Check

 [[ 1.00000000e+00 -2.22044605e-16   0.00000000e+00]

 [ 5.55111512e-17  1.00000000e+00   0.00000000e+00]

 [ 3.88578059e-16 -4.44089210e-16   1.00000000e+00]]

Error

 [[ 0.00000000e+00 -2.22044605e-16   0.00000000e+00]

 [ 5.55111512e-17 -4.44089210e-16   0.00000000e+00]

 [ 3.88578059e-16 -4.44089210e-16   0.00000000e+00]]

 

#可逆矩阵的列子

A

 [[1  0 0]

 [-3 1  0]

 [0  0 1]]

inverse of A

 [[1.  0. 0.]

 [3.  1. 0.]

 [0.  0. 1.]]

Check

 [[1.  0. 0.]

 [0.  1. 0.]

 [0.  0. 1.]]

Error

 [[0.  0. 0.]

 [0.  0. 0.]

 [0.  0. 0.]]

 

B 解线性方程组

示例代码如下:

#!/usr/bin/env python# -*- coding: utf-8 -*-# @Time    : 2017/1/20 13:52# @Author  : Retacn# @Site    : 用numpy解线性方程# @File    : linner_equation.py# @Software: PyCharm__author__ = "retacn"__copyright__ = "property of mankind."__license__ = "CN"__version__ = "0.0.1"__maintainer__ = "retacn"__email__ = "zhenhuayue@sina.com"__status__ = "Development"import numpy as np# 创建矩阵A和数组bA = np.mat("1 -2 1;0 2 -8;-4 5 9")print("A\n", A)b = np.array([0, 8, -9])print("b\n", b)# 调用solve()函数,解线性方程组x = np.linalg.solve(A, b)print("solution", x)# 使用dot()函数进行验算print("check\n", np.dot(A, x))

 

运行结果如下:

A

 [[ 1-2  1]

 [0  2 -8]

 [-4 5  9]]

b

 [0  8 -9]

solution [ 29.  16.  3.]

check

 [[0.  8. -9.]]

 

 

4 使用numpy计算特征值和特征向量

示例代码如下:

#!/usr/bin/env python# -*- coding: utf-8 -*-# @Time    : 2017/1/21 12:47# @Author  : Retacn# @Site    : 计算特征值和特征向量# @File    : eigenvalues.py# @Software: PyCharm__author__ = "retacn"__copyright__ = "property of mankind."__license__ = "CN"__version__ = "0.0.1"__maintainer__ = "retacn"__email__ = "zhenhuayue@sina.com"__status__ = "Development"import numpy as npA = np.mat("3 -2;1 0")print("A\n", A)print("Eigenvalues", np.linalg.eigvals(A))eigenvalues, eigenvectors = np.linalg.eig(A)print("first tuple of eig", eigenvalues)print("second tuple of eig\n", eigenvectors)for i in range(len(eigenvalues)):    print("left", np.dot(A, eigenvectors[:, i]))    print("right", eigenvalues[i] * eigenvectors[:, i])    print()

 

 

运行结果如下:

A

 [[ 3-2]

 [1  0]]

Eigenvalues [ 2.  1.]

first tuple of eig [ 2.  1.]

second tuple of eig

 [[0.89442719  0.70710678]

 [0.4472136   0.70710678]]

left [[ 1.78885438]

 [0.89442719]]

right [[ 1.78885438]

 [0.89442719]]

 

Numpy随机数

用二项式分布进行博弈

#!/usr/bin/env python# -*- coding: utf-8 -*-# @Time    : 2017/1/21 13:17# @Author  : Retacn# @Site    : # @File    : headortail.py# @Software: PyCharm__author__ = "retacn"__copyright__ = "property of mankind."__license__ = "CN"__version__ = "0.0.1"__maintainer__ = "retacn"__email__ = "zhenhuayue@sina.com"__status__ = "Development"import numpy as npfrom matplotlib.pyplot import plot, showcash = np.zeros(10000)cash[0] = 1000outcome = np.random.binomial(9, 0.5, size=len(cash))for i in range(1, len(cash)):    if outcome[i] < 5:        cash[i] = cash[i - 1] - 1    elif outcome[i] < 10:        cash[i] = cash[i - 1] + 1    else:        raise AssertionError("Unexpected outcome" + outcome)print(outcome.min(), outcome.max())plot(np.arange(len(cash)), cash)show()

 

 

 

 

正态分面采样

#!/usr/bin/env python# -*- coding: utf-8 -*-# @Time    : 2017/1/21 13:30# @Author  : Retacn# @Site    : 正态分布采样# @File    : normaldist.py# @Software: PyCharm__author__ = "retacn"__copyright__ = "property of mankind."__license__ = "CN"__version__ = "0.0.1"__maintainer__ = "retacn"__email__ = "zhenhuayue@sina.com"__status__ = "Development"import numpy as npimport matplotlib.pyplot as pltN = 10000normal_values = np.random.normal(size=N)dummy, bins, dummy = plt.hist(normal_values, np.sqrt(N), normed=True, lw=1)sigma = 1mu = 0plt.plot(bins, 1 / (sigma * np.sqrt(2 * np.pi)) * np.exp(-(bins - mu) ** 2 / (2 * sigma ** 2)), lw=2)plt.show()

 

 

使用scipy 进行正态检验

#!/usr/bin/env python# -*- coding: utf-8 -*-# @Time    : 2017/1/21 13:41# @Author  : Retacn# @Site    : 用scipy进行正态检验# @File    : normality_test.py# @Software: PyCharm__author__ = "retacn"__copyright__ = "property of mankind."__license__ = "CN"__version__ = "0.0.1"__maintainer__ = "retacn"__email__ = "zhenhuayue@sina.com"__status__ = "Development"import numpy as npfrom scipy.stats import shapirofrom scipy.stats import andersonfrom scipy.stats import normaltest# 读入流感趋势数据flutrends = np.loadtxt("goog_flutrends.csv", delimiter=',', usecols=(1,), skiprows=1,                       converters={1: lambda s: float(s or 0)}, unpack=True)N = len(flutrends)normal_values = np.random.normal(size=N)zero_values = np.zeros(N)print("Normal Value Shapiro", shapiro(normal_values))print("Zero Shapiro", shapiro(zero_values))print("Flu Shapiro", shapiro(flutrends))print()print("Normal Value Anderson", anderson(normal_values))print("Zero Anderson", anderson(zero_values))print("Flu Anderson", anderson(flutrends))print()print("Normal Value normaltest", normaltest(normal_values))print("Zero normaltest", normaltest(zero_values))print("Flu normaltest", normaltest(flutrends))print()

 

运行结果如下:

Normal Value Shapiro (0.9982795119285583,0.8293641209602356)

Zero Shapiro (1.0, 1.0)

Flu Shapiro (0.9351992011070251,2.2946666759787607e-15)

 

Normal Value AndersonAndersonResult(statistic=0.25335473134487074, critical_values=array([ 0.572,  0.652, 0.782,  0.912,  1.085]), significance_level=array([ 15. ,  10. ,  5. ,   2.5,   1. ]))

 

Zero Anderson AndersonResult(statistic=nan,critical_values=array([ 0.572, 0.652,  0.782,  0.912, 1.085]), significance_level=array([ 15. ,  10. ,  5. ,   2.5,   1. ]))

Flu AndersonAndersonResult(statistic=8.258614154768793, critical_values=array([ 0.572,  0.652, 0.782,  0.912,  1.085]), significance_level=array([ 15.,  10. ,  5. ,   2.5,   1. ]))

 

Normal Value normaltestNormaltestResult(statistic=0.18497418424851336, pvalue=0.91166097849514516)

Zero normaltestNormaltestResult(statistic=1.0095473240349975, pvalue=0.60364218712103535)

Flu normaltestNormaltestResult(statistic=99.643733363569538, pvalue=2.3048264115368721e-22)

 

 

创建掩码式numpy数组

#!/usr/bin/env python# -*- coding: utf-8 -*-# @Time    : 2017/1/21 14:00# @Author  : Retacn# @Site    : 掩码式numpy数组# @File    : masked.py# @Software: PyCharm__author__ = "retacn"__copyright__ = "property of mankind."__license__ = "CN"__version__ = "0.0.1"__maintainer__ = "retacn"__email__ = "zhenhuayue@sina.com"__status__ = "Development"import numpy as npfrom scipy.misc import ascent, faceimport matplotlib.pyplot as pltface = face()random_mask = np.random.randint(0, 2, size=face.shape)plt.subplot(221), plt.title("Original"), plt.imshow(face), plt.axis('off')masked_array = np.ma.array(face, mask=random_mask)print(masked_array)plt.subplot(222)plt.title("Masked")plt.imshow(masked_array)plt.axis("off")plt.subplot(223)plt.title("Log")plt.imshow(np.log(face))plt.axis("off")plt.subplot(224)plt.title("Log Masked")plt.imshow(np.log(masked_array))plt.axis("off")plt.show()

 

运行结果如下:

[[[-- -- --]

 [138 129 --]

 [153 -- 165]

 ...,

  [--126 74]

 [131 136 --]

  [--144 90]]

 

 [[89-- 100]

  [---- 121]

  [--122 --]

 ...,

 [118 125 71]

 [134 141 87]

 [146 153 99]]

 

 [[73-- 84]

  [---- --]

  [---- 126]

 ...,

 [117 126 --]

  [---- 87]

 [144 153 98]]

 

 ...,

 [[87106 76]

  [94110 81]

 [107 124 --]

 ...,

  [---- 97]

  [--157 --]

  [--158 --]]

 

 [[---- --]

  [---- --]

  [---- --]

 ...,

 [121 157 --]

  [--156 94]

  [--156 94]]

 

 [[---- 74]

  [---- --]

 [111 126 --]

 ...,

  [---- --]

  [--155 --]

 [118 -- 92]]]

 

忽略负值和极值

#!/usr/bin/env python# -*- coding: utf-8 -*-# @Time    : 2017/1/21 14:46# @Author  : Retacn# @Site    : 忽略负值和极值# @File    : masked_funcs.py# @Software: PyCharm__author__ = "retacn"__copyright__ = "property of mankind."__license__ = "CN"__version__ = "0.0.1"__maintainer__ = "retacn"__email__ = "zhenhuayue@sina.com"__status__ = "Development"import numpy as npfrom matplotlib.finance import _quotes_historical_yahoofrom datetime import dateimport sysimport matplotlib.pyplot as pltsalary = np.loadtxt("MLB2008.csv", delimiter=",", usecols=(1,), skiprows=1, unpack=True)triples = np.arange(0, len(salary), 3)print("Triples", triples[:10], "...")signs = np.ones(len(salary))print("Signs", signs[:10], "...")signs[triples] = -1print("Sings", signs[:10], "...")ma_log = np.ma.log(salary * signs)print("Masked logs", ma_log[:10], "...")dev = salary.std()avg = salary.mean()inside = np.ma.masked_outside(salary, avg - dev, avg + dev)print("Inside", inside[:10], "...")plt.subplot(311)plt.title("Original")plt.plot(salary)plt.subplot(312)plt.title("Log Masked")plt.plot(np.exp(ma_log))plt.subplot(313)plt.title("Not Extreme")plt.plot(inside)plt.show()

 

 

运行结果如下:

Triples [ 0 3  6  9 12 15 18 21 24 27] ...

Signs [ 1. 1.  1.  1. 1.  1.  1.  1. 1.  1.] ...

Sings [-1. 1.  1. -1.  1.  1.-1.  1. 1. -1.] ...

Masked logs [-- 14.97081819030892915.830413578506539 -- 13.458835614025542

 15.319587954740548 -- 15.64809202171258413.864300722133706 --] ...

Inside [3750000.0 3175000.0 7500000.0 3000000.0700000.0 4500000.0 3000000.0

 6250000.0 1050000.0 4600000.0] ...

 

0 0
原创粉丝点击