使用Python预测黄金AU9999收盘价

来源:互联网 发布:约拍用什么软件最好 编辑:程序博客网 时间:2024/04/28 01:21

1、加载Python包

# - coding: utf-8 -*-import pandas as pdimport mathimport numpy as npimport datetime, timefrom sklearn import preprocessing, cross_validation, svmfrom sklearn.linear_model import LinearRegressionimport matplotlib.pyplot as pltfrom matplotlib import stylestyle.use('ggplot')

2、读入数据集

#如何将xls转为csvfinance_data = pd.read_csv('/Users/MichaelDeng/PycharmProjects/Python_finance/AU9999历史交易数据(2011-12-01至2017-08-18).csv',encoding="gbk")

3、查看数据集

print type(finance_data)print finance_data.indexprint finance_data.columnsprint finance_data.describe() # .astype(np.float64)finance_data.head()

4、对数据集处理

df = finance_data[[u'日期',u'开盘价', u'最高价', u'最低价', u'收盘价', u'成交量(公斤)']]df.columns=['Date','Adj. Open','Adj. Hign','Adj. Low','Adj. Close','Adj. Volume']df['Date']=pd.to_datetime(df['Date'])df.index=df['Date']df = df.sort_index(ascending=True) #Date列不是索引时, df = df.sort(["Date"],ascending=True)del df['Date']df.head()
Adj. Open Adj. Hign Adj. Low Adj. Close Adj. Volume Date 2011-12-01 353.38 359.00 353.00 357.50 27,800.00 2011-12-02 358.50 358.70 356.00 357.74 35,874.00 2011-12-05 360.00 360.00 356.00 357.50 43,370.00 2011-12-06 357.50 357.60 351.21 351.50 45,684.00 2011-12-07 353.48 354.99 350.00 354.60 39,684.00

5、处理异常行数据

# df[df['date'] == '20161111'] # 获取符合这个条件的行,前提是date为列名#df[df['date'] == '20161111'].index[0] # 获取符合这个条件的行的行索引的值df[df.index=='2014-05-02'].index# 获取符合这个条件的行的行索引的值df.drop(df[df.index=='2014-05-02'].index,inplace=True)print df.indexprint df.columnsprint df.describe()#.astype(np.float64)df.head()
DatetimeIndex([‘2011-12-01’, ‘2011-12-02’, ‘2011-12-05’, ‘2011-12-06’, ‘2011-12-07’, ‘2011-12-08’, ‘2011-12-09’, ‘2011-12-12’, ‘2011-12-13’, ‘2011-12-14’, … ‘2017-08-07’, ‘2017-08-08’, ‘2017-08-09’, ‘2017-08-10’, ‘2017-08-11’, ‘2017-08-14’, ‘2017-08-15’, ‘2017-08-16’, ‘2017-08-17’, ‘2017-08-18’], dtype=’datetime64[ns]’, name=u’Date’, length=1397, freq=None)Index([u’Adj. Open’, u’Adj. Hign’, u’Adj. Low’, u’Adj. Close’, u’Adj. Volume’], dtype=’object’) Adj. Open Adj. Hign Adj. Low Adj. Close count 1397.000000 1397.000000 1397.000000 1397.000000 mean 276.691918 277.820988 274.407158 276.071790 std 38.338578 38.187169 38.099712 38.020006 min 210.790000 218.500000 167.500000 216.900000 25% 245.990000 247.000000 244.000000 245.700000 50% 265.980000 267.000000 264.000000 265.350000 75% 291.990000 293.990000 288.200000 290.800000 max 362.990000 363.000000 360.500000 362.000000
Adj. Open Adj. Hign Adj. Low Adj. Close Adj. Volume Date 2011-12-01 353.38 359.00 353.00 357.50 27,800.00 2011-12-02 358.50 358.70 356.00 357.74 35,874.00 2011-12-05 360.00 360.00 356.00 357.50 43,370.00 2011-12-06 357.50 357.60 351.21 351.50 45,684.00 2011-12-07 353.48 354.99 350.00 354.60 39,684.00

6、某些列进行数据类型转换

for i in range(0,len(df)):        df['Adj. Volume'][i] = df['Adj. Volume'][i].encode('ascii','ignore').replace(',','')# 将unicode转为string,encode('ascii','ignore')len(df)# df.apply(lambda x: myfunc(df), axis=1)df['Adj. Volume'] = df['Adj. Volume'].astype(np.float64)df.describe()#.astype(np.float64)df.head()# 前提是数据中没有其他字符# df = df.apply(lambda x: pd.to_numeric(x,errors='ignore'), axis=0) # df.convert_objects(convert_numeric=True).dtypes
Adj. Open Adj. Hign Adj. Low Adj. Close Adj. Volume Date 2011-12-01 353.38 359.00 353.00 357.50 27800.0 2011-12-02 358.50 358.70 356.00 357.74 35874.0 2011-12-05 360.00 360.00 356.00 357.50 43370.0 2011-12-06 357.50 357.60 351.21 351.50 45684.0 2011-12-07 353.48 354.99 350.00 354.60 39684.0

7、计算其他特征指标

# 计算真正波幅(True Range,简称TR)。当日最高价减去当日最低价(H-L);当日最高价减去昨日收盘价(H-PC);当日最低价减去昨日收盘价(L-PC)。# df.columns=['Date','Adj. Open','Adj. Hign','Adj. Low','Adj. Close','Adj. Volume']df['HL_PCT'] = (df['Adj. Hign'] - df['Adj. Close']) / df['Adj. Close'] * 100.0df['CO_PCT'] = (df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open'] * 100.0df = df[['Adj. Close', 'HL_PCT', 'CO_PCT', 'Adj. Volume']]df.head()
Adj. Close HL_PCT CO_PCT Adj. Volume Date 2011-12-01 357.50 0.419580 1.165884 27800.0 2011-12-02 357.74 0.268351 -0.211994 35874.0 2011-12-05 357.50 0.699301 -0.694444 43370.0 2011-12-06 351.50 1.735420 -1.678322 45684.0 2011-12-07 354.60 0.109983 0.316850 39684.0

8、构造预测指标

forecast_col = 'Adj. Close'df.fillna(-99999, inplace=True)forecast_out = 7 # forecast_out = int(math.ceil(0.01 * len(df)))  # math.ceil:返回大于等于数字参数的最小整数(取整函数)df['label'] = df[forecast_col].shift(-forecast_out)  # shift函数是对数据进行移动的操作,此处是将股票收盘价向前移动forecast_out个位置,然后作为标签#df.dropna(inplace=True) #可以按行丢弃带有nan的数据df.head()
Adj. Close HL_PCT CO_PCT Adj. Volume label Date 2011-12-01 357.50 0.419580 1.165884 27800.0 346.35 2011-12-02 357.74 0.268351 -0.211994 35874.0 339.30 2011-12-05 357.50 0.699301 -0.694444 43370.0 338.00 2011-12-06 351.50 1.735420 -1.678322 45684.0 322.30 2011-12-07 354.60 0.109983 0.316850 39684.0 329.30

9、特征指标数据缩放,预测数据分割

X = np.array(df.drop(['label'], 1))  # 对于DataFrame,可以从任何坐标轴删除索引值:X = preprocessing.scale(X)  # 数据缩放的算法是?X = X[:-forecast_out]X_lately = X[-forecast_out:]df.dropna(inplace=True)  # 有的天没有股票数据,需要清除掉y = np.array(df['label'])print Xprint y
[[ 2.14248705 -0.23638346  1.47967723 -1.47731955] [ 2.14880178 -0.39770846 -0.00316564 -1.38496861] [ 2.14248705  0.06201065 -0.52236799 -1.29922888] ...,  [-0.07082465  0.18229388 -0.60315191 -0.4283689 ] [-0.07713937 -0.52775306  0.08725379 -0.62432583] [-0.0505649   0.30437763  0.09167618 -0.34186281]][ 346.35  339.3   338.   ...,  274.67  277.    278.5 ]

10、模型建立与模型预测

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2)clf = LinearRegression(n_jobs=-1)  # n_jobs=-1 表示使用所有CPU# clf= svm.SVR(kernel='poly')clf.fit(X_train, y_train)accuracy = clf.score(X_test, y_test)print accuracyforecast_set = clf.predict(X_lately)print(forecast_set, accuracy, forecast_out)df['Forecast'] = np.nan
0.96681672249(array([ 275.02644769,  274.79433411,  273.48581788,  275.07236144,        273.26221787,  273.02901971,  274.17157259]), 0.96681672249024297, 7)

11、预测结果可视化

df = df['2017']# 选取2017年数据画图last_date = df.iloc[-1].namelast_date# last_unix = last_date.timestamp()last_unix = time.mktime(last_date.timetuple())one_day = 86400next_unix = last_unix + one_day
for i in forecast_set:    next_date = datetime.datetime.fromtimestamp(next_unix)    next_unix += one_day    df.loc[next_date] = [np.nan for _ in range(len(df.columns) - 1)] + [i]
df['Adj. Close'].plot()df['Forecast'].plot()plt.legend(loc=4)plt.xlabel('Date')plt.ylabel('Price')plt.show()

这里写图片描述

历史数据

df_his = df[df['Forecast'].isnull()]df_his['Adj. Close'].plot()plt.legend(loc=4)plt.xlabel('Date')plt.ylabel('Price')plt.show()

这里写图片描述

预测数据

df_for = df[df['Forecast'].notnull()]df_for['Forecast'].plot()plt.legend(loc=4)plt.xlabel('Date')plt.ylabel('Price')plt.show()

这里写图片描述

阅读全文
0 0