CNTK API文档翻译(8)——使用Pandas和金融数据进行时序数据基本分析

本期将带来使用CNTK处理时间序列数据的教程。本教程中会展示怎样为深度学习算法准备时间数据、训练神经网络和评估神经网络。具体来说,我们会探究预测交易性开放式指数基金(Exchange-traded Funds,EFI)的分类是否靠谱,进而通过这种简单的分类来决定是买是卖。本教程仅仅是CNTK分析时序数据的例子,不保证训练的结果可以用于基金买卖的决策,股票市场过于复杂,非常难以预测,目前为止做的最好的依然是该领域的专家。


from __future__ import print_functionimport datetimeimport numpy as npimport osimport pandas as pd# default='warn'pd.options.mode.chained_assignment = None  import cntk as C


# Select the right target device when this notebook is being tested:if 'TEST_DEVICE' in os.environ:    if os.environ['TEST_DEVICE'] == 'cpu':        C.device.try_set_default_device(C.device.cpu())    else:        C.device.try_set_default_device(C.device.gpu(0))



# A method which obtains stock data from Yahoo finance# Requires that you have an internet connection to retreive stock data from Yahoo financeimport timetry:    from  pandas_datareader import dataexcept ImportError:    !pip install pandas_datareader    from  pandas_datareader import data # Set a random seednp.random.seed(123)def get_stock_data(contract, s_year, s_month, s_day, e_year, e_month, e_day):    """    Args:        contract (str): the name of the stock/etf        s_year (int): start year for data        s_month (int): start month        s_day (int): start day        e_year (int): end year        e_month (int): end month        e_day (int): end day    Returns:        Pandas Dataframe: Daily OHLCV bars    """    start = datetime.datetime(s_year, s_month, s_day)    end = datetime.datetime(e_year, e_month, e_day)    retry_cnt, max_num_retry = 0, 3    while(retry_cnt < max_num_retry):        try:            bars = data.DataReader(contract,"google", start, end)            return bars        except:            retry_cnt += 1            time.sleep(np.random.randint(1,10))     print("Google Finance is not reachable")    raise Exception('Google Finance is not reachable')import pickle as  pkl# We search in cached stock data set with symbol SPY.               # Check for an environment variable defined in CNTK's test infrastructureenvvar = 'CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY'def is_test(): return envvar in os.environdef download(data_file):    try:        data = get_stock_data("SPY", 2000, 1,2,2017,1,1)    except:        raise Exception("Data could not be downloaded")    dir = os.path.dirname(data_file)    if not os.path.exists(dir):        os.makedirs(dir)    if not os.path.isfile(data_file):        print("Saving", data_file )        with open(data_file, 'wb') as f:            pkl.dump(data, f, protocol = 2)    return datadata_file = os.path.join("data", "Stock", "stock_SPY.pkl")# Check for data in local cacheif os.path.exists(data_file):        print("File already exists", data_file)        data = pd.read_pickle(data_file) else:     # If not there we might be running in CNTK's test infrastructure    if is_test():        test_file = os.path.join(os.environ[envvar], 'Tutorials','data','stock','stock_SPY.pkl')        if os.path.isfile(test_file):            print("Reading data from test data directory")            data = pd.read_pickle(test_file)        else:            print("Test data directory missing file", test_file)            print("Downloading data from Google Finance")            data = download(data_file)             else:        # Local cache is not present and not test env        # download the data from Yahoo finance and cache it in a local directory        # Please check if there is trade data for the chosen stock symbol during this period        data = download(data_file)




  • 股市接下来一天的数据比当天的数据高还是低。


  • 通过前八天的数据,接下来一天是否会比当天好。
  • 数据波动百分比
  • 前一天数据波动的百分比


# Feature name listpredictor_names = []# Compute price difference as a featuredata["diff"] = np.abs((data["Close"] - data["Close"].shift(1)) / data["Close"]).fillna(0) predictor_names.append("diff")# Compute the volume difference as a featuredata["v_diff"] = np.abs((data["Volume"] - data["Volume"].shift(1)) / data["Volume"]).fillna(0) predictor_names.append("v_diff")# Compute the stock being up (1) or down (0) over different day offsets compared to current dat closing pricenum_days_back = 8for i in range(1,num_days_back+1):    data["p_" + str(i)] = np.where(data["Close"] > data["Close"].shift(i), 1, 0) # i: number of look back days    predictor_names.append("p_" + str(i))# If you want to save the file to your local drive#data.to_csv("PATH_TO_SAVE.csv")data.head(10)



data["next_day"] = np.where(data["Close"].shift(-1) > data["Close"], 1, 0)data["next_day_opposite"] = np.where(data["next_day"]==1,0,1) # The label must be one-hot encoded# Establish the start and end date of our training timeseries (picked 2000 days before the market crash)training_data = data["2001-02-05":"2009-01-20"] # We define our test data as: data["2008-01-02":]# This example allows to to include data up to current datetest_data= data["2009-01-20":"2016-12-29"] training_features = np.asarray(training_data[predictor_names], dtype = "float32")training_labels = np.asarray(training_data[["next_day","next_day_opposite"]], dtype="float32")


# Lets build the networkinput_dim = 2 + num_days_backnum_output_classes = 2 #Remember we need to have 2 since we are trying to classify if the market goes up or down 1 hot encodednum_hidden_layers = 2hidden_layers_dim = 2 + num_days_backinput_dynamic_axes = [C.Axis.default_batch_axis()]input = C.input_variable(input_dim, dynamic_axes=input_dynamic_axes)label = C.input_variable(num_output_classes, dynamic_axes=input_dynamic_axes)def create_model(input, num_output_classes):    h = input    with C.layers.default_options(init = C.glorot_uniform()):        for i in range(0,num_hidden_layers):            h = C.layers.Dense(hidden_layers_dim,                                activation = C.relu)(h)        r = C.layers.Dense(num_output_classes, activation=None)(h)       return rz = create_model(input, num_output_classes)loss = C.cross_entropy_with_softmax(z, label)label_error = C.classification_error(z, label)lr_per_minibatch = C.learning_rate_schedule(0.125,C.UnitType.minibatch)trainer = C.Trainer(z, (loss, label_error), [C.sgd(z.parameters, lr=lr_per_minibatch)])#Initialize the parameters for the trainer, we will train in large minibatches in sequential orderminibatch_size = 100num_minibatches = len(training_data.index) // minibatch_size#Run the trainer on and perform model trainingtraining_progress_output_freq = 1# Visualize the loss over minibatchplotdata = {"batchsize":[], "loss":[], "error":[]}



tf = np.split(training_features,num_minibatches)print("Number of mini batches")print(len(tf))print("The shape of the training feature minibatch")print(tf[0].shape)tl = np.split(training_labels, num_minibatches)# It is key that we make only one pass through the data linearly in timenum_passes = 1 # Defines a utility that prints the training progressdef print_training_progress(trainer, mb, frequency, verbose=1):    training_loss = "NA"    eval_error = "NA"    if mb%frequency == 0:        training_loss = trainer.previous_minibatch_loss_average        eval_error = trainer.previous_minibatch_evaluation_average        if verbose:             print ("Minibatch: {0}, Loss: {1:.4f}, Error: {2:.2f}%".format(mb, training_loss, eval_error*100))    return mb, training_loss, eval_error# Train our neural networktf = np.split(training_features,num_minibatches)tl = np.split(training_labels, num_minibatches)for i in range(num_minibatches*num_passes): # multiply by the     features = np.ascontiguousarray(tf[i%num_minibatches])    labels = np.ascontiguousarray(tl[i%num_minibatches])    # Specify the mapping of input variables in the model to actual minibatch data to be trained with    trainer.train_minibatch({input : features, label : labels})    batchsize, loss, error = print_training_progress(trainer, i, training_progress_output_freq, verbose=1)    if not (loss == "NA" or error =="NA"):        plotdata["batchsize"].append(batchsize)        plotdata["loss"].append(loss)        plotdata["error"].append(error)


Minibatch: 0, Loss: 0.7874, Error: 54.00%Minibatch: 1, Loss: 0.7570, Error: 51.00%Minibatch: 2, Loss: 0.7579, Error: 61.00%Minibatch: 3, Loss: 0.6916, Error: 47.00%Minibatch: 4, Loss: 0.7127, Error: 54.00%Minibatch: 5, Loss: 0.7286, Error: 59.00%Minibatch: 6, Loss: 0.7056, Error: 50.00%Minibatch: 7, Loss: 0.6975, Error: 48.00%Minibatch: 8, Loss: 0.7059, Error: 56.00%Minibatch: 9, Loss: 0.7037, Error: 54.00%Minibatch: 10, Loss: 0.7567, Error: 60.00%Minibatch: 11, Loss: 0.8480, Error: 52.00%Minibatch: 12, Loss: 0.6917, Error: 45.00%Minibatch: 13, Loss: 0.7526, Error: 58.00%Minibatch: 14, Loss: 0.6823, Error: 47.00%Minibatch: 15, Loss: 0.8856, Error: 40.00%Minibatch: 16, Loss: 0.8299, Error: 48.00%Minibatch: 17, Loss: 1.1737, Error: 51.00%Minibatch: 18, Loss: 0.7951, Error: 53.00%Minibatch: 19, Loss: 0.7809, Error: 48.00%


import matplotlib.pyplot as pltplt.figure(1)plt.subplot(211)plt.plot(plotdata["batchsize"], plotdata["loss"], 'b--')plt.xlabel('Minibatch number')plt.ylabel('Loss')plt.title('Minibatch run vs. Training loss ')["batchsize"], plotdata["error"], 'r--')plt.xlabel('Minibatch number')plt.ylabel('Label Prediction Error')plt.title('Minibatch run vs. Label Prediction Error ')




# Now that we have trained the net, and we will do out of sample test to see how we did.# and then more importantly analyze how that set didtest_features = np.ascontiguousarray(test_data[predictor_names], dtype = "float32")test_labels = np.ascontiguousarray(test_data[["next_day","next_day_opposite"]], dtype="float32")avg_error = trainer.test_minibatch({input : test_features, label : test_labels})print("Average error: {0:2.2f}%".format(avg_error * 100))


out = C.softmax(z)predicted_label_prob = out.eval({input:test_features})test_data["p_up"] = pd.Series(predicted_label_prob[:,0], index = test_data.index)test_data["p_down"] = predicted_label_prob[:,1]test_data['long_entries'] = np.where((test_data.p_up > 0.55), 1, 0)test_data['short_entries'] = np.where((test_data.p_down > 0.55) , -1, 0)test_data['positions'] = test_data['long_entries'].fillna(0) + test_data['short_entries'].fillna(0)






def create_drawdowns(equity_curve):    """    Calculate the largest peak-to-trough drawdown of the PnL curve    as well as the duration of the drawdown. Requires that the     pnl_returns is a pandas Series.    Parameters:    pnl - A pandas Series representing period percentage returns.    Returns:    drawdown, duration - Highest peak-to-trough drawdown and duration.    """    # Calculate the cumulative returns curve     # and set up the High Water Mark    # Then create the drawdown and duration series    hwm = [0]    eq_idx = equity_curve.index    drawdown = pd.Series(index = eq_idx)    duration = pd.Series(index = eq_idx)    # Loop over the index range    for t in range(1, len(eq_idx)):        cur_hwm = max(hwm[t-1], equity_curve[t])        hwm.append(cur_hwm)        drawdown[t]= (hwm[t] - equity_curve[t])         duration[t]= 0 if drawdown[t] == 0 else duration[t-1] + 1    return drawdown.max(), duration.max()plt.figure()test_data["p_up"].hist(bins=20, alpha=0.4)test_data["p_down"].hist(bins=20, alpha=0.4)plt.title("Distribution of Probabilities")plt.legend(["p_up", "p_down"])plt.ylabel("Frequency")plt.xlabel("Probablity")


test_data["pnl"] = test_data["Close"].diff().shift(-1).fillna(0)*test_data["positions"]/np.where(test_data["Close"]!=0,test_data["Close"],1)test_data["perc"] = (test_data["Close"] - test_data["Close"].shift(1)) / test_data["Close"].shift(1)monthly = test_data.pnl.resample("M").sum()monthly_spy = test_data["perc"].resample("M").sum()avg_return = np.mean(monthly)std_return = np.std(monthly)sharpe = np.sqrt(12) * avg_return / std_returndrawdown = create_drawdowns(monthly.cumsum())spy_drawdown = create_drawdowns(monthly_spy.cumsum())print("TRADING STATS")print("AVG Monthly Return :: " + "{0:.2f}".format(round(avg_return*100,2))+ "%")print("STD Monthly        :: " + "{0:.2f}".format(round(std_return*100,2))+ "%")print("SHARPE             :: " + "{0:.2f}".format(round(sharpe,2)))print("MAX DRAWDOWN       :: " + "{0:.2f}".format(round(drawdown[0]*100,2)) + "%, " + str(drawdown[1]) + " months" )print("Correlation to SPY :: " + "{0:.2f}".format(round(np.corrcoef(test_data["pnl"], test_data["diff"])[0][1],2)))print("NUMBER OF TRADES   :: " + str(np.sum(test_data.positions.abs())))print("TOTAL TRADING DAYS :: " + str(len(data)))print("SPY MONTHLY RETURN :: " + "{0:.2f}".format(round(monthly_spy.mean()*100,2)) + "%")print("SPY STD RETURN     :: " + "{0:.2f}".format(round(monthly_spy.std()*100,2)) + "%")print("SPY SHARPE         :: " + "{0:.2f}".format(round(monthly_spy.mean()/monthly_spy.std()*np.sqrt(12),2)))print("SPY DRAWDOWN       :: " + "{0:.2f}".format(round(spy_drawdown[0]*100,2)) + "%, "  + str(spy_drawdown[1]) + " months" )print(drawdown[0])(monthly.cumsum()*100).plot()(monthly_spy.cumsum()*100).plot()plt.legend(["NN", "SPY"],loc=2)plt.ylabel("% Return")plt.title("TRADING SPY OUT OF SAMPLE")


TRADING STATSAVG Monthly Return :: -0.45%STD Monthly        :: 3.17%SHARPE             :: -0.49MAX DRAWDOWN       :: 48.20%, nan monthsCorrelation to SPY :: -0.01NUMBER OF TRADES   :: 1175TOTAL TRADING DAYS :: 4000SPY MONTHLY RETURN :: 1.19%SPY STD RETURN     :: 3.92%SPY SHARPE         :: 1.05SPY DRAWDOWN       :: 17.25%, 11.0 months0.482027152898



0 0