利用每日新闻预测金融市场的变化_版本3

来源:互联网 发布:multisim mac 编辑:程序博客网 时间:2024/05/29 15:11

之前两个版本未使用word2vec,word2vec是自然语言处理的神器 接下来,看代码

import pandas as pdimport numpy as npfrom sklearn.metrics import roc_auc_scorefrom datetime import dateimport os

导入数据

#####导入数据######os.chdir(r'D:/.../.../利用每日新闻预测金融市场变化')data = pd.read_csv('Combined_News_DJIA.csv')

########分割测试/训练集train = data[data['Date'] < '2015-01-01']test = data[data['Date'] > '2014-12-31']

#把每条新闻做成一个单独的句子X_train = train[train.columns[2:]]corpus = X_train.values.flatten().astype(str)X_train = X_train.values.astype(str)X_train = np.array([' '.join(x) for x in X_train])X_test = test[test.columns[2:]]X_test = X_test.values.astype(str)X_test = np.array([' '.join(x) for x in X_test])y_train = train['Label'].valuesy_test = test['Label'].values

将每个单词给分隔开

from nltk.tokenize import word_tokenizecorpus = [word_tokenize(x) for x in corpus]X_train = [word_tokenize(x) for x in X_train]X_test = [word_tokenize(x) for x in X_test] 
预处理

#预处理#小写化#删除停用词#删除数字与符号#lemma# 停止词from nltk.corpus import stopwordsstop = stopwords.words('english')# 数字import redef hasNumbers(inputString):    return bool(re.search(r'\d', inputString))# 特殊符号def isSymbol(inputString):    return bool(re.match(r'[^\w]', inputString))# lemmafrom nltk.stem import WordNetLemmatizerwordnet_lemmatizer = WordNetLemmatizer()def check(word):    """    如果需要这个单词,则True    如果应该去除,则False    """    word= word.lower()    if word in stop:        return False    elif hasNumbers(word) or isSymbol(word):        return False    else:        return True# 把上面的方法综合起来def preprocessing(sen):    res = []    for word in sen:        if check(word):            # 这一段的用处仅仅是去除python里面byte存str时候留下的标识。。之前数据没处理好,其他case里不会有这个情况            word = word.lower().replace("b'", '').replace('b"', '').replace('"', '').replace("'", '')            res.append(wordnet_lemmatizer.lemmatize(word))    return res                         #将三个数据组进行预处理corpus = [preprocessing(x) for x in corpus]X_train = [preprocessing(x) for x in X_train]X_test = [preprocessing(x) for x in X_test] 

训练NLP模型

#训练NLP模型from gensim.models.word2vec import Word2Vecmodel = Word2Vec(corpus, size=128, window=5, min_count=5, workers=4)           vocab = model.wv.vocab# 得到任意text的vectordef get_vector(word_list):    # 建立一个全是0的array    res =np.zeros([128])    count = 0    for word in word_list:        if word in vocab:
from sklearn.svm import SVRfrom sklearn.model_selection import cross_val_scoreparams = [0.1,0.5,1,3,5,7,10,12,16,20,25,30,35,40]test_scores = []for param in params:    clf = SVR(gamma=param)    test_score = cross_val_score(clf, X_train, y_train, cv=3, scoring='roc_auc')    test_scores.append(np.mean(test_score))             import matplotlib.pyplot as plt%matplotlib inlineplt.plot(params, test_scores)plt.title("Param vs CV AUC Score");  

res += model[word] count += 1 return res/count wordlist_train = X_trainwordlist_test = X_testX_train = [get_vector(x) for x in X_train]X_test = [get_vector(x) for x in X_test]

建立ML模型

from sklearn.svm import SVRfrom sklearn.model_selection import cross_val_scoreparams = [0.1,0.5,1,3,5,7,10,12,16,20,25,30,35,40]test_scores = []for param in params:    clf = SVR(gamma=param)    test_score = cross_val_score(clf, X_train, y_train, cv=3, scoring='roc_auc')    test_scores.append(np.mean(test_score))             import matplotlib.pyplot as plt%matplotlib inlineplt.plot(params, test_scores)plt.title("Param vs CV AUC Score");  
使用CNN提升逼格

########利用CNN提升逼格# 用vector表示出一个大matrix,并用CNN做“降维+注意力”def transform_to_matrix(x, padding_size=256, vec_size=128):    res = []    for sen in x:        matrix = []        for i in range(padding_size):            try:                matrix.append(model[sen[i]].tolist())            except:                # 这里有两种except情况,                # 1. 这个单词找不到                # 2. sen没那么长                # 不管哪种情况,我们直接贴上全是0的vec                matrix.append([0] * vec_size)        res.append(matrix)    return resX_train = transform_to_matrix(wordlist_train)X_test = transform_to_matrix(wordlist_test)print(X_train[123])#变成np的数组,便于处理X_train = np.array(X_train)X_test = np.array(X_test)#查看数组的大小print(X_train.shape)print(X_test.shape)X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1], X_train.shape[2])X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1], X_test.shape[2])print(X_train.shape)print(X_test.shape)####定义cnn模型from keras.preprocessing import sequencefrom keras.models import Sequentialfrom keras.layers import Convolution2D,MaxPooling2Dfrom keras.layers.core import Dense,Dropout,Activation,Flatten#设置参数batch_size =32n_filter = 16filter_length = 4nb_epoch = 5n_pool = 2#新建一个sequential的模型model = Sequential()model.add(Convolution2D(n_filter,filter_length,filter_length,input_shape=(1, 256, 128)))model.add(Activation('relu'))model.add(Convolution2D(n_filter,filter_length,filter_length))model.add(Activation('relu'))model.add(MaxPooling2D(pool_size=(n_pool, n_pool)))model.add(Dropout(0.25))model.add(Flatten())# 后面接上一个ANNmodel.add(Dense(128))model.add(Activation('relu'))model.add(Dropout(0.5))model.add(Dense(1))model.add(Activation('softmax'))# compile模型model.compile(loss='mse',optimizer='adadelta',metrics=['accuracy'])model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=nb_epoch,          verbose=0)score = model.evaluate(X_test, y_test, verbose=0)print('Test score:', score[0])print('Test accuracy:', score[1])



原创粉丝点击