利用每日新闻预测金融市场的变化_版本3
来源:互联网 发布:multisim mac 编辑:程序博客网 时间:2024/05/29 15:11
之前两个版本未使用word2vec,word2vec是自然语言处理的神器 接下来,看代码
import pandas as pdimport numpy as npfrom sklearn.metrics import roc_auc_scorefrom datetime import dateimport os
导入数据
#####导入数据######os.chdir(r'D:/.../.../利用每日新闻预测金融市场变化')data = pd.read_csv('Combined_News_DJIA.csv')
########分割测试/训练集train = data[data['Date'] < '2015-01-01']test = data[data['Date'] > '2014-12-31']
#把每条新闻做成一个单独的句子X_train = train[train.columns[2:]]corpus = X_train.values.flatten().astype(str)X_train = X_train.values.astype(str)X_train = np.array([' '.join(x) for x in X_train])X_test = test[test.columns[2:]]X_test = X_test.values.astype(str)X_test = np.array([' '.join(x) for x in X_test])y_train = train['Label'].valuesy_test = test['Label'].values
将每个单词给分隔开
from nltk.tokenize import word_tokenizecorpus = [word_tokenize(x) for x in corpus]X_train = [word_tokenize(x) for x in X_train]X_test = [word_tokenize(x) for x in X_test]预处理
#预处理#小写化#删除停用词#删除数字与符号#lemma# 停止词from nltk.corpus import stopwordsstop = stopwords.words('english')# 数字import redef hasNumbers(inputString): return bool(re.search(r'\d', inputString))# 特殊符号def isSymbol(inputString): return bool(re.match(r'[^\w]', inputString))# lemmafrom nltk.stem import WordNetLemmatizerwordnet_lemmatizer = WordNetLemmatizer()def check(word): """ 如果需要这个单词,则True 如果应该去除,则False """ word= word.lower() if word in stop: return False elif hasNumbers(word) or isSymbol(word): return False else: return True# 把上面的方法综合起来def preprocessing(sen): res = [] for word in sen: if check(word): # 这一段的用处仅仅是去除python里面byte存str时候留下的标识。。之前数据没处理好,其他case里不会有这个情况 word = word.lower().replace("b'", '').replace('b"', '').replace('"', '').replace("'", '') res.append(wordnet_lemmatizer.lemmatize(word)) return res #将三个数据组进行预处理corpus = [preprocessing(x) for x in corpus]X_train = [preprocessing(x) for x in X_train]X_test = [preprocessing(x) for x in X_test]
训练NLP模型
#训练NLP模型from gensim.models.word2vec import Word2Vecmodel = Word2Vec(corpus, size=128, window=5, min_count=5, workers=4) vocab = model.wv.vocab# 得到任意text的vectordef get_vector(word_list): # 建立一个全是0的array res =np.zeros([128]) count = 0 for word in word_list: if word in vocab:from sklearn.svm import SVRfrom sklearn.model_selection import cross_val_scoreparams = [0.1,0.5,1,3,5,7,10,12,16,20,25,30,35,40]test_scores = []for param in params: clf = SVR(gamma=param) test_score = cross_val_score(clf, X_train, y_train, cv=3, scoring='roc_auc') test_scores.append(np.mean(test_score)) import matplotlib.pyplot as plt%matplotlib inlineplt.plot(params, test_scores)plt.title("Param vs CV AUC Score");
res += model[word] count += 1 return res/count wordlist_train = X_trainwordlist_test = X_testX_train = [get_vector(x) for x in X_train]X_test = [get_vector(x) for x in X_test]建立ML模型from sklearn.svm import SVRfrom sklearn.model_selection import cross_val_scoreparams = [0.1,0.5,1,3,5,7,10,12,16,20,25,30,35,40]test_scores = []for param in params: clf = SVR(gamma=param) test_score = cross_val_score(clf, X_train, y_train, cv=3, scoring='roc_auc') test_scores.append(np.mean(test_score)) import matplotlib.pyplot as plt%matplotlib inlineplt.plot(params, test_scores)plt.title("Param vs CV AUC Score");使用CNN提升逼格########利用CNN提升逼格# 用vector表示出一个大matrix,并用CNN做“降维+注意力”def transform_to_matrix(x, padding_size=256, vec_size=128): res = [] for sen in x: matrix = [] for i in range(padding_size): try: matrix.append(model[sen[i]].tolist()) except: # 这里有两种except情况, # 1. 这个单词找不到 # 2. sen没那么长 # 不管哪种情况,我们直接贴上全是0的vec matrix.append([0] * vec_size) res.append(matrix) return resX_train = transform_to_matrix(wordlist_train)X_test = transform_to_matrix(wordlist_test)print(X_train[123])#变成np的数组,便于处理X_train = np.array(X_train)X_test = np.array(X_test)#查看数组的大小print(X_train.shape)print(X_test.shape)X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1], X_train.shape[2])X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1], X_test.shape[2])print(X_train.shape)print(X_test.shape)####定义cnn模型from keras.preprocessing import sequencefrom keras.models import Sequentialfrom keras.layers import Convolution2D,MaxPooling2Dfrom keras.layers.core import Dense,Dropout,Activation,Flatten#设置参数batch_size =32n_filter = 16filter_length = 4nb_epoch = 5n_pool = 2#新建一个sequential的模型model = Sequential()model.add(Convolution2D(n_filter,filter_length,filter_length,input_shape=(1, 256, 128)))model.add(Activation('relu'))model.add(Convolution2D(n_filter,filter_length,filter_length))model.add(Activation('relu'))model.add(MaxPooling2D(pool_size=(n_pool, n_pool)))model.add(Dropout(0.25))model.add(Flatten())# 后面接上一个ANNmodel.add(Dense(128))model.add(Activation('relu'))model.add(Dropout(0.5))model.add(Dense(1))model.add(Activation('softmax'))# compile模型model.compile(loss='mse',optimizer='adadelta',metrics=['accuracy'])model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=nb_epoch, verbose=0)score = model.evaluate(X_test, y_test, verbose=0)print('Test score:', score[0])print('Test accuracy:', score[1])
阅读全文
0 0
- 利用每日新闻预测金融市场的变化_版本3
- 每日新闻预测金融市场的变化_版本1
- 每日新闻预测金融市场的变化_版本2
- 基于分类使用深度神经网络的金融市场预测
- 每日新闻
- 每日新闻
- 每日新闻
- 每日新闻
- 每日新闻
- 每日新闻
- 金融市场的智能交易
- SCAMPI1.3版本的主要变化
- Framework各个版本的变化
- jQuery各个版本的变化
- 判断数据变化的标准以及预测、异常预警
- CLR版本变化导致的context的内容的变化
- 5_透明度变化的动画
- 金融市场的广度、深度与弹性
- https请求数据,ca机构安全证书
- 模块之内聚性
- 【codevs 1116】四色问题
- 如何通过C#调用OpenCV函数(自制OpenCV的c++ dll文件)
- 软件测试人员必备网络知识(一):什么是cookie?
- 利用每日新闻预测金融市场的变化_版本3
- NOIP 2009 题解+代码
- 疯狂Spring Cloud连载(1)Spring Cloud概述
- Git 强制推送 push force 推送多个仓库
- Ubuntu 安装Oracle JDK 1.8
- 实用又强大,6 款 Python 时间日期库推荐
- 在滴滴,我是如何指数级提升开发技术的?
- 刘强东演讲:从穷到一年只能吃两次猪肉到京东掌门人
- 这个程序媛在万圣节居然 COS 这个……