每日新闻预测金融市场的变化_版本2

来源:互联网 发布:培训机构网站源码 编辑:程序博客网 时间:2024/05/20 05:09
######加载包#####from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizerimport pandas as pdimport numpy as npfrom sklearn.svm import SVCfrom sklearn.metrics import roc_auc_scorefrom datetime import dateimport os#####导入数据######os.chdir(r'D:/夏俊红/数据分析/利用每日新闻预测金融市场变化')data = pd.read_csv('Combined_News_DJIA.csv')#####将headlines合并#####data["combined_news"] = data.filter(regex = ("Top.*")).apply(lambda x: ''.join(str(x.values)),axis = 1)########分割测试/训练集train = data[data['Date'] < '2015-01-01']test = data[data['Date'] > '2014-12-31']############提取特征#############feature_extraction = TfidfVectorizer()X_train = feature_extraction.fit_transform(train["combined_news"].values)#训练(fit)文本信息,transform我们所需要的TfidfVectorizer模型X_test = feature_extraction.transform(test["combined_news"].values)y_train = train["Label"].values#将label变成numpy输出             y_test = test["Label"].values

####进阶版本X_train = train["combined_news"].str.lower().str.replace('"','').str.replace("'",'').str.split()X_test = test["combined_news"].str.lower().str.replace('"','').str.replace("'",'').str.split()print(X_test[1611])####删减停止词语from nltk.corpus import stopwordsstop = stopwords.words('english')##删除数字import redef hasNumbers(inputString):    return bool(re.search(r'\d',inputString))##lemma词性之间的转换from nltk.stem import WordNetLemmatizerwordnet_lemmatizer = WordNetLemmatizer()##把这些元素全合成一个funcdef  check(word):    """    如果需要这个单词,则True    如果应该去除,则False    """    if word in stop:        return False    elif hasNumbers(word):        return False    else:        return True##进行DF处理X_train = X_train.apply(lambda x: [wordnet_lemmatizer.lemmatize(item) for item in x if check(item)])X_test = X_test.apply(lambda x: [wordnet_lemmatizer.lemmatize(item) for item in x if check(item)])print(X_test[1611])X_train = X_train.apply(lambda x: ' '.join(x))X_test = X_test.apply(lambda x: ' '.join(x))print(X_test[1611])feature_extection = TfidfVectorizer(lowercase = False)X_train = feature_extraction.fit_transform(X_train.values)X_test = feature_extraction.transform(X_test.values)clf = SVC(probability = True,kernel = 'rbf')clf.fit(X_train,y_train)predictions = clf.predict_proba(X_test)print('ROC-AUC yields' + str(roc_auc_score(y_test,predictions[:,1])))