科创之股票数据挖掘（6）

来源：互联网发布：淘宝学生春秋外套编辑：程序博客网时间：2024/04/30 06:34

    已经很久没有更新我的博客了，最近做了一些小项目，事情比较多，数据挖掘的进度慢了下来，一是因为技术上遇到了一些瓶颈，需要我们解决；二是因为我在想办法提高我们数据挖掘的质量，神经网络上遇到的麻烦一直困扰着我。最近我们的工作需要写一些总结去参加学校的一个论坛，虽然我不觉得有什么意义，但是可以作为一个整理思绪的理由，加上一些朋友也在关注着我们，于是我决定继续写下去。这一篇不会涉及太多的技术问题，更多的是工程问题，也就是我们在实际操作过程中的麻烦事，我会从头开始讲起，写出来给大家分享。从下一篇开始，我会开始写写算法，我会尽量还是以股票为实例，不做太多抽象。

1.数据预处理的选择。
    在拿到数据的时候，我们得到的是TXT，花了些钱网上买的，我先把数据导入到了数据库里，这时有一个选择问题，是把数据留在数据库里做预处理，还是提取出来在外面做。我们可以预想对数据的一些操作，离散化，归一化，计算指标等等，前者应该更为专业一些，但是我们查看指标的各种公式发现如果我们涉及的指标计算量十分的大，有些极其复杂，如果在数据库里操作也许会遇到很多SQL不好写。但是如果在数据库外面操作，编程量会提高几倍，另外有一个数据量的问题，200多万行数据，低版本的EXCEL是装不下的，还好EXCEL2007最多可以支持104万行左右，把原表分成3份可以解决。最后我们选择了在外面处理，看上去像是绕了弯子，但是实际上风险是相对小一些的，因为对于计算一个指标的数据，在数据库外面编程的灵活性显然是要比用SQL语句操作要大的。至于为什么要计算指标，因为指标的完整数据是买不到的，至少我没找到买的地方，于是我们自己动手算，这也是被逼的。

2.预处理程序。
    BM同学为我们写了决策树以及神经网络的预处理程序，它的整个过程在之前的文章已经写过，这里就不再重复了，写得很不错，贴上代码分享一下。需要注意的是，BM同学要求读入数据要为TXT格式的，他好用C语言读，于是我在这之前使用JAVA的POI扩展包把数据给整理成了TXT格式，后来我们一切对EXCEL的操作都是用POI做的。对于输出的格式，是WEKA的ARFF格式，其实就是TXT的前面加了一些声明。

<code>

/** 决策树数据预处理 **/

#include <stdio.h>
#include <stdlib.h>
#include <conio.h>
#define maxn 10010
double data[maxn][maxn];

int stock,old_stock;
int c_date;
double c_ans;
double unused;
char s[50],tmp_date[50];
int date_num,stock_num;
int transDate(char *date){
    int i=0,j=0;
    int num=0;
    int hehe[3];
    while(date[i]){
        if(date[i]>='0' && date[i]<='9')num=num*10+date[i]-'0';
        if(date[i]=='/' || date[i]=='//')
        {
            hehe[j++]=num;
            num=0;
        }
        i++;
    }
    return hehe[0]*10000+hehe[1]*100+num;
}

bool check(int year,int month,int day){
    if(day<1 || day>31)return false;
    if(month<1 || month>12)return false;
    if((month==4 ||month==6 || month==9 || month==11)&& day>30)return false;
    if(year%4==0 && month==2 && day==29)return true;
    if(month==2 && day>28)return false;
    return true;
}
/*读入日期*/
void readDate(){
    int year,month,day;
    int k=1;
    for(int i=19901219;i<20101016;i++){
        year=i/10000;
        day=i%100;
        month=(i%10000)/100;
        if(check(year,month,day))data[k++][0]=i;
        if(day==32){i+=100;i-=day;}
        if(month==13)i=(year+1)*10000+100;
    }
    date_num=k;
}

void putData(){
    int i=1,j=0,k=0,m;
    char filename[50];
    for(m=1;m<4;m++){
        sprintf(filename,"sh_0%d.txt",m);
        freopen(filename,"r",stdin);
        scanf("%s%s%s%s%s%s%s%s%s",s,s,s,s,s,s,s,s,s);
        while(scanf("%d%s%s%s%s%s%s%s%lf",&stock,tmp_date,s,s,s,s,s,s,&c_ans)==9){
            c_date=transDate(tmp_date);
            if(stock!=old_stock){
//                system("cls");
                printf("数据处理中.../n%d/1999/n",stock-600000);
                j++;i=1;
                data[0][j]=stock;
                while(data[i][0]<c_date){
                    data[i][j]=100;
                    i++;
                }
            }
            while(data[i][0]!=c_date)i++;
            data[i][j]=c_ans;
            k++;
            i++;
            old_stock=stock;
        }
    }
    stock_num=j;
}

void filtData(){
    int i,j,k;
    bool flag;
    system("cls");
    printf("正在筛选信息...");
    for(i=0;i<date_num;i++){
        flag=true;
        for(j=1;j<stock_num;j++)
            if(data[i][j]!=0){flag=false;break;}
        if(flag==true)data[i][0]=0;
    }
}

int main(){
    system("title 整合信息");
    system("color f0");
    int i,j,k;
    readDate();
/*--------------------------------------------------------
    检验日期是否正确
    printf("%d/n/n",date_num);
    for(i=0;i<date_num;i++)printf("%.0lf/n",data[i][0]);
---------------------------------------------------------*/
    putData();
    filtData();

    system("cls");
    printf("正在输出信息...");
    freopen("out.txt","w",stdout);
    for(j=0;j<stock_num;j++)printf("%.0lf/t",data[0][j]);
    printf("/n");
    bool flag;
    for(i=0;i<date_num;i++){
        flag=true;
        for(j=0;j<stock_num;j++){
            if(j==0){
                if(data[i][0]==0){flag=false;break;}
                printf("%.0lf/t",data[i][0]);
            }
            else{
                if(data[i][j]!=100)printf("%.2lf/t",data[i][j]);
                else printf("?/t");
            }
        }
        if(flag)printf("/n");
    }

}

/** 神经网络数据预处理 **/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <conio.h>
#define maxn 10010

typedef struct Rec{
    int code;                //编号
    double open;            //开盘价
    double close;            //收盘价
    double high;            //最高价
    double low;                //最低价
    double vol;                //成交量
    double money;            //成交金额
    double ans;                //结果
};

Rec rec[maxn];
int count[maxn];

bool readRec(Rec &r){
    if(scanf("%d%lf%lf%lf%lf%lf%lf%lf",&r.code,&r.open,&r.close,&r.high,&r.low,&r.vol,&r.money,&r.ans)
        ==8)return true;
    else return false;
}

void putData(int dataNum,int stock){
    FILE *fp;
    int i;
    char s[100];
    sprintf(s,"stocks/stock_%d.arff",stock);
    fp=fopen(s,"w");
    if(fp==NULL){printf("File open error/n");system("pause");exit(0);}
    fprintf(fp,"@relation %d/n/n",stock);
    fprintf(fp,"@attribute c_open1 numeric/n");
    fprintf(fp,"@attribute c_close1 numeric/n");
    fprintf(fp,"@attribute c_high1 numeric/n");
    fprintf(fp,"@attribute c_low1 numeric/n");
    fprintf(fp,"@attribute c_vol1 numeric/n");
    fprintf(fp,"@attribute c_money1 numeric/n");

    fprintf(fp,"@attribute c_open2 numeric/n");
    fprintf(fp,"@attribute c_close2 numeric/n");
    fprintf(fp,"@attribute c_high2 numeric/n");
    fprintf(fp,"@attribute c_low2 numeric/n");
    fprintf(fp,"@attribute c_vol2 numeric/n");
    fprintf(fp,"@attribute c_money2 numeric/n");

    fprintf(fp,"@attribute c_open3 numeric/n");
    fprintf(fp,"@attribute c_close3 numeric/n");
    fprintf(fp,"@attribute c_high3 numeric/n");
    fprintf(fp,"@attribute c_low3 numeric/n");
    fprintf(fp,"@attribute c_vol3 numeric/n");
    fprintf(fp,"@attribute c_money3 numeric/n");

    fprintf(fp,"@attribute c_ans numeric/n");
    fprintf(fp,"@attribute c_next numeric/n/n");
    fprintf(fp,"@data/n");
    memset(count,0,sizeof(count));
    for (i=dataNum-1;i>=2;i--){
        if (rec[i+1].ans>10 || rec[i+1].ans<-10) continue;
        if (count[int((rec[i+1].ans+10)*2)]>=4) continue;
        fprintf(fp,"%.2lf,%.2lf,%.2lf,%.2lf,%.0lf,%.0lf,"
                ,rec[i-2].open,rec[i-2].close,rec[i-2].high,rec[i-2].low,rec[i-2].vol,rec[i-2].money);
        fprintf(fp,"%.2lf,%.2lf,%.2lf,%.2lf,%.0lf,%.0lf,"
                ,rec[i-1].open,rec[i-1].close,rec[i-1].high,rec[i-1].low,rec[i-1].vol,rec[i-1].money);
        fprintf(fp,"%.2lf,%.2lf,%.2lf,%.2lf,%.0lf,%.0lf,%.2lf,%.2lf/n"
                ,rec[i].open,rec[i].close,rec[i].high,rec[i].low,rec[i].vol,rec[i].money,rec[i].ans,(rec[i+1].ans+10)/20);
        count[int((rec[i+1].ans+10)*2)]++;
    }
//    if (rec[i+1].ans>=-10 && rec[i+1].ans<=10)
//    fprintf(fp,"%.2lf,%.2lf,%.2lf,%.2lf,%.0lf,%.0lf,?/n"
//            ,rec[i].open,rec[i].close,rec[i].high,rec[i].low,rec[i].vol,rec[i].money);

    fclose(fp);
}

void readData(){
    int i=0,j=0,k=0,m;
    int old;
    char s[100];
    char inputfile[50];
    for(m=1;m<4;m++){
        sprintf(inputfile,"sh_0%d.txt",m);
        freopen(inputfile,"r",stdin);
        scanf("%s%s%s%s%s%s%s%s",s,s,s,s,s,s,s,s);
        while(readRec(rec[i])){
            if(old!=rec[i].code){
                putData(i,old);
                system("cls");
                printf("数据拆分中.../n%d/1999/n",old-600000);
                j++;
                old=rec[i].code;
                i=0;
            }
            else{
                old=rec[i].code;
                i++;
            }
        }
    }
}

int main(){
    system("title 神经网络数据拆分 copyright:beast");
    system("color f0");
    system("md stocks");
    readData();
}

</code>

3.指标计算。
    指标计算的工作量非常大，由于在EXCEL中编程操作，需要编写各种不同的程序来处理不同的指标。有一个比较郁闷的地方，在于数据处理完导出的时候，需要把XLS格式转化为CSV格式，如果我们手动，自然是很简单，不过文件太多，要编程做，POI也没有这个功能。后来我们聪明的ZYN和CXF两位女同志在网上找到了一个开源的小项目，叫OPENCSV（看清楚哦，不是大名鼎鼎的OPENCV，也不是做版本管理的OPENCVS），这个项目开发的包可以实现我们想要的功能，但是悲剧就悲剧在这里，生成的CSV文件从表面看上去没问题，但是WEKA无法识别，后面我们查错觉得可能是OPENCSV这个项目本身的BUG，我们也没有多余的时间去查看项目的源码，所以最后还是换了一种方式：直接把XLS转换为ARFF格式。

4.关于决策树生成的规则。
    决策时生成的规则是很难看的，我们做技术的也许会看得比较明白，但是用户肯定是不懂的，所以完善规则的可读性我们做了一些工作。每支股票我们选出了几十到几百个可供索引的规则，加在一起应该有近万条，在WEKA中跑出来的规则形式如下：
600036 = '(8.5355-inf)': '(8.0135-inf)' (6.0/1.0) （这是600000号股票的规则之一）
这个规则的意思是当600036这支股票涨幅是在8.5355到涨停（或者更高，因为最早是没有涨停限制的）时，600000号股票是8.0135到涨停这个涨幅，这样的情况历史上发生了6次，但是有一次是被错误分类的。（如果不明白什么叫错误分类，可以去看看决策树的C4.5算法）换个容易理解的说法，即这条信息的可信度为6分之5。对于怎样处理规则会让用户觉得友好，我们觉得这是对我们的一大挑战。

5.关于神经网络。
    我实在不想说太多，因为我们的神经网络做得真的够糟，用户至少需要输入20几个浮点数才能进行一次预测，这实在很考验人的耐心，但是我们也很无奈，手里的信息实在有限，输入的麻烦没办法解决，而且神经网络出的结果平均误差也是较大的。（涨跌幅相差2%-3%，但是请注意，这是一个平均值，所以我对此并不乐观）对于预测的结果到底如何，那真的只有看人品了，我们的方法最多只能给个大概的变化方向参考。还好我对于我们的其他两部分较乐观，否则我会考虑放弃这个项目。