利用Logistic回归预测疝气病症的病马的死亡率

来源:互联网 发布:淘宝花呗店铺出售 编辑:程序博客网 时间:2024/04/30 17:44

数据来源

http://archive.ics.uci.edu/ml/machine-learning-databases/horse-colic

处理过程

由于该数据集存在30%的缺失,那么首先必须对数据集进行预处理,这里我把缺失值用每列的平均值来代替,同时把数据集没用的几列数据舍弃。之后利用sklearn库进行Logistic回归。


代码与数据已经打包上传,如果有需要请移步:
http://download.csdn.net/detail/qq_30091945/9822726


结果:
由于有30%的数据缺失,不可避免误差过大。


Python代码如下:

#!/usr/bin/env python# -*- coding: utf-8 -*-# @Time    : 2017/4/23 0023 7:59# @Author  : Aries# @Site    : # @File    : 疝气症预测病马死亡率.py# @Software: PyCharm Community Editionimport numpy as npimport matplotlib as mplimport matplotlib.pyplot as pltfrom sklearn.linear_model import LogisticRegressiondef GetData(path):    """    :param path: 数据集路径    :return: 返回数据集    """    Data = []    Label = []    #没有用的属性的下标    index = [2,24,25,26,27]    with open(path) as f:        for line in f.readlines():            LineArr = line.strip().split(" ")            m = np.shape(LineArr)[0]            data = []            for i in range(m):                if i in index:                    #没有用的属性直接跳过                    continue                elif i == 22:                    #下标为22的属性是分类                    #1代表活着,标记设为1                    #2,3分别代表死亡,安乐死,标记设为0                    if LineArr[i] == '?':                        Label.append(0)                    elif int(LineArr[i]) == 1:                        Label.append(1)                    else:                        Label.append(0)                else:                    #剩下的是有用数据                    if LineArr[i] == '?':                        #缺失数据首先由0代替                        data.append(0.0)                    else:                        data.append(float(LineArr[i]))            Data.append(data)        Data = np.array(Data)        Label = np.array(Label)    return Data,Labeldef ZeroProcess(data):    """    :param data:需要进行0值处理的数据    :return: 返回把0值已经处理好的数据    """    m,n = np.shape(data)    for i in range(n):        avg = np.average(data[:,i])        if np.any(data[:,i]) == 0:            for j in range(m):                data[j][i] = avg        else:            continue    return datadef autoNorm(Data):    """    :param Data: 需要进行归一化的数据    :return: 进行Max-Min标准化的数据    """    #求出数据中每列的最大值,最小值,以及相应的范围    data_min = Data.min(0)    data_max = Data.max(0)    data_range = data_max-data_min    #进行归一化    m = np.shape(Data)[0]    Norm_Data = Data - np.tile(data_min,(m,1))    Norm_Data = Norm_Data / data_range    return Norm_Datadef PreProcess(data):    """    数据预处理,包括0值处理和归一化    :param data:需要处理的数据    :return: 已经处理好的数据    """    #对数据进行0值处理    Non_Zero_Data = ZeroProcess(data)    #对数据进行归一化    Norm_Data = autoNorm(Non_Zero_Data)    return  Norm_Datadef run_main():    """        这是主函数    """    #导入训练与测试数据集    path1 = "./horse_colic_train.txt"    path2 = "./horse_colic_test.txt"    Train_Data,Train_Label = GetData(path1)    Test_Data,Test_Label = GetData(path2)    #数据预处理,包括0值处理和归一化    Train_Data_Precess = PreProcess(Train_Data)    Test_Data_Process = PreProcess(Test_Data)    #设置matplotlib,能让它显示中文    mpl.rcParams['font.sans-serif'] = [u'simHei']    mpl.rcParams['axes.unicode_minus'] = False    #训练Logistic回归算法    logistiic_regression = LogisticRegression()    model = logistiic_regression.fit(Train_Data_Precess,Train_Label)    print("Logistic回归的系数为:", logistiic_regression.coef_, "常数项为:", logistiic_regression.intercept_)    #对Logistic回归算法进行测试    Train_Label_Predict = logistiic_regression.predict(Train_Data_Precess)    error = 0.0    for i in range(len(Train_Label)):        if Train_Label[i] != Train_Label_Predict[i]:            error = error + 1.0    error = error / len(Train_Label)    avg = np.average(Train_Label)    TSS = np.sum((Train_Label-avg)**2)    RSS = np.sum((Train_Label_Predict-Train_Label)**2)    R2 = 1 - RSS/TSS    print("训练样本的误差为:",error)    print("R2为:",R2)    #测试新数据    Test_Label_Predict = logistiic_regression.predict(Test_Data_Process)    err = 0.0    for i in range(len(Test_Label_Predict)):        print(Test_Label[i],Test_Label_Predict[i])        if Test_Label_Predict[i] != Test_Label[i]:            err = err + 1.0    err = err / len(Test_Label)    print("误差为:",err)if __name__ == "__main__":    run_main()
1 0