kaggle Digit Recognizer 数字识别

来源:互联网 发布:淘宝店铺退款率在哪看 编辑:程序博客网 时间:2024/05/16 07:58

https://www.kaggle.com/c/digit-recognizer

首先看一下提供的训练文件train.csv

import pandas as pdtrainingFile = pd.read_csv('C:/Users/Administrator/Desktop/train.csv')print(trainingFile.head())'''   label  pixel0  pixel1  pixel2  pixel3  pixel4  pixel5  pixel6  pixel7  \0      1       0       0       0       0       0       0       0       0   1      0       0       0       0       0       0       0       0       0   2      1       0       0       0       0       0       0       0       0   3      4       0       0       0       0       0       0       0       0   4      0       0       0       0       0       0       0       0       0      pixel8    ...     pixel774  pixel775  pixel776  pixel777  pixel778  \0       0    ...            0         0         0         0         0   1       0    ...            0         0         0         0         0   2       0    ...            0         0         0         0         0   3       0    ...            0         0         0         0         0   4       0    ...            0         0         0         0         0      pixel779  pixel780  pixel781  pixel782  pixel783  0         0         0         0         0         0  1         0         0         0         0         0  2         0         0         0         0         0  3         0         0         0         0         0  4         0         0         0         0         0  [5 rows x 785 columns]'''print(len(trainingFile))'''42000'''
根据他的描述可以知道label是指数字是几 pixel是指784个像素点 共42000个数据


首先尝试用kNN算法

点击打开kNN.py

首先先让前41900个数据当训练集 后100个用作测试 看看正确率

import numpy as npimport pandas as pdimport kNN# 加载数据def loadDataSet():    # 获取训练集    print('获取训练集...')    trainingFile = pd.read_csv('C:/Users/Administrator/Desktop/train.csv')    train_x = np.array(trainingFile.drop('label', 1))[:41900]    train_x[train_x > 0] = 1    train_y = np.array(trainingFile['label'])[:41900]    # 获取测试集    print('获取测试集...')    testingFile = pd.read_csv('C:/Users/Administrator/Desktop/train.csv')    test_x = np.array(testingFile.drop('label', 1))[41900:]    test_x[test_x > 0] = 1    test_y = np.array(testingFile['label'])[41900:]    return train_x, train_y, test_x, test_y# 手写数字测试def testHandWritingClass():    # 加载数据    print('加载数据...')    train_x, train_y, test_x, test_y = loadDataSet()    # 训练    print('训练中...')    pass    # 测试    print('测试中...')    numTestSamples = len(test_x)    matchCount = 0    result = []    for i in range(numTestSamples):        predict = kNN.kNNClassify(test_x[i], train_x, train_y, 3)        if predict == test_y[i]:            matchCount += 1    accuracy = float(matchCount) / numTestSamples    # 输出结果    print('输出结果...')    print('分类准确率为: %.2f%%' % (accuracy * 100))if __name__ == '__main__':    testHandWritingClass()
输出结果:
加载数据...获取训练集...获取测试集...训练中...测试中...输出结果...分类准确率为: 99.00%

看正确率还不错 直接让train.csv作为训练集 计算test.csv中每个数 照着sample_submission.csv的格式 把答案存到result.csv
import numpy as npimport pandas as pdimport kNN# 加载数据def loadDataSet():    # 获取训练集    print('获取训练集...')    trainingFile = pd.read_csv('C:/Users/Administrator/Desktop/train.csv')    train_x = np.array(trainingFile.drop('label', 1))[:]    train_x[train_x > 0] = 1    train_y = np.array(trainingFile['label'])[:]    # 获取测试集    print('获取测试集...')    testingFile = pd.read_csv('C:/Users/Administrator/Desktop/test.csv')    test_x = np.array(testingFile)[:]    test_x[test_x > 0] = 1    test_y = []    return train_x, train_y, test_x, test_y# 手写数字测试def testHandWritingClass():    # 加载数据    print('加载数据...')    train_x, train_y, test_x, test_y = loadDataSet()    # 训练    print('训练中...')    pass    # 测试    print('测试中...')    numTestSamples = len(test_x)    result = []    for i in range(numTestSamples):        predict = kNN.kNNClassify(test_x[i], train_x, train_y, 4)        result.append([i + 1, predict])        if i % 100 == 0:            print('进度:', i, '/', numTestSamples)    # 输出结果    print('输出结果...')    #print(result)    pd.DataFrame(result, columns=['ImageId', 'Label']).to_csv('C:/Users/Administrator/Desktop/result.csv', index=False)if __name__ == '__main__':    testHandWritingClass()
最后把result.csv提交分数为0.96543

使用scikit-learn库的kNN
import numpy as npimport pandas as pdfrom sklearn import preprocessingfrom sklearn.neighbors import KNeighborsClassifier# 加载数据def loadDataSet():    # 获取训练集    print('获取训练集...')    trainingFile = pd.read_csv('C:/Users/Administrator/Desktop/train.csv')    train_x = np.array(trainingFile.drop('label', 1))[:]    preprocessing.Binarizer().fit(train_x)    train_y = np.array(trainingFile['label'])[:]    # 获取测试集    print('获取测试集...')    testingFile = pd.read_csv('C:/Users/Administrator/Desktop/test.csv')    test_x = np.array(testingFile)[:]    preprocessing.Binarizer().fit(test_x)    test_y = []    return train_x, train_y, test_x, test_y# 手写数字测试def testHandWritingClass():    # 加载数据    print('加载数据...')    train_x, train_y, test_x, test_y = loadDataSet()    # 训练    print('训练中...')    model = KNeighborsClassifier()    model.fit(train_x, train_y)    # 测试    print('测试中...')    predict = model.predict(test_x)    # 输出结果    print('输出结果...')    result = list(enumerate(predict, 1))    #print(result)    pd.DataFrame(result, columns=['ImageId', 'Label']).to_csv('C:/Users/Administrator/Desktop/result.csv', index=False)if __name__ == '__main__':    testHandWritingClass()
最后把result.csv提交分数为0.96800

原创粉丝点击