python机器学习及实战-Python基础综合实践

来源:互联网 发布:淘宝八十字评论 编辑:程序博客网 时间:2024/06/07 04:50
#读取数据import pandas as pddf_train = pd.read_csv('breast-cancer-train.csv')df_test = pd.read_csv('breast-cancer-test.csv')#print(df_train.info())#print(df_test.info())df_test_negative = df_test.loc[df_test['Type'] == 0][['Clump Thickness', 'Cell Size']]#将Type这一列值等于0的行的Clump Thickness,Cell Size列取出来,有点拗口df_test_positive = df_test.loc[df_test['Type'] == 1][['Clump Thickness', 'Cell Size']]#将Type这一列值等于1的行的Clump Thickness,Cell Size列取出来#print(df_test_negative)#print(df_test_positive)#绘制散点图1import matplotlib.pyplot as pltplt.scatter(df_test_negative['Clump Thickness'], df_test_negative['Cell Size'], marker = 'o', s = 200, c = 'red')plt.xlabel('Clump Thickness')plt.ylabel('Cell Size')plt.show()#绘制二维直线图2import numpy as npintercept = np.random.random([1])coef = np.random.random([2])lx=np.arange(0, 12)ly = (-intercept - lx * coef[0]) / coef[1]plt.plot(lx, ly, c='yellow')plt.show()#测试样本正样本和负样本散点图图3plt.scatter(df_test_negative['Clump Thickness'], df_test_negative['Cell Size'], marker = 'o', s = 200, c = 'red')plt.scatter(df_test_positive['Clump Thickness'], df_test_positive['Cell Size'], marker = 'x', s = 150, c = 'black')plt.xlabel('Clump Thickness')plt.ylabel('Cell Size')plt.show()#训练样本前十行训练的线性分类器图4from sklearn.linear_model import LogisticRegressionlr = LogisticRegression()lr.fit(df_train[['Clump Thickness', 'Cell Size']][:10], df_train['Type'][:10])#采用训练样本的前十行进行训练#print(df_train[['Clump Thickness', 'Cell Size']][:10])#print(df_train['Type'][:10])print('Testing accuracy (10 training samples):', lr.score(df_test[['Clump Thickness', 'Cell Size']], df_test['Type']))intercept = lr.intercept_coef = lr.coef_[0, :]ly = (-intercept - lx * coef[0]) / coef[1]plt.plot(lx, ly, c = 'green')plt.scatter(df_test_negative['Clump Thickness'], df_test_negative['Cell Size'], marker = 'o', s = 200, c = 'red')plt.scatter(df_test_positive['Clump Thickness'], df_test_positive['Cell Size'], marker = 'x', s = 200, c = 'black')plt.xlabel('Clump Thickness')plt.ylabel('Cell Size')plt.show()#所有样本训练的线性分类器图5lr = LogisticRegression()lr.fit(df_train[['Clump Thickness', 'Cell Size']], df_train['Type'])#采用所有训练样本进行训练print('Testing accuracy (all traning samples):', lr.score(df_test[['Clump Thickness', 'Cell Size']], df_test['Type']))intercept = lr.intercept_coef = lr.coef_[0, :]ly = (-intercept - lx * coef[0]) / coef[1]plt.plot(lx, ly, c = 'blue')plt.scatter(df_test_negative['Clump Thickness'], df_test_negative['Cell Size'], marker = 'o', s = 200, c = 'red')plt.scatter(df_test_positive['Clump Thickness'], df_test_positive['Cell Size'], marker = 'x', s= 200, c = 'black')plt.xlabel('Clump Thickness')plt.ylabel('Cell Size')plt.show()

运行结果:

Testing accuracy (10 training samples): 0.868571428571Testing accuracy (all traning samples): 0.937142857143
效果图:


所用到的训练数据和测试数据链接链接:http://pan.baidu.com/s/1c30cDS 密码:h8a0