from sklearn.ensemble import RandomForestClassifierimport numpy as npimport pandas as pd
dataset=pd.read_csv('input/train.csv')test=pd.read_csv('input/test.csv')
dataset.describe()
| label | pixel0 | pixel1 | pixel2 | pixel3 | pixel4 | pixel5 | pixel6 | pixel7 | pixel8 | … | pixel774 | pixel775 | pixel776 | pixel777 | pixel778 | pixel779 | pixel780 | pixel781 | pixel782 | pixel783 | count | 42000.000000 42000 42000 42000 42000 42000 42000 42000 42000 42000 … 42000.000000 42000.000000 42000.000000 42000.00000 42000.000000 42000.000000 42000 42000 42000 42000 mean | 4.456643 0 0 0 0 0 0 0 0 0 … 0.219286 0.117095 0.059024 0.02019 0.017238 0.002857 0 0 0 0 std | 2.887730 0 0 0 0 0 0 0 0 0 … 6.312890 4.633819 3.274488 1.75987 1.894498 0.414264 0 0 0 0 min | 0.000000 0 0 0 0 0 0 0 0 0 … 0.000000 0.000000 0.000000 0.00000 0.000000 0.000000 0 0 0 0 25% | 2.000000 0 0 0 0 0 0 0 0 0 … 0.000000 0.000000 0.000000 0.00000 0.000000 0.000000 0 0 0 0 50% | 4.000000 0 0 0 0 0 0 0 0 0 … 0.000000 0.000000 0.000000 0.00000 0.000000 0.000000 0 0 0 0 75% | 7.000000 0 0 0 0 0 0 0 0 0 … 0.000000 0.000000 0.000000 0.00000 0.000000 0.000000 0 0 0 0 max | 9.000000 0 0 0 0 0 0 0 0 0 … 254.000000 254.000000 253.000000 253.00000 254.000000 62.000000 0 0 0 0 8 rows × 785 columns
dataset.head()
| label | pixel0 | pixel1 | pixel2 | pixel3 | pixel4 | pixel5 | pixel6 | pixel7 | pixel8 | … | pixel774 | pixel775 | pixel776 | pixel777 | pixel778 | pixel779 | pixel780 | pixel781 | pixel782 | pixel783 | 0 | 1 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 1 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 2 | 1 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 3 | 4 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 4 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 5 rows × 785 columns
test.head()
| pixel0 | pixel1 | pixel2 | pixel3 | pixel4 | pixel5 | pixel6 | pixel7 | pixel8 | pixel9 | … | pixel774 | pixel775 | pixel776 | pixel777 | pixel778 | pixel779 | pixel780 | pixel781 | pixel782 | pixel783 | 0 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 1 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 2 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 3 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 4 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 5 rows × 784 columns
target=dataset[[0]].values
target
array([[1], [0], [1], …, [7], [6], [9]])
target=target.ravel()
target
array([1, 0, 1, …, 7, 6, 9])
train=dataset.iloc[:,1:].values
train
array([[0, 0, 0, …, 0, 0, 0], [0, 0, 0, …, 0, 0, 0], [0, 0, 0, …, 0, 0, 0], …, [0, 0, 0, …, 0, 0, 0], [0, 0, 0, …, 0, 0, 0], [0, 0, 0, …, 0, 0, 0]])
File “”, line 1 print train.dtype ^ SyntaxError: Missing parentheses in call to ‘print’
test
| pixel0 | pixel1 | pixel2 | pixel3 | pixel4 | pixel5 | pixel6 | pixel7 | pixel8 | pixel9 | … | pixel774 | pixel775 | pixel776 | pixel777 | pixel778 | pixel779 | pixel780 | pixel781 | pixel782 | pixel783 | 0 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 1 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 2 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 3 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 4 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 5 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 6 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 7 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 8 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 9 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 10 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 11 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 12 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 13 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 14 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 15 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 16 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 17 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 18 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 19 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 20 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 21 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 22 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 23 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 24 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 25 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 26 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 27 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 28 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 29 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 … | … … … … … … … … … … … … … … … … … … … … … 27970 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 27971 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 27972 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 27973 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 27974 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 27975 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 27976 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 27977 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 27978 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 27979 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 27980 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 27981 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 27982 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 27983 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 27984 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 27985 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 27986 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 27987 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 27988 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 27989 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 27990 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 27991 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 27992 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 27993 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 27994 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 27995 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 27996 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 27997 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 27998 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 27999 | 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 28000 rows × 784 columns
test=test.values
test
array([[0, 0, 0, …, 0, 0, 0], [0, 0, 0, …, 0, 0, 0], [0, 0, 0, …, 0, 0, 0], …, [0, 0, 0, …, 0, 0, 0], [0, 0, 0, …, 0, 0, 0], [0, 0, 0, …, 0, 0, 0]])
rf=RandomForestClassifier(n_estimators=100)rf.fit(train,target)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion=’gini’, max_depth=None, max_features=’auto’, max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False)
pred=rf.predict(test)
pred
array([2, 0, 9, …, 3, 9, 2])
subdemo=pd.read_csv('input/sample_submission.csv')
subdemo
| ImageId | Label | 0 | 1 0 1 | 2 0 2 | 3 0 3 | 4 0 4 | 5 0 5 | 6 0 6 | 7 0 7 | 8 0 8 | 9 0 9 | 10 0 10 | 11 0 11 | 12 0 12 | 13 0 13 | 14 0 14 | 15 0 15 | 16 0 16 | 17 0 17 | 18 0 18 | 19 0 19 | 20 0 20 | 21 0 21 | 22 0 22 | 23 0 23 | 24 0 24 | 25 0 25 | 26 0 26 | 27 0 27 | 28 0 28 | 29 0 29 | 30 0 … | … … 27970 | 27971 0 27971 | 27972 0 27972 | 27973 0 27973 | 27974 0 27974 | 27975 0 27975 | 27976 0 27976 | 27977 0 27977 | 27978 0 27978 | 27979 0 27979 | 27980 0 27980 | 27981 0 27981 | 27982 0 27982 | 27983 0 27983 | 27984 0 27984 | 27985 0 27985 | 27986 0 27986 | 27987 0 27987 | 27988 0 27988 | 27989 0 27989 | 27990 0 27990 | 27991 0 27991 | 27992 0 27992 | 27993 0 27993 | 27994 0 27994 | 27995 0 27995 | 27996 0 27996 | 27997 0 27997 | 27998 0 27998 | 27999 0 27999 | 28000 0 28000 rows × 2 columns
len(test)
28000
len(pred)
28000
np.savetxt('submission_rand_forest.csv', np.c_[range(1,len(test)+1),pred], delimiter=',', header = 'ImageId,Label', comments = '', fmt='%d')