kaggle titanic 入门实例 逻辑回归的使用 & 随机森林的使用

来源:互联网 发布:网络破案电视剧 编辑:程序博客网 时间:2024/05/29 02:09
#coding:utf-8import numpy as npimport pandas as pdtrain = pd.read_csv("./csv/train.csv", dtype={"Age": np.float64},)test = pd.read_csv("./csv/test.csv", dtype={"Age": np.float64},)def harmonize_data(titanic):#填充空数据 和 把string数据转成integer表示    titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median())    titanic.loc[titanic["Sex"] == "male", "Sex"] = 0    titanic.loc[titanic["Sex"] == "female", "Sex"] = 1    titanic["Embarked"] = titanic["Embarked"].fillna("S")    titanic.loc[titanic["Embarked"] == "S", "Embarked"] = 0    titanic.loc[titanic["Embarked"] == "C", "Embarked"] = 1    titanic.loc[titanic["Embarked"] == "Q", "Embarked"] = 2    titanic["Fare"] = titanic["Fare"].fillna(titanic["Fare"].median())    return titanicdef create_submission(alg, train, test, predictors, filename):    alg.fit(train[predictors], train["Survived"])    predictions = alg.predict(test[predictors])    submission = pd.DataFrame({        "PassengerId": test["PassengerId"],        "Survived": predictions    })    submission.to_csv(filename, index=False)train_data = harmonize_data(train)test_data  = harmonize_data(test)from sklearn.linear_model import LogisticRegressionfrom sklearn import cross_validationpredictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]alg    = LogisticRegression(random_state=1)scores = cross_validation.cross_val_score(#对于cross_val_score的解释http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.cross_val_score.html    alg,    train_data[predictors],    train_data["Survived"],    cv=3)print(scores.mean())from sklearn.ensemble import RandomForestClassifierpredictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]alg = RandomForestClassifier(    random_state=1,    n_estimators=150,    min_samples_split=4,    min_samples_leaf=2)scores = cross_validation.cross_val_score(    alg,    train_data[predictors],    train_data["Survived"],    cv=3)print(scores.mean())create_submission(alg, train_data, test_data, predictors, "run-01.csv")
0 0
原创粉丝点击