Kaggle房价预测案例分享
来源:互联网 发布:淘宝运费险退到哪里了 编辑:程序博客网 时间:2024/05/09 15:59
在Jupyter Notebook运行可以显示图(--开头是输出的内容,需要注意下)
参考:https://github.com/AliceDudu/Kaggle-projects/blob/master/house-prices-advanced-regression-techniques/house-1-feature.ipynb
参考:https://github.com/AliceDudu/Kaggle-projects/blob/master/house-prices-advanced-regression-techniques/house-1-feature.ipynb
# 导入需要的包import pandas as pdimport numpy as npfrom sklearn.model_selection import cross_val_score, train_test_splitfrom sklearn.preprocessing import StandardScalerfrom sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCVfrom sklearn.metrics import mean_squared_error, make_scorerfrom scipy.stats import skewfrom IPython.display import displayimport matplotlib.pyplot as pltimport seaborn as sns%matplotlib inline# 获取训练数据train = pd.read_csv("train.csv")print("train : " + str(train.shape))--train : (1460, 81)# 获取测试数据test = pd.read_csv("test.csv")print("test : " + str(test.shape))--test : (1459, 80)train.head()
test.head()
# 合并数据 从'MSSubClass'列到'SaleCondition'列all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'], test.loc[:,'MSSubClass':'SaleCondition']))all_data.head()
# 查看重复项idsUnique = len(set(train.Id))idsTotal = train.shape[0]idsDupli = idsTotal - idsUniqueprint("There are " + str(idsDupli) + " duplicate IDs for " + str(idsTotal) + " total entries")# 删除Id列train.drop("Id", axis = 1, inplace = True)--There are 0 duplicate IDs for 1460 total entries# 查看重复项idsUnique = len(set(test.Id))idsTotal = test.shape[0]idsDupli = idsTotal - idsUniqueprint("There are " + str(idsDupli) + " duplicate IDs for " + str(idsTotal) + " total entries")# 删除Id列test.drop("Id", axis = 1, inplace = True)--There are 0 duplicate IDs for 1459 total entries# 寻找异常值,如 https://ww2.amstat.org/publications/jse/v19n3/decock.pdfplt.scatter(train.GrLivArea, train.SalePrice, c = "blue", marker = "s")plt.title("Looking for outliers")plt.xlabel("GrLivArea")plt.ylabel("SalePrice")plt.show()train = train[train.GrLivArea < 4000]
print("train : " + str(train.shape))--train : (1456, 80)all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'], test.loc[:,'MSSubClass':'SaleCondition']))all_data.head()
# 转换官方得分的目标train.SalePrice = np.log1p(train.SalePrice)y = train.SalePricedef fill_missing(dataset): # 处理中值/平均值或最常见值无意义的特征的缺失值 # Alley : data description says NA means "no alley access" dataset.loc[:, "Alley"] = dataset.loc[:, "Alley"].fillna("None") # BedroomAbvGr : NA most likely means 0 dataset.loc[:, "BedroomAbvGr"] = dataset.loc[:, "BedroomAbvGr"].fillna(0) # BsmtQual etc : data description says NA for basement features is "no basement" dataset.loc[:, "BsmtQual"] = dataset.loc[:, "BsmtQual"].fillna("No") dataset.loc[:, "BsmtCond"] = dataset.loc[:, "BsmtCond"].fillna("No") dataset.loc[:, "BsmtExposure"] = dataset.loc[:, "BsmtExposure"].fillna("No") dataset.loc[:, "BsmtFinType1"] = dataset.loc[:, "BsmtFinType1"].fillna("No") dataset.loc[:, "BsmtFinType2"] = dataset.loc[:, "BsmtFinType2"].fillna("No") dataset.loc[:, "BsmtFullBath"] = dataset.loc[:, "BsmtFullBath"].fillna(0) dataset.loc[:, "BsmtHalfBath"] = dataset.loc[:, "BsmtHalfBath"].fillna(0) dataset.loc[:, "BsmtUnfSF"] = dataset.loc[:, "BsmtUnfSF"].fillna(0) # CentralAir : NA most likely means No dataset.loc[:, "CentralAir"] = dataset.loc[:, "CentralAir"].fillna("N") # Condition : NA most likely means Normal dataset.loc[:, "Condition1"] = dataset.loc[:, "Condition1"].fillna("Norm") dataset.loc[:, "Condition2"] = dataset.loc[:, "Condition2"].fillna("Norm") # EnclosedPorch : NA most likely means no enclosed porch dataset.loc[:, "EnclosedPorch"] = dataset.loc[:, "EnclosedPorch"].fillna(0) # External stuff : NA most likely means average dataset.loc[:, "ExterCond"] = dataset.loc[:, "ExterCond"].fillna("TA") dataset.loc[:, "ExterQual"] = dataset.loc[:, "ExterQual"].fillna("TA") # Fence : data description says NA means "no fence" dataset.loc[:, "Fence"] = dataset.loc[:, "Fence"].fillna("No") # FireplaceQu : data description says NA means "no fireplace" dataset.loc[:, "FireplaceQu"] = dataset.loc[:, "FireplaceQu"].fillna("No") dataset.loc[:, "Fireplaces"] = dataset.loc[:, "Fireplaces"].fillna(0) # Functional : data description says NA means typical dataset.loc[:, "Functional"] = dataset.loc[:, "Functional"].fillna("Typ") # GarageType etc : data description says NA for garage features is "no garage" dataset.loc[:, "GarageType"] = dataset.loc[:, "GarageType"].fillna("No") dataset.loc[:, "GarageFinish"] = dataset.loc[:, "GarageFinish"].fillna("No") dataset.loc[:, "GarageQual"] = dataset.loc[:, "GarageQual"].fillna("No") dataset.loc[:, "GarageCond"] = dataset.loc[:, "GarageCond"].fillna("No") dataset.loc[:, "GarageArea"] = dataset.loc[:, "GarageArea"].fillna(0) dataset.loc[:, "GarageCars"] = dataset.loc[:, "GarageCars"].fillna(0) # HalfBath : NA most likely means no half baths above grade dataset.loc[:, "HalfBath"] = dataset.loc[:, "HalfBath"].fillna(0) # HeatingQC : NA most likely means typical dataset.loc[:, "HeatingQC"] = dataset.loc[:, "HeatingQC"].fillna("TA") # KitchenAbvGr : NA most likely means 0 dataset.loc[:, "KitchenAbvGr"] = dataset.loc[:, "KitchenAbvGr"].fillna(0) # KitchenQual : NA most likely means typical dataset.loc[:, "KitchenQual"] = dataset.loc[:, "KitchenQual"].fillna("TA") # LotFrontage : NA most likely means no lot frontage dataset.loc[:, "LotFrontage"] = dataset.loc[:, "LotFrontage"].fillna(0) # LotShape : NA most likely means regular dataset.loc[:, "LotShape"] = dataset.loc[:, "LotShape"].fillna("Reg") # MasVnrType : NA most likely means no veneer dataset.loc[:, "MasVnrType"] = dataset.loc[:, "MasVnrType"].fillna("None") dataset.loc[:, "MasVnrArea"] = dataset.loc[:, "MasVnrArea"].fillna(0) # MiscFeature : data description says NA means "no misc feature" dataset.loc[:, "MiscFeature"] = dataset.loc[:, "MiscFeature"].fillna("No") dataset.loc[:, "MiscVal"] = dataset.loc[:, "MiscVal"].fillna(0) # OpenPorchSF : NA most likely means no open porch dataset.loc[:, "OpenPorchSF"] = dataset.loc[:, "OpenPorchSF"].fillna(0) # PavedDrive : NA most likely means not paved dataset.loc[:, "PavedDrive"] = dataset.loc[:, "PavedDrive"].fillna("N") # PoolQC : data description says NA means "no pool" dataset.loc[:, "PoolQC"] = dataset.loc[:, "PoolQC"].fillna("No") dataset.loc[:, "PoolArea"] = dataset.loc[:, "PoolArea"].fillna(0) # SaleCondition : NA most likely means normal sale dataset.loc[:, "SaleCondition"] = dataset.loc[:, "SaleCondition"].fillna("Normal") # ScreenPorch : NA most likely means no screen porch dataset.loc[:, "ScreenPorch"] = dataset.loc[:, "ScreenPorch"].fillna(0) # TotRmsAbvGrd : NA most likely means 0 dataset.loc[:, "TotRmsAbvGrd"] = dataset.loc[:, "TotRmsAbvGrd"].fillna(0) # Utilities : NA most likely means all public utilities dataset.loc[:, "Utilities"] = dataset.loc[:, "Utilities"].fillna("AllPub") # WoodDeckSF : NA most likely means no wood deck dataset.loc[:, "WoodDeckSF"] = dataset.loc[:, "WoodDeckSF"].fillna(0) return datasettrain = fill_missing(train)train.head()用test得到test newtest_new = fill_missing(test)test_new.head()all_data = fill_missing(all_data)all_data.head()def num_to_cat(dataset): # 一些数字特征实际上是真正的类别 dataset = dataset.replace({"MSSubClass" : {20 : "SC20", 30 : "SC30", 40 : "SC40", 45 : "SC45", 50 : "SC50", 60 : "SC60", 70 : "SC70", 75 : "SC75", 80 : "SC80", 85 : "SC85", 90 : "SC90", 120 : "SC120", 150 : "SC150", 160 : "SC160", 180 : "SC180", 190 : "SC190"}, "MoSold" : {1 : "Jan", 2 : "Feb", 3 : "Mar", 4 : "Apr", 5 : "May", 6 : "Jun", 7 : "Jul", 8 : "Aug", 9 : "Sep", 10 : "Oct", 11 : "Nov", 12 : "Dec"} }) return datasettrain = num_to_cat(train)train.head()test_new = num_to_cat(test_new)test_new.head()all_data = num_to_cat(all_data)all_data.head()def order_cat(dataset): # 当订单中有信息时,将一些分类功能编码为有序数字 dataset = dataset.replace({"Alley" : {"Grvl" : 1, "Pave" : 2}, "BsmtCond" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5}, "BsmtExposure" : {"No" : 0, "Mn" : 1, "Av": 2, "Gd" : 3}, "BsmtFinType1" : {"No" : 0, "Unf" : 1, "LwQ": 2, "Rec" : 3, "BLQ" : 4, "ALQ" : 5, "GLQ" : 6}, "BsmtFinType2" : {"No" : 0, "Unf" : 1, "LwQ": 2, "Rec" : 3, "BLQ" : 4, "ALQ" : 5, "GLQ" : 6}, "BsmtQual" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA": 3, "Gd" : 4, "Ex" : 5}, "ExterCond" : {"Po" : 1, "Fa" : 2, "TA": 3, "Gd": 4, "Ex" : 5}, "ExterQual" : {"Po" : 1, "Fa" : 2, "TA": 3, "Gd": 4, "Ex" : 5}, "FireplaceQu" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5}, "Functional" : {"Sal" : 1, "Sev" : 2, "Maj2" : 3, "Maj1" : 4, "Mod": 5, "Min2" : 6, "Min1" : 7, "Typ" : 8}, "GarageCond" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5}, "GarageQual" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5}, "HeatingQC" : {"Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5}, "KitchenQual" : {"Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5}, "LandSlope" : {"Sev" : 1, "Mod" : 2, "Gtl" : 3}, "LotShape" : {"IR3" : 1, "IR2" : 2, "IR1" : 3, "Reg" : 4}, "PavedDrive" : {"N" : 0, "P" : 1, "Y" : 2}, "PoolQC" : {"No" : 0, "Fa" : 1, "TA" : 2, "Gd" : 3, "Ex" : 4}, "Street" : {"Grvl" : 1, "Pave" : 2}, "Utilities" : {"ELO" : 1, "NoSeWa" : 2, "NoSewr" : 3, "AllPub" : 4}} ) return datasettrain = order_cat(train)train.head()test_new = order_cat(test_new)test_new.head()all_data = order_cat(all_data)all_data.head()#增加新特征def new_feature(dataset): # Create new features # 1* Simplifications of existing features dataset["SimplOverallQual"] = dataset.OverallQual.replace({1 : 1, 2 : 1, 3 : 1, # bad 4 : 2, 5 : 2, 6 : 2, # average 7 : 3, 8 : 3, 9 : 3, 10 : 3 # good }) dataset["SimplOverallCond"] = dataset.OverallCond.replace({1 : 1, 2 : 1, 3 : 1, # bad 4 : 2, 5 : 2, 6 : 2, # average 7 : 3, 8 : 3, 9 : 3, 10 : 3 # good }) dataset["SimplPoolQC"] = dataset.PoolQC.replace({1 : 1, 2 : 1, # average 3 : 2, 4 : 2 # good }) dataset["SimplGarageCond"] = dataset.GarageCond.replace({1 : 1, # bad 2 : 1, 3 : 1, # average 4 : 2, 5 : 2 # good }) dataset["SimplGarageQual"] = dataset.GarageQual.replace({1 : 1, # bad 2 : 1, 3 : 1, # average 4 : 2, 5 : 2 # good }) dataset["SimplFireplaceQu"] = dataset.FireplaceQu.replace({1 : 1, # bad 2 : 1, 3 : 1, # average 4 : 2, 5 : 2 # good }) dataset["SimplFireplaceQu"] = dataset.FireplaceQu.replace({1 : 1, # bad 2 : 1, 3 : 1, # average 4 : 2, 5 : 2 # good }) dataset["SimplFunctional"] = dataset.Functional.replace({1 : 1, 2 : 1, # bad 3 : 2, 4 : 2, # major 5 : 3, 6 : 3, 7 : 3, # minor 8 : 4 # typical }) dataset["SimplKitchenQual"] = dataset.KitchenQual.replace({1 : 1, # bad 2 : 1, 3 : 1, # average 4 : 2, 5 : 2 # good }) dataset["SimplHeatingQC"] = dataset.HeatingQC.replace({1 : 1, # bad 2 : 1, 3 : 1, # average 4 : 2, 5 : 2 # good }) dataset["SimplBsmtFinType1"] = dataset.BsmtFinType1.replace({1 : 1, # unfinished 2 : 1, 3 : 1, # rec room 4 : 2, 5 : 2, 6 : 2 # living quarters }) dataset["SimplBsmtFinType2"] = dataset.BsmtFinType2.replace({1 : 1, # unfinished 2 : 1, 3 : 1, # rec room 4 : 2, 5 : 2, 6 : 2 # living quarters }) dataset["SimplBsmtCond"] = dataset.BsmtCond.replace({1 : 1, # bad 2 : 1, 3 : 1, # average 4 : 2, 5 : 2 # good }) dataset["SimplBsmtQual"] = dataset.BsmtQual.replace({1 : 1, # bad 2 : 1, 3 : 1, # average 4 : 2, 5 : 2 # good }) dataset["SimplExterCond"] = dataset.ExterCond.replace({1 : 1, # bad 2 : 1, 3 : 1, # average 4 : 2, 5 : 2 # good }) dataset["SimplExterQual"] = dataset.ExterQual.replace({1 : 1, # bad 2 : 1, 3 : 1, # average 4 : 2, 5 : 2 # good }) # 2* Combinations of existing features # 2* Combinations of existing features # Overall quality of the house dataset["OverallGrade"] = dataset["OverallQual"] * dataset["OverallCond"] # Overall quality of the garage dataset["GarageGrade"] = dataset["GarageQual"] * dataset["GarageCond"] # Overall quality of the exterior dataset["ExterGrade"] = dataset["ExterQual"] * dataset["ExterCond"] # Overall kitchen score dataset["KitchenScore"] = dataset["KitchenAbvGr"] * dataset["KitchenQual"] # Overall fireplace score dataset["FireplaceScore"] = dataset["Fireplaces"] * dataset["FireplaceQu"] # Overall garage score dataset["GarageScore"] = dataset["GarageArea"] * dataset["GarageQual"] # Overall pool score dataset["PoolScore"] = dataset["PoolArea"] * dataset["PoolQC"] # Simplified overall quality of the house dataset["SimplOverallGrade"] = dataset["SimplOverallQual"] * dataset["SimplOverallCond"] # Simplified overall quality of the exterior dataset["SimplExterGrade"] = dataset["SimplExterQual"] * dataset["SimplExterCond"] # Simplified overall pool score dataset["SimplPoolScore"] = dataset["PoolArea"] * dataset["SimplPoolQC"] # Simplified overall garage score dataset["SimplGarageScore"] = dataset["GarageArea"] * dataset["SimplGarageQual"] # Simplified overall fireplace score dataset["SimplFireplaceScore"] = dataset["Fireplaces"] * dataset["SimplFireplaceQu"] # Simplified overall kitchen score dataset["SimplKitchenScore"] = dataset["KitchenAbvGr"] * dataset["SimplKitchenQual"] # Total number of bathrooms dataset["TotalBath"] = dataset["BsmtFullBath"] + (0.5 * dataset["BsmtHalfBath"]) + \ dataset["FullBath"] + (0.5 * dataset["HalfBath"]) # Total SF for house (incl. basement) dataset["AllSF"] = dataset["GrLivArea"] + dataset["TotalBsmtSF"] # Total SF for 1st + 2nd floors dataset["AllFlrsSF"] = dataset["1stFlrSF"] + dataset["2ndFlrSF"] # Total SF for porch dataset["AllPorchSF"] = dataset["OpenPorchSF"] + dataset["EnclosedPorch"] + \ dataset["3SsnPorch"] + dataset["ScreenPorch"] # Has masonry veneer or not dataset["HasMasVnr"] = dataset.MasVnrType.replace({"BrkCmn" : 1, "BrkFace" : 1, "CBlock" : 1, "Stone" : 1, "None" : 0}) # House completed before sale or not dataset["BoughtOffPlan"] = dataset.SaleCondition.replace({"Abnorml" : 0, "Alloca" : 0, "AdjLand" : 0, "Family" : 0, "Normal" : 0, "Partial" : 1}) return datasettrain = new_feature(train)train.head()test_new = new_feature(test_new)test_new.head()all_data = new_feature(all_data)all_data.head()# 找到最重要的字段# Find most important features relative to targetprint("Find most important features relative to target")corr = train.corr()corr.sort_values(["SalePrice"], ascending = False, inplace = True)print(corr.SalePrice)输出内容:Find most important features relative to targetSalePrice 1.000OverallQual 0.819AllSF 0.817AllFlrsSF 0.729GrLivArea 0.719SimplOverallQual 0.708ExterQual 0.681GarageCars 0.680TotalBath 0.673KitchenQual 0.667GarageArea 0.655BsmtQual 0.653TotalBsmtSF 0.642SimplExterQual 0.636GarageScore 0.6181stFlrSF 0.614SimplKitchenQual 0.610OverallGrade 0.604FullBath 0.591YearBuilt 0.589ExterGrade 0.587SimplGarageScore 0.585SimplBsmtQual 0.578YearRemodAdd 0.569GarageYrBlt 0.544TotRmsAbvGrd 0.533SimplOverallGrade 0.527SimplKitchenScore 0.523SimplExterGrade 0.488Fireplaces 0.487 ... GarageQual 0.192GarageGrade 0.184GarageCond 0.172PoolQC 0.164SimplFunctional 0.137Functional 0.136ScreenPorch 0.124SimplPoolQC 0.122SimplBsmtCond 0.104Street 0.0583SsnPorch 0.056ExterCond 0.051SimplGarageQual 0.047PoolArea 0.041Utilities 0.013BsmtFinSF2 0.006SimplGarageCond -0.015BsmtHalfBath -0.015MiscVal -0.020SimplBsmtFinType2 -0.020SimplOverallCond -0.028BsmtFinType2 -0.030YrSold -0.034OverallCond -0.037LowQualFinSF -0.038LandSlope -0.040SimplExterCond -0.042KitchenAbvGr -0.148EnclosedPorch -0.149LotShape -0.286Name: SalePrice, dtype: float64def poly_feature(dataset): # Create new features # 3* Polynomials on the top 10 existing features dataset["OverallQual-s2"] = dataset["OverallQual"] ** 2 dataset["OverallQual-s3"] = dataset["OverallQual"] ** 3 dataset["OverallQual-Sq"] = np.sqrt(dataset["OverallQual"]) dataset["AllSF-2"] = dataset["AllSF"] ** 2 dataset["AllSF-3"] = dataset["AllSF"] ** 3 dataset["AllSF-Sq"] = np.sqrt(dataset["AllSF"]) dataset["AllFlrsSF-2"] = dataset["AllFlrsSF"] ** 2 dataset["AllFlrsSF-3"] = dataset["AllFlrsSF"] ** 3 dataset["AllFlrsSF-Sq"] = np.sqrt(dataset["AllFlrsSF"]) dataset["GrLivArea-2"] = dataset["GrLivArea"] ** 2 dataset["GrLivArea-3"] = dataset["GrLivArea"] ** 3 dataset["GrLivArea-Sq"] = np.sqrt(dataset["GrLivArea"]) dataset["SimplOverallQual-s2"] = dataset["SimplOverallQual"] ** 2 dataset["SimplOverallQual-s3"] = dataset["SimplOverallQual"] ** 3 dataset["SimplOverallQual-Sq"] = np.sqrt(dataset["SimplOverallQual"]) dataset["ExterQual-2"] = dataset["ExterQual"] ** 2 dataset["ExterQual-3"] = dataset["ExterQual"] ** 3 dataset["ExterQual-Sq"] = np.sqrt(dataset["ExterQual"]) dataset["GarageCars-2"] = dataset["GarageCars"] ** 2 dataset["GarageCars-3"] = dataset["GarageCars"] ** 3 dataset["GarageCars-Sq"] = np.sqrt(dataset["GarageCars"]) dataset["TotalBath-2"] = dataset["TotalBath"] ** 2 dataset["TotalBath-3"] = dataset["TotalBath"] ** 3 dataset["TotalBath-Sq"] = np.sqrt(dataset["TotalBath"]) dataset["KitchenQual-2"] = dataset["KitchenQual"] ** 2 dataset["KitchenQual-3"] = dataset["KitchenQual"] ** 3 dataset["KitchenQual-Sq"] = np.sqrt(dataset["KitchenQual"]) dataset["GarageScore-2"] = dataset["GarageScore"] ** 2 dataset["GarageScore-3"] = dataset["GarageScore"] ** 3 dataset["GarageScore-Sq"] = np.sqrt(dataset["GarageScore"]) return datasettrain = poly_feature(train)train.head()test_new = poly_feature(test_new)test_new.head()all_data = poly_feature(all_data)all_data.head()# train# 区分数值特征(减去目标)和分类特征 # Differentiate numerical features (minus the target) and categorical featurescategorical_features = train.select_dtypes(include = ["object"]).columnsnumerical_features = train.select_dtypes(exclude = ["object"]).columnsnumerical_features = numerical_features.drop("SalePrice")print("Numerical features : " + str(len(numerical_features)))print("Categorical features : " + str(len(categorical_features)))train_num = train[numerical_features]train_cat = train[categorical_features]--Numerical features : 118--Categorical features : 25因为有第一列ID# test_new# Differentiate numerical features (minus the target) and categorical featurescategorical_features = test_new.select_dtypes(include = ["object"]).columnsnumerical_features = test_new.select_dtypes(exclude = ["object"]).columns# numerical_features = numerical_features.drop("SalePrice")print("Numerical features : " + str(len(numerical_features)))print("Categorical features : " + str(len(categorical_features)))test_new_num = test_new[numerical_features]test_new_cat = test_new[categorical_features]--Numerical features : 120--Categorical features : 24# all_data# Differentiate numerical features (minus the target) and categorical featurescategorical_features = all_data.select_dtypes(include = ["object"]).columnsnumerical_features = all_data.select_dtypes(exclude = ["object"]).columns# numerical_features = numerical_features.drop("SalePrice")print("Numerical features : " + str(len(numerical_features)))print("Categorical features : " + str(len(categorical_features)))all_data_num = all_data[numerical_features]all_data_cat = all_data[categorical_features]--Numerical features : 117--Categorical features : 26# Handle remaining missing values for numerical features by using median as replacementprint("NAs for numerical features in train : " + str(train_num.isnull().values.sum()))train_num = train_num.fillna(train_num.median())print("Remaining NAs for numerical features in train : " + str(train_num.isnull().values.sum()))--NAs for numerical features in train : 11431--Remaining NAs for numerical features in train : 0# Handle remaining missing values for numerical features by using median as replacementprint("NAs for numerical features in test_new : " + str(test_new_num.isnull().values.sum()))test_new_num = test_new_num.fillna(test_new_num.median())print("Remaining NAs for numerical features in test_new : " + str(test_new_num.isnull().values.sum()))--NAs for numerical features in test_new : 85--Remaining NAs for numerical features in test_new : 0# Handle remaining missing values for numerical features by using median as replacementprint("NAs for numerical features in all_data : " + str(all_data_num.isnull().values.sum()))all_data_num = all_data_num.fillna(all_data_num.median())print("Remaining NAs for numerical features in all_data : " + str(all_data_num.isnull().values.sum()))--NAs for numerical features in all_data : 166--Remaining NAs for numerical features in all_data : 0# 对变量数值特征进行Log变换,以减少异常值的影响# 由Alexandru Papiu的脚本启发:https://www.kaggle.com/apapiu/house-prices-advanced-regression-techniques/regularized-linear-models# 作为一般的经验法则,绝对值> 0.5的偏度被认为至少适度地偏斜# Log transform of the skewed numerical features to lessen impact of outliers# Inspired by Alexandru Papiu's script : https://www.kaggle.com/apapiu/house-prices-advanced-regression-techniques/regularized-linear-models# As a general rule of thumb, a skewness with an absolute value > 0.5 is considered at least moderately skewedskewness = train_num.apply(lambda x: skew(x))skewness = skewness[abs(skewness) > 0.5]print(str(skewness.shape[0]) + " skewed numerical features to log transform")skewed_features = skewness.indextrain_num[skewed_features] = np.log1p(train_num[skewed_features])--87 skewed numerical features to log transform# Log transform of the skewed numerical features to lessen impact of outliers# Inspired by Alexandru Papiu's script : https://www.kaggle.com/apapiu/house-prices-advanced-regression-techniques/regularized-linear-models# As a general rule of thumb, a skewness with an absolute value > 0.5 is considered at least moderately skewedskewness = test_new_num.apply(lambda x: skew(x))skewness = skewness[abs(skewness) > 0.5]print(str(skewness.shape[0]) + " skewed numerical features to log transform")skewed_features = skewness.indextest_new_num[skewed_features] = np.log1p(test_new_num[skewed_features])--87 skewed numerical features to log transform# Log transform of the skewed numerical features to lessen impact of outliers# Inspired by Alexandru Papiu's script : https://www.kaggle.com/apapiu/house-prices-advanced-regression-techniques/regularized-linear-models# As a general rule of thumb, a skewness with an absolute value > 0.5 is considered at least moderately skewedskewness = all_data_num.apply(lambda x: skew(x))skewness = skewness[abs(skewness) > 0.5]print(str(skewness.shape[0]) + " skewed numerical features to log transform")skewed_features = skewness.indexall_data_num[skewed_features] = np.log1p(all_data_num[skewed_features])--86 skewed numerical features to log transform# 通过单热编码为分类值创建虚拟特征# Create dummy features for categorical values via one-hot encodingprint("NAs for categorical features in train : " + str(train_cat.isnull().values.sum()))train_cat = pd.get_dummies(train_cat)print("Remaining NAs for categorical features in train : " + str(train_cat.isnull().values.sum()))--NAs for categorical features in train : 2749--Remaining NAs for categorical features in train : 0# Create dummy features for categorical values via one-hot encodingprint("NAs for categorical features in test_new : " + str(test_new_cat.isnull().values.sum()))test_new_cat = pd.get_dummies(test_new_cat)print("Remaining NAs for categorical features in test_new : " + str(test_new_cat.isnull().values.sum()))--NAs for categorical features in test_new : 7--Remaining NAs for categorical features in test_new : 0# Create dummy features for categorical values via one-hot encodingprint("NAs for categorical features in all_data : " + str(all_data_cat.isnull().values.sum()))all_data_cat = pd.get_dummies(all_data_cat)print("Remaining NAs for categorical features in all_data : " + str(all_data_cat.isnull().values.sum()))--NAs for categorical features in all_data : 8--Remaining NAs for categorical features in all_data : 0test_new.head()# 合并分类和数值特性# Join categorical and numerical featurestrain = pd.concat([train_num, train_cat], axis = 1)print("New number of features : " + str(train.shape[1]))--New number of features : 295# Join categorical and numerical featurestest_new = pd.concat([test_new_num, test_new_cat], axis = 1)print("New number of features : " + str(test_new.shape[1]))--New number of features : 281# Join categorical and numerical featuresall_data = pd.concat([all_data_num, all_data_cat], axis = 1)print("New number of features : " + str(all_data.shape[1]))--New number of features : 320test_new.head()train.shape[0]--1456train = all_data[:train.shape[0]]test = all_data[train.shape[0]:]y.head()输出:0 12.2481 12.1092 12.3173 11.8494 12.429Name: SalePrice, dtype: float64#分割数据集# Partition the dataset in train + validation setsX_train, X_test, y_train, y_test = train_test_split(train, y, test_size = 0.3, random_state = 0)print("X_train : " + str(X_train.shape))print("X_test : " + str(X_test.shape))print("y_train : " + str(y_train.shape))print("y_test : " + str(y_test.shape))X_train : (1022, 295)X_test : (438, 295)y_train : (1022,)y_test : (438,)# Partition the dataset in train + validation setsX_train, X_test, y_train, y_test = train_test_split(train, y, test_size = 0.3, random_state = 0)print("X_train : " + str(X_train.shape))print("X_test : " + str(X_test.shape))print("y_train : " + str(y_train.shape))print("y_test : " + str(y_test.shape))X_train : (1019, 320)X_test : (437, 320)y_train : (1019,)y_test : (437,)# 标准化数值特性# Standardize numerical featuresstdSc = StandardScaler()X_train.loc[:, numerical_features] = stdSc.fit_transform(X_train.loc[:, numerical_features])X_test.loc[:, numerical_features] = stdSc.transform(X_test.loc[:, numerical_features])# 定义官方评分的误差度量:RMSE# Define error measure for official scoring : RMSEscorer = make_scorer(mean_squared_error, greater_is_better = False)def rmse_cv_train(model): rmse= np.sqrt(-cross_val_score(model, X_train, y_train, scoring = scorer, cv = 10)) return(rmse)def rmse_cv_test(model): rmse= np.sqrt(-cross_val_score(model, X_test, y_test, scoring = scorer, cv = 10)) return(rmse)# Linear Regressionlr = LinearRegression()lr.fit(X_train, y_train)# 看看训练和验证集的预测# Look at predictions on training and validation setprint("RMSE on Training set :", rmse_cv_train(lr).mean())print("RMSE on Test set :", rmse_cv_test(lr).mean())y_train_pred = lr.predict(X_train)y_test_pred = lr.predict(X_test)# 绘制残差# Plot residualsplt.scatter(y_train_pred, y_train_pred - y_train, c = "blue", marker = "s", label = "Training data")plt.scatter(y_test_pred, y_test_pred - y_test, c = "lightgreen", marker = "s", label = "Validation data")plt.title("Linear regression")plt.xlabel("Predicted values")plt.ylabel("Residuals")plt.legend(loc = "upper left")plt.hlines(y = 0, xmin = 10.5, xmax = 13.5, color = "red")plt.show()# 绘制预测# Plot predictionsplt.scatter(y_train_pred, y_train, c = "blue", marker = "s", label = "Training data")plt.scatter(y_test_pred, y_test, c = "lightgreen", marker = "s", label = "Validation data")plt.title("Linear regression")plt.xlabel("Predicted values")plt.ylabel("Real values")plt.legend(loc = "upper left")plt.plot([10.5, 13.5], [10.5, 13.5], c = "red")plt.show()--('RMSE on Training set :', 0.2994379109222855)--('RMSE on Test set :', 0.38449735311625766)
# 2* Ridgeridge = RidgeCV(alphas = [0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1, 3, 6, 10, 30, 60])ridge.fit(X_train, y_train)alpha = ridge.alpha_print("Best alpha :", alpha)print("Try again for more precision with alphas centered around " + str(alpha))ridge = RidgeCV(alphas = [alpha * .6, alpha * .65, alpha * .7, alpha * .75, alpha * .8, alpha * .85, alpha * .9, alpha * .95, alpha, alpha * 1.05, alpha * 1.1, alpha * 1.15, alpha * 1.25, alpha * 1.3, alpha * 1.35, alpha * 1.4], cv = 10)ridge.fit(X_train, y_train)alpha = ridge.alpha_print("Best alpha :", alpha)print("Ridge RMSE on Training set :", rmse_cv_train(ridge).mean())print("Ridge RMSE on Test set :", rmse_cv_test(ridge).mean())y_train_rdg = ridge.predict(X_train)y_test_rdg = ridge.predict(X_test)# Plot residualsplt.scatter(y_train_rdg, y_train_rdg - y_train, c = "blue", marker = "s", label = "Training data")plt.scatter(y_test_rdg, y_test_rdg - y_test, c = "lightgreen", marker = "s", label = "Validation data")plt.title("Linear regression with Ridge regularization")plt.xlabel("Predicted values")plt.ylabel("Residuals")plt.legend(loc = "upper left")plt.hlines(y = 0, xmin = 10.5, xmax = 13.5, color = "red")plt.show()# Plot predictionsplt.scatter(y_train_rdg, y_train, c = "blue", marker = "s", label = "Training data")plt.scatter(y_test_rdg, y_test, c = "lightgreen", marker = "s", label = "Validation data")plt.title("Linear regression with Ridge regularization")plt.xlabel("Predicted values")plt.ylabel("Real values")plt.legend(loc = "upper left")plt.plot([10.5, 13.5], [10.5, 13.5], c = "red")plt.show()# 绘制重要系数# Plot important coefficientscoefs = pd.Series(ridge.coef_, index = X_train.columns)print("Ridge picked " + str(sum(coefs != 0)) + " features and eliminated the other " + \ str(sum(coefs == 0)) + " features")imp_coefs = pd.concat([coefs.sort_values().head(10), coefs.sort_values().tail(10)])imp_coefs.plot(kind = "barh")plt.title("Coefficients in the Ridge Model")plt.show()--('Best alpha :', 30.0)--Try again for more precision with alphas centered around 30.0--('Best alpha :', 22.5)--('Ridge RMSE on Training set :', 0.11517100992467055)--('Ridge RMSE on Test set :', 0.11636013229778504)
# 3* Lassolasso = LassoCV(alphas = [0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1], max_iter = 50000, cv = 10)lasso.fit(X_train, y_train)alpha = lasso.alpha_print("Best alpha :", alpha)print("Try again for more precision with alphas centered around " + str(alpha))lasso = LassoCV(alphas = [alpha * .6, alpha * .65, alpha * .7, alpha * .75, alpha * .8, alpha * .85, alpha * .9, alpha * .95, alpha, alpha * 1.05, alpha * 1.1, alpha * 1.15, alpha * 1.25, alpha * 1.3, alpha * 1.35, alpha * 1.4], max_iter = 50000, cv = 10)lasso.fit(X_train, y_train)alpha = lasso.alpha_print("Best alpha :", alpha)print("Lasso RMSE on Training set :", rmse_cv_train(lasso).mean())print("Lasso RMSE on Test set :", rmse_cv_test(lasso).mean())y_train_las = lasso.predict(X_train)y_test_las = lasso.predict(X_test)# Plot residualsplt.scatter(y_train_las, y_train_las - y_train, c = "blue", marker = "s", label = "Training data")plt.scatter(y_test_las, y_test_las - y_test, c = "lightgreen", marker = "s", label = "Validation data")plt.title("Linear regression with Lasso regularization")plt.xlabel("Predicted values")plt.ylabel("Residuals")plt.legend(loc = "upper left")plt.hlines(y = 0, xmin = 10.5, xmax = 13.5, color = "red")plt.show()# Plot predictionsplt.scatter(y_train_las, y_train, c = "blue", marker = "s", label = "Training data")plt.scatter(y_test_las, y_test, c = "lightgreen", marker = "s", label = "Validation data")plt.title("Linear regression with Lasso regularization")plt.xlabel("Predicted values")plt.ylabel("Real values")plt.legend(loc = "upper left")plt.plot([10.5, 13.5], [10.5, 13.5], c = "red")plt.show()# Plot important coefficientscoefs = pd.Series(lasso.coef_, index = X_train.columns)print("Lasso picked " + str(sum(coefs != 0)) + " features and eliminated the other " + \ str(sum(coefs == 0)) + " features")imp_coefs = pd.concat([coefs.sort_values().head(10), coefs.sort_values().tail(10)])imp_coefs.plot(kind = "barh")plt.title("Coefficients in the Lasso Model")plt.show()--('Best alpha :', 0.00059999999999999995)--Try again for more precision with alphas centered around 0.0006--('Best alpha :', 0.00059999999999999995)--('Lasso RMSE on Training set :', 0.11403954212405014)--('Lasso RMSE on Test set :', 0.11585280771533035)
X_train.head()X_test.head()test.head()y_pred_lasso = lasso.predict(test)y_pred_lasso--array([ 90.59653003, 62.86892549, 74.88361726, ..., 81.39766545, 39.54093941, 87.80713916])y_pred_lasso = np.exp(y_pred_lasso)y_pred_lasso--array([ 2.21601691e+39, 2.01199745e+27, 3.32308738e+32, ..., 2.24159394e+35, 1.48734526e+17, 1.36193383e+38])pred_df = pd.DataFrame(y_pred_lasso, index=test_new["Id"], columns=["SalePrice"])pred_df.to_csv('output.csv', header=True, index_label='Id')# Get datatest_data = pd.read_csv("test.csv")print("test : " + str(test_data.shape))pred_df = pd.DataFrame(y_pred_lasso, index=test_data["Id"], columns=["SalePrice"])pred_df.to_csv('output_lasso.csv', header=True, index_label='Id')
阅读全文
0 0
- Kaggle房价预测案例分享
- kaggle 房价预测经典文章
- Kaggle入门实例-预测房价
- Story 2---Kaggle房价预测
- Kaggle房价预测:随机森林方法
- kaggle房价预测/Ridge/RandomForest/cross_validation
- Kaggle房价预测进阶版/bagging/boosting/AdaBoost/XGBoost
- Kaggle房价预测:数据探索——练习
- Kaggle房价预测:数据预处理——练习
- 【笔记】AI100-Kaggle竞赛_2017年房价预测
- 机器学习案例之二 房价预测
- Kaggle债务违约预测冠军经验分享
- spark--案例分享--性别预测
- 转:kaggle案例:员工离职预测 (附视频)
- 房价预测(HackerRank)
- python预测房价
- 波士顿房价预测
- 房价预测解决方案
- linux 解压命令 tar
- kubernetes中的中间件集群故障测试
- MySQL 源码导读 5.7
- Python时间序列LSTM预测系列教程(1)-单变量
- 「python」类定义
- Kaggle房价预测案例分享
- 仿真软件LTspice之《第三方spice模型导入方法》
- 将winmanager插件显示在右侧
- beanUtils用法
- Selenium之JDBC请求
- 算法 1.3.47 可连接的队列
- 先锋机器人实践(一)
- 条件随机场随记1
- Error:null value in entry: incrementalFolder=null