LR的变量选择问题

来源：互联网发布：直播app源码搭建编辑：程序博客网时间：2024/06/13 01:38

Python中没有forward backward stepwise方法。

使用RFE包

原理：参数中设定需要几个变量，每次按重要性筛去变量

参考：http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html

心得：可以考虑使用gridsearch来调节n_features这一参数

from sklearn.linear_model import LogisticRegressionfrom sklearn.feature_selection import RFEreg = LogisticRegression(C=1, solver="newton-cg", max_iter = 1000, penalty = "l2")model_select = RFE(estimator = reg, n_features_to_select = 4)

自己写了一个利用AIC准则做stepwise的函数：

import mathAIC = lambda estimator, X, y: 2*X.shape[1] + X.shape[0]*math.log(pow((y-(estimator.predict_proba(X))[:,0]), 2).sum()/X.shape[1])

import pandas as pdimport numpy as npfrom sklearn.linear_model import LogisticRegressiondef stepwise_selection(X, y, initial_list=[], verbose=True):    """ Perform a forward-backward feature selection     based on p-value from statsmodels.api.OLS    Arguments:        X - pandas.DataFrame with candidate features        y - list-like with the target        initial_list - list of features to start with (column names of X)        threshold_in - include a feature if its p-value < threshold_in        threshold_out - exclude a feature if its p-value > threshold_out        verbose - whether to print the sequence of inclusions and exclusions    Returns: list of selected features     Always set threshold_in < threshold_out to avoid infinite looping.    See https://en.wikipedia.org/wiki/Stepwise_regression for the details    """    included = list(initial_list)    model = LogisticRegression(C=1, solver="newton-cg", max_iter = 1000, penalty = "l2")    # initiate AIC_current    if len(initial_list) != 0:        model.fit(pd.DataFrame(X[initial_list]), y)        AIC_current = AIC(model, X[initial_list], y)        print('current AIC = ', AIC_current)    else:        excluded = list(set(X.columns)-set(included))        new_AIC = pd.Series(index=excluded)        for new_column in excluded:            model.fit(pd.DataFrame(X[[new_column]]), y)            new_AIC[new_column] = AIC(model, X[[new_column]], y) #model.pvalues[new_column]        AIC_current = new_AIC.min()        print('Add  {:30} with AIC {:.6}'.format(new_AIC.argmin(), AIC_current))        print('----', new_AIC)        included.append(new_AIC.argmin())    while True:        changed=False        # forward step        excluded = list(set(X.columns)-set(included))        new_AIC = pd.Series(index=excluded)        for new_column in excluded:            model.fit(pd.DataFrame(X[included+[new_column]]), y)            new_AIC[new_column] = AIC(model, X[included+[new_column]], y) #model.pvalues[new_column]        best_AIC = new_AIC.min()        print('----', new_AIC)        if best_AIC < AIC_current:            AIC_current = best_AIC            best_feature = new_AIC.argmin()            included.append(best_feature)            changed=True            if verbose:                print('Add  {:30} with AIC {:.6}'.format(best_feature, best_AIC))        # backward step        if len(included)>=2 :            old_AIC = pd.Series(index=included)            for old_column in included:                model.fit(pd.DataFrame(X[list(set(included) - set([old_column]))]), y)                old_AIC[old_column] = AIC(model, X[list(set(included) - set([old_column]))], y)             best_AIC = old_AIC.min()            print('----', old_AIC)            if best_AIC < AIC_current:                AIC_current = best_AIC                changed=True                worst_feature = old_AIC.argmin()                included.remove(worst_feature)                if verbose:                    print('Drop {:30} with AIC {:.6}'.format(worst_feature, best_AIC))        if not changed:            break    return included

注意：这里有可能出现死循环。这一问题还未解决。要更换模型或者准则需要在源代码里修改。

但是这边有一个问题，AICBIC针对LR可能会变成负无穷。

想到了另一种解决方法：用ks作为准则来筛选变量（越大越好）

代码：

import pandas as pdimport numpy as npfrom sklearn.linear_model import LogisticRegressiondef stepwise_selection(X, y, initial_list=[], verbose=True, max_iter = 1000):    """ Perform a forward-backward feature selection     based on p-value from statsmodels.api.OLS    Arguments:        X - pandas.DataFrame with candidate features        y - list-like with the target        initial_list - list of features to start with (column names of X)        max_iter - maximum iteration to avoid infinite looping        verbose - whether to print the sequence of inclusions and exclusions    Returns: list of selected features    """    included = list(initial_list)    model = LogisticRegression(C=1, solver="newton-cg", max_iter = 1000, penalty = "l2")    # initiate ks_current    if len(initial_list) != 0:        model.fit(pd.DataFrame(X[initial_list]), y)        ks_current = get_ks_for_estimators(model, X[initial_list], y)        print('current ks = ', ks_current)    else:        excluded = list(set(X.columns)-set(included))        new_ks = pd.Series(index=excluded)        for new_column in excluded:            model.fit(pd.DataFrame(X[[new_column]]), y)            new_ks[new_column] = get_ks_for_estimators(model, X[[new_column]], y)        ks_current = new_ks.max()        print('Add  {:30} with ks {:.6}'.format(new_ks.argmax(), ks_current))        included.append(new_ks.argmax())    while True:        changed=False        # forward step        excluded = list(set(X.columns)-set(included))        new_ks = pd.Series(index=excluded)        for new_column in excluded:            model.fit(pd.DataFrame(X[included+[new_column]]), y)            new_ks[new_column] = get_ks_for_estimators(model, X[included+[new_column]], y) #model.pvalues[new_column]        best_ks = new_ks.max()        if best_ks > ks_current:            ks_current = best_ks            best_feature = new_ks.argmax()            included.append(best_feature)            changed=True            if verbose:                print('Add  {:30} with ks {:.6}'.format(best_feature, best_ks))        # backward step        if len(included)>=2 :            old_ks = pd.Series(index=included)            for old_column in included:                model.fit(pd.DataFrame(X[list(set(included) - set([old_column]))]), y)                old_ks[old_column] = get_ks_for_estimators(model, X[list(set(included) - set([old_column]))], y)             best_ks = old_ks.max()            if best_ks > ks_current:                ks_current = best_ks                changed=True                worst_feature = old_ks.argmax()                included.remove(worst_feature)                if verbose:                    print('Drop {:30} with ks {:.6}'.format(worst_feature, best_ks))        if not changed:            break    return included

跟前一段程序差不多。把AIC改为ks就可以了。具体函数在另一篇博客里面有。

阅读全文

0 0