逻辑回归实战 — Kaggle_Titanic

来源：互联网发布：php优化方案编辑：程序博客网时间：2024/06/11 22:04

数据来源：https://www.kaggle.com/c/titanic

Training

import pandasimport numpyimport timeimport matplotlib.pyplot as plt%matplotlib inlinedef prepareData(filename):    data = pandas.read_csv(filename)    data['Sex'] = data['Sex'].map({'female':0, 'male':1})    data['Embarked'] = data['Embarked'].map({'S':1, 'C':2, 'Q':3})    pier = [0 if numpy.isnan(item) else item for item in data['Embarked']]    data['Embarked'] = [max(set(pier), key=pier.count) if item == 0 else item for item in pier]    age_avg = numpy.mean([0 if numpy.isnan(item) else item for item in data['Age']])    data['Age'] = [age_avg if numpy.isnan(item) else item for item in data['Age']]    #data['Age'] = [1/(1+numpy.exp(-item)) for item in data['Age']]    data['Age'] = [(item-min(data['Age']))/(max(data['Age'])-min(data['Age'])) for item in data['Age']]    #data['Fare'] = [1/(1+numpy.exp(-item)) for item in data['Fare']]    data['Fare'] = [(item-min(data['Fare']))/(max(data['Fare'])-min(data['Fare'])) for item in data['Fare']]    #data = data.drop(['PassengerId','Name','Ticket','Cabin'], axis=1)    data.insert(0, 'ones', 1)    return data

def run(X, Y, theta, alpha, steps):    init_time = time.time()    costs = [getCost(X, Y, theta)]    count = 0    with open('titanic/model.txt','w') as f:        for i in range(len(theta)):            f.write('theta_' + str(i) + ',')        f.write('cost\n')        while count < steps:            theta -= alpha*getGradient(X, Y, theta)            cost = getCost(X, Y, theta)            costs.append(cost)            for item in theta:                f.write(str(item)+',')            f.write(str(cost)+'\n')            count += 1    time_spent = time.time()-init_time    return costs, theta, time_spent

def getGradient(X, Y, theta):    gradient = numpy.zeros(len(theta))    for j in range(len(theta)):        tmp = 0        for x,y in zip(X,Y):            tmp += x[j]*(y - 1/(1+numpy.exp(-numpy.dot(theta,x))))        gradient[j] = -1/len(Y)*tmp    return gradient

def getCost(X, Y, theta):    cost = 0    for x,y in zip(X,Y):        cost += -numpy.log(numpy.exp(numpy.dot(theta,x)) + 1) + y*numpy.dot(theta,x)    return -cost/len(Y)

def getAccuracy(train_X, train_Y, theta):    Y_hat = []    for x in train_X:        y_hat = 1/(1+numpy.exp(-numpy.dot(theta, x)))        if y_hat >= 0.5:            Y_hat.append(1)        else:            Y_hat.append(0)    correct = 0.0    for i,j in zip(Y_hat, train_Y):        if i == j:            correct += 1    accuracy = correct/len(Y_hat)    return accuracy

train_data = prepareData('titanic/train.csv')train_data.head(5)

ones PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked 0 1 1 0 3 Braund, Mr. Owen Harris 1 0.271174 1 0 A/5 21171 0.014151 NaN 1 1 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th… 0 0.472229 1 0 PC 17599 0.139136 C85 2 2 1 3 1 3 Heikkinen, Miss. Laina 0 0.321438 0 0 STON/O2. 3101282 0.015469 NaN 1 3 1 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 0 0.434531 1 0 113803 0.103644 C123 1 4 1 5 0 3 Allen, Mr. William Henry 1 0.434531 0 0 373450 0.015713 NaN 1

train_Y = (train_data.drop(['PassengerId','Name','Ticket','Cabin'], axis=1))['Survived'].valuestrain_X = train_data.drop(['PassengerId','Name','Ticket','Cabin'], axis=1).drop(['Survived'], axis=1).valuestheta = numpy.random.random(len(train_X[1]))alpha = 0.001steps = 10000costs, theta, time_spent = run(train_X, train_Y, theta, alpha, steps)accuracy = getAccuracy(train_X, train_Y, theta)fig = plt.figure(figsize=(18,5))ax1 = fig.add_subplot(121)ax1.plot(range(steps+1), costs)ax1.set_title('Logistic Regression for Titanic Problem -- Time spent: %f\nAccuracy: %f' % (time_spent, accuracy))ax1.set_xlabel('steps')ax1.set_ylabel('cost')ax2 = fig.add_subplot(122)ax2.plot(range(steps+1)[-1000:-1], costs[-1000:-1])ax2.set_xlabel('steps')ax2.set_ylabel('cost')

costs vs. steps

Testing

test_data = prepareData('titanic/test.csv')test_data.head(5)

ones PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked 0 1 892 3 Kelly, Mr. James 1 0.452723 0 0 330911 0.015282 NaN 3 1 1 893 3 Wilkes, Mrs. James (Ellen Needs) 0 0.617566 1 0 363272 0.013663 NaN 1 2 1 894 2 Myles, Mr. Thomas Francis 1 0.815377 0 0 240276 0.018909 NaN 3 3 1 895 3 Wirz, Mr. Albert 1 0.353818 0 0 315154 0.016908 NaN 1 4 1 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) 0 0.287881 1 1 3101298 0.023984 NaN 1

test_X = test_data.drop(['PassengerId','Name','Ticket','Cabin'], axis=1).valuesY_hat = []for x in test_X:    y_hat = 1/(1+numpy.exp(-numpy.dot(theta, x)))    if y_hat >= 0.5:        Y_hat.append(1)    else:        Y_hat.append(0)

results = pandas.DataFrame(Y_hat, columns=['Survived'])results.insert(0, 'PassengerId', test_data['PassengerId'])results.to_csv('titanic/results.csv')

阅读全文

0 0