Dataquest学习总结[9]

来源：互联网发布：二叉树的结点算法公式编辑：程序博客网时间：2024/05/22 13:49

Step 6: Machine Learning

Machine Learning In Python: Intermediate

>>Multiclass classification:

pandas.get_dummies() 对dataframe或Series中value值进行变换，尤其是在value有多个取值时，转换为多个二进制的结果，

需要进行dummy处理的依据：针对于value之间没有明显的关系，即使数值很接近，但是意义完全分离的情况，例如：70,71代表年份，就可以dummy

#读取数据，输出origin的类别import pandas as pdcars = pd.read_csv("auto.csv")unique_regions=cars['origin'].unique()print(unique_regions)#对数值类型进行转换dummy_cylinders = pd.get_dummies(cars["cylinders"], prefix="cyl")cars = pd.concat([cars, dummy_cylinders], axis=1)dummy_years = pd.get_dummies(cars["year"], prefix="year")cars = pd.concat([cars, dummy_years], axis=1)cars=cars.drop(['cylinders','year'],axis=1)print(cars.head())#划分训练集与测试集shuffled_rows = np.random.permutation(cars.index)shuffled_cars = cars.iloc[shuffled_rows]highest_train_row = int(cars.shape[0] * .70)train = shuffled_cars.iloc[0:highest_train_row]test = shuffled_cars.iloc[highest_train_row:]#将三分类问题转化为三个二分类问题，给出两种解决方法>>from sklearn.linear_model import LogisticRegressionunique_origins = cars["origin"].unique()unique_origins.sort()models = {}for i in unique_origins:    model=LogisticRegression()    train_x=train.iloc[:,6:]    train_y=train['origin']==i    model.fit(train_x,train_y)    models[i]=model>>from sklearn.linear_model import LogisticRegressionunique_origins = cars["origin"].unique()unique_origins.sort()models = {}features = [c for c in train.columns if c.startswith("cyl") or c.startswith("year")]for origin in unique_origins:    model = LogisticRegression()    X_train = train[features]    y_train = train["origin"] == origin    model.fit(X_train, y_train)    models[origin] = model#计算各个模型对应类的概率值构成Dataframe表testing_probs = pd.DataFrame(columns=unique_origins)for i in unique_origins:    testing_probs[i]=models[i].predict_proba(test[features])[:,1]#根据最大概率确定分类predicted_origins=testing_probs.idxmax(axis=1)

>>Intermediate Linear Regression:

线性模型评价指标: Sum of Square Error (SSE), Regression Sum of Squares (RSS), and Total Sum of Squares (TSS)

TSS=RSS+SSE

R-Squared: R^2=1-SSE/TSS=RSS/TSS 越大匹配效果越好

#披萨斜塔斜度随着时间的变化import pandasimport matplotlib.pyplot as pltpisa = pandas.DataFrame({"year": range(1975, 1988),                          "lean": [2.9642, 2.9644, 2.9656, 2.9667, 2.9673, 2.9688, 2.9696,                                   2.9698, 2.9713, 2.9717, 2.9725, 2.9742, 2.9757]})print(pisa)plt.scatter(x=pisa['year'],y=pisa['lean'])plt.show()import statsmodels.api as smy = pisa.lean # targetX = pisa.year  # featuresX = sm.add_constant(X)  # add a column of 1's as the constant term# OLS -- Ordinary Least Squares Fitlinear = sm.OLS(y, X)# fit modellinearfit = linear.fit()print(linearfit.summary())# Our predicted values of yyhat = linearfit.predict(X)print(yhat)residuals=yhat-y# The variable residuals is in memoryplt.hist(residuals, bins=5)import numpy as np# sum the (predicted - observed) squaredSSE = np.sum((y.values-yhat)**2)RSS=np.sum((y.mean()-yhat)**2)TSS=SSE+RSS#The models parametersprint("\n",linearfit.params)delta = linearfit.params["year"] * 15SSE = np.sum((y.values - yhat)**2)# Compute variance in Xxvar = np.sum((pisa.year - pisa.year.mean())**2)# Compute variance in b1 s2b1 = SSE / ((y.shape[0] - 2) * xvar)R2=RSS/TSS#画出T分布不同自由度曲线from scipy.stats import t# 100 values between -3 and 3x = np.linspace(-3,3,100)tdist3=t.pdf(x=x, df=3)tdist30=t.pdf(x=x, df=30)plt.plot(x,tdist3)plt.plot(x,tdist30)plt.show()tstat = linearfit.params["year"] / np.sqrt(s2b1)# At the 95% confidence interval for a two-sided t-test we must use a p-value of 0.975pval = 0.975# The degrees of freedomdf = pisa.shape[0] - 2# The probability to test againstp = t.cdf(tstat, df=df)beta1_test = p > pval

>>过拟合的处理方法：使bias和variance都较小选择的特征越多模型越复杂，越容易过拟合。进行交叉验证

理想的模型复杂度的选择：

The best model was around 50% more accurate than the simplest model. On the other hand, the overall variance increased around 25% as we increased the model complexity.

# Our implementation for train_and_test, takes in a list of strings.def train_and_test(cols):    # Split into features & target.    features = filtered_cars[cols]    target = filtered_cars["mpg"]    # Fit model.    lr = LinearRegression()    lr.fit(features, target)    # Make predictions on training set.    predictions = lr.predict(features)    # Compute MSE and Variance.    mse = mean_squared_error(filtered_cars["mpg"], predictions)    variance = np.var(predictions)    return(mse, variance)one_mse, one_var = train_and_test(["cylinders"])two_mse, two_var = train_and_test(['cylinders', 'displacement'])three_mse, three_var = train_and_test(['cylinders', 'displacement', 'horsepower'])four_mse, four_var = train_and_test(['cylinders', 'displacement', 'horsepower', 'weight'])five_mse, five_var = train_and_test(['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration'])six_mse, six_var = train_and_test(['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model year'])seven_mse, seven_var = train_and_test(['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model year', 'origin'])#进行k折线交叉验证from sklearn.cross_validation import KFoldfrom sklearn.metrics import mean_squared_errorimport numpy as npdef train_and_cross_val(cols):    features = filtered_cars[cols]    target = filtered_cars["mpg"]    variance_values = []    mse_values = []    # KFold instance.    kf = KFold(n=len(filtered_cars), n_folds=10, shuffle=True, random_state=3)     # Iterate through over each fold.    for train_index, test_index in kf:        # Training and test sets.        X_train, X_test = features.iloc[train_index], features.iloc[test_index]        y_train, y_test = target.iloc[train_index], target.iloc[test_index]            # Fit the model and make predictions.        lr = LinearRegression()        lr.fit(X_train, y_train)        predictions = lr.predict(X_test)             # Calculate mse and variance values for this fold.        mse = mean_squared_error(y_test, predictions)        var = np.var(predictions)        # Append to arrays to do calculate overall average mse and variance values.        variance_values.append(var)        mse_values.append(mse)    # Compute average mse and variance values.    avg_mse = np.mean(mse_values)    avg_var = np.mean(variance_values)    return(avg_mse, avg_var)two_mse, two_var = train_and_cross_val(["cylinders", "displacement"])three_mse, three_var = train_and_cross_val(["cylinders", "displacement", "horsepower"])four_mse, four_var = train_and_cross_val(["cylinders", "displacement", "horsepower", "weight"])five_mse, five_var = train_and_cross_val(["cylinders", "displacement", "horsepower", "weight", "acceleration"])six_mse, six_var = train_and_cross_val(["cylinders", "displacement", "horsepower", "weight", "acceleration", "model year"])seven_mse, seven_var = train_and_cross_val(["cylinders", "displacement", "horsepower", "weight", "acceleration","model year", "origin"])

>>K-means clustering源码，不调用sklearn的方法

无监督的K-means方法与有监督的分类方法最大的区别是前者没有标签，后者是基于已知标签进行学习分类

import pandas as pdimport numpy as npnba = pd.read_csv("nba_2013.csv")point_guards=nba[nba['pos']=='PG']point_guards['ppg'] = point_guards['pts'] / point_guards['g']point_guards = point_guards[point_guards['tov'] != 0]point_guards['atr']=point_guards['ast']/point_guards['tov']#初始化，随机产生5个中心点num_clusters = 5# Use numpy's random function to generate a list, length: num_clusters, of indicesrandom_initial_points = np.random.choice(point_guards.index, size=num_clusters)# Use the random indices to create the centroidscentroids = point_guards.loc[random_initial_points]#5个中心点构成的字典def centroids_to_dict(centroids):    dictionary = dict()    # iterating counter we use to generate a cluster_id    counter = 0    # iterate a pandas data frame row-wise using .iterrows()    for index, row in centroids.iterrows():        coordinates = [row['ppg'], row['atr']]        dictionary[counter] = coordinates        counter += 1    return dictionarycentroids_dict = centroids_to_dict(centroids)#计算欧几里得距离import mathdef calculate_distance(centroid, player_values):    root_distance = 0    for x in range(0, len(centroid)):        difference = centroid[x] - player_values[x]        squared_difference = difference**2        root_distance += squared_difference    euclid_distance = math.sqrt(root_distance)    return euclid_distance#给各个点进行标记def assign_to_cluster(s):    s_p=s.loc[['ppg','atr']]    min_dis=None    min_id=0    for k,v in centroids_dict.items():        k_dis=calculate_distance(v,s_p)        if min_dis==None or min_dis>k_dis:            min_dis=k_dis            min_id=k    return min_idpoint_guards['cluster']=point_guards.apply(assign_to_cluster,axis=1)#可视化def visualize_clusters(df, num_clusters):    colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']    for n in range(num_clusters):        clustered_df = df[df['cluster'] == n]        plt.scatter(clustered_df['ppg'], clustered_df['atr'], c=colors[n-1])        plt.xlabel('Points Per Game', fontsize=13)        plt.ylabel('Assist Turnover Ratio', fontsize=13)    plt.show()visualize_clusters(point_guards, 5)#继续迭代，更新各个簇的中心点，采用统计平均def recalculate_centroids(df):    new_centroids_dict = dict()    # 0..1...2...3...4    for cluster_id in range(0, num_clusters):        # Finish the logic        cluster_df=df[df['cluster']==cluster_id]        new_centroids_dict[cluster_id]=(cluster_df['ppg'].mean(),cluster_df['atr'].mean())    return new_centroids_dictcentroids_dict = recalculate_centroids(point_guards)#继续可视化和迭代point_guards['cluster'] = point_guards.apply(lambda row: assign_to_cluster(row), axis=1)visualize_clusters(point_guards, num_clusters)centroids_dict = recalculate_centroids(point_guards)point_guards['cluster'] = point_guards.apply(lambda row: assign_to_cluster(row), axis=1)visualize_clusters(point_guards, num_clusters)#采用sklearn聚类的方式from sklearn.cluster import KMeanskmeans = KMeans(n_clusters=num_clusters)kmeans.fit(point_guards[['ppg', 'atr']])point_guards['cluster'] = kmeans.labels_visualize_clusters(point_guards, num_clusters)

>>梯度下降法源码，针对线性回归：

import pandasimport matplotlib.pyplot as pltpga = pandas.read_csv("pga.csv")pga.distance = (pga.distance - pga.distance.mean()) / pga.distance.std()pga.accuracy = (pga.accuracy - pga.accuracy.mean()) / pga.accuracy.std()print(pga.head())plt.scatter(pga.distance, pga.accuracy)plt.xlabel('normalized distance')plt.ylabel('normalized accuracy')plt.show()#线性模型from sklearn.linear_model import LinearRegressionimport numpy as npmodel=LinearRegression()model.fit(pga[['distance']],pga['accuracy'])theta1=model.coef_#损失函数def cost(theta0, theta1, x, y):    # Initialize cost    J = 0    # The number of observations    m = len(x)    # Loop through each observation    for i in range(m):        # Compute the hypothesis         h = theta1 * x[i] + theta0        # Add to cost        J += (h - y[i])**2    # Average and normalize cost    J /= (2*m)    return J#画出三维图theta0s = np.linspace(-2,2,100)theta1s = np.linspace(-2,2, 100)COST = np.empty(shape=(100,100))T0S, T1S = np.meshgrid(theta0s, theta1s)for i in range(100):    for j in range(100):        COST[i,j] = cost(T0S[0,i], T1S[j,0], pga.distance, pga.accuracy)fig2 = plt.figure()ax = fig2.gca(projection='3d')ax.plot_surface(X=T0S,Y=T1S,Z=COST)plt.show()#求斜率和截距的偏导数def partial_cost_theta1(theta0, theta1, x, y):    # Hypothesis    h = theta0 + theta1*x    # Hypothesis minus observed times x    diff = (h - y) * x    # Average to compute partial derivative    partial = diff.sum() / (x.shape[0])    return partialdef partial_cost_theta0(theta0, theta1, x, y):    # Hypothesis    h = theta0 + theta1*x    # Difference between hypothesis and observation    diff = (h - y)    # Compute partial derivative    partial = diff.sum() / (x.shape[0])    return partial#梯度下降算法# x is our feature vector -- distance# y is our target variable -- accuracy# alpha is the learning rate# theta0 is the intial theta0 # theta1 is the intial theta1def gradient_descent(x, y, alpha=0.1, theta0=0, theta1=0):    max_epochs = 1000 # Maximum number of iterations    counter = 0      # Intialize a counter    c = cost(theta1, theta0, pga.distance, pga.accuracy)  ## Initial cost    costs = [c]     # Lets store each update    # Set a convergence threshold to find where the cost function in minimized    # When the difference between the previous cost and current cost     #        is less than this value we will say the parameters converged    convergence_thres = 0.000001      cprev = c + 10       theta0s = [theta0]    theta1s = [theta1]    # When the costs converge or we hit a large number of iterations will we stop updating    while (np.abs(cprev - c) > convergence_thres) and (counter < max_epochs):        cprev = c        # Alpha times the partial deriviative is our updated        update0 = alpha * partial_cost_theta0(theta0, theta1, x, y)        update1 = alpha * partial_cost_theta1(theta0, theta1, x, y)        # Update theta0 and theta1 at the same time        # We want to compute the slopes at the same set of hypothesised parameters        #             so we update after finding the partial derivatives        theta0 -= update0        theta1 -= update1        # Store thetas        theta0s.append(theta0)        theta1s.append(theta1)        # Compute the new cost        c = cost(theta0, theta1, pga.distance, pga.accuracy)        # Store updates        costs.append(c)        counter += 1   # Count    return {'theta0': theta0, 'theta1': theta1, "costs": costs}print("Theta1 =", gradient_descent(pga.distance, pga.accuracy)['theta1'])descend = gradient_descent(pga.distance, pga.accuracy, alpha=.01)plt.scatter(range(len(descend["costs"])), descend["costs"])plt.show()

>>神经网络

注意矩阵相乘维度的问题，尤其是向量（0维）与矩阵（n*1维）是不一样的

x0 =[ 1. ,  7.4,  2.8,  6.1,  1.9]# Initialize thetas randomly theta_init = np.random.normal(0,0.01,size=(5,1))    #shape(5,1)def sigmoid_activation(x,theta):    v=np.dot([x],theta)#(1,5)*(5,1)    v1=1+np.exp(-v)    return 1/v1[0]#(1,1)[0]def sigmoid_activation(x, theta):    x = np.asarray(x)                           #shape(5,)    theta = np.asarray(theta)#shape(5,1)    return 1 / (1 + np.exp(-np.dot(theta.T, x)))#(1,5)*(5,) 得到(1,)a1=sigmoid_activation(x0,theta_init)

所写的两个sigmod_activation函数需要返回一个向量，但是后面的函数写法更规范

计算负梯度：

三层网络：

多层神经网络的损失函数：

#计算h(x)iris["ones"] = np.ones(iris.shape[0])X = iris[['ones', 'sepal_length', 'sepal_width', 'petal_length', 'petal_width']].valuesy = (iris.species == 'Iris-versicolor').values.astype(int)x0 = X[0]theta_init = np.random.normal(0,0.01,size=(5,1))def sigmoid_activation(x, theta):    x = np.asarray(x)    theta = np.asarray(theta)    return 1 / (1 + np.exp(-np.dot(theta.T, x)))           a1 = sigmoid_activation(x0, theta_init)#计算损失函数Jx0 = X[0]y0 = y[0]# Initialize parameters, we have 5 units and just 1 layertheta_init = np.random.normal(0,0.01,size=(5,1))def singlecost(X, y, theta):    h = sigmoid_activation(X.T, theta)    cost = -np.mean(y * np.log(h) + (1-y) * np.log(1-h))    return costfirst_cost = singlecost(x0, y0, theta_init)#计算梯度theta_init = np.random.normal(0,0.01,size=(5,1))grads = np.zeros(theta_init.shape)n = X.shape[0]for j, obs in enumerate(X):    h = sigmoid_activation(obs, theta_init)    delta = (y[j]-h) * h * (1-h) * obs    grads += delta[:,np.newaxis]/X.shape[0]#二层网络theta_init = np.random.normal(0,0.01,size=(5,1))# set a learning ratelearning_rate = 0.1# maximum number of iterations for gradient descentmaxepochs = 10000       # costs convergence threshold, ie. (prevcost - cost) > convergence_thresconvergence_thres = 0.0001  def learn(X, y, theta, learning_rate, maxepochs, convergence_thres):    costs = []    cost = singlecost(X, y, theta)  # compute initial cost    costprev = cost + convergence_thres + 0.01  # set an inital costprev to past while loop    counter = 0  # add a counter    # Loop through until convergence    for counter in range(maxepochs):        grads = np.zeros(theta.shape)        for j, obs in enumerate(X):            h = sigmoid_activation(obs, theta)   # Compute activation            delta = (y[j]-h) * h * (1-h) * obs   # Get delta            grads += delta[:,np.newaxis]/X.shape[0]  # accumulate         # update parameters         theta += grads * learning_rate        counter += 1  # count        costprev = cost  # store prev cost        cost = singlecost(X, y, theta) # compute new cost        costs.append(cost)        if np.abs(costprev-cost) < convergence_thres:            break           plt.plot(costs)    plt.title("Convergence of the Cost Function")    plt.ylabel("J($\Theta$)")    plt.xlabel("Iteration")    plt.show()    return theta theta = learn(X, y, theta_init, learning_rate, maxepochs, convergence_thres)#三层神经网络theta0_init = np.random.normal(0,0.01,size=(5,4))theta1_init = np.random.normal(0,0.01,size=(5,1))def feedforward(X, theta0, theta1):    a1 = sigmoid_activation(X.T, theta0).T    a1 = np.column_stack([np.ones(a1.shape[0]), a1])    out = sigmoid_activation(a1.T, theta1)    return outh = feedforward(X, theta0_init, theta1_init)#计算多层网络损失函数def multiplecost(x,y,theta0,theta1):    h=feedforward(x,theta0,theta1)    return -np.mean(y*np.log(h)+(1-y)*np.log(1-h))c=multiplecost(X,y,theta0_init,theta1_init)

构造三层神经网络，并进行预测

# Use a class for this model, it's good practice and condenses the codeclass NNet3:    def __init__(self, learning_rate=0.5, maxepochs=1e4, convergence_thres=1e-5, hidden_layer=4):        self.learning_rate = learning_rate        self.maxepochs = int(maxepochs)        self.convergence_thres = 1e-5        self.hidden_layer = int(hidden_layer)      def _multiplecost(self, X, y):        # feed through network        l1, l2 = self._feedforward(X)         # compute error        inner = y * np.log(l2) + (1-y) * np.log(1-l2)        # negative of average error        return -np.mean(inner)    def _feedforward(self, X):        # feedforward to the first layer        l1 = sigmoid_activation(X.T, self.theta0).T        # add a column of ones for bias term        l1 = np.column_stack([np.ones(l1.shape[0]), l1])        # activation units are then inputted to the output layer        l2 = sigmoid_activation(l1.T, self.theta1)        return l1, l2    def predict(self, X):        _, y = self._feedforward(X)        return y    def learn(self, X, y):        nobs, ncols = X.shape        self.theta0 = np.random.normal(0,0.01,size=(ncols,self.hidden_layer))        self.theta1 = np.random.normal(0,0.01,size=(self.hidden_layer+1,1))        self.costs = []        cost = self._multiplecost(X, y)        self.costs.append(cost)        costprev = cost + self.convergence_thres+1  # set an inital costprev to past while loop        counter = 0  # intialize a counter        # Loop through until convergence        for counter in range(self.maxepochs):            # feedforward through network            l1, l2 = self._feedforward(X)            # Start Backpropagation            # Compute gradients            l2_delta = (y-l2) * l2 * (1-l2)            l1_delta = l2_delta.T.dot(self.theta1.T) * l1 * (1-l1)            # Update parameters by averaging gradients and multiplying by the learning rate            self.theta1 += l1.T.dot(l2_delta.T) / nobs * self.learning_rate            self.theta0 += X.T.dot(l1_delta)[:,1:] / nobs * self.learning_rate            # Store costs and check for convergence            counter += 1  # Count            costprev = cost  # Store prev cost            cost = self._multiplecost(X, y)  # get next cost            self.costs.append(cost)            if np.abs(costprev-cost) < self.convergence_thres and counter > 500:                break# Set a learning ratelearning_rate = 0.5# Maximum number of iterations for gradient descentmaxepochs = 10000       # Costs convergence threshold, ie. (prevcost - cost) > convergence_thresconvergence_thres = 0.00001  # Number of hidden unitshidden_units = 4# Initialize model model = NNet3(learning_rate=learning_rate, maxepochs=maxepochs,              convergence_thres=convergence_thres, hidden_layer=hidden_units)# Train modelmodel.learn(X, y)# Plot costsplt.plot(model.costs)plt.title("Convergence of the Cost Function")plt.ylabel("J($\Theta$)")plt.xlabel("Iteration")plt.show()#划分训练集 测试集X_train,y_train=X[:70,:],y[:70]X_test,y_test=X[-30:,:],y[-30:]#训练与预测from sklearn.metrics import roc_auc_score# Set a learning ratelearning_rate = 0.5# Maximum number of iterations for gradient descentmaxepochs = 10000       # Costs convergence threshold, ie. (prevcost - cost) > convergence_thresconvergence_thres = 0.00001  # Number of hidden unitshidden_units = 4# Initialize model model = NNet3(learning_rate=learning_rate, maxepochs=maxepochs,              convergence_thres=convergence_thres, hidden_layer=hidden_units)model.learn(X_train, y_train)yhat = model.predict(X_test)[0]auc = roc_auc_score(y_test, yhat)

>>Guided Project: Predicting Board Game Reviews

数据集： here

对股票进行预测，主要介绍了提取一些新的特征，但是要注意时间顺序，不能让现在或未来的信息泄露来影响模型评估。

阅读全文

0 0