Python之《机器学习系统设计》第五章

来源:互联网 发布:游戏交易网站源码 编辑:程序博客网 时间:2024/06/08 14:32

第五章:分类,检测劣质答案

对数据进行解析

#对数据进行解析def fetch_posts():    for line in open("data.tsv","r"):        post_id,text = line.split("\t")        yield int(post_id), text.strip()

用KNN分类器进行分类

#KNN分类器import numpy as npfrom sklearn import neighborsknn = neighbors.KNeighborsClassifier(n_neighbors=2)print knnknn.fit([[1],[2],[3],[4],[5],[6]],[0,0,0,1,1,1])print knn.predict(1.5)print knn.predict(3)print knn.predict(12)print knn.predict_proba(1.5)

结果为

[0]
[0]
[1]
[[ 1.  0.]]


将特征数组和y便签传入KNN分类器

X = np.asarray([extract_features_from_body(text) for post_id, text in fetch_posts() if post_id in all_answers])knn = neighbors.KNeighborsClassifier()knn.fit(X,Y)

交叉验证

#交叉验证from sklearn.cross_validation import KFoldscores = []cv = KFold(n=len(X),k = 10,indices=True)for train, test in cv:    x_train,y_train = X[train],Y[train]    x_test,y_test = X[test],Y[test]    clf = neighbors.KNeighborsClassifier()    clf.fit(x_train,y_train) # 书上为clf.fit(X,Y),但我感觉写错了    scores.append(clf.score(x_test,y_test))print("Mean scores = %.5f\tStddev scores = %.5f"%(np.mean(scores),np.std(scores)))

逻辑回归

#逻辑回归from sklearn.linear_model import LogisticRegressionclf = LogisticRegression()print clfclf.fit(X,y)print(np.exp(clf.intercept_),np.exp(clf.coef_.ravel()))def lr_model(clf,X):    return 1/(1+np.exp(-(clf.intercept_+clf.coef_*X)))print("P(x=-1)=%.2f\tp(x=7)=%.2f"%(lr_model(clf,-1),lr_model(clf,7)))

准确率和召回率

thresh80 = thresholds[idx80][0]probs_for_good = clf.predict_proba(answer_features)[:,1]answer_class = probs_for_good>thresh80#可以使用classification_report确认得到了预期的准确率和召回率from sklearn.metrics import classification_reportprint(classification_report(y_test,clf.predict_proba[:,1]>0.63,target_names=['not accepted','accepted']))

训练好的分类器可以进行保存

import picklepickle.dump(clf,open("logreg.dat","w"))

需要使用的时候再使用即可

clf = pickle.load(open("logreg.dat","r"))







0 0
原创粉丝点击