目录
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
#竞赛的评价指标为logloss,但LinearSVC不支持概率
#所以这个例子中我们用正确率accuracy_score作为模型选择的度量
#如果要将LinearSVC的输出转换成概率,可通过概率校准工具CalibratedClassfierCV实现
#SVC也是铜鼓类似的方式支持概率输出
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from matplotlib import pyplot as plt
#读取数据
dpath='./data/'
#采用原始特征+tf_idf特征
#原始特征+tf_idf特征对线性SVM训练还是很快,RBF核已慢得不行
#RBF核只用tf_idf特征
train1=pd.read_csv(dpath+"Otto_FE_train_org.csv")
train2=pd.read_csv(dpath+"Otto_FE_train_tfidf.csv")
#去掉多余的id
train2=train2.drop(["id"],['targer'],axis=1)
train=pd.concat([train1,train2],axis=1,ignore_index=False)
train.head()
del train1
del train2
#将类别字符串变成数字
#drop ids and get labels
y_train=train['target']
X_train=train.drop(['id','targer'],axis=1)
#保存特征名字以备后用(可视化)
feat_names=X_train.columns
#sklearn的学习器大多之一稀疏数据输入,模型训练会很快
from scipy.sparse import csr_matrix
X_train=csr_matrix(X_train)
#训练样本6w+,交叉验证太慢,用train_test_split来估计模型性能
#SVM对大样本数据集支持不太好
from sklearn.model_selection import train_test_split
X_train_part,X_val,y_train_part,y_val=train_test_split(X_train,y_trian,
train_size=0.8,random_state=0)
print(X_train_part.shape)
数据结果:
from sklearn.svm import LinearSVC
#LinearSVC不能得到每类的概率(只有predict函数,没有predict_proba函数),Otto数据集
#要求输出每类的概率
#1.生成学习器实例
SVC1=LinearSVC()
#2.模型训练
SVC1.fit(X_train_part,y_train_part)
#3.在校验集上测试,估计模型性能
y_predict=SVC1.predict(X_val)
print('accuracy is :',accuracy_score(y_val,y_predict))
print("Classification report for classifier %s:\n%s\n"%(SVC1,
classification_report(y_val,y_predict)))
print('Confusion matrix:\n%s'%confusion_matrix(y_val,y_predict))
打印结果:
ccuracy is: 0.7643018745959922
Confusion matrix:
[[ 129 16 3 0 1 41 10 77 93]
[ 3 2762 361 10 12 12 23 14 8]
[ 1 943 535 8 2 2 40 7 8]
[ 0 343 88 86 5 25 15 3 1]
[ 0 17 1 0 519 1 0 3 1]
[ 22 32 5 6 1 2609 46 54 48]
[ 16 71 35 1 3 42 357 41 6]
[ 19 24 7 0 4 42 17 1569 21]
[ 27 20 3 2 2 37 10 55 893]]
使用原始特征+tfidf特征的线性SVM分类性能:accuracy is:
class_1,class_3和class_4分类效果不好。
是因为这几类样本数目少吗?后面可以采用class_weight='balanced'试一下
线性SVM LinearSVC的需要调整正则超参数包括C(正则系数,一般在log域(取log后的值)均匀设置候选参数)和正则函数penalty(L2/L1)
采用交叉验证,网格搜索步骤与Logistic回归正则参数处理类似,在此略。
这里我们用校验集(X_val,y_val)来估计模型性能
#单组超参数情况,模型在训练集上训练,在校验集上的测试的测试性能
def fit_grid_point_Linear(C,X_trian,y_train,X_val,y_val):
#在训练集上训练SVC
SVC2=LinearSVC(C=C)
SVC2=SVC2.fit(X_train,y_train)
#在校验集上返回accuracy
accuracy=SVC2.score(X_val,y_val)
print("C={}: accuracy={}".format(C,accuracy))
return accuracy
#需要调优的参数
#SVM太慢,每次只调一个参数(这里只调C,penalty为'l2')
C_s=np.logspace(-1,3,5) #logspace(a,b,N)表示把10的a次方到10的b次方区间分成N份
#penalty_s=['l1','l2']
accuracy_s=[]
for i ,oneC in enumerate(C_s):
tmp=fit_grid_point_Linear(oneC,X_train_part,X_val,y_train,y_val)
accuracy_s.append(tmp)
X_axis=np.log10(C_s)
#for j,penalty in enumerate(penalty_s):
plot.plot(X_axis,np.array(accuracy_s),'b-')
plot.legend()
plt.xlabel('log(C)')
plt.ylabel('accuracy')
输出结果为:
最后得到最佳超参数:
#最佳超参数
index=np.argmax(accuracy_s,axis=None)
Best_C=C_s[index]
print(Best_C)
输出结果:
#SVC训练SVC,支持概率输出
Best_C=100
SVC3=LinearSVC(C=Best_C)
SVC3.fit(X_train,y_train)
#保存模型,用于后续测试
import cPickle
cPickle.dump(SVC3,open("Otto_LinearSVC.pkl","wb"))
dpath='./data/'
#原始特征+tf_idf特征对线性SVM训练还是很快,RBF核已慢得不行
#RBF核只用tf_idf特征
train=pd.read_csv(dpath+"Otto_FE_train_tfidf.csv")
print(train.head())
#将类别字符串变成数字
#drop ids and get labels
y_train=train['target']
X_train=train.drop(['id','targer'],axis=1)
#保存特征名字以备后用(可视化)
feat_names=X_train.columns
#sklearn的学习器大多之一稀疏数据输入,模型训练会很快
from scipy.sparse import csr_matrix
X_train=csr_matrix(X_train)
#训练样本6w+,交叉验证太慢,用train_test_split来估计模型性能
#SVM对大样本数据集支持不太好
from sklearn.model_selection import train_test_split
X_train_part,X_val,y_train_part,y_val=train_test_split(X_train,y_trian,
train_size=0.8,random_state=0)
print(X_train_part.shape)
RBF核是SVM最常用的核函数。RBF核SVM的需要调整正则超参数包括C和核函数的宽度gamma ,C越小,决策边界越平滑;gamma越小,决策边界越平滑。
采用交叉验证,网格搜索步骤与Logistic回归正则参数处理类似,在此略
这里我们用校验集(X_val,y_val)来估计模型性能
from sklearn.svm import SVC
def fit_grid_point_RBF(C,gamma,X_train,y_train,X_val,y_val):
SVC3=SVC(C=C,kernel='rbf',gamma=gamma)
SVC3=SVC3.fit(X_train,y_train)
#在校验集上返回accuracy
accuracy=SVC3.score(X_val,y_val)
print("C={} and gamma={} :accuracy={}".format(C,gamma,accuracy))
return accuracy
accuracy_s=np.matrix(np.zeros(shape=(5,3)),float)
gamma_s=np.logspace(-1,1,3)
oneC=0.1
for j,gamma in enumerate(gamma_s):
accuracy_s[0,j]=fit_grid_point)RBF(oneC,gamma,X_train_part,y_train_part,
X_val,y_val)
oneC=1
for j,gamma in enumerate:
accuracy_s[1,j]=fit_grid_point_RBF(oneC,gamma,X_train_part,y_train_part,
X_val,y_val)
oneC=10
for j,gamma in enumerate:
accuracy_s[1,j]=fit_grid_point_RBF(oneC,gamma,X_train_part,y_train_part,
X_val,y_val)
oneC=100
for j,gamma in enumerate:
accuracy_s[1,j]=fit_grid_point_RBF(oneC,gamma,X_train_part,y_train_part,
X_val,y_val)
输出结果:
从上述结果会发现,gamma参数非常重要(当gamma=0.1或gamma=100时性能很差),非线性模型比线性模型要好(注意这里只用到了tfidf特征)
但速度慢了不是一点半点(sklearn建议核方法SVM样本数不超过10000)
可以考虑将训练样本分为多个子集,每个子集训练一个RBF核SVM模型,最终融合到一起
Otto_SVM_result=pd.read_csv("Otto_SVM_result.csv",'wb')
accuracy_s1=Otto_SVM_result['accuracy']
C_s=np.logspace(-1,3,5)
gamma_s=np.logspace(-1,1,3)
accuracy_s1=np.array(accuracy_s1).reshape(len(C_s))
x_axis=np.log10(C_s)
for j,gamma in enumerate(gamma_s):
plt.plot(x_axis,np.array(accuracy_s1[:,j]),
label='Test-log(gamma)'+str(np.log10(*****
plt.legend()
plt.xlabel('log(C)')
plt.ylabel('accuracy')
plt.savefig(RBF_SVM_Otto.png')
plt.show()
index=np.unravel_index(np.argmx(accuracy_s1,axis=None,accuracy_s1.shape)
Best_C=C_s[index[0]]
Best_gamma=gamma_s[index[1]]
print(Best_C)
print(Best_gamma)
输出结果:
#SVC训练SVC,支持概率输出
Best_C=100
Best_gamma=1.0
SVC4=SVC(C=Best_C,kernel='rbf',gamma=Best_gamma,probability=True)
SVC4.fit(X_train,y_train)
#保持模型,用于后续测试
import cPickle
cPickle.dump(SVC4,open("Otto_RBF_SVC.pkl",'wb')