df=pd.read_csv('数据路径',encoding = "utf-8",header = 0)
pd.options.display.max_columns = None
#pd.options.display.max_rows = None
df.head(n=5)
percent_missing = df.isnull().sum() * 100 / len(df)
missing_values = pd.DataFrame({'percent_missing': percent_missing})
missing_values.sort_values(by ='percent_missing' , ascending=False
sns.set(style="ticks")
f = sns.countplot(x="class", data=df, palette="bwr")
plt.show()
df['class'].value_counts(ascendind=False)
df.shape
one hot编码是将类别变量转换为机器学习算法易于利用的一种形式的过程。
标签编码的问题是它经常假定类别值越高,该类别更好,这样对预测的结果会较差。经独热编码以后,特征个数会增加。
一个例子:假设“花”的一个特征可能的取值为daffodil(水仙)、lily(百合)、rose(玫瑰)。one hot编码将其转换为三个特征:is_daffodil、is_lily、is_rose,这些特征都是二进制的。此时三种特征取值分别对应100,010,001,特征也由原来的一个变成了三个。
X = df.drop(['class'], axis = 1)
Y = df['class']
X = pd.get_dummies(X, prefix_sep='_')
Y = LabelEncoder().fit_transform(Y)
#np.set_printoptions(threshold=np.inf)
注意训练前先将特征数据标准化
数据预处理的fit(),transform()和fit_transform()的区别
X2 = StandardScaler().fit_transform(X)
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X2, Y, test_size = 0.30, random_state = 101)
start = time.process_time() #获取当前时间
trainedmodel = LogisticRegression().fit(X_Train,Y_Train)
print(time.process_time() - start) #显示训练模型用时
predictions =trainedmodel.predict(X_Test)
print(confusion_matrix(Y_Test,predictions))
print(classification_report(Y_Test,predictions))
trainedsvm = svm.LinearSVC().fit(X_Train, Y_Train)
predictionsvm = trainedsvm.predict(X_Test)
trainedtree = tree.DecisionTreeClassifier().fit(X_Train, Y_Train)
predictionstree = trainedtree.predict(X_Test)
trainedtree = tree.DecisionTreeClassifier().fit(X_Train, Y_Train)
predictionstree = trainedtree.predict(X_Test)
基于集合的决策树模型(如随机森林)可以用来对不同特征的重要性进行排序。
随机森林进行特征重要性度量的原理说明
一个实例
trainedforest = RandomForestClassifier(n_estimators=700).fit(X_Train,Y_Train)
predictionforest = trainedforest.predict(X_Test)
print(confusion_matrix(Y_Test,predictionforest))
print(classification_report(Y_Test,predictionforest)
figure(num=None, figsize=(20, 22), dpi=80, facecolor='w', edgecolor='k')
feat_importances = pd.Series(trainedforest.feature_importances_, index= X.columns)
feat_importances.nlargest(19).plot(kind='barh')
plt.show() #显示plot()的图像
树结构顶部的特征是我们的模型为了执行分类而保留的最重要的特征。因此,只选择顶部的前几个特征,而放弃其他特征,可能创建一个准确度非常可观的模型。
start = time.process_time()
trainedtree = tree.DecisionTreeClassifier().fit(X_Train, Y_Train)
print(time.process_time() - start)
predictionstree = trainedtree.predict(X_Test)
print(confusion_matrix(Y_Test,predictionstree))
print(classification_report(Y_Test,predictionstree))
#决策树可视化
import graphviz
from sklearn.tree import DecisionTreeClassifier, export_graphviz
data = export_graphviz(trainedtree,out_file=None,feature_names= X.columns,
class_names=['edible', 'poisonous'],
filled=True, rounded=True,
max_depth=2,
special_characters=True)
graph = graphviz.Source(data)
graph
尝试直接选择前几个最重要的特征训练模型,结果与原模型误差非常小。
X_Reduced = X[['odor_n','odor_f', 'gill-size_n','gill-size_b']]
X_Reduced = StandardScaler().fit_transform(X_Reduced)
X_Train2, X_Test2, Y_Train2, Y_Test2 = train_test_split(X_Reduced, Y, test_size = 0.30, random_state = 101)
start = time.process_time()
trainedforest = RandomForestClassifier(n_estimators=700).fit(X_Train2,Y_Train2)
print(time.process_time() - start)
predictionforest = trainedforest.predict(X_Test2)
print(confusion_matrix(Y_Test2,predictionforest))
print(classification_report(Y_Test2,predictionforest))
如果RFE采用的底层模型不稳定的话,RFE就是不稳定的,尽量不用未正则化的LR模型,而使用岭模型。这个问题中,我们采用的是随机森林模型。
from sklearn.feature_selection import RFE
#利用cross validation验证RFE在该问题中的优劣
model = RandomForestClassifier(n_estimators=700)
rfe = RFE(model, 4)
start = time.process_time()
RFE_X_Train = rfe.fit_transform(X_Train,Y_Train)
RFE_X_Test = rfe.transform(X_Test)
rfe = rfe.fit(RFE_X_Train,Y_Train)
print(time.process_time() - start)
print("Overall Accuracy using RFE: ", rfe.score(RFE_X_Test,Y_Test))
#利用RFE后的数据拟合模型
model = RandomForestClassifier(n_estimators=700)
rfe = RFE(model, 4)
RFE_X_Train = rfe.fit_transform(X_Train,Y_Train)
model.fit(RFE_X_Train,Y_Train)
print("Number of Features: ", rfe.n_features_)
print("Selected Features: ")
colcheck = pd.Series(rfe.support_,index = list(X.columns))
colcheck[colcheck == True].index
Numeric_df = pd.DataFrame(X)
Numeric_df['Y'] = Y
corr= Numeric_df.corr()
corr_y = abs(corr["Y"])
highest_corr = corr_y[corr_y >0.5]
highest_corr.sort_values(ascending=True)
X_Reduced2 = X[['bruises_f' , 'bruises_t' , 'gill-color_b' , 'gill-size_b' , 'gill-size_n' , 'ring-type_p' , 'stalk-surface-below-ring_k' , 'stalk-surface-above-ring_k' ,
'odor_f', 'odor_n']]
X_Reduced2 = StandardScaler().fit_transform(X_Reduced2)
X_Train3, X_Test3, Y_Train3, Y_Test3 = train_test_split(X_Reduced2, Y, test_size = 0.30, random_state = 101)
start = time.process_time()
trainedsvm = svm.LinearSVC().fit(X_Train3, Y_Train3)
print(time.process_time() - start)
predictionsvm = trainedsvm.predict(X_Test3)
print(confusion_matrix(Y_Test3,predictionsvm))
print(classification_report(Y_Test3,predictionsvm))
regr = LassoCV(cv=5, random_state=101)
regr.fit(X_Train,Y_Train)
print("LassoCV Best Alpha Scored: ", regr.alpha_)
print("LassoCV Model Accuracy: ", regr.score(X_Test, Y_Test))
model_coef = pd.Series(regr.coef_, index = list(X.columns))
print("Variables Eliminated: ", str(sum(model_coef == 0)))
print("Variables Kept: ", str(sum(model_coef != 0)))
figure(num=None, figsize=(12, 10), dpi=80, facecolor='w', edgecolor='k')
top_coef = model_coef.sort_values()
top_coef[top_coef != 0].plot(kind = "barh")
plt.title("Most Important Features Identified using Lasso (!0)")