由于刚刚接触编程深深体会到一行一行理解代码的痛苦,现在在尝试去复刻一些很基础的东西,因此里面一些比较复杂的我都详细注释了,希望能够帮助到和我一样正在痛苦中的人,也欢迎大家和我一起交流学习!
以下代码中用到的分类器有随机森林回归、线性回归、随机森林分类、决策树、K-means.其中随机森林分类和决策树的效果最好。
如何将数据中的“nan”(字符串)值(非nan)替换为每列的均值
这一部分在代码中有体现,如果为nan可以直接调用fillna
from pandas import DataFrame, read_csv
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from sklearn import tree
#DATAPREPROCESSING
names = ["age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope",
"ca", "thal"]
dataset = pd.read_csv("data.csv")
dataset.replace("?", "nan", inplace=True)#datafram中用nan替换?
#print(dataset.isnull().sum()) Summing how many NaN values are in each column
dataset.dropna(axis=0,how='all',inplace=True)
dataset.reset_index(drop=True, inplace=True)#drop=True删除原来的索引,inplace=True在原数据上操作
#列均值替换这一列中的nan,如何用除了这一列的nan以外的其他值的均值替换nan,并更新原始dataset
for CloumnName in names :#均值填充566
unique_value=dataset[CloumnName].unique().tolist()#数据都是根据train来算的
if "nan" in unique_value:
column_data=dataset[ CloumnName].replace("nan",0).astype(int)
mean= column_data.mean()
dataset.loc[dataset[CloumnName].isin(["nan"]),CloumnName]=mean#只替换
#dataset['num'].replace(to_replace=[1, 2, 3, 4], value=1, inplace=True)#to_replace=[1, 2, 3, 4]前面为要被替换的,value为用它去替换
a = (dataset.iloc[:,-1] == 1).sum()#dataset.iloc[:,-1]获取最后一列,分布对为1和为0的求和
b = (dataset.iloc[:,-1] == 0).sum()
plot2 = pd.DataFrame({'Target':['No diesease', 'Disease'], 'Count':[a, b]})#把数据组合成datafram
ax = plot2.plot.bar(x='Target', y='Count')#绘制条形图,指定x,y轴
ax.set_xlabel("Target", fontsize=12)#轴标签
ax.set_ylabel("Count", fontsize=12)
ax.set_title("People with heart disease")#图标题
X=dataset[['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']]#取出数据的各列自变量
Y=dataset.iloc[:,-1].to_frame()#因变量
# RANDOM FOREST REGRESSION MODEL,随机森林回归
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state = 60)
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf.fit(x_train,y_train )
predictions = rf.predict(x_test)
x = 0
for i in range (1,100):
x = x + r2_score(y_test, predictions)
r2 = x/100
print("R2 score for Random Forest regression",r2*100)
#11111 LINEAR REGRESSOR,线性回归
X_train2, X_test2, Y_train2, Y_test2 = train_test_split(X, Y, test_size=0.3,random_state = 60)
lr = LinearRegression()
lr.fit(X_train2,Y_train2)
predicted2 = lr.predict(X_test2)
y = 0
for i in range (1,100):
y = y + r2_score(Y_test2, predicted2)
r1 = y/100
print("R2 score for Linear regression",r1*100)
#2222 RANDOM FORESTN CLASSIFIER随机森林分类
X_train1, X_test1, Y_train1, Y_test1 = train_test_split(X, Y, test_size=0.3,random_state = 60)
model= RandomForestClassifier()
model.fit(X_train1,Y_train1)
predicted1= model.predict(X_test1)
a = 0
b = 0
for i in range (1,100):
a = a + accuracy_score(Y_test1,predicted1)
b = b + model.score(X_train1, Y_train1)
test_accuracy1 = a/100
r3 = b/100 #r3最大
print("Training Accuracy for random forest",r3*100)
print("Testing Accuracy for random forest",test_accuracy1*100)
print ("Training Error for random forest", (1 - r3)*100)
print ("Testing Error for random forest", (1-test_accuracy1)*100)
#决策树分类
X_train3, X_test3, Y_train3, Y_test3 = train_test_split(X, Y, test_size=0.3,random_state = 60)
clf = tree.DecisionTreeClassifier()
clf.fit(X_train3, Y_train3)
predictions3 = clf.predict(X_test3)
a1 = 0
b1 = 0
for i in range (1,100):
a1 = a1 + accuracy_score(Y_test3,predictions3)
b1 = b1 + model.score(X_train3, Y_train3)
test_accuracy3 = a1/100
score1 = b1/100
print("\nTraining Accuracy for Descision Tree:",score1*100)
print("Testing Accuracy for Descision Tree:",test_accuracy3*100)
print ("Training Error for Descision Tree", (1 - score1)*100)
print ("Testing Error for Descision Tree", (1-test_accuracy3)*100)
#3333 K MEAN CLASSIFIER
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3,random_state = 60)
k_means = KMeans(n_clusters=1,random_state=0)
k_means.fit(X_train,Y_train)
predicted= k_means.predict(X_test)
test_accuracy=accuracy_score(Y_test,predicted)
print("\nTesting Accuracy for Kmean",test_accuracy*100)