import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.cross_validation import train_test_split
from sklearn import metrics, cross_validation
from sklearn.linear_model import LogisticRegression
# read training data in pandas dataframe
data = pd.read_csv("./dataset.csv", delimiter=';')
# last column is target, store in array t
t = data['TARGET']
# list of features, including target
features = data.columns
# item feature matrix in X
X = data[features[:-1]].as_matrix()
# remove first column because it is not necessary in the analysis
X = np.delete(X,0,axis=1)
# divide in training and test set
X_train, X_test, t_train, t_test = train_test_split(X, t, test_size=0.2, random_state=0)
# define method
# cross valitadion prediction
predicted = cross_validation.cross_val_predict(logreg, X_train, t_train, cv=10)
print(metrics.accuracy_score(t_train, predicted))
# Until here everything is good
# You keep away 20% of data for testing (test_size=0.2)
# This test data should be unseen by any of the below methods
# define method
# Ideally what you are doing here should be correct, until you did anything wrong in dataframe operations (which apparently has been solved)
#cross valitadion prediction
#This cross validation prediction will print the predicted values of 't_train'
predicted = cross_validation.cross_val_predict(logreg, X_train, t_train, cv=10)
# internal working of cross_val_predict:
#1. Get the data and estimator (logreg, X_train, t_train)
#2. From here on, we will use X_train as X_cv and t_train as t_cv (because cross_val_predict doesnt know that its our training data) - Doubts??
#3. Split X_cv, t_cv into X_cv_train, X_cv_test, t_cv_train, t_cv_test by using its internal cv
#4. Use X_cv_train, t_cv_train for fitting 'logreg'
#5. Predict on X_cv_test (No use of t_cv_test)
#6. Repeat steps 3 to 5 repeatedly for cv=10 iterations, each time using different data for training and different data for testing.
# So here you are correctly comparing 'predicted' and 't_train'
print(metrics.accuracy_score(t_train, predicted))
# The above metrics will show you how our estimator 'logreg' works on 'X_train' data. If the accuracies are very high it may be because of overfitting.
# Now what to do about the X_test and t_test above.
# Actually the correct preference for metrics is this X_test and t_train
# If you are satisfied by the accuracies on the training data then you should fit the entire training data to the estimator and then predict on X_test
logreg.fit(X_train, t_train)
t_pred = logreg(X_test)
# Here is the final accuracy
print(metrics.accuracy_score(t_test, t_pred))
# If this accuracy is good, then your model is good.
# Use cross_val_score on your all data
scores = model_selection.cross_val_score(logreg, X, y, cv=10)
# 'cross_val_score' will almost work same from steps 1 to 4
#5. t_cv_pred = logreg.predict(X_cv_test) and calculate accuracy with t_cv_test.
#6. Repeat steps 1 to 5 for cv_iterations = 10
#7. Return array of accuracies calculated in step 5.
# Find out average of returned accuracies to see the model performance
scores = scores.mean()
logreg = LogisticRegression()
logreg = LogisticRegression(penalty='l1', solver='liblinear')
# Create example df with alphabetic col names.
alphabet_cols = list(string.ascii_uppercase)[:26]
df = pd.DataFrame(np.random.randint(1000, size=(1000, 26)),
df['Target'] = df['A']
df.drop(['A'], axis=1, inplace=True)
y = df.Target.values # df['Target'] is not an np.array.
feature_cols = [i for i in list(df.columns) if i != 'Target']
X = df.ix[:, feature_cols].as_matrix()
# Illustrated here for manual splitting of training and testing data.
X_train, X_test, y_train, y_test = \
model_selection.train_test_split(X, y, test_size=0.2, random_state=0)
# Initialize model.
logreg = linear_model.LinearRegression()
# Use cross_val_score to automatically split, fit, and score.
scores = model_selection.cross_val_score(logreg, X, y, cv=10)
print('average score: {}'.format(scores.mean()))
B C D E F G H I J K ... Target
0 20 33 451 0 420 657 954 156 200 935 ... 253
1 427 533 801 183 894 822 303 623 455 668 ... 421
2 148 681 339 450 376 482 834 90 82 684 ... 903
3 289 612 472 105 515 845 752 389 532 306 ... 639
4 556 103 132 823 149 974 161 632 153 782 ... 347
[5 rows x 26 columns]
[-0.0367 -0.0874 -0.0094 -0.0469 -0.0279 -0.0694 -0.1002 -0.0399 0.0328
average score: -0.04258093018969249
逻辑回归对应线性回归,但旨在解决分类问题,即将模型的输出转换为从 0 到 1 之间的概率值。逻辑回归直接对分类的可能性进行建模,无需事先假设数据的分布。 最理想的转换函数为单位阶跃函数(也称Heaviside函数),但单位阶跃函数是不连续的,没法在实际计算中使用。故而,在分类过程中更常使用对数几率函数(即sigmoid函数): $$f(x)=\frac{1}{1+e^{-x}}$$ 这样,模型就变
交叉验证 那么什么时候才需要交叉验证呢?交叉验证用在数据不是很充足的时候。比如在我日常项目里面,对于普通适中问题,如果数据样本量小于一万条,我们就会采用交叉验证来训练优化选择模型。如果样本大于一万条的话,我们一般随机的把数据分成三份,一份为训练集(Training Set),一份为验证集(Validation Set),最后一份为测试集(Test Set)。用训练集来训练模型,用验证集来评估模型预
综述 “子非鱼,焉知鱼之乐” 本文采用编译器:jupyter 逻辑回归方法是从线性回归方法发展过来的,通常解决的是分类问题,读者或许有这样一个疑问:既然是回归算法又么解决分类问题的呢? 道理其实很简单,在我们求出线性回归系数a,b之后,对于每一个输入的x值,模型都可以输出对应的y值,如果把输出值y限制在0到1的范围内,那么这个y就非常的像一个概率p,我们只用规定概率的不同取值范围对应不同的标记
