Learning Scikit-learn Machine Learning in Python

怀浩大
2023-12-01


Chapter 4: Advanced Features - Feature Engineering and Selection

%pylab inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

titanic = pd.read_csv('data/titanic.csv')
print titanic

print titanic.head()[['pclass', 'survived', 'age', 'embarked', 'boat', 'sex']]  # 注意写法, 把属性卸载head()后面的[[内

titanic.describe()

from sklearn import feature_extraction

def one_hot_dataframe(data, cols, replace=False):                       #特征抽取
    vec = feature_extraction.DictVectorizer()
    mkdict = lambda row: dict((col, row[col]) for col in cols)     #注意这个lambda的写法,pandas的apply, 此处对data[cols],一个DataFrame应用,所以row是DataFrame中的Series
    vecData = pd.DataFrame(vec.fit_transform(data[cols].apply(mkdict, axis=1)).toarray())   #注意apply的应用,,并设置了axis
    vecData.columns = vec.get_feature_names()
    vecData.index = data.index
    if replace:
        data = data.drop(cols, axis=1)
        data = data.join(vecData)  # 注意此处join的用法,列连接?
    return (data, vecData)

titanic, titanic_n = one_hot_dataframe(titanic, ['pclass', 'embarked', 'sex'], replace=True)


print titanic_n.head(5)
print titanic_n[titanic_n['embarked'] != 0].head()


print titanic.head()
titanic, titanic_n = one_hot_dataframe(titanic, ['home.dest', 'room', 'ticket', 'boat'], replace=True)

print titanic['age'].describe()
mean = titanic['age'].mean()
titanic['age'].fillna(mean, inplace=True)
print titanic['age'].describe()

titanic.fillna(0, inplace=True)

from sklearn.cross_validation import train_test_split
titanic_target = titanic['survived']
titanic_data = titanic.drop(['name', 'row.names', 'survived'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(titanic_data, titanic_target, test_size=0.25, random_state=33)


from sklearn import tree
dt = tree.DecisionTreeClassifier(criterion='entropy')
dt = dt.fit(X_train, y_train)

import pydot, StringIO
dot_data = StringIO.StringIO()
tree.export_graphviz(dt, out_file=dot_data, feature_names=titanic_data.columns)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
graph.write_png('titanic.png')
from IPython.core.display import Image
Image(filename='titanic.png')



from sklearn import feature_selection
fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=20)   #选择特征, 百分比
X_train_fs = fs.fit_transform(X_train, y_train)
print titanic_data.columns[fs.get_support()]
print fs.scores_[2]
print titanic_data.columns[2]

dt.fit(X_train_fs, y_train)
X_test_fs = fs.transform(X_test)
measure_performance(X_test_fs, y_test, dt, show_confussion_matrix=False, show_classification_report=False)




ercentiles = range(1, 100, 5)   #画出特征和准确度之间的关系图
results = []
for i in range(1, 100, 5):
    fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=i)
    X_train_fs = fs.fit_transform(X_train, y_train)
    scores = cross_validation.cross_val_score(dt, X_train_fs, y_train, cv=5)
    #print i,scores.mean()
    results = np.append(results, scores.mean())

optimal_percentil = np.where(results == results.max())[0]
print "Optimal number of features:{0}".format(percentiles[optimal_percentil]), "\n"

# Plot number of features VS. cross-validation scores
import pylab as pl
pl.figure()
pl.xlabel("Number of features selected")
pl.ylabel("Cross validation accuracy)")
pl.plot(percentiles,results)
print "Mean scores:",results

fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=percentiles[optimal_percentil])
X_train_fs = fs.fit_transform(X_train, y_train)
dt.fit(X_train_fs, y_train)
X_test_fs = fs.transform(X_test)
measure_performance(X_test_fs, y_test, dt, show_confussion_matrix=False, show_classification_report=False)

模型选择

dt = tree.DecisionTreeClassifier(criterion='entropy')
scores = cross_validation.cross_val_score(dt, X_train_fs, y_train, cv=5)
print "Entropy criterion accuracy on cv: {0:.3f}".format(scores.mean())
dt = tree.DecisionTreeClassifier(criterion='gini')
scores = cross_validation.cross_val_score(dt, X_train_fs, y_train, cv=5)
print "Gini criterion accuracy on cv: {0:.3f}".format(scores.mean())

dt.fit(X_train_fs, y_train)
X_test_fs = fs.transform(X_test)
measure_performance(X_test_fs, y_test, dt, show_confussion_matrix=False, show_classification_report=False)



Model Selection

from sklearn.datasets import fetch_20newsgroups

news = fetch_20newsgroups(subset='all')

n_samples = 3000

X = news.data[:n_samples]
y = news.target[:n_samples]

from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer


def get_stop_words():
    result = set()
    for line in open('stopwords_en.txt', 'r').readlines():
        result.add(line.strip())
    return result

stop_words = get_stop_words()

clf = Pipeline([
    ('vect', TfidfVectorizer(
                stop_words=stop_words,
                token_pattern=ur"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b",         
    )),
    ('nb', MultinomialNB(alpha=0.01)),
])

from sklearn.cross_validation import cross_val_score, KFold
from scipy.stats import sem

def evaluate_cross_validation(clf, X, y, K):
    # create a k-fold croos validation iterator of k=5 folds
    cv = KFold(len(y), K, shuffle=True, random_state=0)
    # by default the score used is the one returned by score method of the estimator (accuracy)
    scores = cross_val_score(clf, X, y, cv=cv)
    print scores
    print ("Mean score: {0:.3f} (+/-{1:.3f})").format(
        np.mean(scores), sem(scores))
		
		
evaluate_cross_validation(clf, X, y, 3)

def calc_params(X, y, clf, param_values, param_name, K):
    # initialize training and testing scores with zeros
    train_scores = np.zeros(len(param_values))
    test_scores = np.zeros(len(param_values))
    
    # iterate over the different parameter values
    for i, param_value in enumerate(param_values):
        print param_name, ' = ', param_value
        
        # set classifier parameters
        clf.set_params(**{param_name:param_value})      ###注意** 的用法
        
        # initialize the K scores obtained for each fold
        k_train_scores = np.zeros(K)
        k_test_scores = np.zeros(K)
        
        # create KFold cross validation
        cv = KFold(n_samples, K, shuffle=True, random_state=0)
        
        # iterate over the K folds
        for j, (train, test) in enumerate(cv):
            # fit the classifier in the corresponding fold
            # and obtain the corresponding accuracy scores on train and test sets
            clf.fit([X[k] for k in train], y[train])
            k_train_scores[j] = clf.score([X[k] for k in train], y[train])
            k_test_scores[j] = clf.score([X[k] for k in test], y[test])
            
        # store the mean of the K fold scores
        train_scores[i] = np.mean(k_train_scores)
        test_scores[i] = np.mean(k_test_scores)
       
    # plot the training and testing scores in a log scale
    plt.semilogx(param_values, train_scores, alpha=0.4, lw=2, c='b')
    plt.semilogx(param_values, test_scores, alpha=0.4, lw=2, c='g')
    
    plt.xlabel(param_name + " values")
    plt.ylabel("Mean cross validation accuracy")

    # return the training and testing scores on each parameter value
    return train_scores, test_scores
	
	
	alphas = np.logspace(-7, 0, 8)
print alphas

train_scores, test_scores = calc_params(X, y, clf, alphas, 'nb__alpha', 3)

from sklearn.svm import SVC

clf = Pipeline([
    ('vect', TfidfVectorizer(
                stop_words=stop_words,
                token_pattern=ur"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b",         
    )),
    ('svc', SVC()),
])


from sklearn.grid_search import GridSearchCV

parameters = {
    'svc__gamma': np.logspace(-2, 1, 4),
    'svc__C': np.logspace(-1, 1, 3),
}

clf = Pipeline([
    ('vect', TfidfVectorizer(
                stop_words=stop_words,
                token_pattern=ur"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b",         
    )),
    ('svc', SVC()),
])

gs = GridSearchCV(clf, parameters, verbose=2, refit=False, cv=3)

Parallelizing

from sklearn.externals import joblib
from sklearn.cross_validation import ShuffleSplit
import os

def persist_cv_splits(X, y, K=3, name='data', suffix="_cv_%03d.pkl"):
    """Dump K folds to filesystem."""
    
    cv_split_filenames = []
    
    # create KFold cross validation
    cv = KFold(n_samples, K, shuffle=True, random_state=0)
    
    # iterate over the K folds
    for i, (train, test) in enumerate(cv):
        cv_fold = ([X[k] for k in train], y[train], [X[k] for k in test], y[test])
        cv_split_filename = name + suffix % i
        cv_split_filename = os.path.abspath(cv_split_filename)
        joblib.dump(cv_fold, cv_split_filename)
        cv_split_filenames.append(cv_split_filename)
    
    return cv_split_filenames
	
def compute_evaluation(cv_split_filename, clf, params):
    
    # All module imports should be executed in the worker namespace
    from sklearn.externals import joblib

    # load the fold training and testing partitions from the filesystem
    X_train, y_train, X_test, y_test = joblib.load(
        cv_split_filename, mmap_mode='c')
    
    clf.set_params(**params)
    clf.fit(X_train, y_train)
    test_score = clf.score(X_test, y_test)
    return test_score
	
	
from sklearn.grid_search import IterGrid

def parallel_grid_search(lb_view, clf, cv_split_filenames, param_grid):
    
    all_tasks = []
    all_parameters = list(IterGrid(param_grid))
    
    # iterate over parameter combinations
    for i, params in enumerate(all_parameters):
        task_for_params = []
        
        # iterate over the K folds
        for j, cv_split_filename in enumerate(cv_split_filenames):    
            t = lb_view.apply(
                compute_evaluation, cv_split_filename, clf, params)
            task_for_params.append(t) 
        
        all_tasks.append(task_for_params)
        
    return all_parameters, all_tasks
	
from sklearn.svm import SVC
from IPython.parallel import Client

client = Client()
lb_view = client.load_balanced_view()

all_parameters, all_tasks = parallel_grid_search(
   lb_view, clf, cv_filenames, parameters)
   
 
def print_progress(tasks):
    progress = np.mean([task.ready() for task_group in tasks
                                 for task in task_group])
    print "Tasks completed: {0}%".format(100 * progress)
	
	
def find_bests(all_parameters, all_tasks, n_top=5):
    """Compute the mean score of the completed tasks"""
    mean_scores = []
    
    for param, task_group in zip(all_parameters, all_tasks):
        scores = [t.get() for t in task_group if t.ready()]
        if len(scores) == 0:
            continue
        mean_scores.append((np.mean(scores), param))
                   
    return sorted(mean_scores, reverse=True)[:n_top]


 类似资料:

相关阅读

相关文章

相关问答