一、K-fold cross validation
from sklearn import datasets
from sklearn import ensemble
rf = ensemble.RandomForestRegressor(max_features = 'auto')
X, y = datasets.make_regression(10000, 10)
from sklearn import cross_validation
scores = cross_validation.cross_val_score(rf, X, y)
print(scores)
#[ 0.86310151 0.87504517 0.86335846]
scores = cross_validation.cross_val_score(rf, X, y, verbose=3, cv=4)
#[CV] no parameters to be
set .........................................
#[CV] ................ no parameters to be set, score=0.867883 - 0.4s
#[CV] no parameters to be set .........................................
#[CV] ................ no parameters to be set, score=0.878830 - 0.4s
#[CV] no parameters to be set .........................................
#[CV] ................ no parameters to be set, score=0.869258 - 0.4s
#[CV] no parameters to be set .........................................
#[CV] ................ no parameters to be set, score=0.872930 - 0.4s
#[Parallel(n_jobs=1)]: Done 4 out of 4 | elapsed: 1.9s finished
二、Stratified k-fold
The class representation was unbalanced in some manner. Stratified k-fold is specifically designed to maintain the class proportions.
1. Create datasets
from sklearn import datasets
X, y = datasets.make_classification(n_samples=
int(
1e3),weights=[
1./
11])
y.mean()
# 0.90600000000000003
从以上数字可以看出,90.6%的样本为正,其余为负。
2. 对比Stratified k-fold and shuffle_split k-fold
from sklearn
import cross_validation
n_folds =
50
strat_kfold = cross_validation.StratifiedKFold(y, n_folds = n_folds)
shuff_split = cross_validation.ShuffleSplit(n=
len(y),n_iter=n_folds)
kfold_y_props = []
shuff_y_props = []
for (k_train,k_test), (s_train,s_test) in zip (strat_kfold,shuff_split):
kfold_y_props.
append(y[k_train].mean())
shuff_y_props.
append(y[s_train].mean())
import matplotlib.pyplot as plt
f, ax = plt.subplots(figsize=
(7,
5))
ax.plot(
range(n_folds), shuff_y_props, label =
"ShuffleSplit")
ax.plot(
range(n_folds),kfold_y_props,label =
"Stratified", color=
'k', ls=
'--')
ax.set_title(
"Comparing class proportions")
ax.legend(loc=
'best')
We can see that the proportion of each fold for stratified k-fold is stable across folds
3. for multiclasses
import numpy
as np
three_classes = np.random.choice([
1,
2,
3],p=[
.1,
.4,
.5],size=
1000)
import itertools
as it
for train, test
in cross_validation.StratifiedKFold(three_classes,
5):
print(np.bincount(three_classes[train]))
三、Poor man’s grid search
1.选择两个参数维度
criteria = {gini, entropy} max_features = {auto, log2, None} parameter space = criteria * max_features
from sklearn
import datasets
X, y = datasets.make_classification(n_samples =
2000, n_features=
10)
criteria = {
'gini',
'entropy'}
max_features = {
'auto',
'log2',
None}
import itertools
as it
parameter_space = it.product(criteria, max_features)
import numpy
as np
train_set = np.random.choice([
True,
False],size=len(y))
from sklearn.tree
import DecisionTreeClassifier
accuracies = {}
for criterion, max_feature
in parameter_space:
print(criterion,max_feature)
dt = DecisionTreeClassifier(criterion=criterion, max_features = max_feature)
dt.fit(X[train_set],y[train_set])
accuracies[(criterion,max_feature)] = (dt.predict(X[~train_set]) == y[~train_set]).mean()
print(accuracies)
2. Visualize the results
from matplotlib
import pyplot
as plt
from matplotlib
import cm
cmap = cm.RdBu_r
f, ax = plt.subplots(figsize=(
7,
4))
ax.set_xticklabels([
''] + list(criteria))
ax.set_yticklabels([
''] + list(max_features))
plot_array = []
for max_feature
in max_features:
m=[]
for criterion
in criteria:
m.append(accuracies[(criterion, max_feature)])
plot_array.append(m)
colors = ax.matshow(plot_array, vmin=np.min(list(accuracies.values()))-
0.001,
vmax=np.max(list(accuracies.values()))+
0.001,cmap=cmap)
f.colorbar(colors)
注意:python3中,accuracies.values()为字典格式,需要通过list()函数将其转化为列表形式的,否则会报错。
四、Brute force grid search
1. Create some classification data
from sklearn.datasets
import make_classification
X, y = make_classification(
1000, n_features=
5)
2.Create logistic regression object
from sklearn.linear_model
import LogisticRegression
lr = LogisticRegression(class_weight=
'auto')
3. Grid Search
grid_search_params = {
'penalty':[
'l1',
'l2'],
'C':[
1,
2,
3,
4]}
import scipy.stats
as st
random_search_params = {
'penalty':[
'l1',
'l2'],
'C':st.randint(
1,
4)}
from sklearn.grid_search
import GridSearchCV, RandomizedSearchCV
gs = GridSearchCV(lr, grid_search_params)
gs.fit(X, y)
rs = RandomizedSearchCV(lr, random_search_params)
rs.fit(X, y)
gs.grid_scores_
rs.grid_scores_
max(gs.grid_scores_,key=
lambda x : x[
1])
max(rs.grid_scores_,key=
lambda x : x[
1])