Skip to content

Commit f1692db

Browse files
committed
Merge remote branch 'origin'
Conflicts: scikits/learn/datasets/lfw.py scikits/learn/datasets/tests/test_lfw.py
2 parents aaeadfb + 3057073 commit f1692db

File tree

3 files changed

+112
-76
lines changed

3 files changed

+112
-76
lines changed

scikits/learn/base.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,8 @@ def _get_params(self, deep=True):
145145
"""
146146
out = dict()
147147
for key in self._get_param_names():
148+
if not hasattr(self, key):
149+
continue
148150
value = getattr (self, key)
149151
if deep and hasattr (value, '_get_params'):
150152
deep_items = value._get_params().items()

scikits/learn/datasets/tests/test_lfw.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,11 @@ def setup_module():
6969
for i in range(n_faces):
7070
file_path = os.path.join(folder_name, name + '_%04d.jpg' % i)
7171
uniface = np_rng.randint(0, 255, size=(250, 250, 3))
72-
imsave(file_path, uniface)
72+
try:
73+
imsave(file_path, uniface)
74+
except ImportError:
75+
# PIL is not properly installed, skip those tests
76+
raise SkipTest
7377

7478
# add some random file pollution to test robustness
7579
f = open(os.path.join(LFW_HOME, 'lfw_funneled', '.test.swp'), 'wb')

scikits/learn/grid_search.py

Lines changed: 105 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,13 @@
55
# License: BSD Style.
66

77
import copy
8+
import time
89

910
import numpy as np
1011
import scipy.sparse as sp
1112

1213
from .externals.joblib import Parallel, delayed
14+
from .externals.joblib.logger import short_format_time
1315
from .cross_val import KFold, StratifiedKFold
1416
from .base import BaseEstimator, is_classifier, clone
1517

@@ -26,7 +28,7 @@ def product(*args, **kwds):
2628
yield tuple(prod)
2729

2830

29-
def iter_grid(param_grid):
31+
class IterGrid(object):
3032
"""Generators on the combination of the various parameter lists given
3133
3234
Parameters
@@ -43,79 +45,85 @@ def iter_grid(param_grid):
4345
4446
Examples
4547
---------
46-
>>> from scikits.learn.grid_search import iter_grid
48+
>>> from scikits.learn.grid_search import IterGrid
4749
>>> param_grid = {'a':[1, 2], 'b':[True, False]}
48-
>>> list(iter_grid(param_grid))
50+
>>> list(IterGrid(param_grid))
4951
[{'a': 1, 'b': True}, {'a': 1, 'b': False}, {'a': 2, 'b': True}, {'a': 2, 'b': False}]
5052
5153
"""
52-
if hasattr(param_grid, 'has_key'):
53-
param_grid = [param_grid]
54-
for p in param_grid:
55-
# Always sort the keys of a dictionary, for reproducibility
56-
items = sorted(p.items())
57-
keys, values = zip(*items)
58-
for v in product(*values):
59-
params = dict(zip(keys, v))
60-
yield params
61-
62-
63-
def fit_grid_point(X, y, base_clf, clf_params, cv, loss_func, score_func, iid,
64-
**fit_params):
54+
def __init__(self, param_grid):
55+
self.param_grid = param_grid
56+
57+
def __iter__(self):
58+
param_grid = self.param_grid
59+
if hasattr(param_grid, 'has_key'):
60+
param_grid = [param_grid]
61+
for p in param_grid:
62+
# Always sort the keys of a dictionary, for reproducibility
63+
items = sorted(p.items())
64+
keys, values = zip(*items)
65+
for v in product(*values):
66+
params = dict(zip(keys, v))
67+
yield params
68+
69+
70+
def fit_grid_point(X, y, base_clf, clf_params, train, test, loss_func,
71+
score_func, verbose, **fit_params):
6572
"""Run fit on one set of parameters
6673
6774
Returns the score and the instance of the classifier
6875
"""
76+
if verbose > 1:
77+
start_time = time.time()
78+
msg = '%s' % (', '.join('%s=%s' % (k, v)
79+
for k, v in clf_params.iteritems()))
80+
print "[GridSearchCV] %s %s" % (msg, (64-len(msg))*'.')
6981
# update parameters of the classifier after a copy of its base structure
7082
clf = copy.deepcopy(base_clf)
7183
clf._set_params(**clf_params)
7284

73-
score = 0.
74-
n_test_samples = 0.
75-
for train, test in cv:
76-
if isinstance(X, list) or isinstance(X, tuple):
77-
X_train = [X[i] for i, cond in enumerate(train) if cond]
78-
X_test = [X[i] for i, cond in enumerate(test) if cond]
79-
else:
80-
if sp.issparse(X):
81-
# For sparse matrices, slicing only works with indices
82-
# (no masked array). Convert to CSR format for efficiency and
83-
# because some sparse formats don't support row slicing.
84-
X = sp.csr_matrix(X)
85-
ind = np.arange(X.shape[0])
86-
train = ind[train]
87-
test = ind[test]
88-
X_train = X[train]
89-
X_test = X[test]
90-
if y is not None:
91-
y_test = y[test]
92-
y_train = y[train]
93-
else:
94-
y_test = None
95-
y_train = None
96-
97-
clf.fit(X_train, y_train, **fit_params)
98-
99-
if loss_func is not None:
100-
y_pred = clf.predict(X_test)
101-
this_score = -loss_func(y_test, y_pred)
102-
elif score_func is not None:
103-
y_pred = clf.predict(X_test)
104-
this_score = score_func(y_test, y_pred)
105-
else:
106-
this_score = clf.score(X_test, y_test)
107-
if iid:
108-
if y is not None:
109-
this_n_test_samples = y.shape[0]
110-
else:
111-
this_n_test_samples = X.shape[0]
112-
this_score *= this_n_test_samples
113-
n_test_samples += this_n_test_samples
114-
score += this_score
115-
if iid:
116-
score /= n_test_samples
117-
118-
return score, clf
85+
if isinstance(X, list) or isinstance(X, tuple):
86+
X_train = [X[i] for i, cond in enumerate(train) if cond]
87+
X_test = [X[i] for i, cond in enumerate(test) if cond]
88+
else:
89+
if sp.issparse(X):
90+
# For sparse matrices, slicing only works with indices
91+
# (no masked array). Convert to CSR format for efficiency and
92+
# because some sparse formats don't support row slicing.
93+
X = sp.csr_matrix(X)
94+
ind = np.arange(X.shape[0])
95+
train = ind[train]
96+
test = ind[test]
97+
X_train = X[train]
98+
X_test = X[test]
99+
if y is not None:
100+
y_test = y[test]
101+
y_train = y[train]
102+
else:
103+
y_test = None
104+
y_train = None
105+
106+
clf.fit(X_train, y_train, **fit_params)
107+
108+
if loss_func is not None:
109+
y_pred = clf.predict(X_test)
110+
this_score = -loss_func(y_test, y_pred)
111+
elif score_func is not None:
112+
y_pred = clf.predict(X_test)
113+
this_score = score_func(y_test, y_pred)
114+
else:
115+
this_score = clf.score(X_test, y_test)
116+
117+
if y is not None:
118+
this_n_test_samples = y.shape[0]
119+
else:
120+
this_n_test_samples = X.shape[0]
121+
122+
if verbose > 1:
123+
end_msg = "%s -%s" % (msg,
124+
short_format_time(time.time() - start_time))
125+
print "[GridSearchCV] %s %s" % ((64-len(end_msg))*'.', end_msg)
126+
return this_score, clf, this_n_test_samples
119127

120128

121129
class GridSearchCV(BaseEstimator):
@@ -162,6 +170,9 @@ class GridSearchCV(BaseEstimator):
162170
refit: boolean
163171
refit the best estimator with the entire dataset
164172
173+
verbose: integer
174+
Controls the verbosity: the higher, the more messages.
175+
165176
Examples
166177
--------
167178
>>> from scikits.learn import svm, grid_search, datasets
@@ -170,8 +181,8 @@ class GridSearchCV(BaseEstimator):
170181
>>> svr = svm.SVR()
171182
>>> clf = grid_search.GridSearchCV(svr, parameters)
172183
>>> clf.fit(iris.data, iris.target) # doctest: +ELLIPSIS
173-
GridSearchCV(n_jobs=1, fit_params={}, loss_func=None, refit=True, cv=None,
174-
iid=True,
184+
GridSearchCV(n_jobs=1, verbose=0, fit_params={}, loss_func=None, refit=True,
185+
cv=None, iid=True,
175186
estimator=SVR(kernel='rbf', C=1.0, probability=False, ...
176187
...
177188
@@ -187,6 +198,7 @@ class GridSearchCV(BaseEstimator):
187198

188199
def __init__(self, estimator, param_grid, loss_func=None, score_func=None,
189200
fit_params={}, n_jobs=1, iid=True, refit=True, cv=None,
201+
verbose=0,
190202
):
191203
assert hasattr(estimator, 'fit') and (hasattr(estimator, 'predict')
192204
or hasattr(estimator, 'score')), (
@@ -210,6 +222,7 @@ def __init__(self, estimator, param_grid, loss_func=None, score_func=None,
210222
self.iid = iid
211223
self.refit = refit
212224
self.cv = cv
225+
self.verbose = verbose
213226

214227
def fit(self, X, y=None, **params):
215228
"""Run fit with all sets of parameters
@@ -242,20 +255,38 @@ def fit(self, X, y=None, **params):
242255
else:
243256
cv = KFold(n_samples, k=3)
244257

245-
grid = iter_grid(self.param_grid)
258+
grid = IterGrid(self.param_grid)
246259
base_clf = clone(self.estimator)
247-
out = Parallel(n_jobs=self.n_jobs)(
260+
# XXX: Need to make use of Parallel's new pre_dispatch
261+
out = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
248262
delayed(fit_grid_point)(
249-
X, y, base_clf, clf_params, cv, self.loss_func,
250-
self.score_func, self.iid, **self.fit_params)
251-
for clf_params in grid)
252-
253-
# Out is a list of pairs: score, estimator
254-
263+
X, y, base_clf, clf_params, train, test, self.loss_func,
264+
self.score_func, self.verbose, **self.fit_params)
265+
for clf_params in grid for train, test in cv)
266+
267+
# Out is a list of triplet: score, estimator, n_test_samples
268+
n_grid_points = len(list(grid))
269+
n_fits = len(out)
270+
n_folds = n_fits//n_grid_points
271+
272+
scores = list()
273+
for grid_start in range(0, n_fits, n_folds):
274+
n_test_samples = 0
275+
score = 0
276+
for this_score, estimator, this_n_test_samples in \
277+
out[grid_start:grid_start+n_folds]:
278+
if self.iid:
279+
this_score *= this_n_test_samples
280+
score += this_score
281+
n_test_samples += this_n_test_samples
282+
if self.iid:
283+
score /= float(n_test_samples)
284+
scores.append((score, estimator))
285+
255286
# Note: we do not use max(out) to make ties deterministic even if
256287
# comparison on estimator instances is not deterministic
257288
best_score = None
258-
for score, estimator in out:
289+
for score, estimator in scores:
259290
if best_score is None:
260291
best_score = score
261292
best_estimator = estimator
@@ -277,11 +308,10 @@ def fit(self, X, y=None, **params):
277308
self.score = best_estimator.score
278309

279310
# Store the computed scores
280-
grid = iter_grid(self.param_grid)
281311
# XXX: the name is too specific, it shouldn't have
282312
# 'grid' in it. Also, we should be retrieving/storing variance
283313
self.grid_points_scores_ = dict((tuple(clf_params.items()), score)
284-
for clf_params, (score, _) in zip(grid, out))
314+
for clf_params, (score, _) in zip(grid, scores))
285315

286316
return self
287317

0 commit comments

Comments
 (0)