55# License: BSD Style.
66
77import copy
8+ import time
89
910import numpy as np
1011import scipy .sparse as sp
1112
1213from .externals .joblib import Parallel , delayed
14+ from .externals .joblib .logger import short_format_time
1315from .cross_val import KFold , StratifiedKFold
1416from .base import BaseEstimator , is_classifier , clone
1517
@@ -26,7 +28,7 @@ def product(*args, **kwds):
2628 yield tuple (prod )
2729
2830
29- def iter_grid ( param_grid ):
31+ class IterGrid ( object ):
3032 """Generators on the combination of the various parameter lists given
3133
3234 Parameters
@@ -43,79 +45,85 @@ def iter_grid(param_grid):
4345
4446 Examples
4547 ---------
46- >>> from scikits.learn.grid_search import iter_grid
48+ >>> from scikits.learn.grid_search import IterGrid
4749 >>> param_grid = {'a':[1, 2], 'b':[True, False]}
48- >>> list(iter_grid (param_grid))
50+ >>> list(IterGrid (param_grid))
4951 [{'a': 1, 'b': True}, {'a': 1, 'b': False}, {'a': 2, 'b': True}, {'a': 2, 'b': False}]
5052
5153 """
52- if hasattr (param_grid , 'has_key' ):
53- param_grid = [param_grid ]
54- for p in param_grid :
55- # Always sort the keys of a dictionary, for reproducibility
56- items = sorted (p .items ())
57- keys , values = zip (* items )
58- for v in product (* values ):
59- params = dict (zip (keys , v ))
60- yield params
61-
62-
63- def fit_grid_point (X , y , base_clf , clf_params , cv , loss_func , score_func , iid ,
64- ** fit_params ):
54+ def __init__ (self , param_grid ):
55+ self .param_grid = param_grid
56+
57+ def __iter__ (self ):
58+ param_grid = self .param_grid
59+ if hasattr (param_grid , 'has_key' ):
60+ param_grid = [param_grid ]
61+ for p in param_grid :
62+ # Always sort the keys of a dictionary, for reproducibility
63+ items = sorted (p .items ())
64+ keys , values = zip (* items )
65+ for v in product (* values ):
66+ params = dict (zip (keys , v ))
67+ yield params
68+
69+
70+ def fit_grid_point (X , y , base_clf , clf_params , train , test , loss_func ,
71+ score_func , verbose , ** fit_params ):
6572 """Run fit on one set of parameters
6673
6774 Returns the score and the instance of the classifier
6875 """
76+ if verbose > 1 :
77+ start_time = time .time ()
78+ msg = '%s' % (', ' .join ('%s=%s' % (k , v )
79+ for k , v in clf_params .iteritems ()))
80+ print "[GridSearchCV] %s %s" % (msg , (64 - len (msg ))* '.' )
6981 # update parameters of the classifier after a copy of its base structure
7082 clf = copy .deepcopy (base_clf )
7183 clf ._set_params (** clf_params )
7284
73- score = 0.
74- n_test_samples = 0.
75- for train , test in cv :
76- if isinstance (X , list ) or isinstance (X , tuple ):
77- X_train = [X [i ] for i , cond in enumerate (train ) if cond ]
78- X_test = [X [i ] for i , cond in enumerate (test ) if cond ]
79- else :
80- if sp .issparse (X ):
81- # For sparse matrices, slicing only works with indices
82- # (no masked array). Convert to CSR format for efficiency and
83- # because some sparse formats don't support row slicing.
84- X = sp .csr_matrix (X )
85- ind = np .arange (X .shape [0 ])
86- train = ind [train ]
87- test = ind [test ]
88- X_train = X [train ]
89- X_test = X [test ]
90- if y is not None :
91- y_test = y [test ]
92- y_train = y [train ]
93- else :
94- y_test = None
95- y_train = None
96-
97- clf .fit (X_train , y_train , ** fit_params )
98-
99- if loss_func is not None :
100- y_pred = clf .predict (X_test )
101- this_score = - loss_func (y_test , y_pred )
102- elif score_func is not None :
103- y_pred = clf .predict (X_test )
104- this_score = score_func (y_test , y_pred )
105- else :
106- this_score = clf .score (X_test , y_test )
107- if iid :
108- if y is not None :
109- this_n_test_samples = y .shape [0 ]
110- else :
111- this_n_test_samples = X .shape [0 ]
112- this_score *= this_n_test_samples
113- n_test_samples += this_n_test_samples
114- score += this_score
115- if iid :
116- score /= n_test_samples
117-
118- return score , clf
85+ if isinstance (X , list ) or isinstance (X , tuple ):
86+ X_train = [X [i ] for i , cond in enumerate (train ) if cond ]
87+ X_test = [X [i ] for i , cond in enumerate (test ) if cond ]
88+ else :
89+ if sp .issparse (X ):
90+ # For sparse matrices, slicing only works with indices
91+ # (no masked array). Convert to CSR format for efficiency and
92+ # because some sparse formats don't support row slicing.
93+ X = sp .csr_matrix (X )
94+ ind = np .arange (X .shape [0 ])
95+ train = ind [train ]
96+ test = ind [test ]
97+ X_train = X [train ]
98+ X_test = X [test ]
99+ if y is not None :
100+ y_test = y [test ]
101+ y_train = y [train ]
102+ else :
103+ y_test = None
104+ y_train = None
105+
106+ clf .fit (X_train , y_train , ** fit_params )
107+
108+ if loss_func is not None :
109+ y_pred = clf .predict (X_test )
110+ this_score = - loss_func (y_test , y_pred )
111+ elif score_func is not None :
112+ y_pred = clf .predict (X_test )
113+ this_score = score_func (y_test , y_pred )
114+ else :
115+ this_score = clf .score (X_test , y_test )
116+
117+ if y is not None :
118+ this_n_test_samples = y .shape [0 ]
119+ else :
120+ this_n_test_samples = X .shape [0 ]
121+
122+ if verbose > 1 :
123+ end_msg = "%s -%s" % (msg ,
124+ short_format_time (time .time () - start_time ))
125+ print "[GridSearchCV] %s %s" % ((64 - len (end_msg ))* '.' , end_msg )
126+ return this_score , clf , this_n_test_samples
119127
120128
121129class GridSearchCV (BaseEstimator ):
@@ -162,6 +170,9 @@ class GridSearchCV(BaseEstimator):
162170 refit: boolean
163171 refit the best estimator with the entire dataset
164172
173+ verbose: integer
174+ Controls the verbosity: the higher, the more messages.
175+
165176 Examples
166177 --------
167178 >>> from scikits.learn import svm, grid_search, datasets
@@ -170,8 +181,8 @@ class GridSearchCV(BaseEstimator):
170181 >>> svr = svm.SVR()
171182 >>> clf = grid_search.GridSearchCV(svr, parameters)
172183 >>> clf.fit(iris.data, iris.target) # doctest: +ELLIPSIS
173- GridSearchCV(n_jobs=1, fit_params={}, loss_func=None, refit=True, cv=None ,
174- iid=True,
184+ GridSearchCV(n_jobs=1, verbose=0, fit_params={}, loss_func=None, refit=True,
185+ cv=None, iid=True,
175186 estimator=SVR(kernel='rbf', C=1.0, probability=False, ...
176187 ...
177188
@@ -187,6 +198,7 @@ class GridSearchCV(BaseEstimator):
187198
188199 def __init__ (self , estimator , param_grid , loss_func = None , score_func = None ,
189200 fit_params = {}, n_jobs = 1 , iid = True , refit = True , cv = None ,
201+ verbose = 0 ,
190202 ):
191203 assert hasattr (estimator , 'fit' ) and (hasattr (estimator , 'predict' )
192204 or hasattr (estimator , 'score' )), (
@@ -210,6 +222,7 @@ def __init__(self, estimator, param_grid, loss_func=None, score_func=None,
210222 self .iid = iid
211223 self .refit = refit
212224 self .cv = cv
225+ self .verbose = verbose
213226
214227 def fit (self , X , y = None , ** params ):
215228 """Run fit with all sets of parameters
@@ -242,20 +255,38 @@ def fit(self, X, y=None, **params):
242255 else :
243256 cv = KFold (n_samples , k = 3 )
244257
245- grid = iter_grid (self .param_grid )
258+ grid = IterGrid (self .param_grid )
246259 base_clf = clone (self .estimator )
247- out = Parallel (n_jobs = self .n_jobs )(
260+ # XXX: Need to make use of Parallel's new pre_dispatch
261+ out = Parallel (n_jobs = self .n_jobs , verbose = self .verbose )(
248262 delayed (fit_grid_point )(
249- X , y , base_clf , clf_params , cv , self .loss_func ,
250- self .score_func , self .iid , ** self .fit_params )
251- for clf_params in grid )
252-
253- # Out is a list of pairs: score, estimator
254-
263+ X , y , base_clf , clf_params , train , test , self .loss_func ,
264+ self .score_func , self .verbose , ** self .fit_params )
265+ for clf_params in grid for train , test in cv )
266+
267+ # Out is a list of triplet: score, estimator, n_test_samples
268+ n_grid_points = len (list (grid ))
269+ n_fits = len (out )
270+ n_folds = n_fits // n_grid_points
271+
272+ scores = list ()
273+ for grid_start in range (0 , n_fits , n_folds ):
274+ n_test_samples = 0
275+ score = 0
276+ for this_score , estimator , this_n_test_samples in \
277+ out [grid_start :grid_start + n_folds ]:
278+ if self .iid :
279+ this_score *= this_n_test_samples
280+ score += this_score
281+ n_test_samples += this_n_test_samples
282+ if self .iid :
283+ score /= float (n_test_samples )
284+ scores .append ((score , estimator ))
285+
255286 # Note: we do not use max(out) to make ties deterministic even if
256287 # comparison on estimator instances is not deterministic
257288 best_score = None
258- for score , estimator in out :
289+ for score , estimator in scores :
259290 if best_score is None :
260291 best_score = score
261292 best_estimator = estimator
@@ -277,11 +308,10 @@ def fit(self, X, y=None, **params):
277308 self .score = best_estimator .score
278309
279310 # Store the computed scores
280- grid = iter_grid (self .param_grid )
281311 # XXX: the name is too specific, it shouldn't have
282312 # 'grid' in it. Also, we should be retrieving/storing variance
283313 self .grid_points_scores_ = dict ((tuple (clf_params .items ()), score )
284- for clf_params , (score , _ ) in zip (grid , out ))
314+ for clf_params , (score , _ ) in zip (grid , scores ))
285315
286316 return self
287317
0 commit comments