4444
4545print (__doc__ )
4646
47- # Author: Peter Prettenhoer <[email protected] > 47+ # Author: Peter Prettenhofer <[email protected] > 4848# License: BSD Style.
4949
50- # $Id$
51-
52- from time import time
50+ import logging
5351import os
5452import sys
55- import numpy as np
53+ from time import time
5654from optparse import OptionParser
5755
56+ import numpy as np
57+
58+ from sklearn .datasets import fetch_covtype
5859from sklearn .svm import LinearSVC
5960from sklearn .linear_model import SGDClassifier
6061from sklearn .naive_bayes import GaussianNB
6162from sklearn .tree import DecisionTreeClassifier
6263from sklearn .ensemble import RandomForestClassifier , ExtraTreesClassifier
6364from sklearn import metrics
6465from sklearn .externals .joblib import Memory
65- from sklearn .utils import check_random_state
66+
67+ logging .basicConfig (level = logging .INFO ,
68+ format = '%(asctime)s %(levelname)s %(message)s' )
69+ logger = logging .getLogger (__name__ )
6670
6771op = OptionParser ()
6872op .add_option ("--classifiers" ,
8084# estimators.
8185op .add_option ("--random-seed" ,
8286 dest = "random_seed" , default = 13 , type = int ,
83- help = "Common seed used by random number generator."
84- )
87+ help = "Common seed used by random number generator." )
8588
8689op .print_help ()
8790
97100joblib_cache_folder = os .path .join (bench_folder , 'bench_covertype_data' )
98101m = Memory (joblib_cache_folder , mmap_mode = 'r' )
99102
100- # Set seed for rng
101- rng = check_random_state (opts .random_seed )
102-
103103
104104# Load the data, then cache and memmap the train/test split
105105@m .cache
106106def load_data (dtype = np .float32 , order = 'F' ):
107- ######################################################################
108- ## Download the data, if not already on disk
109- if not os .path .exists (original_archive ):
110- # Download the data
111- import urllib
112- print ("Downloading data, Please Wait (11MB)..." )
113- opener = urllib .urlopen (
114- 'http://archive.ics.uci.edu/ml/'
115- 'machine-learning-databases/covtype/covtype.data.gz' )
116- open (original_archive , 'wb' ).write (opener .read ())
117-
118107 ######################################################################
119108 ## Load dataset
120109 print ("Loading dataset..." )
121- import gzip
122- f = gzip .open (original_archive )
123- X = np .fromstring (f .read ().replace ("," , " " ), dtype = dtype , sep = " " ,
124- count = - 1 )
125- X = X .reshape ((581012 , 55 ))
110+ data = fetch_covtype (download_if_missing = True , shuffle = True ,
111+ random_state = opts .random_seed )
112+ X , y = data .data , data .target
126113 if order .lower () == 'f' :
127114 X = np .asfortranarray (X )
128- f .close ()
129115
130116 # class 1 vs. all others.
131- y = np .ones (X .shape [0 ]) * - 1
132- y [np .where (X [:, - 1 ] == 1 )] = 1
133- X = X [:, :- 1 ]
117+ y [np .where (y != 1 )] = - 1
134118
135119 ######################################################################
136120 ## Create train-test split (as [Joachims, 2006])
137- print ("Creating train-test split..." )
138- idx = np .arange (X .shape [0 ])
139- rng .shuffle (idx )
140- train_idx = idx [:522911 ]
141- test_idx = idx [522911 :]
121+ logger .info ("Creating train-test split..." )
122+ n_train = 522911
142123
143- X_train = X [train_idx ]
144- y_train = y [train_idx ]
145- X_test = X [test_idx ]
146- y_test = y [test_idx ]
147-
148- # free memory
149- del X
150- del y
124+ X_train = X [:n_train ]
125+ y_train = y [:n_train ]
126+ X_test = X [n_train :]
127+ y_test = y [n_train :]
151128
152129 ######################################################################
153130 ## Standardize first 10 features (the numerical ones)
@@ -206,7 +183,7 @@ def benchmark(clf):
206183 'dual' : False ,
207184 'tol' : 1e-3 ,
208185 "random_state" : opts .random_seed ,
209- }
186+ }
210187classifiers ['liblinear' ] = LinearSVC (** liblinear_parameters )
211188
212189######################################################################
@@ -220,7 +197,7 @@ def benchmark(clf):
220197 'n_iter' : 2 ,
221198 'n_jobs' : opts .n_jobs ,
222199 "random_state" : opts .random_seed ,
223- }
200+ }
224201classifiers ['SGD' ] = SGDClassifier (** sgd_parameters )
225202
226203######################################################################
0 commit comments