File tree Expand file tree Collapse file tree 3 files changed +8
-4
lines changed Expand file tree Collapse file tree 3 files changed +8
-4
lines changed Original file line number Diff line number Diff line change 8888def is_interactive ():
8989 return not hasattr (sys .modules ['__main__' ], '__file__' )
9090
91+
9192# work-around for Jupyter notebook and IPython console
9293argv = [] if is_interactive () else sys .argv [1 :]
9394(opts , args ) = op .parse_args (argv )
@@ -136,6 +137,7 @@ def is_interactive():
136137def size_mb (docs ):
137138 return sum (len (s .encode ('utf-8' )) for s in docs ) / 1e6
138139
140+
139141data_train_size_mb = size_mb (data_train .data )
140142data_test_size_mb = size_mb (data_test .data )
141143
Original file line number Diff line number Diff line change 2727Two algorithms are demoed: ordinary k-means and its more scalable cousin
2828minibatch k-means.
2929
30- Additionally, latent semantic analysis can also be used to reduce dimensionality
31- and discover latent patterns in the data.
30+ Additionally, latent semantic analysis can also be used to reduce
31+ dimensionality and discover latent patterns in the data.
3232
3333It can be noted that k-means (and minibatch k-means) are very sensitive to
3434feature scaling and that in this case the IDF weighting helps improve the
106106def is_interactive ():
107107 return not hasattr (sys .modules ['__main__' ], '__file__' )
108108
109+
109110# work-around for Jupyter notebook and IPython console
110111argv = [] if is_interactive () else sys .argv [1 :]
111112(opts , args ) = op .parse_args (argv )
@@ -138,7 +139,8 @@ def is_interactive():
138139labels = dataset .target
139140true_k = np .unique (labels ).shape [0 ]
140141
141- print ("Extracting features from the training dataset using a sparse vectorizer" )
142+ print ("Extracting features from the training dataset "
143+ "using a sparse vectorizer" )
142144t0 = time ()
143145if opts .use_hashing :
144146 if opts .use_idf :
Original file line number Diff line number Diff line change @@ -62,7 +62,7 @@ def token_freqs(doc):
6262 'talk.religion.misc' ,
6363]
6464# Uncomment the following line to use a larger set (11k+ documents)
65- #categories = None
65+ # categories = None
6666
6767print (__doc__ )
6868print ("Usage: %s [n_features_for_hashing]" % sys .argv [0 ])
You can’t perform that action at this time.
0 commit comments