Skip to content

Commit e14b716

Browse files
committed
Merge pull request scikit-learn#3328 from ogrisel/joblib-0.8.2
MAINT bump joblib to 0.8.2
2 parents 777123d + a00fffd commit e14b716

File tree

6 files changed

+109
-48
lines changed

6 files changed

+109
-48
lines changed

sklearn/externals/joblib/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@
100100
101101
"""
102102

103-
__version__ = '0.8.1'
103+
__version__ = '0.8.2'
104104

105105

106106
from .memory import Memory, MemorizedResult

sklearn/externals/joblib/parallel.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -352,7 +352,7 @@ class Parallel(Logger):
352352
[Parallel(n_jobs=2)]: Done 6 out of 6 | elapsed: 0.0s finished
353353
'''
354354
def __init__(self, n_jobs=1, backend=None, verbose=0, pre_dispatch='all',
355-
temp_folder=None, max_nbytes=100e6, mmap_mode='c'):
355+
temp_folder=None, max_nbytes=100e6, mmap_mode='r'):
356356
self.verbose = verbose
357357
self._mp_context = None
358358
if backend is None:

sklearn/externals/joblib/pool.py

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -491,7 +491,7 @@ class MemmapingPool(PicklingPool):
491491
"""
492492

493493
def __init__(self, processes=None, temp_folder=None, max_nbytes=1e6,
494-
mmap_mode='c', forward_reducers=None, backward_reducers=None,
494+
mmap_mode='r', forward_reducers=None, backward_reducers=None,
495495
verbose=0, context_id=None, prewarm=False, **kwargs):
496496
if forward_reducers is None:
497497
forward_reducers = dict()
@@ -502,33 +502,35 @@ def __init__(self, processes=None, temp_folder=None, max_nbytes=1e6,
502502
# pool instance (do not create in advance to spare FS write access if
503503
# no array is to be dumped):
504504
use_shared_mem = False
505+
pool_folder_name = "joblib_memmaping_pool_%d_%d" % (
506+
os.getpid(), id(self))
505507
if temp_folder is None:
506508
temp_folder = os.environ.get('JOBLIB_TEMP_FOLDER', None)
507509
if temp_folder is None:
508510
if os.path.exists(SYSTEM_SHARED_MEM_FS):
509511
try:
510-
joblib_folder = os.path.join(
511-
SYSTEM_SHARED_MEM_FS, 'joblib')
512-
if not os.path.exists(joblib_folder):
513-
os.makedirs(joblib_folder)
512+
temp_folder = SYSTEM_SHARED_MEM_FS
513+
pool_folder = os.path.join(temp_folder, pool_folder_name)
514+
if not os.path.exists(pool_folder):
515+
os.makedirs(pool_folder)
514516
use_shared_mem = True
515517
except IOError:
516-
# Missing rights in the the /dev/shm partition, ignore
517-
pass
518+
# Missing rights in the the /dev/shm partition,
519+
# fallback to regular temp folder.
520+
temp_folder = None
518521
if temp_folder is None:
519522
# Fallback to the default tmp folder, typically /tmp
520523
temp_folder = tempfile.gettempdir()
521524
temp_folder = os.path.abspath(os.path.expanduser(temp_folder))
522-
self._temp_folder = temp_folder = os.path.join(
523-
temp_folder, "joblib_memmaping_pool_%d_%d" % (
524-
os.getpid(), id(self)))
525+
pool_folder = os.path.join(temp_folder, pool_folder_name)
526+
self._temp_folder = pool_folder
525527

526528
# Register the garbage collector at program exit in case caller forgets
527529
# to call terminate explicitly: note we do not pass any reference to
528530
# self to ensure that this callback won't prevent garbage collection of
529531
# the pool instance and related file handler resources such as POSIX
530532
# semaphores and pipes
531-
atexit.register(lambda: delete_folder(temp_folder))
533+
atexit.register(lambda: delete_folder(pool_folder))
532534

533535
if np is not None:
534536
# Register smart numpy.ndarray reducers that detects memmap backed
@@ -537,7 +539,7 @@ def __init__(self, processes=None, temp_folder=None, max_nbytes=1e6,
537539
if prewarm == "auto":
538540
prewarm = not use_shared_mem
539541
forward_reduce_ndarray = ArrayMemmapReducer(
540-
max_nbytes, temp_folder, mmap_mode, verbose,
542+
max_nbytes, pool_folder, mmap_mode, verbose,
541543
context_id=context_id, prewarm=prewarm)
542544
forward_reducers[np.ndarray] = forward_reduce_ndarray
543545
forward_reducers[np.memmap] = reduce_memmap
@@ -547,7 +549,7 @@ def __init__(self, processes=None, temp_folder=None, max_nbytes=1e6,
547549
# to avoid confusing the caller and make it tricky to collect the
548550
# temporary folder
549551
backward_reduce_ndarray = ArrayMemmapReducer(
550-
None, temp_folder, mmap_mode, verbose)
552+
None, pool_folder, mmap_mode, verbose)
551553
backward_reducers[np.ndarray] = backward_reduce_ndarray
552554
backward_reducers[np.memmap] = reduce_memmap
553555

sklearn/externals/joblib/test/test_hashing.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,11 @@
3232
unicode = lambda s: s
3333

3434

35+
def assert_less(a, b):
36+
if a > b:
37+
raise AssertionError("%r is not lower than %r")
38+
39+
3540
###############################################################################
3641
# Helper functions for the tests
3742
def time_func(func, *args):
@@ -200,15 +205,15 @@ def test_hash_numpy_performance():
200205
md5_hash = lambda x: hashlib.md5(getbuffer(x)).hexdigest()
201206

202207
relative_diff = relative_time(md5_hash, hash, a)
203-
nose.tools.assert_true(relative_diff < 0.1)
208+
assert_less(relative_diff, 0.3)
204209

205210
# Check that hashing an tuple of 3 arrays takes approximately
206211
# 3 times as much as hashing one array
207212
time_hashlib = 3 * time_func(md5_hash, a)
208213
time_hash = time_func(hash, (a, a, a))
209214
relative_diff = 0.5 * (abs(time_hash - time_hashlib)
210215
/ (time_hash + time_hashlib))
211-
nose.tools.assert_true(relative_diff < 0.2)
216+
assert_less(relative_diff, 0.3)
212217

213218

214219
def test_bound_methods_hash():

sklearn/externals/joblib/test/test_logger.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,8 +59,8 @@ def test_print_time():
5959
print_time(unicode('Foo'))
6060
printed_text = sys.stderr.getvalue()
6161
# Use regexps to be robust to time variations
62-
match = r"Foo: 0\..s, 0\.0min\nFoo: 0\..s, 0.0min\nFoo: " + \
63-
r".\..s, 0.0min\n"
62+
match = r"Foo: 0\..s, 0\..min\nFoo: 0\..s, 0..min\nFoo: " + \
63+
r".\..s, 0..min\n"
6464
if not re.match(match, printed_text):
6565
raise AssertionError('Excepted %s, got %s' %
6666
(match, printed_text))

sklearn/externals/joblib/test/test_pool.py

Lines changed: 83 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -54,16 +54,40 @@ def teardown_temp_folder():
5454
with_temp_folder = with_setup(setup_temp_folder, teardown_temp_folder)
5555

5656

57-
def double(input):
58-
"""Dummy helper function to be executed in subprocesses"""
57+
def setup_if_has_dev_shm():
58+
if not os.path.exists('/dev/shm'):
59+
raise SkipTest("This test requires the /dev/shm shared memory fs.")
60+
61+
62+
with_dev_shm = with_setup(setup_if_has_dev_shm)
63+
64+
65+
def check_array(args):
66+
"""Dummy helper function to be executed in subprocesses
67+
68+
Check that the provided array has the expected values in the provided
69+
range.
70+
71+
"""
5972
assert_array_equal = np.testing.assert_array_equal
73+
data, position, expected = args
74+
assert_equal(data[position], expected)
6075

61-
data, position, expected = input
62-
if expected is not None:
63-
assert_equal(data[position], expected)
76+
77+
def inplace_double(args):
78+
"""Dummy helper function to be executed in subprocesses
79+
80+
81+
Check that the input array has the right values in the provided range
82+
and perform an inplace modification to double the values in the range by
83+
two.
84+
85+
"""
86+
assert_array_equal = np.testing.assert_array_equal
87+
data, position, expected = args
88+
assert_equal(data[position], expected)
6489
data[position] *= 2
65-
if expected is not None:
66-
assert_array_equal(data[position], 2 * expected)
90+
assert_equal(data[position], 2 * expected)
6791

6892

6993
@with_numpy
@@ -210,18 +234,18 @@ def test_pool_with_memmap():
210234
a = np.memmap(filename, dtype=np.float32, shape=(3, 5), mode='w+')
211235
a.fill(1.0)
212236

213-
p.map(double, [(a, (i, j), 1.0)
214-
for i in range(a.shape[0])
215-
for j in range(a.shape[1])])
237+
p.map(inplace_double, [(a, (i, j), 1.0)
238+
for i in range(a.shape[0])
239+
for j in range(a.shape[1])])
216240

217241
assert_array_equal(a, 2 * np.ones(a.shape))
218242

219243
# Open a copy-on-write view on the previous data
220244
b = np.memmap(filename, dtype=np.float32, shape=(5, 3), mode='c')
221245

222-
p.map(double, [(b, (i, j), 2.0)
223-
for i in range(b.shape[0])
224-
for j in range(b.shape[1])])
246+
p.map(inplace_double, [(b, (i, j), 2.0)
247+
for i in range(b.shape[0])
248+
for j in range(b.shape[1])])
225249

226250
# Passing memmap instances to the pool should not trigger the creation
227251
# of new files on the FS
@@ -235,12 +259,12 @@ def test_pool_with_memmap():
235259
c = np.memmap(filename, dtype=np.float32, shape=(10,), mode='r',
236260
offset=5 * 4)
237261

238-
assert_raises(AssertionError, p.map, double,
262+
assert_raises(AssertionError, p.map, check_array,
239263
[(c, i, 3.0) for i in range(c.shape[0])])
240264

241265
# depending on the version of numpy one can either get a RuntimeError
242266
# or a ValueError
243-
assert_raises((RuntimeError, ValueError), p.map, double,
267+
assert_raises((RuntimeError, ValueError), p.map, inplace_double,
244268
[(c, i, 2.0) for i in range(c.shape[0])])
245269
finally:
246270
# Clean all filehandlers held by the pool
@@ -270,9 +294,9 @@ def test_pool_with_memmap_array_view():
270294
assert_false(isinstance(a_view, np.memmap))
271295
assert_true(has_shareable_memory(a_view))
272296

273-
p.map(double, [(a_view, (i, j), 1.0)
274-
for i in range(a.shape[0])
275-
for j in range(a.shape[1])])
297+
p.map(inplace_double, [(a_view, (i, j), 1.0)
298+
for i in range(a.shape[0])
299+
for j in range(a.shape[1])])
276300

277301
# Both a and the a_view have been updated
278302
assert_array_equal(a, 2 * np.ones(a.shape))
@@ -307,24 +331,17 @@ def test_memmaping_pool_for_large_arrays():
307331

308332
small = np.ones(5, dtype=np.float32)
309333
assert_equal(small.nbytes, 20)
310-
p.map(double, [(small, i, 1.0) for i in range(small.shape[0])])
334+
p.map(check_array, [(small, i, 1.0) for i in range(small.shape[0])])
311335

312336
# Memory has been copied, the pool filesystem folder is unused
313337
assert_equal(os.listdir(TEMP_FOLDER), [])
314338

315339
# Try with a file larger than the memmap threshold of 40 bytes
316340
large = np.ones(100, dtype=np.float64)
317341
assert_equal(large.nbytes, 800)
318-
p.map(double, [(large, i, 1.0) for i in range(large.shape[0])])
342+
p.map(check_array, [(large, i, 1.0) for i in range(large.shape[0])])
319343

320-
# By defaul, the mmap_mode is copy-on-write to make the pool
321-
# process able to modify their view individually as if they would have
322-
# received their own copy of the original array. The original array
323-
# (which is not a shared memmap instance is untouched)
324-
assert_false(has_shareable_memory(large))
325-
assert_array_equal(large, np.ones(100))
326-
327-
# The data has been dump in a temp folder for subprocess to share it
344+
# The data has been dumped in a temp folder for subprocess to share it
328345
# without per-child memory copies
329346
assert_true(os.path.isdir(p._temp_folder))
330347
dumped_filenames = os.listdir(p._temp_folder)
@@ -352,7 +369,7 @@ def test_memmaping_pool_for_large_arrays_disabled():
352369
# Try with a file largish than the memmap threshold of 40 bytes
353370
large = np.ones(100, dtype=np.float64)
354371
assert_equal(large.nbytes, 800)
355-
p.map(double, [(large, i, 1.0) for i in range(large.shape[0])])
372+
p.map(check_array, [(large, i, 1.0) for i in range(large.shape[0])])
356373

357374
# Check that the tempfolder is still empty
358375
assert_equal(os.listdir(TEMP_FOLDER), [])
@@ -363,6 +380,43 @@ def test_memmaping_pool_for_large_arrays_disabled():
363380
del p
364381

365382

383+
@with_numpy
384+
@with_multiprocessing
385+
@with_dev_shm
386+
def test_memmaping_on_dev_shm():
387+
"""Check that large arrays memmaping can be disabled"""
388+
p = MemmapingPool(3, max_nbytes=10)
389+
try:
390+
# Check that the pool has correctly detected the presence of the
391+
# shared memory filesystem.
392+
pool_temp_folder = p._temp_folder
393+
folder_prefix = '/dev/shm/joblib_memmaping_pool_'
394+
assert_true(pool_temp_folder.startswith(folder_prefix))
395+
assert_true(os.path.exists(pool_temp_folder))
396+
397+
# Try with a file larger than the memmap threshold of 10 bytes
398+
a = np.ones(100, dtype=np.float64)
399+
assert_equal(a.nbytes, 800)
400+
p.map(id, [a] * 10)
401+
# a should have been memmaped to the pool temp folder: the joblib
402+
# pickling procedure generate a .pkl and a .npy file:
403+
assert_equal(len(os.listdir(pool_temp_folder)), 2)
404+
405+
b = np.ones(100, dtype=np.float64)
406+
assert_equal(b.nbytes, 800)
407+
p.map(id, [b] * 10)
408+
# A copy of both a and b are not stored in the shared memory folder
409+
assert_equal(len(os.listdir(pool_temp_folder)), 4)
410+
411+
finally:
412+
# Cleanup open file descriptors
413+
p.terminate()
414+
del p
415+
416+
# The temp folder is cleaned up upon pool termination
417+
assert_false(os.path.exists(pool_temp_folder))
418+
419+
366420
@with_numpy
367421
@with_multiprocessing
368422
@with_temp_folder

0 commit comments

Comments
 (0)