Merge pull request pytorch#702 from gchanan/conservativeAllocator

soumith · web-flow · commit 07f5b21ef1bd · 2017-02-15T08:26:48.000+05:30
Improve THCCachingHostAllocator performance by making it reclaim less aggressively
diff --git a/THCCachingHostAllocator.cpp b/THCCachingHostAllocator.cpp
@@ -6,6 +6,7 @@
 #include <set>
 #include <stdint.h>
 #include <unordered_map>
+#include <unordered_set>
 #include <utility>
 
 
@@ -23,11 +24,25 @@ struct Block : public BlockSize
 {
   bool  allocated;    // true if the block is currently allocated
   int   event_count;  // number of outstanding cuda events
+  std::unordered_set<THCStream *> streams;
 
   Block(size_t size, void* ptr, bool allocated) :
       BlockSize(size, ptr), allocated(allocated), event_count(0) { }
 };
 
+struct BlockStreamCleaner {
+  std::unordered_set<THCStream *> &streams;
+
+  BlockStreamCleaner(std::unordered_set<THCStream *> &streams) : streams(streams) {}
+  ~BlockStreamCleaner() {
+    for(auto it = streams.begin(); it != streams.end(); ++it) {
+      if (*it != NULL) {
+        THCStream_free(*it);
+      }
+    }
+    streams.clear();
+  }
+};
 static bool BlockComparator(const BlockSize& a, const BlockSize& b)
 {
   // sort by size, break ties with pointer
@@ -98,21 +113,49 @@ struct HostAllocator
       return cudaSuccess;
     }
 
+    // process outstanding cuda events which may have occurred
+    cudaError_t err = processEvents();
+    if (err != cudaSuccess) {
+      return err;
+    }
+
     auto it = blocks.find(ptr);
     THAssert(it != blocks.end());
 
     Block& block = it->second;
     THAssert(block.allocated);
 
+    // free (on valid memory) shouldn't fail, so mark unallocated before
+    // we process the streams.
     block.allocated = false;
+
+    // since the block has been deallocated, no point in keeping around the
+    // streams, even in case of error.
+    BlockStreamCleaner sc(block.streams);
+    for (auto it = block.streams.begin(); it != block.streams.end(); ++it) {
+      cudaEvent_t event;
+      err = cudaEventCreateWithFlags(&event, cudaEventDisableTiming);
+      if (err != cudaSuccess) {
+        return err;
+      }
+
+      err = cudaEventRecord(event, (*it) == NULL ? NULL : (*it)->stream);
+      if (err != cudaSuccess) {
+        return err;
+      }
+
+      // the block will not be re-used until all associated events have occured
+      block.event_count++;
+      cuda_events.emplace_back(event, ptr);
+    }
     if (block.event_count == 0) {
       // the block can be re-used if there are no outstanding cuda events
       available.insert(block);
     }
     return cudaSuccess;
   }
 
-  cudaError_t recordEvent(void* ptr, cudaStream_t stream)
+  cudaError_t recordEvent(void* ptr, THCStream *stream)
   {
     std::lock_guard<std::mutex> lock(mutex);
     cudaError_t err;
@@ -125,27 +168,11 @@ struct HostAllocator
 
     Block& block = it->second;
     THAssert(block.allocated);
-
-    // process outstanding cuda events which may have occurred
-    err = processEvents();
-    if (err != cudaSuccess) {
-      return err;
+    auto res = block.streams.emplace(stream);
+    if (res.second == true && stream != NULL) {
+      THCStream_retain(stream);
     }
 
-    // create and record an event in the given stream
-    cudaEvent_t event;
-    err = cudaEventCreateWithFlags(&event, cudaEventDisableTiming);
-    if (err != cudaSuccess) {
-      return err;
-    }
-    err = cudaEventRecord(event, stream);
-    if (err != cudaSuccess) {
-      return err;
-    }
-
-    // the block will not be re-used until all associated events have occured
-    block.event_count++;
-    cuda_events.emplace_back(event, ptr);
     return cudaSuccess;
   }
 
@@ -186,18 +213,17 @@ struct HostAllocator
     std::lock_guard<std::mutex> lock(mutex);
 
     // remove events for freed blocks
-    std::deque<std::pair<cudaEvent_t, void*>> new_events;
     for (auto it = cuda_events.begin(); it != cuda_events.end(); ++it) {
       cudaEvent_t event = it->first;
       Block& block = blocks.at(it->second);
       if (!block.allocated) {
         THCudaCheckWarn(cudaEventDestroy(event));
         block.event_count--;
-      } else {
-        new_events.push_back(*it);
       }
     }
-    cuda_events.swap(new_events);
+
+    // all cuda_events have been processed
+    cuda_events.clear();
 
     // clear list of available blocks
     available.clear();
@@ -232,7 +258,7 @@ static void THCCachingHostAllocator_free(void* ctx, void* ptr)
   allocator.free(ptr);
 }
 
-cudaError_t THCCachingHostAllocator_recordEvent(void *ptr, cudaStream_t stream)
+cudaError_t THCCachingHostAllocator_recordEvent(void *ptr, THCStream *stream)
 {
   return allocator.recordEvent(ptr, stream);
 }
diff --git a/THCCachingHostAllocator.h b/THCCachingHostAllocator.h
@@ -2,6 +2,7 @@
 #define THC_CACHING_HOST_ALLOCATOR_INC
 
 #include "THCGeneral.h"
+#include "THCStream.h"
 
 //
 // A caching allocator for CUDA host allocations (pinned memory).
@@ -22,7 +23,7 @@ THC_API THAllocator THCCachingHostAllocator;
 
 // Records an event in the specified stream. The allocation 'ptr' will not be
 // re-used until the event has occured.
-THC_API cudaError_t THCCachingHostAllocator_recordEvent(void *ptr, cudaStream_t stream);
+THC_API cudaError_t THCCachingHostAllocator_recordEvent(void *ptr, THCStream *stream);
 
 // Releases cached pinned memory allocations via cudaHostFree
 THC_API void THCCachingHostAllocator_emptyCache(void);
diff --git a/THCTensorCopy.h b/THCTensorCopy.h
@@ -4,6 +4,7 @@
 #include "THCTensor.h"
 #include "THCGeneral.h"
 #include "THCHalf.h"
+#include "THCStream.h"
 
 #include "generic/THCTensorCopy.h"
 #include "THCGenerateAllTypes.h"
diff --git a/generic/THCTensorCopy.c b/generic/THCTensorCopy.c
@@ -118,12 +118,12 @@ void THCTensor_(copyAsyncCPU)(THCState *state, THCTensor *self, struct THTensor
     THCudaCheck(cudaSetDevice(tensorDevice));
   }
 
-  cudaStream_t stream = THCState_getCurrentStream(state);
+  THCStream *stream  = THCState_getStream(state);
   THCudaCheck(cudaMemcpyAsync(THCTensor_(data)(state, self),
                               THTensor_(data)(src),
                               THTensor_(nElement)(src) * sizeof(real),
                               cudaMemcpyHostToDevice,
-                              stream));
+                              stream == NULL ? NULL : stream->stream));
 
   THCudaCheck(THCCachingHostAllocator_recordEvent(src->storage->data, stream));
 
@@ -149,12 +149,12 @@ void THTensor_(copyAsyncCuda)(THCState *state, THTensor *self, struct THCTensor
     THCudaCheck(cudaSetDevice(tensorDevice));
   }
 
-  cudaStream_t stream = THCState_getCurrentStream(state);
+  THCStream *stream = THCState_getStream(state);
   THCudaCheck(cudaMemcpyAsync(THTensor_(data)(self),
                               THCTensor_(data)(state, src),
                               THCTensor_(nElement)(state, src) * sizeof(real),
                               cudaMemcpyDeviceToHost,
-                              stream));
+                              stream == NULL ? NULL : stream->stream));
 
   THCudaCheck(THCCachingHostAllocator_recordEvent(src->storage->data, stream));