clang format reducer and logger files (pytorch#53148)

zhaojuanmao · facebook-github-bot · commit a76b4736dbf6 · 2021-03-10T11:35:30.000-08:00
Summary: Pull Request resolved: pytorch#53148 clang format reducer and logger files ghstack-source-id: 123453983 Test Plan: unit test Reviewed By: SciPioneer Differential Revision: D26764509 fbshipit-source-id: 711efcfd77420f912861cfd20c69e3af5086f4b9
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
@@ -1,9 +1,9 @@
 #include <torch/csrc/python_headers.h>
 
 #include <c10/util/intrusive_ptr.h>
-#include <c10d/Utils.hpp>
 #include <c10d/FileStore.hpp>
 #include <c10d/TCPStore.hpp>
+#include <c10d/Utils.hpp>
 #ifndef _WIN32
 #include <c10d/HashStore.hpp>
 #include <c10d/ProcessGroupRoundRobin.hpp>
@@ -301,10 +301,11 @@ An enum-like class for built-in communication hooks: ``ALLREDUCE`` and ``FP16_CO
           "save_thread_local_state",
           &::c10d::Reducer::save_thread_local_state,
           py::call_guard<py::gil_scoped_release>())
-      .def("_set_ddp_runtime_logging_sample_rate",
-           &::c10d::Reducer::set_ddp_runtime_logging_sample_rate,
-           py::arg("sample_rate"),
-           py::call_guard<py::gil_scoped_release>());
+      .def(
+          "_set_ddp_runtime_logging_sample_rate",
+          &::c10d::Reducer::set_ddp_runtime_logging_sample_rate,
+          py::arg("sample_rate"),
+          py::call_guard<py::gil_scoped_release>());
 
   shared_ptr_class_<::c10d::Logger>(module, "Logger")
       .def(
@@ -323,25 +324,25 @@ An enum-like class for built-in communication hooks: ``ALLREDUCE`` and ``FP16_CO
           "set_runtime_stats_and_log",
           &::c10d::Logger::set_runtime_stats_and_log,
           py::call_guard<py::gil_scoped_release>())
-        .def(
+      .def(
           "_get_ddp_logging_data",
           &::c10d::Logger::get_ddp_logging_data,
           py::call_guard<py::gil_scoped_release>())
-        .def(
-            "_set_comm_hook_name",
-            &::c10d::Logger::set_comm_hook,
-            py::arg("comm_hook"),
-            py::call_guard<py::gil_scoped_release>());
+      .def(
+          "_set_comm_hook_name",
+          &::c10d::Logger::set_comm_hook,
+          py::arg("comm_hook"),
+          py::call_guard<py::gil_scoped_release>());
 
   py::enum_<::c10d::DistributedDebugLevel>(module, "_DistributedDebugLevel", R"(
       An enum whose values correspond to different debug settings of the
       torch.distributed package. Currently supporting settings are OFF, INFO,
       and DETAIL, which can be set via the TORCH_DISTRIBUTED_DEBUG environment
       variable.
   )")
-  .value("OFF", ::c10d::DistributedDebugLevel::OFF)
-  .value("INFO", ::c10d::DistributedDebugLevel::INFO)
-  .value("DETAIL", ::c10d::DistributedDebugLevel::DETAIL);
+      .value("OFF", ::c10d::DistributedDebugLevel::OFF)
+      .value("INFO", ::c10d::DistributedDebugLevel::INFO)
+      .value("DETAIL", ::c10d::DistributedDebugLevel::DETAIL);
 
   module.def(
       "_get_debug_mode",
@@ -1283,7 +1284,7 @@ that adds a prefix to each key inserted to the store.
                 Note that ``fut.done()`` returns only whether the operation has been enqueued on the GPU.
            )");
 
-py::class_<c10::DDPLoggingData>(module, "DDPLoggingData")
+  py::class_<c10::DDPLoggingData>(module, "DDPLoggingData")
       .def(py::init<>())
       .def_readwrite("world_size", &c10::DDPLoggingData::world_size)
       .def_readwrite("rank", &c10::DDPLoggingData::rank)
@@ -1344,18 +1345,13 @@ py::class_<c10::DDPLoggingData>(module, "DDPLoggingData")
       .def_readwrite(
           "avg_backward_compute_comm_overlap_time",
           &c10::DDPLoggingData::avg_backward_compute_comm_overlap_time)
+      .def_readwrite("comm_hook", &c10::DDPLoggingData::comm_hook)
       .def_readwrite(
-          "comm_hook",
-          &c10::DDPLoggingData::comm_hook)
-      .def_readwrite(
-          "forward_compute_time",
-          &c10::DDPLoggingData::forward_compute_time)
+          "forward_compute_time", &c10::DDPLoggingData::forward_compute_time)
       .def_readwrite(
-          "backward_compute_time",
-          &c10::DDPLoggingData::backward_compute_time)
+          "backward_compute_time", &c10::DDPLoggingData::backward_compute_time)
       .def_readwrite(
-          "backward_comm_time",
-          &c10::DDPLoggingData::backward_comm_time)
+          "backward_comm_time", &c10::DDPLoggingData::backward_comm_time)
       .def_readwrite(
           "backward_compute_comm_overlap_time",
           &c10::DDPLoggingData::backward_compute_comm_overlap_time)
diff --git a/torch/lib/c10d/logger.cpp b/torch/lib/c10d/logger.cpp
@@ -1,5 +1,5 @@
-#include <c10d/logger.hpp>
 #include <c10d/Utils.hpp>
+#include <c10d/logger.hpp>
 #include <fmt/format.h>
 
 namespace c10d {
@@ -14,25 +14,21 @@ const int kMilliSecondToNanosSecond = 1000000;
 
 } // anonymous namespace
 
-std::ostream& operator<<(
-  std::ostream& output,
-  const Logger& logger
-) {
+std::ostream& operator<<(std::ostream& output, const Logger& logger) {
   auto& ddp_logging_data = logger.ddp_logging_data_;
 
   std::string loggerInfo = fmt::format(
-    "[Rank {} / {}] Training {} unused_parameter_size={} \n "
-    "Avg forward compute time: {} \n Avg backward compute time: {} \n"
-    "Avg backward comm. time: {} \n Avg backward comm/comp overlap time: {}",
-    ddp_logging_data->rank,
-    ddp_logging_data->world_size,
-    ddp_logging_data->module_name,
-    ddp_logging_data->unused_parameter_size,
-    ddp_logging_data->avg_forward_compute_time,
-    ddp_logging_data->avg_backward_compute_time,
-    ddp_logging_data->avg_backward_comm_time,
-    ddp_logging_data->avg_backward_compute_comm_overlap_time
-  );
+      "[Rank {} / {}] Training {} unused_parameter_size={} \n "
+      "Avg forward compute time: {} \n Avg backward compute time: {} \n"
+      "Avg backward comm. time: {} \n Avg backward comm/comp overlap time: {}",
+      ddp_logging_data->rank,
+      ddp_logging_data->world_size,
+      ddp_logging_data->module_name,
+      ddp_logging_data->unused_parameter_size,
+      ddp_logging_data->avg_forward_compute_time,
+      ddp_logging_data->avg_backward_compute_time,
+      ddp_logging_data->avg_backward_comm_time,
+      ddp_logging_data->avg_backward_compute_comm_overlap_time);
 
   if (ddp_logging_data->comm_hook != "") {
     loggerInfo +=
@@ -56,7 +52,8 @@ void Logger::set_env_variables() {
   ddp_logging_data_->gloo_device_transport = parse_env("GLOO_DEVICE_TRANSPORT");
   ddp_logging_data_->nccl_socket_ifname = parse_env("NCCL_SOCKET_IFNAME");
   ddp_logging_data_->nccl_blocking_wait = parse_env("NCCL_BLOCKING_WAIT");
-  ddp_logging_data_->nccl_async_error_handling = parse_env("NCCL_ASYNC_ERROR_HANDLING");
+  ddp_logging_data_->nccl_async_error_handling =
+      parse_env("NCCL_ASYNC_ERROR_HANDLING");
   ddp_logging_data_->nccl_debug = parse_env("NCCL_DEBUG");
   ddp_logging_data_->nccl_nthreads = parse_env("NCCL_NTHREADS");
   ddp_logging_data_->nccl_ib_timeout = parse_env("NCCL_IB_TIMEOUT");
@@ -124,9 +121,7 @@ void Logger::set_construction_data_and_log(
 
   if (parseDistDebugLevel() != DistributedDebugLevel::OFF) {
     std::string initInfo = fmt::format(
-      "[Rank {}]: DDP Initialized with: \n",
-      ddp_logging_data_->rank
-    );
+        "[Rank {}]: DDP Initialized with: \n", ddp_logging_data_->rank);
     LOG(INFO) << initInfo << *ddp_logging_data_;
   }
 
@@ -149,8 +144,7 @@ void Logger::calculate_avg_cpu_time(
     return;
   }
   time_duration = cpu_end_time - cpu_start_time;
-  avg_time = (time_duration +
-              avg_time * (num_iterations_stats_recorded_ - 1)) /
+  avg_time = (time_duration + avg_time * (num_iterations_stats_recorded_ - 1)) /
       num_iterations_stats_recorded_;
 }
 
@@ -172,8 +166,7 @@ void Logger::calculate_avg_gpu_time(
     return;
   }
   time_duration = int64_t(milliseconds * kMilliSecondToNanosSecond);
-  avg_time = (time_duration +
-              avg_time * (num_iterations_stats_recorded_ - 1)) /
+  avg_time = (time_duration + avg_time * (num_iterations_stats_recorded_ - 1)) /
       num_iterations_stats_recorded_;
 }
 #endif
@@ -267,7 +260,7 @@ void Logger::set_runtime_stats_and_log() {
 
     calculate_avg_cpu_time(
         ddp_logging_data_->avg_backward_compute_time,
-         ddp_logging_data_->backward_compute_time,
+        ddp_logging_data_->backward_compute_time,
         reducer_->cpu_timer_.backward_compute_start_time,
         reducer_->cpu_timer_.backward_compute_end_time);
 
diff --git a/torch/lib/c10d/logger.hpp b/torch/lib/c10d/logger.hpp
@@ -20,10 +20,7 @@ class Logger {
 
   // Stream insertion operator for logging data to stream under
   // TORCH_DISTRIBUTED_DEBUG.
-  friend std::ostream& operator<<(
-    std::ostream& output,
-    const Logger& logger
-  );
+  friend std::ostream& operator<<(std::ostream& output, const Logger& logger);
 
   // Set environment variables.
   void set_env_variables();
diff --git a/torch/lib/c10d/reducer.cpp b/torch/lib/c10d/reducer.cpp
@@ -59,14 +59,14 @@ Reducer::Reducer(
   {
     std::set<int> unique_devices;
     for (const auto& v : replicas_[0]) {
-        auto device_idx = int(v.device().index());
-        if (unique_devices.find(device_idx) == unique_devices.end()) {
-          unique_devices.insert(device_idx);
-          if (unique_devices.size() > 1) {
-            is_multi_device_module_ = true;
-            break;
-          }
+      auto device_idx = int(v.device().index());
+      if (unique_devices.find(device_idx) == unique_devices.end()) {
+        unique_devices.insert(device_idx);
+        if (unique_devices.size() > 1) {
+          is_multi_device_module_ = true;
+          break;
         }
+      }
     }
   }
 
@@ -423,8 +423,8 @@ void Reducer::push_rebuilt_params(const VariableIndex& index) {
 void Reducer::autograd_hook(VariableIndex index) {
   std::lock_guard<std::mutex> lock(this->mutex_);
 
-  // Carry over thread local state from main thread. This allows for thread-local
-  // flags such as profiler enabled to be configure correctly.
+  // Carry over thread local state from main thread. This allows for
+  // thread-local flags such as profiler enabled to be configure correctly.
   at::ThreadLocalStateGuard g(thread_local_state_);
   // See Note [Skip allreducing local_used_maps_dev]
   if (find_unused_parameters_) {
@@ -970,7 +970,8 @@ void Reducer::prepare_for_backward(
     }
   }
 
-  // Warn user about unnecessary perf hit if all parameters were used in forward.
+  // Warn user about unnecessary perf hit if all parameters were used in
+  // forward.
   if (unused_parameters_.empty()) {
     TORCH_WARN_ONCE(
         "find_unused_parameters=True was specified in DDP constructor, "
@@ -1388,14 +1389,17 @@ void Reducer::ensure_prior_reduction_finished() {
   // The variable `require_finalize_` is true until all gradients
   // have been computed and reduction of all buckets has been kicked off.
   if (require_finalize_) {
-    std::string kBaseErrorMsg = "Expected to have finished reduction in the prior iteration before "
+    std::string kBaseErrorMsg =
+        "Expected to have finished reduction in the prior iteration before "
         "starting a new one. "
         ""
         "This error indicates that your module has parameters that were "
         "not used in producing loss. ";
-    std::string kOutputsNotUsedInLossErrorMsg = "making sure all "
+    std::string kOutputsNotUsedInLossErrorMsg =
+        "making sure all "
         "`forward` function outputs participate in calculating loss. ";
-    std::string kDDPBugErrorMsg = "\nIf you already have done the above, then the distributed "
+    std::string kDDPBugErrorMsg =
+        "\nIf you already have done the above, then the distributed "
         "data parallel module wasn't able to locate the output tensors in the "
         "return value of your module's `forward` function. "
         "Please include the loss function and the structure of the return "
@@ -1405,7 +1409,8 @@ void Reducer::ensure_prior_reduction_finished() {
     if (!find_unused_parameters_) {
       // Parameters may have been unused in forward pass, or not all outputs
       // were used in producing loss.
-      kBaseErrorMsg += "You can enable unused parameter detection by passing the "
+      kBaseErrorMsg +=
+          "You can enable unused parameter detection by passing the "
           "keyword argument `find_unused_parameters=True` to "
           "`torch.nn.parallel.DistributedDataParallel`, and by \n";
       kBaseErrorMsg += kOutputsNotUsedInLossErrorMsg;
@@ -1414,7 +1419,8 @@ void Reducer::ensure_prior_reduction_finished() {
       // Note that it does not really matter whether unused_parameters_.empty(),
       // since user may have enabled detection but this particular iteration
       // could have used or not used all parameters.
-      kBaseErrorMsg += "Since `find_unused_parameters=True` is enabled, this likely "
+      kBaseErrorMsg +=
+          "Since `find_unused_parameters=True` is enabled, this likely "
           " means that not all `forward` outputs participate in computing loss. You can fix this by ";
       kBaseErrorMsg += kOutputsNotUsedInLossErrorMsg;
       kBaseErrorMsg += kDDPBugErrorMsg;
@@ -1433,8 +1439,8 @@ int Reducer::get_ddp_runtime_logging_sample_rate() {
 
 bool Reducer::should_collect_runtime_stats() {
   if (num_iterations_ > 0 &&
-    (num_iterations_ <= 10 ||
-    num_iterations_ % get_ddp_runtime_logging_sample_rate() == 0)) {
+      (num_iterations_ <= 10 ||
+       num_iterations_ % get_ddp_runtime_logging_sample_rate() == 0)) {
     return true;
   }
   return false;