Skip to content

Commit 540e867

Browse files
Wrappers for CUDA 9 warp-synchronous intrinsics.
PiperOrigin-RevId: 177799252
1 parent c02cfb0 commit 540e867

File tree

10 files changed

+851
-772
lines changed

10 files changed

+851
-772
lines changed

tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops_gpu.cu.cc

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,9 @@ namespace functor {
3434
__global__ void ReduceSliceDeviceKernel##reduceop( \
3535
Cuda3DLaunchConfig config, Index indices_width, Index bound, \
3636
const T begin, const Index *indices, const T *input, T *out) { \
37-
CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count, x) { \
38-
CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count, y) { \
39-
CUDA_AXIS_KERNEL_LOOP(z, config.virtual_thread_count, z) { \
37+
CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count.x, X) { \
38+
CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count.y, Y) { \
39+
CUDA_AXIS_KERNEL_LOOP(z, config.virtual_thread_count.z, Z) { \
4040
Index outidx = x * config.virtual_thread_count.y * \
4141
config.virtual_thread_count.z + \
4242
y * config.virtual_thread_count.z + z; \
@@ -68,8 +68,9 @@ namespace functor {
6868
if (sizex * sizey * sizez == 0) { \
6969
return; \
7070
} \
71-
Cuda3DLaunchConfig config = GetCuda3DLaunchConfig(sizex, sizey, sizez, d,\
72-
ReduceSliceDeviceKernel##reduceop<T, Index>, 0, 0); \
71+
Cuda3DLaunchConfig config = GetCuda3DLaunchConfig( \
72+
sizex, sizey, sizez, d, ReduceSliceDeviceKernel##reduceop<T, Index>, \
73+
0, 0); \
7374
\
7475
ReduceSliceDeviceKernel##reduceop<T, Index> \
7576
<<<config.block_count, config.thread_per_block, 0, d.stream()>>>( \

tensorflow/core/BUILD

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1847,6 +1847,13 @@ cc_library(
18471847
],
18481848
)
18491849

1850+
tf_cuda_library(
1851+
name = "cuda_device_functions",
1852+
hdrs = ["util/cuda_device_functions.h"],
1853+
visibility = ["//visibility:public"],
1854+
deps = [":framework_lite"],
1855+
)
1856+
18501857
# TODO(josh11b): Is this needed, or can we just use ":protos_all_cc"?
18511858
cc_library(
18521859
name = "protos_cc",

tensorflow/core/kernels/bias_op_gpu.cu.cc

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -173,19 +173,13 @@ __global__ void BiasGradNCHW_SharedAtomics(const T* output_backprop,
173173
// Accumulate the results in the shared memory into the first element.
174174
// No syncthreads is needed since this is only in the same warp.
175175
int32 thread_index = threadIdx.x;
176-
if (thread_index < 16) {
177-
s_data[thread_index] += s_data[thread_index + 16];
178-
__syncwarp(0xFFFF);
179-
if (thread_index < 8) s_data[thread_index] += s_data[thread_index + 8];
180-
__syncwarp(0xFF);
181-
if (thread_index < 4) s_data[thread_index] += s_data[thread_index + 4];
182-
__syncwarp(0xF);
183-
if (thread_index < 2) s_data[thread_index] += s_data[thread_index + 2];
184-
__syncwarp(0x3);
176+
if (thread_index < 32) {
177+
AccT data = s_data[thread_index];
178+
for (int32 offset = warpSize / 2; offset > 0; offset /= 2) {
179+
data += CudaShuffleDownSync(kCudaWarpAll, data, offset);
180+
}
185181
if (thread_index == 0) {
186-
T val = T(s_data[0] + s_data[1]);
187-
// The first thread writes out the accumulated result to global location.
188-
CudaAtomicAdd(bias_backprop + bias_index, val);
182+
CudaAtomicAdd(bias_backprop + bias_index, T(data));
189183
}
190184
}
191185
}

tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ limitations under the License.
3434

3535
namespace tensorflow {
3636

37+
typedef Eigen::GpuDevice GPUDevice;
3738
using Eigen::GpuDevice;
3839

3940
// Returns whether depthwise convolution forward or backward input pass can be
@@ -1028,7 +1029,7 @@ __device__ __forceinline__ T WarpSumReduce(T val) {
10281029
int zeros = sub_warp * kWidth;
10291030
unsigned mask = ((1UL << kWidth) - 1) << zeros;
10301031
for (int delta = kWidth / 2; delta > 0; delta /= 2) {
1031-
val += CudaShuffleXor(mask, val, delta);
1032+
val += CudaShuffleXorSync(mask, val, delta);
10321033
}
10331034
return val;
10341035
}
@@ -1145,7 +1146,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall(
11451146

11461147
// Note: the condition to reach this is uniform across the entire block.
11471148
__syncthreads();
1148-
unsigned active_threads = CudaBallot(CUDA_WARP_ALL, depth_in_range);
1149+
unsigned active_threads = CudaBallotSync(kCudaWarpAll, depth_in_range);
11491150

11501151
if (depth_in_range) {
11511152
const T* const out_ptr = inout_offset + output;
@@ -1159,7 +1160,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall(
11591160
T val = out1 * tile_ptr[0] + out2 * tile_ptr[tile_offset];
11601161
// Warp-accumulate pixels of the same depth and write to accumulator.
11611162
for (int delta = 16; delta >= kBlockSlices; delta /= 2) {
1162-
val += CudaShuffleDown(active_threads, val, delta);
1163+
val += CudaShuffleDownSync(active_threads, val, delta);
11631164
}
11641165
if (!(thread_idx & 32 - kBlockSlices) /* lane_idx < kBlockSlices */) {
11651166
*accum_ptr = val;
@@ -1399,7 +1400,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall(
13991400

14001401
// Note: the condition to reach this is uniform across the entire block.
14011402
__syncthreads();
1402-
unsigned active_threads = CudaBallot(CUDA_WARP_ALL, slice_in_range);
1403+
unsigned active_threads = CudaBallotSync(kCudaWarpAll, slice_in_range);
14031404

14041405
if (slice_in_range) {
14051406
const T* const out_ptr = inout_offset + output;
@@ -1413,7 +1414,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall(
14131414
T val = out1 * tile_ptr[0] + out2 * tile_ptr[tile_offset];
14141415
// Warp-accumulate pixels of the same depth and write to accumulator.
14151416
for (int delta = 16 / kBlockSlices; delta > 0; delta /= 2) {
1416-
val += CudaShuffleDown(active_threads, val, delta);
1417+
val += CudaShuffleDownSync(active_threads, val, delta);
14171418
}
14181419
if (!(thread_idx & 32 / kBlockSlices - 1)) {
14191420
*accum_ptr = val;

tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,27 @@ struct LeftUpdate<T, scatter_nd_op::UpdateOp::SUB> {
5555
}
5656
};
5757

58+
// Specializations for std::complex, updating real and imaginary part
59+
// individually. Even though this is not an atomic op anymore, it is safe
60+
// because there is only one type of op per kernel.
61+
template <typename T>
62+
struct LeftUpdate<std::complex<T>, scatter_nd_op::UpdateOp::ADD> {
63+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void operator()(
64+
std::complex<T>* out, const std::complex<T>& val) {
65+
T* ptr = reinterpret_cast<T*>(out);
66+
CudaAtomicAdd(ptr, val.real());
67+
CudaAtomicAdd(ptr, val.imag());
68+
}
69+
};
70+
71+
template <typename T>
72+
struct LeftUpdate<std::complex<T>, scatter_nd_op::UpdateOp::SUB> {
73+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void operator()(
74+
std::complex<T>* out, const std::complex<T>& val) {
75+
LeftUpdate<std::complex<T>, scatter_nd_op::UpdateOp::ADD>()(out, -val);
76+
}
77+
};
78+
5879
} // namespace
5980

6081
template <typename T, typename Index, scatter_nd_op::UpdateOp op, int IXDIM>

tensorflow/core/kernels/svd_op_gpu.cu.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,8 @@ __global__ void ComputeValueOfVKernel(Cuda2DLaunchConfig config, int64 m,
6363
int64 ldu, const Scalar* M,
6464
const Scalar* U, const Scalar* S,
6565
Scalar* V) {
66-
CUDA_AXIS_KERNEL_LOOP(batch, config.virtual_thread_count, x) {
67-
CUDA_AXIS_KERNEL_LOOP(i, config.virtual_thread_count, y) {
66+
CUDA_AXIS_KERNEL_LOOP(batch, config.virtual_thread_count.x, X) {
67+
CUDA_AXIS_KERNEL_LOOP(i, config.virtual_thread_count.y, Y) {
6868
Scalar v = M[i + m * batch] * U[ldu * (i + m * batch)] * S[batch];
6969
CudaAtomicAdd(V + batch, v);
7070
}

0 commit comments

Comments
 (0)