Skip to content

Commit 1d06d2d

Browse files
Thiemo Wiedemeyerxlz
authored andcommitted
opencl: Add optional profiling
Added (optional) profiling of OpenCL kernels. Reverted back to calculating sine and cosine on the GPU.
1 parent 4c699a2 commit 1d06d2d

File tree

2 files changed

+87
-63
lines changed

2 files changed

+87
-63
lines changed

src/opencl_depth_packet_processor.cl

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,7 @@
2424
* either License.
2525
*/
2626

27-
#define PHASE_SIN (float3)(PHASE_IN_RAD0_SIN, PHASE_IN_RAD1_SIN, PHASE_IN_RAD2_SIN)
28-
#define PHASE_COS (float3)(PHASE_IN_RAD0_COS, PHASE_IN_RAD1_COS, PHASE_IN_RAD2_COS)
27+
#define PHASE (float3)(PHASE_IN_RAD0, PHASE_IN_RAD1, PHASE_IN_RAD2)
2928
#define AB_MULTIPLIER_PER_FRQ (float3)(AB_MULTIPLIER_PER_FRQ0, AB_MULTIPLIER_PER_FRQ1, AB_MULTIPLIER_PER_FRQ2)
3029

3130
/*******************************************************************************
@@ -47,8 +46,8 @@ float decodePixelMeasurement(global const ushort *data, global const short *lut1
4746
return (float)lut11to16[(x < 1 || 510 < x || col_idx > 352) ? 0 : ((data[data_idx0] >> upper_bytes) | (data[data_idx1] << lower_bytes)) & 2047];
4847
}
4948

50-
void kernel processPixelStage1(global const short *lut11to16, global const float *z_table, global const float3 *p0_sin_table, global const float3 *p0_cos_table,
51-
global const ushort *data, global float3 *a_out, global float3 *b_out, global float3 *n_out, global float *ir_out)
49+
void kernel processPixelStage1(global const short *lut11to16, global const float *z_table, global const float3 *p0_table, global const ushort *data,
50+
global float3 *a_out, global float3 *b_out, global float3 *n_out, global float *ir_out)
5251
{
5352
const uint i = get_global_id(0);
5453

@@ -59,8 +58,13 @@ void kernel processPixelStage1(global const short *lut11to16, global const float
5958
const uint y_in = (y_tmp < 212 ? y_tmp + 212 : 423 - y_tmp);
6059

6160
const int3 invalid = (int)(0.0f >= z_table[i]);
62-
const float3 p0_sin = p0_sin_table[i];
63-
const float3 p0_cos = p0_cos_table[i];
61+
const float3 p0 = p0_table[i];
62+
float3 p0x_sin, p0y_sin, p0z_sin;
63+
float3 p0x_cos, p0y_cos, p0z_cos;
64+
65+
p0x_sin = -sincos(PHASE + p0.x, &p0x_cos);
66+
p0y_sin = -sincos(PHASE + p0.y, &p0y_cos);
67+
p0z_sin = -sincos(PHASE + p0.z, &p0z_cos);
6468

6569
int3 invalid_pixel = (int3)(invalid);
6670

@@ -74,12 +78,12 @@ void kernel processPixelStage1(global const short *lut11to16, global const float
7478
decodePixelMeasurement(data, lut11to16, 7, x, y_in),
7579
decodePixelMeasurement(data, lut11to16, 8, x, y_in));
7680

77-
float3 a = (float3)(dot(v0, PHASE_COS * p0_cos.x - PHASE_SIN * p0_sin.x),
78-
dot(v1, PHASE_COS * p0_cos.y - PHASE_SIN * p0_sin.y),
79-
dot(v2, PHASE_COS * p0_cos.z - PHASE_SIN * p0_sin.z)) * AB_MULTIPLIER_PER_FRQ;
80-
float3 b = (float3)(dot(v0, PHASE_COS * p0_sin.x + PHASE_SIN * p0_cos.x),
81-
dot(v1, PHASE_COS * p0_sin.y + PHASE_SIN * p0_cos.y),
82-
dot(v2, PHASE_COS * p0_sin.z + PHASE_SIN * p0_cos.z)) * AB_MULTIPLIER_PER_FRQ;
81+
float3 a = (float3)(dot(v0, p0x_cos),
82+
dot(v1, p0y_cos),
83+
dot(v2, p0z_cos)) * AB_MULTIPLIER_PER_FRQ;
84+
float3 b = (float3)(dot(v0, p0x_sin),
85+
dot(v1, p0y_sin),
86+
dot(v2, p0z_sin)) * AB_MULTIPLIER_PER_FRQ;
8387

8488
a = select(a, (float3)(0.0f), invalid_pixel);
8589
b = select(b, (float3)(0.0f), invalid_pixel);

src/opencl_depth_packet_processor.cpp

Lines changed: 71 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,8 @@
5858
#define CHECK_CL_ERROR(err, str) do {if (err != CL_SUCCESS) {LOG_ERROR << str << " failed: " << err; return false; } } while(0)
5959
#define LOG_CL_ERROR(err, str) if (err != CL_SUCCESS) LOG_ERROR << str << " failed: " << err
6060

61+
#define WITH_PROFILING 0
62+
6163
namespace libfreenect2
6264
{
6365

@@ -167,8 +169,7 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging
167169
size_t buf_packet_size;
168170

169171
cl::Buffer buf_lut11to16;
170-
cl::Buffer buf_p0_sin_table;
171-
cl::Buffer buf_p0_cos_table;
172+
cl::Buffer buf_p0_table;
172173
cl::Buffer buf_x_table;
173174
cl::Buffer buf_z_table;
174175
cl::Buffer buf_packet;
@@ -201,6 +202,11 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging
201202
bool programInitialized;
202203
std::string sourceCode;
203204

205+
#if WITH_PROFILING
206+
std::vector<double> timings;
207+
int count;
208+
#endif
209+
204210
OpenCLDepthPacketProcessorImpl(const int deviceId = -1)
205211
: image_size(512 * 424)
206212
, lut_size(2048)
@@ -266,12 +272,9 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging
266272
oss << " -D AB_MULTIPLIER_PER_FRQ2=" << params.ab_multiplier_per_frq[2] << "f";
267273
oss << " -D AB_OUTPUT_MULTIPLIER=" << params.ab_output_multiplier << "f";
268274

269-
oss << " -D PHASE_IN_RAD0_SIN=" << std::sin(-params.phase_in_rad[0]) << "f";
270-
oss << " -D PHASE_IN_RAD0_COS=" << std::cos(params.phase_in_rad[0]) << "f";
271-
oss << " -D PHASE_IN_RAD1_SIN=" << std::sin(-params.phase_in_rad[1]) << "f";
272-
oss << " -D PHASE_IN_RAD1_COS=" << std::cos(params.phase_in_rad[1]) << "f";
273-
oss << " -D PHASE_IN_RAD2_SIN=" << std::sin(-params.phase_in_rad[2]) << "f";
274-
oss << " -D PHASE_IN_RAD2_COS=" << std::cos(params.phase_in_rad[2]) << "f";
275+
oss << " -D PHASE_IN_RAD0=" << params.phase_in_rad[0] << "f";
276+
oss << " -D PHASE_IN_RAD1=" << params.phase_in_rad[1] << "f";
277+
oss << " -D PHASE_IN_RAD2=" << params.phase_in_rad[2] << "f";
275278

276279
oss << " -D JOINT_BILATERAL_AB_THRESHOLD=" << params.joint_bilateral_ab_threshold << "f";
277280
oss << " -D JOINT_BILATERAL_MAX_EDGE=" << params.joint_bilateral_max_edge << "f";
@@ -430,7 +433,12 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging
430433
bool initBuffers()
431434
{
432435
cl_int err = CL_SUCCESS;
436+
#if WITH_PROFILING
437+
count = 0;
438+
queue = cl::CommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
439+
#else
433440
queue = cl::CommandQueue(context, device, 0, &err);
441+
#endif
434442
CHECK_CL_ERROR(err, "cl::CommandQueue");
435443

436444
//Read only
@@ -442,9 +450,7 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging
442450

443451
buf_lut11to16 = cl::Buffer(context, CL_MEM_READ_ONLY, buf_lut11to16_size, NULL, &err);
444452
CHECK_CL_ERROR(err, "cl::Buffer");
445-
buf_p0_sin_table = cl::Buffer(context, CL_MEM_READ_ONLY, buf_p0_table_size, NULL, &err);
446-
CHECK_CL_ERROR(err, "cl::Buffer");
447-
buf_p0_cos_table = cl::Buffer(context, CL_MEM_READ_ONLY, buf_p0_table_size, NULL, &err);
453+
buf_p0_table = cl::Buffer(context, CL_MEM_READ_ONLY, buf_p0_table_size, NULL, &err);
448454
CHECK_CL_ERROR(err, "cl::Buffer");
449455
buf_x_table = cl::Buffer(context, CL_MEM_READ_ONLY, buf_x_table_size, NULL, &err);
450456
CHECK_CL_ERROR(err, "cl::Buffer");
@@ -471,7 +477,7 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging
471477
CHECK_CL_ERROR(err, "cl::Buffer");
472478
buf_n = cl::Buffer(context, CL_MEM_READ_WRITE, buf_n_size, NULL, &err);
473479
CHECK_CL_ERROR(err, "cl::Buffer");
474-
buf_ir = cl::Buffer(context, CL_MEM_READ_WRITE, buf_ir_size, NULL, &err);
480+
buf_ir = cl::Buffer(context, CL_MEM_WRITE_ONLY, buf_ir_size, NULL, &err);
475481
CHECK_CL_ERROR(err, "cl::Buffer");
476482
buf_a_filtered = cl::Buffer(context, CL_MEM_READ_WRITE, buf_a_filtered_size, NULL, &err);
477483
CHECK_CL_ERROR(err, "cl::Buffer");
@@ -507,19 +513,17 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging
507513
CHECK_CL_ERROR(err, "setArg");
508514
err = kernel_processPixelStage1.setArg(1, buf_z_table);
509515
CHECK_CL_ERROR(err, "setArg");
510-
err = kernel_processPixelStage1.setArg(2, buf_p0_sin_table);
516+
err = kernel_processPixelStage1.setArg(2, buf_p0_table);
511517
CHECK_CL_ERROR(err, "setArg");
512-
err = kernel_processPixelStage1.setArg(3, buf_p0_cos_table);
518+
err = kernel_processPixelStage1.setArg(3, buf_packet);
513519
CHECK_CL_ERROR(err, "setArg");
514-
err = kernel_processPixelStage1.setArg(4, buf_packet);
520+
err = kernel_processPixelStage1.setArg(4, buf_a);
515521
CHECK_CL_ERROR(err, "setArg");
516-
err = kernel_processPixelStage1.setArg(5, buf_a);
522+
err = kernel_processPixelStage1.setArg(5, buf_b);
517523
CHECK_CL_ERROR(err, "setArg");
518-
err = kernel_processPixelStage1.setArg(6, buf_b);
524+
err = kernel_processPixelStage1.setArg(6, buf_n);
519525
CHECK_CL_ERROR(err, "setArg");
520-
err = kernel_processPixelStage1.setArg(7, buf_n);
521-
CHECK_CL_ERROR(err, "setArg");
522-
err = kernel_processPixelStage1.setArg(8, buf_ir);
526+
err = kernel_processPixelStage1.setArg(7, buf_ir);
523527
CHECK_CL_ERROR(err, "setArg");
524528

525529
kernel_filterPixelStage1 = cl::Kernel(program, "filterPixelStage1", &err);
@@ -571,14 +575,14 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging
571575
{
572576
cl_int err;
573577
std::vector<cl::Event> eventWrite(1), eventPPS1(1), eventFPS1(1), eventPPS2(1), eventFPS2(1);
574-
cl::Event event0, event1;
578+
cl::Event eventReadIr, eventReadDepth;
575579

576580
err = queue.enqueueWriteBuffer(buf_packet, CL_FALSE, 0, buf_packet_size, packet.buffer, NULL, &eventWrite[0]);
577-
CHECK_CL_ERROR(err, "enqueueMapBuffer");
581+
CHECK_CL_ERROR(err, "enqueueWriteBuffer");
578582

579583
err = queue.enqueueNDRangeKernel(kernel_processPixelStage1, cl::NullRange, cl::NDRange(image_size), cl::NullRange, &eventWrite, &eventPPS1[0]);
580584
CHECK_CL_ERROR(err, "enqueueNDRangeKernel");
581-
err = queue.enqueueReadBuffer(buf_ir, CL_FALSE, 0, buf_ir_size, ir_frame->data, &eventPPS1, &event0);
585+
err = queue.enqueueReadBuffer(buf_ir, CL_FALSE, 0, buf_ir_size, ir_frame->data, &eventPPS1, &eventReadIr);
582586
CHECK_CL_ERROR(err, "enqueueReadBuffer");
583587

584588
if(config.EnableBilateralFilter)
@@ -597,20 +601,50 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging
597601
if(config.EnableEdgeAwareFilter)
598602
{
599603
err = queue.enqueueNDRangeKernel(kernel_filterPixelStage2, cl::NullRange, cl::NDRange(image_size), cl::NullRange, &eventPPS2, &eventFPS2[0]);
600-
CHECK_CL_ERROR(err, "enqueueWriteBuffer");
604+
CHECK_CL_ERROR(err, "enqueueNDRangeKernel");
601605
}
602606
else
603607
{
604608
eventFPS2[0] = eventPPS2[0];
605609
}
606610

607-
err = queue.enqueueReadBuffer(config.EnableEdgeAwareFilter ? buf_filtered : buf_depth, CL_FALSE, 0, buf_depth_size, depth_frame->data, &eventFPS2, &event1);
611+
err = queue.enqueueReadBuffer(config.EnableEdgeAwareFilter ? buf_filtered : buf_depth, CL_FALSE, 0, buf_depth_size, depth_frame->data, &eventFPS2, &eventReadDepth);
608612
CHECK_CL_ERROR(err, "enqueueReadBuffer");
609-
err = event0.wait();
613+
err = eventReadIr.wait();
610614
CHECK_CL_ERROR(err, "wait");
611-
err = event1.wait();
615+
err = eventReadDepth.wait();
612616
CHECK_CL_ERROR(err, "wait");
613617

618+
#if WITH_PROFILING
619+
if(count == 0)
620+
{
621+
timings.clear();
622+
timings.resize(7, 0.0);
623+
}
624+
625+
timings[0] += eventWrite[0].getProfilingInfo<CL_PROFILING_COMMAND_END>() - eventWrite[0].getProfilingInfo<CL_PROFILING_COMMAND_START>();
626+
timings[1] += eventPPS1[0].getProfilingInfo<CL_PROFILING_COMMAND_END>() - eventPPS1[0].getProfilingInfo<CL_PROFILING_COMMAND_START>();
627+
timings[2] += eventFPS1[0].getProfilingInfo<CL_PROFILING_COMMAND_END>() - eventFPS1[0].getProfilingInfo<CL_PROFILING_COMMAND_START>();
628+
timings[3] += eventPPS2[0].getProfilingInfo<CL_PROFILING_COMMAND_END>() - eventPPS2[0].getProfilingInfo<CL_PROFILING_COMMAND_START>();
629+
timings[4] += eventFPS2[0].getProfilingInfo<CL_PROFILING_COMMAND_END>() - eventFPS2[0].getProfilingInfo<CL_PROFILING_COMMAND_START>();
630+
timings[5] += eventReadIr.getProfilingInfo<CL_PROFILING_COMMAND_END>() - eventReadIr.getProfilingInfo<CL_PROFILING_COMMAND_START>();
631+
timings[6] += eventReadDepth.getProfilingInfo<CL_PROFILING_COMMAND_END>() - eventReadDepth.getProfilingInfo<CL_PROFILING_COMMAND_START>();
632+
633+
if(++count == 100)
634+
{
635+
double sum = timings[0] + timings[1] + timings[2] + timings[3] + timings[4] + timings[5] + timings[6];
636+
LOG_INFO << "writing package: " << timings[0] / 100000000.0 << " ms.";
637+
LOG_INFO << "stage 1: " << timings[1] / 100000000.0 << " ms.";
638+
LOG_INFO << "filter 1: " << timings[2] / 100000000.0 << " ms.";
639+
LOG_INFO << "stage 2: " << timings[3] / 100000000.0 << " ms.";
640+
LOG_INFO << "filter 2: " << timings[4] / 100000000.0 << " ms.";
641+
LOG_INFO << "reading ir: " << timings[5] / 100000000.0 << " ms.";
642+
LOG_INFO << "reading depth: " << timings[6] / 100000000.0 << " ms.";
643+
LOG_INFO << "overall: " << sum / 100000000.0 << " ms.";
644+
count = 0;
645+
}
646+
#endif
647+
614648
return true;
615649
}
616650

@@ -665,46 +699,32 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging
665699
return;
666700
}
667701

668-
cl_float3 *p0_sin_table = new cl_float3[image_size];
669-
cl_float3 *p0_cos_table = new cl_float3[image_size];
702+
cl_float3 *p0_table = new cl_float3[image_size];
670703

671704
for(int r = 0; r < 424; ++r)
672705
{
673-
cl_float3 *itS = &p0_sin_table[r * 512];
674-
cl_float3 *itC = &p0_cos_table[r * 512];
706+
cl_float3 *it = &p0_table[r * 512];
675707
const uint16_t *it0 = &p0table->p0table0[r * 512];
676708
const uint16_t *it1 = &p0table->p0table1[r * 512];
677709
const uint16_t *it2 = &p0table->p0table2[r * 512];
678-
for(int c = 0; c < 512; ++c, ++itS, ++itC, ++it0, ++it1, ++it2)
710+
for(int c = 0; c < 512; ++c, ++it, ++it0, ++it1, ++it2)
679711
{
680-
const float x = ((float)*it0) * 0.000031 * M_PI;
681-
const float y = ((float)*it1) * 0.000031 * M_PI;
682-
const float z = ((float)*it2) * 0.000031 * M_PI;
683-
itS->s[0] = std::sin(x);
684-
itS->s[1] = std::sin(y);
685-
itS->s[2] = std::sin(z);
686-
itS->s[3] = 0.0f;
687-
itC->s[0] = std::cos(-x);
688-
itC->s[1] = std::cos(-y);
689-
itC->s[2] = std::cos(-z);
690-
itC->s[3] = 0.0f;
712+
it->s[0] = -((float)*it0) * 0.000031 * M_PI;
713+
it->s[1] = -((float)*it1) * 0.000031 * M_PI;
714+
it->s[2] = -((float)*it2) * 0.000031 * M_PI;
715+
it->s[3] = 0.0f;
691716
}
692717
}
693718

694719
cl_int err = CL_SUCCESS;
695-
cl::Event event0, event1;
696-
err = queue.enqueueWriteBuffer(buf_p0_sin_table, CL_FALSE, 0, buf_p0_table_size, p0_sin_table, NULL, &event0);
697-
LOG_CL_ERROR(err, "enqueueWriteBuffer");
698-
err = queue.enqueueWriteBuffer(buf_p0_cos_table, CL_FALSE, 0, buf_p0_table_size, p0_cos_table, NULL, &event1);
720+
cl::Event event0;
721+
err = queue.enqueueWriteBuffer(buf_p0_table, CL_FALSE, 0, buf_p0_table_size, p0_table, NULL, &event0);
699722
LOG_CL_ERROR(err, "enqueueWriteBuffer");
700723

701724
err = event0.wait();
702725
LOG_CL_ERROR(err, "wait");
703-
err = event1.wait();
704-
LOG_CL_ERROR(err, "wait");
705726

706-
delete[] p0_sin_table;
707-
delete[] p0_cos_table;
727+
delete[] p0_table;
708728
}
709729

710730
void fill_xz_tables(const float *xtable, const float *ztable)

0 commit comments

Comments
 (0)