58
58
#define CHECK_CL_ERROR (err, str ) do {if (err != CL_SUCCESS) {LOG_ERROR << str << " failed: " << err; return false ; } } while (0 )
59
59
#define LOG_CL_ERROR (err, str ) if (err != CL_SUCCESS) LOG_ERROR << str << " failed: " << err
60
60
61
+ #define WITH_PROFILING 0
62
+
61
63
namespace libfreenect2
62
64
{
63
65
@@ -167,8 +169,7 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging
167
169
size_t buf_packet_size;
168
170
169
171
cl::Buffer buf_lut11to16;
170
- cl::Buffer buf_p0_sin_table;
171
- cl::Buffer buf_p0_cos_table;
172
+ cl::Buffer buf_p0_table;
172
173
cl::Buffer buf_x_table;
173
174
cl::Buffer buf_z_table;
174
175
cl::Buffer buf_packet;
@@ -201,6 +202,11 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging
201
202
bool programInitialized;
202
203
std::string sourceCode;
203
204
205
+ #if WITH_PROFILING
206
+ std::vector<double > timings;
207
+ int count;
208
+ #endif
209
+
204
210
OpenCLDepthPacketProcessorImpl (const int deviceId = -1 )
205
211
: image_size(512 * 424 )
206
212
, lut_size(2048 )
@@ -266,12 +272,9 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging
266
272
oss << " -D AB_MULTIPLIER_PER_FRQ2=" << params.ab_multiplier_per_frq [2 ] << " f" ;
267
273
oss << " -D AB_OUTPUT_MULTIPLIER=" << params.ab_output_multiplier << " f" ;
268
274
269
- oss << " -D PHASE_IN_RAD0_SIN=" << std::sin (-params.phase_in_rad [0 ]) << " f" ;
270
- oss << " -D PHASE_IN_RAD0_COS=" << std::cos (params.phase_in_rad [0 ]) << " f" ;
271
- oss << " -D PHASE_IN_RAD1_SIN=" << std::sin (-params.phase_in_rad [1 ]) << " f" ;
272
- oss << " -D PHASE_IN_RAD1_COS=" << std::cos (params.phase_in_rad [1 ]) << " f" ;
273
- oss << " -D PHASE_IN_RAD2_SIN=" << std::sin (-params.phase_in_rad [2 ]) << " f" ;
274
- oss << " -D PHASE_IN_RAD2_COS=" << std::cos (params.phase_in_rad [2 ]) << " f" ;
275
+ oss << " -D PHASE_IN_RAD0=" << params.phase_in_rad [0 ] << " f" ;
276
+ oss << " -D PHASE_IN_RAD1=" << params.phase_in_rad [1 ] << " f" ;
277
+ oss << " -D PHASE_IN_RAD2=" << params.phase_in_rad [2 ] << " f" ;
275
278
276
279
oss << " -D JOINT_BILATERAL_AB_THRESHOLD=" << params.joint_bilateral_ab_threshold << " f" ;
277
280
oss << " -D JOINT_BILATERAL_MAX_EDGE=" << params.joint_bilateral_max_edge << " f" ;
@@ -430,7 +433,12 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging
430
433
bool initBuffers ()
431
434
{
432
435
cl_int err = CL_SUCCESS;
436
+ #if WITH_PROFILING
437
+ count = 0 ;
438
+ queue = cl::CommandQueue (context, device, CL_QUEUE_PROFILING_ENABLE, &err);
439
+ #else
433
440
queue = cl::CommandQueue (context, device, 0 , &err);
441
+ #endif
434
442
CHECK_CL_ERROR (err, " cl::CommandQueue" );
435
443
436
444
// Read only
@@ -442,9 +450,7 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging
442
450
443
451
buf_lut11to16 = cl::Buffer (context, CL_MEM_READ_ONLY, buf_lut11to16_size, NULL , &err);
444
452
CHECK_CL_ERROR (err, " cl::Buffer" );
445
- buf_p0_sin_table = cl::Buffer (context, CL_MEM_READ_ONLY, buf_p0_table_size, NULL , &err);
446
- CHECK_CL_ERROR (err, " cl::Buffer" );
447
- buf_p0_cos_table = cl::Buffer (context, CL_MEM_READ_ONLY, buf_p0_table_size, NULL , &err);
453
+ buf_p0_table = cl::Buffer (context, CL_MEM_READ_ONLY, buf_p0_table_size, NULL , &err);
448
454
CHECK_CL_ERROR (err, " cl::Buffer" );
449
455
buf_x_table = cl::Buffer (context, CL_MEM_READ_ONLY, buf_x_table_size, NULL , &err);
450
456
CHECK_CL_ERROR (err, " cl::Buffer" );
@@ -471,7 +477,7 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging
471
477
CHECK_CL_ERROR (err, " cl::Buffer" );
472
478
buf_n = cl::Buffer (context, CL_MEM_READ_WRITE, buf_n_size, NULL , &err);
473
479
CHECK_CL_ERROR (err, " cl::Buffer" );
474
- buf_ir = cl::Buffer (context, CL_MEM_READ_WRITE , buf_ir_size, NULL , &err);
480
+ buf_ir = cl::Buffer (context, CL_MEM_WRITE_ONLY , buf_ir_size, NULL , &err);
475
481
CHECK_CL_ERROR (err, " cl::Buffer" );
476
482
buf_a_filtered = cl::Buffer (context, CL_MEM_READ_WRITE, buf_a_filtered_size, NULL , &err);
477
483
CHECK_CL_ERROR (err, " cl::Buffer" );
@@ -507,19 +513,17 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging
507
513
CHECK_CL_ERROR (err, " setArg" );
508
514
err = kernel_processPixelStage1.setArg (1 , buf_z_table);
509
515
CHECK_CL_ERROR (err, " setArg" );
510
- err = kernel_processPixelStage1.setArg (2 , buf_p0_sin_table );
516
+ err = kernel_processPixelStage1.setArg (2 , buf_p0_table );
511
517
CHECK_CL_ERROR (err, " setArg" );
512
- err = kernel_processPixelStage1.setArg (3 , buf_p0_cos_table );
518
+ err = kernel_processPixelStage1.setArg (3 , buf_packet );
513
519
CHECK_CL_ERROR (err, " setArg" );
514
- err = kernel_processPixelStage1.setArg (4 , buf_packet );
520
+ err = kernel_processPixelStage1.setArg (4 , buf_a );
515
521
CHECK_CL_ERROR (err, " setArg" );
516
- err = kernel_processPixelStage1.setArg (5 , buf_a );
522
+ err = kernel_processPixelStage1.setArg (5 , buf_b );
517
523
CHECK_CL_ERROR (err, " setArg" );
518
- err = kernel_processPixelStage1.setArg (6 , buf_b );
524
+ err = kernel_processPixelStage1.setArg (6 , buf_n );
519
525
CHECK_CL_ERROR (err, " setArg" );
520
- err = kernel_processPixelStage1.setArg (7 , buf_n);
521
- CHECK_CL_ERROR (err, " setArg" );
522
- err = kernel_processPixelStage1.setArg (8 , buf_ir);
526
+ err = kernel_processPixelStage1.setArg (7 , buf_ir);
523
527
CHECK_CL_ERROR (err, " setArg" );
524
528
525
529
kernel_filterPixelStage1 = cl::Kernel (program, " filterPixelStage1" , &err);
@@ -571,14 +575,14 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging
571
575
{
572
576
cl_int err;
573
577
std::vector<cl::Event> eventWrite (1 ), eventPPS1 (1 ), eventFPS1 (1 ), eventPPS2 (1 ), eventFPS2 (1 );
574
- cl::Event event0, event1 ;
578
+ cl::Event eventReadIr, eventReadDepth ;
575
579
576
580
err = queue.enqueueWriteBuffer (buf_packet, CL_FALSE, 0 , buf_packet_size, packet.buffer , NULL , &eventWrite[0 ]);
577
- CHECK_CL_ERROR (err, " enqueueMapBuffer " );
581
+ CHECK_CL_ERROR (err, " enqueueWriteBuffer " );
578
582
579
583
err = queue.enqueueNDRangeKernel (kernel_processPixelStage1, cl::NullRange, cl::NDRange (image_size), cl::NullRange, &eventWrite, &eventPPS1[0 ]);
580
584
CHECK_CL_ERROR (err, " enqueueNDRangeKernel" );
581
- err = queue.enqueueReadBuffer (buf_ir, CL_FALSE, 0 , buf_ir_size, ir_frame->data , &eventPPS1, &event0 );
585
+ err = queue.enqueueReadBuffer (buf_ir, CL_FALSE, 0 , buf_ir_size, ir_frame->data , &eventPPS1, &eventReadIr );
582
586
CHECK_CL_ERROR (err, " enqueueReadBuffer" );
583
587
584
588
if (config.EnableBilateralFilter )
@@ -597,20 +601,50 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging
597
601
if (config.EnableEdgeAwareFilter )
598
602
{
599
603
err = queue.enqueueNDRangeKernel (kernel_filterPixelStage2, cl::NullRange, cl::NDRange (image_size), cl::NullRange, &eventPPS2, &eventFPS2[0 ]);
600
- CHECK_CL_ERROR (err, " enqueueWriteBuffer " );
604
+ CHECK_CL_ERROR (err, " enqueueNDRangeKernel " );
601
605
}
602
606
else
603
607
{
604
608
eventFPS2[0 ] = eventPPS2[0 ];
605
609
}
606
610
607
- err = queue.enqueueReadBuffer (config.EnableEdgeAwareFilter ? buf_filtered : buf_depth, CL_FALSE, 0 , buf_depth_size, depth_frame->data , &eventFPS2, &event1 );
611
+ err = queue.enqueueReadBuffer (config.EnableEdgeAwareFilter ? buf_filtered : buf_depth, CL_FALSE, 0 , buf_depth_size, depth_frame->data , &eventFPS2, &eventReadDepth );
608
612
CHECK_CL_ERROR (err, " enqueueReadBuffer" );
609
- err = event0 .wait ();
613
+ err = eventReadIr .wait ();
610
614
CHECK_CL_ERROR (err, " wait" );
611
- err = event1 .wait ();
615
+ err = eventReadDepth .wait ();
612
616
CHECK_CL_ERROR (err, " wait" );
613
617
618
+ #if WITH_PROFILING
619
+ if (count == 0 )
620
+ {
621
+ timings.clear ();
622
+ timings.resize (7 , 0.0 );
623
+ }
624
+
625
+ timings[0 ] += eventWrite[0 ].getProfilingInfo <CL_PROFILING_COMMAND_END>() - eventWrite[0 ].getProfilingInfo <CL_PROFILING_COMMAND_START>();
626
+ timings[1 ] += eventPPS1[0 ].getProfilingInfo <CL_PROFILING_COMMAND_END>() - eventPPS1[0 ].getProfilingInfo <CL_PROFILING_COMMAND_START>();
627
+ timings[2 ] += eventFPS1[0 ].getProfilingInfo <CL_PROFILING_COMMAND_END>() - eventFPS1[0 ].getProfilingInfo <CL_PROFILING_COMMAND_START>();
628
+ timings[3 ] += eventPPS2[0 ].getProfilingInfo <CL_PROFILING_COMMAND_END>() - eventPPS2[0 ].getProfilingInfo <CL_PROFILING_COMMAND_START>();
629
+ timings[4 ] += eventFPS2[0 ].getProfilingInfo <CL_PROFILING_COMMAND_END>() - eventFPS2[0 ].getProfilingInfo <CL_PROFILING_COMMAND_START>();
630
+ timings[5 ] += eventReadIr.getProfilingInfo <CL_PROFILING_COMMAND_END>() - eventReadIr.getProfilingInfo <CL_PROFILING_COMMAND_START>();
631
+ timings[6 ] += eventReadDepth.getProfilingInfo <CL_PROFILING_COMMAND_END>() - eventReadDepth.getProfilingInfo <CL_PROFILING_COMMAND_START>();
632
+
633
+ if (++count == 100 )
634
+ {
635
+ double sum = timings[0 ] + timings[1 ] + timings[2 ] + timings[3 ] + timings[4 ] + timings[5 ] + timings[6 ];
636
+ LOG_INFO << " writing package: " << timings[0 ] / 100000000.0 << " ms." ;
637
+ LOG_INFO << " stage 1: " << timings[1 ] / 100000000.0 << " ms." ;
638
+ LOG_INFO << " filter 1: " << timings[2 ] / 100000000.0 << " ms." ;
639
+ LOG_INFO << " stage 2: " << timings[3 ] / 100000000.0 << " ms." ;
640
+ LOG_INFO << " filter 2: " << timings[4 ] / 100000000.0 << " ms." ;
641
+ LOG_INFO << " reading ir: " << timings[5 ] / 100000000.0 << " ms." ;
642
+ LOG_INFO << " reading depth: " << timings[6 ] / 100000000.0 << " ms." ;
643
+ LOG_INFO << " overall: " << sum / 100000000.0 << " ms." ;
644
+ count = 0 ;
645
+ }
646
+ #endif
647
+
614
648
return true ;
615
649
}
616
650
@@ -665,46 +699,32 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging
665
699
return ;
666
700
}
667
701
668
- cl_float3 *p0_sin_table = new cl_float3[image_size];
669
- cl_float3 *p0_cos_table = new cl_float3[image_size];
702
+ cl_float3 *p0_table = new cl_float3[image_size];
670
703
671
704
for (int r = 0 ; r < 424 ; ++r)
672
705
{
673
- cl_float3 *itS = &p0_sin_table[r * 512 ];
674
- cl_float3 *itC = &p0_cos_table[r * 512 ];
706
+ cl_float3 *it = &p0_table[r * 512 ];
675
707
const uint16_t *it0 = &p0table->p0table0 [r * 512 ];
676
708
const uint16_t *it1 = &p0table->p0table1 [r * 512 ];
677
709
const uint16_t *it2 = &p0table->p0table2 [r * 512 ];
678
- for (int c = 0 ; c < 512 ; ++c, ++itS, ++itC , ++it0, ++it1, ++it2)
710
+ for (int c = 0 ; c < 512 ; ++c, ++it , ++it0, ++it1, ++it2)
679
711
{
680
- const float x = ((float )*it0) * 0.000031 * M_PI;
681
- const float y = ((float )*it1) * 0.000031 * M_PI;
682
- const float z = ((float )*it2) * 0.000031 * M_PI;
683
- itS->s [0 ] = std::sin (x);
684
- itS->s [1 ] = std::sin (y);
685
- itS->s [2 ] = std::sin (z);
686
- itS->s [3 ] = 0 .0f ;
687
- itC->s [0 ] = std::cos (-x);
688
- itC->s [1 ] = std::cos (-y);
689
- itC->s [2 ] = std::cos (-z);
690
- itC->s [3 ] = 0 .0f ;
712
+ it->s [0 ] = -((float )*it0) * 0.000031 * M_PI;
713
+ it->s [1 ] = -((float )*it1) * 0.000031 * M_PI;
714
+ it->s [2 ] = -((float )*it2) * 0.000031 * M_PI;
715
+ it->s [3 ] = 0 .0f ;
691
716
}
692
717
}
693
718
694
719
cl_int err = CL_SUCCESS;
695
- cl::Event event0, event1;
696
- err = queue.enqueueWriteBuffer (buf_p0_sin_table, CL_FALSE, 0 , buf_p0_table_size, p0_sin_table, NULL , &event0);
697
- LOG_CL_ERROR (err, " enqueueWriteBuffer" );
698
- err = queue.enqueueWriteBuffer (buf_p0_cos_table, CL_FALSE, 0 , buf_p0_table_size, p0_cos_table, NULL , &event1);
720
+ cl::Event event0;
721
+ err = queue.enqueueWriteBuffer (buf_p0_table, CL_FALSE, 0 , buf_p0_table_size, p0_table, NULL , &event0);
699
722
LOG_CL_ERROR (err, " enqueueWriteBuffer" );
700
723
701
724
err = event0.wait ();
702
725
LOG_CL_ERROR (err, " wait" );
703
- err = event1.wait ();
704
- LOG_CL_ERROR (err, " wait" );
705
726
706
- delete[] p0_sin_table;
707
- delete[] p0_cos_table;
727
+ delete[] p0_table;
708
728
}
709
729
710
730
void fill_xz_tables (const float *xtable, const float *ztable)
0 commit comments