|
| 1 | +#include <ccv.h> |
| 2 | +#include <ccv_internal.h> |
| 3 | +#include <nnc/ccv_nnc.h> |
| 4 | +#include <nnc/ccv_nnc_easy.h> |
| 5 | +#include <nnc/ccv_nnc_internal.h> |
| 6 | +#ifdef USE_OPENMP |
| 7 | +#include <omp.h> |
| 8 | +#endif |
| 9 | +#ifdef USE_DISPATCH |
| 10 | +#include <dispatch/dispatch.h> |
| 11 | +#endif |
| 12 | + |
| 13 | +typedef struct { |
| 14 | + float v[5]; |
| 15 | +} float5; |
| 16 | +#define less_than(a, b, aux) ((a).v[0] > (b).v[0]) |
| 17 | +#define swap_func(a, b, array, aux, t) do { \ |
| 18 | + (t) = (a); \ |
| 19 | + (a) = (b); \ |
| 20 | + (b) = (t); \ |
| 21 | + int _t = aux[&(a) - array]; \ |
| 22 | + aux[&(a) - array] = aux[&(b) - array]; \ |
| 23 | + aux[&(b) - array] = _t; \ |
| 24 | +} while (0) |
| 25 | +CCV_IMPLEMENT_QSORT_EX(_ccv_nnc_nms_sortby_f5_32f, float5, less_than, swap_func, int*) |
| 26 | +#undef less_than |
| 27 | +#undef swap_func |
| 28 | + |
| 29 | +static int _ccv_nnc_nms_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
| 30 | +{ |
| 31 | + assert(input_size == 1); |
| 32 | + const ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0]; |
| 33 | + assert(output_size == 2); |
| 34 | + ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0]; |
| 35 | + ccv_nnc_tensor_view_t* c = (ccv_nnc_tensor_view_t*)outputs[1]; |
| 36 | + const int a_nd = ccv_nnc_tensor_nd(a->info.dim); |
| 37 | + const int b_nd = ccv_nnc_tensor_nd(b->info.dim); |
| 38 | + const int c_nd = ccv_nnc_tensor_nd(c->info.dim); |
| 39 | + assert(a_nd == b_nd); |
| 40 | + int i; |
| 41 | + for (i = 0; i < a_nd; i++) |
| 42 | + { assert(a->info.dim[i] == b->info.dim[i]); } |
| 43 | + const int* ainc = CCV_IS_TENSOR_VIEW(a) ? a->inc : a->info.dim; |
| 44 | + const int* binc = CCV_IS_TENSOR_VIEW(b) ? b->inc : b->info.dim; |
| 45 | + const int* cinc = CCV_IS_TENSOR_VIEW(c) ? c->inc : c->info.dim; |
| 46 | + const int n = a_nd >= 3 ? a->info.dim[0] : 1; |
| 47 | + const int aninc = a_nd >= 3 ? ainc[1] * ainc[2] : 0; |
| 48 | + const int bninc = b_nd >= 3 ? binc[1] * binc[2] : 0; |
| 49 | + const int cninc = c_nd >= 2 ? cinc[1] : 0; |
| 50 | + const int m = a_nd >= 3 ? a->info.dim[1] : a->info.dim[0]; |
| 51 | + if (c_nd == 1) |
| 52 | + { assert(m == c->info.dim[0]); } |
| 53 | + else |
| 54 | + { assert(c_nd == 2 && n == c->info.dim[0] && m == c->info.dim[1]); } |
| 55 | + const int aminc = ainc[a_nd - 1]; |
| 56 | + const int bminc = binc[b_nd - 1]; |
| 57 | + const int d = a_nd <= 1 ? 1 : a->info.dim[a_nd - 1]; |
| 58 | + const float iou_threshold = cmd.info.nms.iou_threshold; |
| 59 | + if (d == 5 && aminc == 5 && aminc == bminc) // If it is 5, we can use our quick sort implementation. |
| 60 | + { |
| 61 | + parallel_for(i, n) |
| 62 | + { |
| 63 | + int x, y; |
| 64 | + const float* const ap = a->data.f32 + n * aninc; |
| 65 | + float* const bp = b->data.f32 + n * bninc; |
| 66 | + int* const cp = c->data.i32 + n * cninc; |
| 67 | + for (x = 0; x < m; x++) |
| 68 | + cp[x] = x; |
| 69 | + for (x = 0; x < m * d; x++) |
| 70 | + bp[x] = ap[x]; |
| 71 | + _ccv_nnc_nms_sortby_f5_32f((float5*)bp, m, cp); |
| 72 | + for (x = 0; x < m; x++) |
| 73 | + { |
| 74 | + float v = bp[x * 5]; |
| 75 | + if (v == -FLT_MAX) // Suppressed. |
| 76 | + continue; |
| 77 | + const float area1 = bp[x * 5 + 3] * bp[x * 5 + 4]; |
| 78 | + for (y = x + 1; y < m; y++) |
| 79 | + { |
| 80 | + const float u = bp[y * 5]; |
| 81 | + if (u == -FLT_MAX) // Suppressed. |
| 82 | + continue; |
| 83 | + const float area2 = bp[y * 5 + 3] * bp[y * 5 + 4]; |
| 84 | + const float xdiff = ccv_max(0, ccv_min(bp[x * 5 + 1] + bp[x * 5 + 3], bp[y * 5 + 1] + bp[y * 5 + 3]) - ccv_max(bp[x * 5 + 1], bp[y * 5 + 1])); |
| 85 | + const float ydiff = ccv_max(0, ccv_min(bp[x * 5 + 2] + bp[x * 5 + 4], bp[y * 5 + 2] + bp[y * 5 + 4]) - ccv_max(bp[x * 5 + 2], bp[y * 5 + 2])); |
| 86 | + const float intersection = xdiff * ydiff; |
| 87 | + const float iou = intersection / (area1 + area2 - intersection); |
| 88 | + if (iou >= iou_threshold) |
| 89 | + bp[y * 5] = -FLT_MAX; |
| 90 | + } |
| 91 | + } |
| 92 | + } parallel_endfor |
| 93 | + } else { |
| 94 | + // Otherwise, fall to use selection sort. |
| 95 | + parallel_for(i, n) |
| 96 | + { |
| 97 | + int x, y; |
| 98 | + const float* const ap = a->data.f32 + n * aninc; |
| 99 | + float* const bp = b->data.f32 + n * bninc; |
| 100 | + int* const cp = c->data.i32 + n * cninc; |
| 101 | + for (x = 0; x < m; x++) |
| 102 | + cp[x] = x; |
| 103 | + for (x = 0; x < m; x++) |
| 104 | + for (y = 0; y < d; y++) |
| 105 | + bp[x * bminc + y] = ap[x * aminc + y]; |
| 106 | + for (x = 0; x < m; x++) |
| 107 | + { |
| 108 | + float v = bp[x * bminc]; |
| 109 | + int k = x; |
| 110 | + for (y = x + 1; y < m; y++) |
| 111 | + { |
| 112 | + const float u = bp[y * bminc]; |
| 113 | + if (u > v) |
| 114 | + k = y, v = u; |
| 115 | + } |
| 116 | + for (y = 0; y < d; y++) |
| 117 | + { |
| 118 | + const float t = bp[k * bminc + y]; |
| 119 | + bp[k * bminc + y] = bp[x * bminc + y]; |
| 120 | + bp[x * bminc + y] = t; |
| 121 | + const int u = cp[k]; |
| 122 | + cp[k] = cp[x]; |
| 123 | + cp[x] = u; |
| 124 | + } |
| 125 | + } |
| 126 | + for (x = 0; x < m; x++) |
| 127 | + { |
| 128 | + float v = bp[x * bminc]; |
| 129 | + if (v == -FLT_MAX) // Suppressed. |
| 130 | + continue; |
| 131 | + const float area1 = bp[x * bminc + 3] * bp[x * bminc + 4]; |
| 132 | + for (y = x + 1; y < m; y++) |
| 133 | + { |
| 134 | + const float u = bp[y * bminc]; |
| 135 | + if (u == -FLT_MAX) // Suppressed. |
| 136 | + continue; |
| 137 | + const float area2 = bp[y * bminc + 3] * bp[y * bminc + 4]; |
| 138 | + const float xdiff = ccv_max(0, ccv_min(bp[x * bminc + 1] + bp[x * bminc + 3], bp[y * bminc + 1] + bp[y * bminc + 3]) - ccv_max(bp[x * bminc + 1], bp[y * bminc + 1])); |
| 139 | + const float ydiff = ccv_max(0, ccv_min(bp[x * bminc + 2] + bp[x * bminc + 4], bp[y * bminc + 2] + bp[y * bminc + 4]) - ccv_max(bp[x * bminc + 2], bp[y * bminc + 2])); |
| 140 | + const float intersection = xdiff * ydiff; |
| 141 | + const float iou = intersection / (area1 + area2 - intersection); |
| 142 | + if (iou >= iou_threshold) |
| 143 | + bp[y * bminc] = -FLT_MAX; |
| 144 | + } |
| 145 | + } |
| 146 | + } parallel_endfor |
| 147 | + } |
| 148 | + return CCV_NNC_EXEC_SUCCESS; |
| 149 | +} |
| 150 | + |
| 151 | +static int _ccv_nnc_nms_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
| 152 | +{ |
| 153 | + return CCV_NNC_EXEC_SUCCESS; |
| 154 | +} |
| 155 | + |
| 156 | +REGISTER_COMMAND_BACKEND(CCV_NNC_NMS_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
| 157 | +{ |
| 158 | + registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW; |
| 159 | + registry->tensor_datatypes = CCV_32F | CCV_32S; |
| 160 | + registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
| 161 | + registry->algorithms = 1; |
| 162 | + registry->exec = _ccv_nnc_nms_forw; |
| 163 | +} |
| 164 | + |
| 165 | +REGISTER_COMMAND_BACKEND(CCV_NNC_NMS_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
| 166 | +{ |
| 167 | + registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW; |
| 168 | + registry->tensor_datatypes = CCV_32F | CCV_32S; |
| 169 | + registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
| 170 | + registry->algorithms = 1; |
| 171 | + registry->exec = _ccv_nnc_nms_back; |
| 172 | +} |
0 commit comments