Skip to content

Commit 838a540

Browse files
committed
Adding changes for calc weight and tuple/unrolled_loop
1 parent 22e4416 commit 838a540

File tree

12 files changed

+52
-371
lines changed

12 files changed

+52
-371
lines changed

DirectProgramming/DPC++FPGA/ReferenceDesigns/mvdr_beamforming/src/BackwardSubstitution.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
// utility classes
88
#include "ParallelCopyArray.hpp"
9-
#include "UnrolledLoop.hpp"
9+
#include "unrolled_loop.hpp"
1010

1111
#include "mvdr_complex.hpp"
1212

@@ -135,7 +135,7 @@ event SubmitBackwardSubstitutionKernel(queue& q) {
135135
CalcType u_val, y_val, y_initial_val, y_current, y_new;
136136
short row[k_unroll_factor];
137137

138-
UnrolledLoop<k_unroll_factor>([&](auto j) {
138+
fpga_tools::UnrolledLoop<k_unroll_factor>([&](auto j) {
139139
// calculate current location within the vector
140140
row[j] = j + (i * (short)k_unroll_factor);
141141

DirectProgramming/DPC++FPGA/ReferenceDesigns/mvdr_beamforming/src/Beamformer.hpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@
66

77
// utility classes
88
#include "ParallelCopyArray.hpp"
9-
#include "Tuple.hpp"
10-
#include "UnrolledLoop.hpp"
9+
#include "tuple.hpp"
10+
#include "unrolled_loop.hpp"
1111

1212
#include "mvdr_complex.hpp"
1313

@@ -67,7 +67,7 @@ event SubmitBeamformerKernel(
6767
"k_unroll_factor must be evenly divisible by k_num_complex_per_xrx_read");
6868

6969
// data coming from the Xrx pipe
70-
using XrxPipeType = NTuple<ComplexType, k_num_complex_per_xrx_read>;
70+
using XrxPipeType = fpga_tools::NTuple<ComplexType, k_num_complex_per_xrx_read>;
7171

7272
// this type represents the number of samples to be processed in parallel
7373
using CalcType = ParallelCopyArray<ComplexType, k_unroll_factor>;
@@ -105,7 +105,7 @@ event SubmitBeamformerKernel(
105105
i++) {
106106
segment = XrxVectorsInPipe::read();
107107
short index = i / kReadsPerCalcType;
108-
UnrolledLoop<k_num_complex_per_xrx_read>([&](auto k) {
108+
fpga_tools::UnrolledLoop<k_num_complex_per_xrx_read>([&](auto k) {
109109
short subindex =
110110
(i % kReadsPerCalcType) * (short)k_num_complex_per_xrx_read +
111111
k;
@@ -125,19 +125,19 @@ event SubmitBeamformerKernel(
125125
for (unsigned char vector_num = 0;
126126
vector_num < (unsigned char)k_num_weight_vectors; vector_num++) {
127127
// zero the accumulators
128-
UnrolledLoop<k_unroll_factor>([&](auto i) { accum_vector[i] = 0; });
128+
fpga_tools::UnrolledLoop<k_unroll_factor>([&](auto i) { accum_vector[i] = 0; });
129129

130130
// calculate the sum of products of the weight and xrx vectors
131131
// unroll by a factor of k_unroll_factor (so perform k_unroll_factor
132132
// operations in parallel)
133133
for (short i = 0; i < (kNumCalcTypePerVector); i++) {
134-
UnrolledLoop<k_unroll_factor>([&](auto j) {
134+
fpga_tools::UnrolledLoop<k_unroll_factor>([&](auto j) {
135135
accum_vector[j] +=
136136
xrx_vector[i][j] * weight_vectors[vector_num][i][j].conj();
137137
});
138138
}
139139
ComplexType accum_vector_sum = 0;
140-
UnrolledLoop<k_unroll_factor>(
140+
fpga_tools::UnrolledLoop<k_unroll_factor>(
141141
[&](auto i) { accum_vector_sum += accum_vector[i]; });
142142

143143
result[vector_num] = accum_vector_sum;

DirectProgramming/DPC++FPGA/ReferenceDesigns/mvdr_beamforming/src/CalcWeights.hpp

Lines changed: 3 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include <sycl/ext/intel/fpga_extensions.hpp>
66

77
#include "mvdr_complex.hpp"
8+
#include <complex>
89

910
using namespace sycl;
1011

@@ -53,32 +54,12 @@ event SubmitCalcWeightsKernel(queue& q) {
5354

5455
// calculate Ct * y
5556

56-
float re = 0;
57-
float im = 0;
57+
ComplexType ctranspose_times_y (0);
5858

5959
[[intel::initiation_interval(1)]] // NO-FORMAT: Attribute
6060
for (short i = 0; i < (short)k_num_elements; i++) {
61-
auto c = c_vector[i];
62-
auto y = y_vector[i];
63-
64-
// floating point numbers use the msb to indicate sign, so flipping
65-
// that bit is equivalent to multiplying by -1
66-
// This bit-manipulation allows the compiler to infer a hardened
67-
// accumulator, which allows an II of 1 for this loop
68-
union {
69-
float f;
70-
uint i;
71-
} c_imag_neg;
72-
c_imag_neg.f = c.imag();
73-
c_imag_neg.i ^= 0x80000000;
74-
75-
auto im_tmp = c.real() * y.imag() + c_imag_neg.f * y.real();
76-
auto re_tmp = c.real() * y.real() + c.imag() * y.imag();
77-
78-
re += re_tmp;
79-
im += im_tmp;
61+
ctranspose_times_y += c_vector[i].conj() * y_vector[i];
8062
}
81-
ComplexType ctranspose_times_y(re, im);
8263

8364
// calculate 1 / norm(Ctranspose * y)
8465
// Ct * y is a complex number, but it's norm is a real number (float)

DirectProgramming/DPC++FPGA/ReferenceDesigns/mvdr_beamforming/src/ForwardSubstitution.hpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
// utility classes
88
#include "ParallelCopyArray.hpp"
9-
#include "UnrolledLoop.hpp"
9+
#include "unrolled_loop.hpp"
1010

1111
#include "mvdr_complex.hpp"
1212

@@ -154,7 +154,7 @@ event SubmitForwardSubstitutionKernel(queue& q) {
154154
for (short i = 0; i < kNumCalcTypePerVector; i++) {
155155
CalcType y_elements;
156156

157-
UnrolledLoop<k_unroll_factor>([&](auto j) {
157+
fpga_tools::UnrolledLoop<k_unroll_factor>([&](auto j) {
158158
y_elements[j] = y_vectors[vector_num][i][j];
159159
y_vector_initial[i][j] = y_elements[j];
160160
if (i == 0 && j == 0) {
@@ -186,7 +186,7 @@ event SubmitForwardSubstitutionKernel(queue& q) {
186186
CalcType l_val, y_val, y_initial_val, y_current, y_new;
187187
short row[k_unroll_factor];
188188

189-
UnrolledLoop<k_unroll_factor>([&](auto j) {
189+
fpga_tools::UnrolledLoop<k_unroll_factor>([&](auto j) {
190190
// calculate current location within the vector
191191
row[j] = j + (i * (short)k_unroll_factor);
192192

DirectProgramming/DPC++FPGA/ReferenceDesigns/mvdr_beamforming/src/InputDemux.hpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@
66
#include <cmath>
77

88
// utility classes
9-
#include "Tuple.hpp"
10-
#include "UnrolledLoop.hpp"
9+
#include "tuple.hpp"
10+
#include "unrolled_loop.hpp"
1111

1212
#include "mvdr_complex.hpp"
1313

@@ -53,7 +53,7 @@ event SubmitInputDemuxKernel(
5353
}
5454

5555
// Use an NTuple of complex numbers for reading/writing pipes
56-
using PipeType = NTuple<ComplexType, k_pipe_width>;
56+
using PipeType = fpga_tools::NTuple<ComplexType, k_pipe_width>;
5757

5858
auto e = q.submit([&](handler& h) {
5959
h.single_task<InputDemuxKernelName>([=] {
@@ -78,8 +78,8 @@ event SubmitInputDemuxKernel(
7878

7979
// create a 'pipeline' for the almost full signal
8080
constexpr int kAlmostFullPipeDepth = 2;
81-
NTuple<bool, kAlmostFullPipeDepth> almost_full_pipeline;
82-
UnrolledLoop<kAlmostFullPipeDepth>([&](auto pipe_stage) {
81+
fpga_tools::NTuple<bool, kAlmostFullPipeDepth> almost_full_pipeline;
82+
fpga_tools::UnrolledLoop<kAlmostFullPipeDepth>([&](auto pipe_stage) {
8383
almost_full_pipeline.template get<pipe_stage>() = false;
8484
});
8585

@@ -123,7 +123,7 @@ event SubmitInputDemuxKernel(
123123
// an 'almost' full signal, we don't need the result right away, we
124124
// can wait several loop iterations. This allows us to break
125125
// dependencies between loop iterations and improve FMAX.
126-
UnrolledLoop<kAlmostFullPipeDepth - 1>([&](auto pipe_stage) {
126+
fpga_tools::UnrolledLoop<kAlmostFullPipeDepth - 1>([&](auto pipe_stage) {
127127
almost_full_pipeline.template get<pipe_stage>() =
128128
almost_full_pipeline.template get<pipe_stage + 1>();
129129
});

DirectProgramming/DPC++FPGA/ReferenceDesigns/mvdr_beamforming/src/MVDR.hpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -149,10 +149,10 @@ template <
149149
// all default to 'null' pipes that go nowhere
150150
typename TrainingDataPipeOut =
151151
fpga_tools::PipeDuplicator<MVDRNullPipeID,
152-
NTuple<ComplexType, k_num_complex_per_xrx_read>>,
152+
fpga_tools::NTuple<ComplexType, k_num_complex_per_xrx_read>>,
153153
typename XrxDataPipeOut =
154154
fpga_tools::PipeDuplicator<MVDRNullPipeID,
155-
NTuple<ComplexType, k_num_complex_per_xrx_read>>,
155+
fpga_tools::NTuple<ComplexType, k_num_complex_per_xrx_read>>,
156156
typename SteeringVectorsPipeOut =
157157
fpga_tools::PipeDuplicator<MVDRNullPipeID, ComplexType>,
158158
typename ForwardSteeringVectorsPipeOut =
@@ -169,7 +169,7 @@ template <
169169
fpga_tools::PipeDuplicator<MVDRNullPipeID, ComplexType>,
170170
typename TransposedTrainingDataPipeOut =
171171
fpga_tools::PipeDuplicator<MVDRNullPipeID,
172-
NTuple<ComplexType, k_num_complex_per_xrx_read>>>
172+
fpga_tools::NTuple<ComplexType, k_num_complex_per_xrx_read>>>
173173
MVDREventArray SubmitMVDRKernels(
174174
queue& q,
175175
short num_xrx_per_weights // Number of xrx vectors to process with
@@ -187,7 +187,7 @@ MVDREventArray SubmitMVDRKernels(
187187
"k_num_sensor_inputs * k_rmb_factor must fit in a short");
188188

189189
// Multiple pipes use this type, a group of complex wrapped in an NTuple
190-
using XrxPipeType = NTuple<ComplexType, k_num_complex_per_xrx_read>;
190+
using XrxPipeType = fpga_tools::NTuple<ComplexType, k_num_complex_per_xrx_read>;
191191

192192
// Training data pipe (after demux from input data)
193193
constexpr int kTrainingDataPipeMinDepth =
@@ -297,7 +297,7 @@ MVDREventArray SubmitMVDRKernels(
297297
// Q matrix pipe
298298
// Q matrix not used in MVDR design, so this is a 'null' pipe (a
299299
// PipeDuplicator with no output pipes connected)
300-
using QMatrixColumn = NTuple<ComplexType, k_num_sensor_inputs>;
300+
using QMatrixColumn = fpga_tools::NTuple<ComplexType, k_num_sensor_inputs>;
301301
using QMatrixPipe =
302302
fpga_tools::PipeDuplicator<QMatrixPipeID<k_instance_num>, QMatrixColumn>;
303303

DirectProgramming/DPC++FPGA/ReferenceDesigns/mvdr_beamforming/src/ParallelCopyArray.hpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#ifndef __PARALLEL_COPY_ARRAY_HPP__
22
#define __PARALLEL_COPY_ARRAY_HPP__
33

4-
#include "UnrolledLoop.hpp"
4+
#include "unrolled_loop.hpp"
55

66
// ParallelCopyArray
77
// Defines a struct with a single element data, which is an array of type T.
@@ -17,12 +17,12 @@ struct ParallelCopyArray {
1717

1818
// copy constructor - do a parallel copy
1919
ParallelCopyArray(const ParallelCopyArray& source) {
20-
UnrolledLoop<k_size>([&](auto k) { data[k] = source[k]; });
20+
fpga_tools::UnrolledLoop<k_size>([&](auto k) { data[k] = source[k]; });
2121
}
2222

2323
// assignment operator - do a parallel copy
2424
ParallelCopyArray& operator=(const ParallelCopyArray& source) {
25-
UnrolledLoop<k_size>([&](auto k) { data[k] = source[k]; });
25+
fpga_tools::UnrolledLoop<k_size>([&](auto k) { data[k] = source[k]; });
2626
return *this;
2727
}
2828

DirectProgramming/DPC++FPGA/ReferenceDesigns/mvdr_beamforming/src/StreamingQRD.hpp

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
1-
#ifndef __STREAMING_QRD_HPP__
2-
#define __STREAMING_QRD_HPP__
1+
#ifndef __STREAMING_QRD_HPP_MVDR__
2+
#define __STREAMING_QRD_HPP_MVDR__
33

44
#include <sycl/sycl.hpp>
55
#include <sycl/ext/intel/fpga_extensions.hpp>
66

77
// utility classes
8-
#include "Tuple.hpp"
9-
#include "UnrolledLoop.hpp"
8+
#include "tuple.hpp"
9+
#include "unrolled_loop.hpp"
1010

1111
#include "mvdr_complex.hpp"
1212

@@ -80,7 +80,7 @@ event SubmitStreamingQRDKernel(queue& q) {
8080
static_assert(k_a_num_rows % k_pipe_width == 0,
8181
"k_a_num_rows must be evenly divisible by k_pipe_width");
8282

83-
using PipeType = NTuple<ComplexType, k_pipe_width>;
83+
using PipeType = fpga_tools::NTuple<ComplexType, k_pipe_width>;
8484

8585
auto e = q.submit([&](handler& h) {
8686
h.single_task<StreamingQRDKernelName>([=] {
@@ -120,7 +120,7 @@ event SubmitStreamingQRDKernel(queue& q) {
120120
constexpr short kNumBanksNextPow2 = Pow2(CeilLog2(kNumBanks));
121121

122122
// define a type that contains an entire column
123-
using AColumn = NTuple<ComplexType, k_a_num_rows>;
123+
using AColumn = fpga_tools::NTuple<ComplexType, k_a_num_rows>;
124124

125125
// Three copies of the full matrix, so that each matrix has a single
126126
// load and a single store.
@@ -158,8 +158,8 @@ event SubmitStreamingQRDKernel(queue& q) {
158158
PipeType data_in = AMatrixInPipe::read();
159159
short col = i % (short)k_a_num_cols;
160160
short write_row_group = i / (short)k_a_num_cols;
161-
UnrolledLoop<k_a_num_rows / k_pipe_width>([&](auto row_group) {
162-
UnrolledLoop<k_pipe_width>([&](auto element) {
161+
fpga_tools::UnrolledLoop<k_a_num_rows / k_pipe_width>([&](auto row_group) {
162+
fpga_tools::UnrolledLoop<k_pipe_width>([&](auto element) {
163163
constexpr short row = row_group * k_pipe_width + element;
164164
if (write_row_group == row_group) {
165165
a_matrix_in[col].template get<row>() =
@@ -208,7 +208,7 @@ event SubmitStreamingQRDKernel(queue& q) {
208208
bool i_lt_0[kNumBanks];
209209
ComplexType sori[kNumBanks];
210210

211-
UnrolledLoop<kNumBanks>([&](auto k) {
211+
fpga_tools::UnrolledLoop<kNumBanks>([&](auto k) {
212212
j_eq_i[k] = ext::intel::fpga_reg(j == i);
213213
i_gt_0[k] = ext::intel::fpga_reg(i > 0);
214214
i_ge_0_j_ge_i[k] = ext::intel::fpga_reg(i >= 0 && j >= i);
@@ -220,7 +220,7 @@ event SubmitStreamingQRDKernel(queue& q) {
220220
// fetch data from a_matrix_in or a_matrix, based on value of i
221221
// Use of fpga_reg here is a workaround to prevent the compiler from
222222
// inferring some very complicated arbitrated local memory systems.
223-
UnrolledLoop<k_a_num_rows>([&](auto row) {
223+
fpga_tools::UnrolledLoop<k_a_num_rows>([&](auto row) {
224224
// load vector_t from a_matrix_in
225225
vector_t.template get<row>() =
226226
ext::intel::fpga_reg(a_matrix_in[j_nonneg].template get<row>());
@@ -239,7 +239,7 @@ event SubmitStreamingQRDKernel(queue& q) {
239239

240240
// perform calculations on the current column of data, and store
241241
// the result back to a_matrix (and q_matrix).
242-
UnrolledLoop<k_a_num_rows>([&](auto row) {
242+
fpga_tools::UnrolledLoop<k_a_num_rows>([&](auto row) {
243243
// calculate the new vector_t
244244
ComplexType sori_or_0 = i_lt_0[row / kNumElementsPerBank]
245245
? 0
@@ -266,7 +266,7 @@ event SubmitStreamingQRDKernel(queue& q) {
266266
});
267267

268268
ComplexType p_ij = 0;
269-
UnrolledLoop<k_a_num_rows>([&](auto row) {
269+
fpga_tools::UnrolledLoop<k_a_num_rows>([&](auto row) {
270270
p_ij += vector_t.template get<row>() *
271271
vector_ti.template get<row>().conj();
272272
});
@@ -340,4 +340,4 @@ event SubmitStreamingQRDKernel(queue& q) {
340340
return e;
341341
}
342342

343-
#endif // ifndef __STREAMING_QRD_HPP__
343+
#endif // ifndef __STREAMING_QRD_HPP_MVDR__

DirectProgramming/DPC++FPGA/ReferenceDesigns/mvdr_beamforming/src/Transpose.hpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
#include <sycl/sycl.hpp>
55
#include <sycl/ext/intel/fpga_extensions.hpp>
66

7-
#include "Tuple.hpp"
8-
#include "UnrolledLoop.hpp"
7+
#include "tuple.hpp"
8+
#include "unrolled_loop.hpp"
99

1010
using namespace sycl;
1111

@@ -61,7 +61,7 @@ template <typename T, size_t k_num_cols_in, size_t k_pipe_width,
6161
typename MatrixInPipe, typename MatrixOutPipe>
6262
struct Transposer {
6363
void operator()() const {
64-
using PipeType = NTuple<T, k_pipe_width>;
64+
using PipeType = fpga_tools::NTuple<T, k_pipe_width>;
6565

6666
// This is a scratch pad memory that we will use to do the transpose.
6767
// We read the data in from a pipe (k_pipe_width elements at at time),
@@ -94,8 +94,8 @@ struct Transposer {
9494

9595
// create a 'pipeline' for the almost full signal
9696
constexpr int kAlmostFullPipeDepth = 2;
97-
NTuple<bool, kAlmostFullPipeDepth> almost_full_pipeline;
98-
UnrolledLoop<kAlmostFullPipeDepth>([&](auto pipe_stage) {
97+
fpga_tools::NTuple<bool, kAlmostFullPipeDepth> almost_full_pipeline;
98+
fpga_tools::UnrolledLoop<kAlmostFullPipeDepth>([&](auto pipe_stage) {
9999
almost_full_pipeline.template get<pipe_stage>() = false;
100100
});
101101

@@ -116,7 +116,7 @@ struct Transposer {
116116
// an 'almost' full signal, we don't need the result right away, we
117117
// can wait several loop iterations. This allows us to break
118118
// dependencies between loop iterations and improve FMAX.
119-
UnrolledLoop<kAlmostFullPipeDepth - 1>([&](auto pipe_stage) {
119+
fpga_tools::UnrolledLoop<kAlmostFullPipeDepth - 1>([&](auto pipe_stage) {
120120
almost_full_pipeline.template get<pipe_stage>() =
121121
almost_full_pipeline.template get<pipe_stage + 1>();
122122
});
@@ -130,7 +130,7 @@ struct Transposer {
130130

131131
// read the next data to send
132132
PipeType data_out;
133-
UnrolledLoop<k_pipe_width>([&](auto i) {
133+
fpga_tools::UnrolledLoop<k_pipe_width>([&](auto i) {
134134
data_out.template get<i>() = scratch[cur_tx_buffer][i*k_num_cols_in+cur_tx_col];
135135
});
136136

@@ -165,7 +165,7 @@ struct Transposer {
165165

166166
// if we have new data, store it in the buffer and update the status
167167
if (read_valid) {
168-
UnrolledLoop<k_pipe_width>([&](auto i) {
168+
fpga_tools::UnrolledLoop<k_pipe_width>([&](auto i) {
169169
scratch[cur_rx_buffer][cur_rx_count*(unsigned short)k_pipe_width + i] = data_in.template get<i>();
170170
});
171171

0 commit comments

Comments
 (0)