Adding changes for calc weight and tuple/unrolled_loop

ericxu233 · ericxu233 · commit 838a5408b955 · 2022-10-03T11:41:27.000-07:00
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/mvdr_beamforming/src/BackwardSubstitution.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/mvdr_beamforming/src/BackwardSubstitution.hpp
@@ -6,7 +6,7 @@
 
 // utility classes
 #include "ParallelCopyArray.hpp"
-#include "UnrolledLoop.hpp"
+#include "unrolled_loop.hpp"
 
 #include "mvdr_complex.hpp"
 
@@ -135,7 +135,7 @@ event SubmitBackwardSubstitutionKernel(queue& q) {
               CalcType u_val, y_val, y_initial_val, y_current, y_new;
               short row[k_unroll_factor];
 
-              UnrolledLoop<k_unroll_factor>([&](auto j) {
+              fpga_tools::UnrolledLoop<k_unroll_factor>([&](auto j) {
                 // calculate current location within the vector
                 row[j] = j + (i * (short)k_unroll_factor);
 
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/mvdr_beamforming/src/Beamformer.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/mvdr_beamforming/src/Beamformer.hpp
@@ -6,8 +6,8 @@
 
 // utility classes
 #include "ParallelCopyArray.hpp"
-#include "Tuple.hpp"
-#include "UnrolledLoop.hpp"
+#include "tuple.hpp"
+#include "unrolled_loop.hpp"
 
 #include "mvdr_complex.hpp"
 
@@ -67,7 +67,7 @@ event SubmitBeamformerKernel(
       "k_unroll_factor must be evenly divisible by k_num_complex_per_xrx_read");
 
   // data coming from the Xrx pipe
-  using XrxPipeType = NTuple<ComplexType, k_num_complex_per_xrx_read>;
+  using XrxPipeType = fpga_tools::NTuple<ComplexType, k_num_complex_per_xrx_read>;
 
   // this type represents the number of samples to be processed in parallel
   using CalcType = ParallelCopyArray<ComplexType, k_unroll_factor>;
@@ -105,7 +105,7 @@ event SubmitBeamformerKernel(
                i++) {
             segment = XrxVectorsInPipe::read();
             short index = i / kReadsPerCalcType;
-            UnrolledLoop<k_num_complex_per_xrx_read>([&](auto k) {
+            fpga_tools::UnrolledLoop<k_num_complex_per_xrx_read>([&](auto k) {
               short subindex =
                   (i % kReadsPerCalcType) * (short)k_num_complex_per_xrx_read +
                   k;
@@ -125,19 +125,19 @@ event SubmitBeamformerKernel(
           for (unsigned char vector_num = 0;
                vector_num < (unsigned char)k_num_weight_vectors; vector_num++) {
             // zero the accumulators
-            UnrolledLoop<k_unroll_factor>([&](auto i) { accum_vector[i] = 0; });
+            fpga_tools::UnrolledLoop<k_unroll_factor>([&](auto i) { accum_vector[i] = 0; });
 
             // calculate the sum of products of the weight and xrx vectors
             // unroll by a factor of k_unroll_factor (so perform k_unroll_factor
             // operations in parallel)
             for (short i = 0; i < (kNumCalcTypePerVector); i++) {
-              UnrolledLoop<k_unroll_factor>([&](auto j) {
+              fpga_tools::UnrolledLoop<k_unroll_factor>([&](auto j) {
                 accum_vector[j] +=
                     xrx_vector[i][j] * weight_vectors[vector_num][i][j].conj();
               });
             }
             ComplexType accum_vector_sum = 0;
-            UnrolledLoop<k_unroll_factor>(
+            fpga_tools::UnrolledLoop<k_unroll_factor>(
                 [&](auto i) { accum_vector_sum += accum_vector[i]; });
 
             result[vector_num] = accum_vector_sum;
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/mvdr_beamforming/src/CalcWeights.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/mvdr_beamforming/src/CalcWeights.hpp
@@ -5,6 +5,7 @@
 #include <sycl/ext/intel/fpga_extensions.hpp>
 
 #include "mvdr_complex.hpp"
+#include <complex>
 
 using namespace sycl;
 
@@ -53,32 +54,12 @@ event SubmitCalcWeightsKernel(queue& q) {
 
           // calculate Ct * y
 
-          float re = 0;
-          float im = 0;
+          ComplexType ctranspose_times_y (0);
           
           [[intel::initiation_interval(1)]]  // NO-FORMAT: Attribute
           for (short i = 0; i < (short)k_num_elements; i++) {
-            auto c = c_vector[i];
-            auto y = y_vector[i];
-
-            // floating point numbers use the msb to indicate sign, so flipping
-            // that bit is equivalent to multiplying by -1
-            // This bit-manipulation allows the compiler to infer a hardened
-            // accumulator, which allows an II of 1 for this loop
-            union {
-              float f;
-              uint i;
-            } c_imag_neg;
-            c_imag_neg.f = c.imag();
-            c_imag_neg.i ^= 0x80000000;
-
-            auto im_tmp = c.real() * y.imag() + c_imag_neg.f * y.real();
-            auto re_tmp = c.real() * y.real() + c.imag() * y.imag();
-
-            re += re_tmp;
-            im += im_tmp;
+            ctranspose_times_y += c_vector[i].conj() * y_vector[i];
           }
-          ComplexType ctranspose_times_y(re, im);
 
           // calculate 1 / norm(Ctranspose * y)
           // Ct * y is a complex number, but it's norm is a real number (float)
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/mvdr_beamforming/src/ForwardSubstitution.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/mvdr_beamforming/src/ForwardSubstitution.hpp
@@ -6,7 +6,7 @@
 
 // utility classes
 #include "ParallelCopyArray.hpp"
-#include "UnrolledLoop.hpp"
+#include "unrolled_loop.hpp"
 
 #include "mvdr_complex.hpp"
 
@@ -154,7 +154,7 @@ event SubmitForwardSubstitutionKernel(queue& q) {
           for (short i = 0; i < kNumCalcTypePerVector; i++) {
             CalcType y_elements;
 
-            UnrolledLoop<k_unroll_factor>([&](auto j) {
+            fpga_tools::UnrolledLoop<k_unroll_factor>([&](auto j) {
               y_elements[j] = y_vectors[vector_num][i][j];
               y_vector_initial[i][j] = y_elements[j];
               if (i == 0 && j == 0) {
@@ -186,7 +186,7 @@ event SubmitForwardSubstitutionKernel(queue& q) {
               CalcType l_val, y_val, y_initial_val, y_current, y_new;
               short row[k_unroll_factor];
 
-              UnrolledLoop<k_unroll_factor>([&](auto j) {
+              fpga_tools::UnrolledLoop<k_unroll_factor>([&](auto j) {
                 // calculate current location within the vector
                 row[j] = j + (i * (short)k_unroll_factor);
 
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/mvdr_beamforming/src/InputDemux.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/mvdr_beamforming/src/InputDemux.hpp
@@ -6,8 +6,8 @@
 #include <cmath>
 
 // utility classes
-#include "Tuple.hpp"
-#include "UnrolledLoop.hpp"
+#include "tuple.hpp"
+#include "unrolled_loop.hpp"
 
 #include "mvdr_complex.hpp"
 
@@ -53,7 +53,7 @@ event SubmitInputDemuxKernel(
   }
 
   // Use an NTuple of complex numbers for reading/writing pipes
-  using PipeType = NTuple<ComplexType, k_pipe_width>;
+  using PipeType = fpga_tools::NTuple<ComplexType, k_pipe_width>;
 
   auto e = q.submit([&](handler& h) {
     h.single_task<InputDemuxKernelName>([=] {
@@ -78,8 +78,8 @@ event SubmitInputDemuxKernel(
 
       // create a 'pipeline' for the almost full signal
       constexpr int kAlmostFullPipeDepth = 2;
-      NTuple<bool, kAlmostFullPipeDepth> almost_full_pipeline;
-      UnrolledLoop<kAlmostFullPipeDepth>([&](auto pipe_stage) {
+      fpga_tools::NTuple<bool, kAlmostFullPipeDepth> almost_full_pipeline;
+      fpga_tools::UnrolledLoop<kAlmostFullPipeDepth>([&](auto pipe_stage) {
         almost_full_pipeline.template get<pipe_stage>() = false;
       });
 
@@ -123,7 +123,7 @@ event SubmitInputDemuxKernel(
         // an 'almost' full signal, we don't need the result right away, we
         // can wait several loop iterations.  This allows us to break
         // dependencies between loop iterations and improve FMAX.
-        UnrolledLoop<kAlmostFullPipeDepth - 1>([&](auto pipe_stage) {
+        fpga_tools::UnrolledLoop<kAlmostFullPipeDepth - 1>([&](auto pipe_stage) {
           almost_full_pipeline.template get<pipe_stage>() =
               almost_full_pipeline.template get<pipe_stage + 1>();
         });
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/mvdr_beamforming/src/MVDR.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/mvdr_beamforming/src/MVDR.hpp
@@ -149,10 +149,10 @@ template <
     // all default to 'null' pipes that go nowhere
     typename TrainingDataPipeOut =
         fpga_tools::PipeDuplicator<MVDRNullPipeID,
-          NTuple<ComplexType, k_num_complex_per_xrx_read>>,
+          fpga_tools::NTuple<ComplexType, k_num_complex_per_xrx_read>>,
     typename XrxDataPipeOut =
         fpga_tools::PipeDuplicator<MVDRNullPipeID,
-          NTuple<ComplexType, k_num_complex_per_xrx_read>>,
+          fpga_tools::NTuple<ComplexType, k_num_complex_per_xrx_read>>,
     typename SteeringVectorsPipeOut =
         fpga_tools::PipeDuplicator<MVDRNullPipeID, ComplexType>,
     typename ForwardSteeringVectorsPipeOut =
@@ -169,7 +169,7 @@ template <
         fpga_tools::PipeDuplicator<MVDRNullPipeID, ComplexType>,
     typename TransposedTrainingDataPipeOut =
         fpga_tools::PipeDuplicator<MVDRNullPipeID,
-          NTuple<ComplexType, k_num_complex_per_xrx_read>>>
+          fpga_tools::NTuple<ComplexType, k_num_complex_per_xrx_read>>>
 MVDREventArray SubmitMVDRKernels(
     queue& q,
     short num_xrx_per_weights  // Number of xrx vectors to process with
@@ -187,7 +187,7 @@ MVDREventArray SubmitMVDRKernels(
                 "k_num_sensor_inputs * k_rmb_factor must fit in a short");
 
   // Multiple pipes use this type, a group of complex wrapped in an NTuple
-  using XrxPipeType = NTuple<ComplexType, k_num_complex_per_xrx_read>;
+  using XrxPipeType = fpga_tools::NTuple<ComplexType, k_num_complex_per_xrx_read>;
 
   // Training data pipe (after demux from input data)
   constexpr int kTrainingDataPipeMinDepth =
@@ -297,7 +297,7 @@ MVDREventArray SubmitMVDRKernels(
   // Q matrix pipe
   // Q matrix not used in MVDR design, so this is a 'null' pipe (a
   // PipeDuplicator with no output pipes connected)
-  using QMatrixColumn = NTuple<ComplexType, k_num_sensor_inputs>;
+  using QMatrixColumn = fpga_tools::NTuple<ComplexType, k_num_sensor_inputs>;
   using QMatrixPipe =
       fpga_tools::PipeDuplicator<QMatrixPipeID<k_instance_num>, QMatrixColumn>;
 
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/mvdr_beamforming/src/ParallelCopyArray.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/mvdr_beamforming/src/ParallelCopyArray.hpp
@@ -1,7 +1,7 @@
 #ifndef __PARALLEL_COPY_ARRAY_HPP__
 #define __PARALLEL_COPY_ARRAY_HPP__
 
-#include "UnrolledLoop.hpp"
+#include "unrolled_loop.hpp"
 
 // ParallelCopyArray
 // Defines a struct with a single element data, which is an array of type T.
@@ -17,12 +17,12 @@ struct ParallelCopyArray {
 
   // copy constructor - do a parallel copy
   ParallelCopyArray(const ParallelCopyArray& source) {
-    UnrolledLoop<k_size>([&](auto k) { data[k] = source[k]; });
+    fpga_tools::UnrolledLoop<k_size>([&](auto k) { data[k] = source[k]; });
   }
 
   // assignment operator - do a parallel copy
   ParallelCopyArray& operator=(const ParallelCopyArray& source) {
-    UnrolledLoop<k_size>([&](auto k) { data[k] = source[k]; });
+    fpga_tools::UnrolledLoop<k_size>([&](auto k) { data[k] = source[k]; });
     return *this;
   }
 
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/mvdr_beamforming/src/StreamingQRD.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/mvdr_beamforming/src/StreamingQRD.hpp
@@ -1,12 +1,12 @@
-#ifndef __STREAMING_QRD_HPP__
-#define __STREAMING_QRD_HPP__
+#ifndef __STREAMING_QRD_HPP_MVDR__
+#define __STREAMING_QRD_HPP_MVDR__
 
 #include <sycl/sycl.hpp>
 #include <sycl/ext/intel/fpga_extensions.hpp>
 
 // utility classes
-#include "Tuple.hpp"
-#include "UnrolledLoop.hpp"
+#include "tuple.hpp"
+#include "unrolled_loop.hpp"
 
 #include "mvdr_complex.hpp"
 
@@ -80,7 +80,7 @@ event SubmitStreamingQRDKernel(queue& q) {
   static_assert(k_a_num_rows % k_pipe_width == 0,
                 "k_a_num_rows must be evenly divisible by k_pipe_width");
 
-  using PipeType = NTuple<ComplexType, k_pipe_width>;
+  using PipeType = fpga_tools::NTuple<ComplexType, k_pipe_width>;
 
   auto e = q.submit([&](handler& h) {
     h.single_task<StreamingQRDKernelName>([=] {
@@ -120,7 +120,7 @@ event SubmitStreamingQRDKernel(queue& q) {
         constexpr short kNumBanksNextPow2 = Pow2(CeilLog2(kNumBanks));
 
         // define a type that contains an entire column
-        using AColumn = NTuple<ComplexType, k_a_num_rows>;
+        using AColumn = fpga_tools::NTuple<ComplexType, k_a_num_rows>;
 
         // Three copies of the full matrix, so that each matrix has a single
         // load and a single store.
@@ -158,8 +158,8 @@ event SubmitStreamingQRDKernel(queue& q) {
           PipeType data_in = AMatrixInPipe::read();
           short col = i % (short)k_a_num_cols;
           short write_row_group = i / (short)k_a_num_cols;
-          UnrolledLoop<k_a_num_rows / k_pipe_width>([&](auto row_group) {
-            UnrolledLoop<k_pipe_width>([&](auto element) {
+          fpga_tools::UnrolledLoop<k_a_num_rows / k_pipe_width>([&](auto row_group) {
+            fpga_tools::UnrolledLoop<k_pipe_width>([&](auto element) {
               constexpr short row = row_group * k_pipe_width + element;
               if (write_row_group == row_group) {
                 a_matrix_in[col].template get<row>() =
@@ -208,7 +208,7 @@ event SubmitStreamingQRDKernel(queue& q) {
           bool i_lt_0[kNumBanks];
           ComplexType sori[kNumBanks];
 
-          UnrolledLoop<kNumBanks>([&](auto k) {
+          fpga_tools::UnrolledLoop<kNumBanks>([&](auto k) {
             j_eq_i[k] = ext::intel::fpga_reg(j == i);
             i_gt_0[k] = ext::intel::fpga_reg(i > 0);
             i_ge_0_j_ge_i[k] = ext::intel::fpga_reg(i >= 0 && j >= i);
@@ -220,7 +220,7 @@ event SubmitStreamingQRDKernel(queue& q) {
           // fetch data from a_matrix_in or a_matrix, based on value of i
           // Use of fpga_reg here is a workaround to prevent the compiler from
           // inferring some very complicated arbitrated local memory systems.
-          UnrolledLoop<k_a_num_rows>([&](auto row) {
+          fpga_tools::UnrolledLoop<k_a_num_rows>([&](auto row) {
             // load vector_t from a_matrix_in
             vector_t.template get<row>() =
                 ext::intel::fpga_reg(a_matrix_in[j_nonneg].template get<row>());
@@ -239,7 +239,7 @@ event SubmitStreamingQRDKernel(queue& q) {
 
           // perform calculations on the current column of data, and store
           // the result back to a_matrix (and q_matrix).
-          UnrolledLoop<k_a_num_rows>([&](auto row) {
+          fpga_tools::UnrolledLoop<k_a_num_rows>([&](auto row) {
             // calculate the new vector_t
             ComplexType sori_or_0 = i_lt_0[row / kNumElementsPerBank]
                                         ? 0
@@ -266,7 +266,7 @@ event SubmitStreamingQRDKernel(queue& q) {
           });
 
           ComplexType p_ij = 0;
-          UnrolledLoop<k_a_num_rows>([&](auto row) {
+          fpga_tools::UnrolledLoop<k_a_num_rows>([&](auto row) {
             p_ij += vector_t.template get<row>() *
                     vector_ti.template get<row>().conj();
           });
@@ -340,4 +340,4 @@ event SubmitStreamingQRDKernel(queue& q) {
   return e;
 }
 
-#endif  // ifndef __STREAMING_QRD_HPP__
+#endif  // ifndef __STREAMING_QRD_HPP_MVDR__
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/mvdr_beamforming/src/Transpose.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/mvdr_beamforming/src/Transpose.hpp
@@ -4,8 +4,8 @@
 #include <sycl/sycl.hpp>
 #include <sycl/ext/intel/fpga_extensions.hpp>
 
-#include "Tuple.hpp"
-#include "UnrolledLoop.hpp"
+#include "tuple.hpp"
+#include "unrolled_loop.hpp"
 
 using namespace sycl;
 
@@ -61,7 +61,7 @@ template <typename T, size_t k_num_cols_in, size_t k_pipe_width,
           typename MatrixInPipe, typename MatrixOutPipe>
 struct Transposer {
   void operator()() const {
-    using PipeType = NTuple<T, k_pipe_width>;
+    using PipeType = fpga_tools::NTuple<T, k_pipe_width>;
 
     // This is a scratch pad memory that we will use to do the transpose.
     // We read the data in from a pipe (k_pipe_width elements at at time),
@@ -94,8 +94,8 @@ struct Transposer {
 
     // create a 'pipeline' for the almost full signal
     constexpr int kAlmostFullPipeDepth = 2;
-    NTuple<bool, kAlmostFullPipeDepth> almost_full_pipeline;
-    UnrolledLoop<kAlmostFullPipeDepth>([&](auto pipe_stage) {
+    fpga_tools::NTuple<bool, kAlmostFullPipeDepth> almost_full_pipeline;
+    fpga_tools::UnrolledLoop<kAlmostFullPipeDepth>([&](auto pipe_stage) {
       almost_full_pipeline.template get<pipe_stage>() = false;
     });
 
@@ -116,7 +116,7 @@ struct Transposer {
       // an 'almost' full signal, we don't need the result right away, we
       // can wait several loop iterations.  This allows us to break
       // dependencies between loop iterations and improve FMAX.
-      UnrolledLoop<kAlmostFullPipeDepth - 1>([&](auto pipe_stage) {
+      fpga_tools::UnrolledLoop<kAlmostFullPipeDepth - 1>([&](auto pipe_stage) {
         almost_full_pipeline.template get<pipe_stage>() =
             almost_full_pipeline.template get<pipe_stage + 1>();
       });
@@ -130,7 +130,7 @@ struct Transposer {
 
       // read the next data to send
       PipeType data_out;
-      UnrolledLoop<k_pipe_width>([&](auto i) {
+      fpga_tools::UnrolledLoop<k_pipe_width>([&](auto i) {
         data_out.template get<i>() = scratch[cur_tx_buffer][i*k_num_cols_in+cur_tx_col];
       });
 
@@ -165,7 +165,7 @@ struct Transposer {
 
       // if we have new data, store it in the buffer and update the status
       if (read_valid) {
-        UnrolledLoop<k_pipe_width>([&](auto i) {
+        fpga_tools::UnrolledLoop<k_pipe_width>([&](auto i) {
           scratch[cur_rx_buffer][cur_rx_count*(unsigned short)k_pipe_width + i] = data_in.template get<i>();
         });
 
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/mvdr_beamforming/src/Tuple.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/mvdr_beamforming/src/Tuple.hpp
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/mvdr_beamforming/src/UnrolledLoop.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/mvdr_beamforming/src/UnrolledLoop.hpp
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/mvdr_beamforming/src/mvdr_beamforming.cpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/mvdr_beamforming/src/mvdr_beamforming.cpp