diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..11f61b3 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +/**/ide/**/*.*sdf +/**/ide/**/*.tss +/**/ide/**/*.suo +/**/ide/**/Debug +/**/ide/**/Release diff --git a/aws_demo/Code_Structure.png b/aws_demo/Code_Structure.png new file mode 100644 index 0000000..e83b55e Binary files /dev/null and b/aws_demo/Code_Structure.png differ diff --git a/aws_demo/README.md b/aws_demo/README.md new file mode 100644 index 0000000..2503114 --- /dev/null +++ b/aws_demo/README.md @@ -0,0 +1,130 @@ +AWS Demo +====================== +The AWS Demo is set of examples demonstrate how use xfOpenCV library in kernels build for Amazone F1 instance. Each example could be build to run on FPGA (only F1 instance with "*FPGA Developer AMI*" could be used) or emulated for debug purpose in HW or SW emulation mode (any instance with "*FPGA Developer AMI*"could be used). + +## EXAMPLES FILE HIERARCHY +Each example is organized into the following folders + + +| Folder Name | Contents | +| :----- | :------ +| <example name> | **Root folder of example.** Folder contains input image(s), headers with kernel configuration and declaration, makefile, source code of host application, kernel wrapper and kernel.| +|      hw |**Folder for FPGA flow.** | +|           afi |**Folder for AWS FPGA binary file generation.** After successful build folder will contain the kernel container binary (`.xclbin`) to generate AWS FPGA binary file for Amazon F1 instance and register AFI. During generation all intermediate files will be stored there. | +|           run |**Run folder of the example.** After successful build folder will contain host application executable. All result, intermediate and reference images generated by kernel and application will be stored there. Kernel container | +|      hw_emu |**Folder for HW emulation flow.** | +|           run |**Emulation folder of the example.** After successful build folder will contain host application executable and kernel container (`.xclbin`) for HW emulation. Emulation logs and data, result, intermediate and reference images generated by kernel emulation and application will be stored there. | +|      sw_emu |**Folder for HW emulation flow.** | +|           run |**Emulation folder of the example.** After successful build folder will contain host application executable and kernel container (`.xclbin`) for HW emulation. Emulation logs and data, result, intermediate and reference images generated by kernel emulation and application will be stored there. | + + +## HOW TO BUILD EXAMPLE +Place xfOpenCV library (`xfopencv` folder) together with Amazon's FPGA framework +``` +project_data + ├─ aws-fpga + └─ xfopencv +``` + +If you would like to have other folder structure you need tune [`aws_demo/common_makefile`](common_makefile). For more information please see [make_description.md](make_description.md) + +### Prepare environment +Run following code to prepare environment for build. +``` +cd $AWS_FPGA_REPO_DIR +source sdaccel_setup.sh +source $XILINX_SDX/settings64.sh +``` + +### Build example for SW/HW emulation + +1. Go to root folder of example. +2. Build whole example (`all`), kernel part only (`krnl`) or host application only (`host`) for HW (`hw_emu`) or SW (`sw_emu`) emulation with the following command: +``` +make TARGET=hw_emu|sw_emu all|host|krnl +``` + +To erase all build data including host application executable and kernel binary files use following command: +``` +make TARGET=hw_emu|sw_emu clean +``` + +### Build example for FPGA + +To build examples for FPGA F1 instance you will need access to [**AWS CLI**](https://aws.amazon.com/cli/) and [**S3**](https://aws.amazon.com/s3/). Please refer to **_[What Is the AWS Command Line Interface?](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-welcome.html)_** and **_[Getting Started with Amazon S3](https://aws.amazon.com/s3/getting-started/)_**. +1. Go to root folder of example. +2. Build whole example (`all`), kernel part only (`krnl`) or host application only (`host`) for HW (`hw_emu`) or SW (`sw_emu`) emulation with the following command: +``` +make TARGET=hw all|host|krnl +``` +3. After kernel build complete go to ***afi*** folder to generate AWS FPGA binary file for Amazon F1 instance and register AFI +4. Setup **[AWS CLI](https://aws.amazon.com/cli/)** (see **_[What Is the AWS Command Line Interface?](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-welcome.html)_**) +5. Run script placed in ***afi*** folder to generate AWS FPGA binary file for Amazon F1 instance and register AFI +``` +source ./gen_afi.sh +``` +Script will create S3 bucket for FPGA image and launch image generation in background process. When script finish the FPGA image will not be ready. +6. Wait until FPGA image will be generated. To check generation completion periodically run following command: +``` +aws ec2 describe-fpga-images --fpga-image-id +``` +You can get `` from script message or from file **`*_afi_id.txt`**. During generation you will see following message: +``` +... + "State": { + "Code": "pending" + }, +... +``` +The FPGA image is ready if command print `available`: +``` +... + "State": { + "Code": "available" + }, +... +``` + +7. Copy `.awsxclbin` into ***hw/run*** folder + +To erase all build data including host application executable but except content of ***afi*** folder use following command: +``` +make TARGET=hw clean +``` + +## HOW TO RUN EXAMPLE + +### Prepare environment +If you relaunch Amazon instance after build you need to repeat environment preparation step: +``` +cd $AWS_FPGA_REPO_DIR +source sdaccel_setup.sh +source $XILINX_SDX/settings64.sh +``` +### Run SW/HW emulation of example + +1. Go to emulation folder of example (**`hw_emu/run`** or **`sw_emu/run`**). +2. Set desired emulation option in `sdaccel.ini` file +3. Launch emulation with the following command: +``` +source run.sh +``` + +### Run example on FPGA + +1. Go to run folder of example (**`hw/run`**). +2. Launch shell +``` +sudo sh +``` +3. Launch application with the following command: +``` +source run.sh +``` + +## REVISION HISTORY + +Date | Readme Version | Release Notes +-------- |----------------|------------------------- +May 2018 | 1.0 | Initial version. + diff --git a/aws_demo/common_makefile b/aws_demo/common_makefile new file mode 100644 index 0000000..b996b1e --- /dev/null +++ b/aws_demo/common_makefile @@ -0,0 +1,196 @@ +######################################## +# # +# Tools section # +# # +######################################## + +XILINX_SDX ?= /opt/Xilinx/SDx/2017.1.op +XILINX_HLS ?= $(XILINX_SDX)/Vivado_HLS + + +SDX_CXX ?= $(XILINX_SDX)/bin/xcpp +XOCC ?= $(XILINX_SDX)/bin/xocc + +RM = rm -f +RMDIR = rm -rf + +ifeq "$(AWS_PLATFORM)" "$(AWS_PLATFORM_1DDR)" + XILINX_SDX_RUNTIME=/opt/Xilinx/2017.1.rte.1ddr/runtime/lib/x86_64 +else ifeq "$(AWS_PLATFORM)" "$(AWS_PLATFORM_4DDR)" + XILINX_SDX_RUNTIME=/opt/Xilinx/2017.1.rte.4ddr/runtime/lib/x86_64 +else ifeq "$(AWS_PLATFORM)" "$(AWS_PLATFORM_4DDR_DEBUG)" + XILINX_SDX_RUNTIME=/opt/Xilinx/2017.1.rte.4ddr_debug/runtime/lib/x86_64 +endif + +XFOPENCV ?= /home/centos/src/project_data/xfopencv + +TARGET ?= hw_emu + +######################################## +# # +# Host section # +# # +######################################## + +HOST_SDx_SRC ?= xcl2 + +SDx_LIB_DIR ?= $(SDACCEL_DIR)/examples/xilinx/libs/xcl2 + +CXXFLAGS += -DSDX_PLATFORM=$(AWS_PLATFORM) -D__USE_XOPEN2K8 +CXXFLAGS += -I$(XILINX_SDX)/runtime/include/1_2/ +CXXFLAGS += -I$(XILINX_SDX)/include/ +CXXFLAGS += -I$(XFOPENCV)/include/ +CXXFLAGS += -I$(SDx_LIB_DIR)/ +CXXFLAGS += -I$(XILINX_HLS)/include +CXXFLAGS += -O2 -Wall -c -fmessage-length=0 -std=c++14 + +#--- Specify OpenCV libraries ---# + +LDFLAGS += -L$(XILINX_SDX)/lnx64/tools/opencv +LDFLAGS += -lopencv_core +LDFLAGS += -lopencv_imgproc +LDFLAGS += -lopencv_highgui + +#--- Specify common libraries ---# + +LDFLAGS += -L$(XILINX_SDX)/lib/lnx64.o +LDFLAGS += -lstdc++ +LDFLAGS += -lpthread +LDFLAGS += -lrt + +#--- Specify AWS libraries ---# + +LDFLAGS += -L$(XILINX_SDX_RUNTIME) +LDFLAGS += -lxilinxopencl + +#--- Specify runtime libraries ---# + +LDFLAGS += -Wl,-rpath,$(XILINX_SDX)/lnx64/tools/opencv +LDFLAGS += -Wl,-rpath,$(XILINX_SDX)/lib/lnx64.o +LDFLAGS += -Wl,-rpath,$(XILINX_SDX_RUNTIME) + +#--- Specify objects ---# + +HOST_AWS_DIR = ./ +HOST_BLD_DIR = $(TARGET)/build/host +HOST_RUN_DIR = $(TARGET)/run + +HOST_AWS_OBJ += $(addsuffix .o, $(addprefix $(HOST_BLD_DIR)/, $(HOST_AWS_SRC)) ) +HOST_SDx_OBJ += $(addsuffix .o, $(addprefix $(HOST_BLD_DIR)/, $(HOST_SDx_SRC)) ) + +HOST_OBJ = $(HOST_AWS_OBJ) $(HOST_SDx_OBJ) + +HOST_EXE ?= $(HOST_RUN_DIR)/$(TEST_NAME) + +BUILD_SUBDIRS += $(HOST_BLD_DIR) + + +######################################## +# # +# Kernel section # +# # +######################################## + +XOCC_OPTS += --platform $(AWS_PLATFORM) +XOCC_OPTS += --save-temps +XOCC_OPTS += --report system + +XOCC_INCL += -I$(XFOPENCV)/include +XOCC_INCL += -I/opt/Xilinx/SDx/2017.4/include/ocv + +KERNEL_BLD_DIR = $(TARGET)/build/kernel + +ifeq "$(TARGET)" "hw" + KERNEL_RUN_DIR = $(TARGET)/afi +else + KERNEL_RUN_DIR = $(TARGET)/run + XOCC_OPTS += -g +endif + +BUILD_SUBDIRS += $(KERNEL_BLD_DIR) + +KERNEL_OBJ += $(addsuffix .xo , $(addprefix $(KERNEL_BLD_DIR)/, $(KERNEL)) ) +KERNEL_BIN += $(addsuffix .xclbin, $(addprefix $(KERNEL_RUN_DIR)/, $(KERNEL)) ) + + +######################################## +# # +# Build section # +# # +######################################## + +.PHONY: all + +all: host krnl + +host: $(HOST_EXE) + +krnl: $(KERNEL_BIN) + +clean: + $(RMDIR) $(BUILD_SUBDIRS) + $(RMDIR) .Xil + $(RMDIR) $(HOST_RUN_DIR)/TempConfig + $(RM) $(HOST_RUN_DIR)/*.jpg $(HOST_RUN_DIR)/*.png $(HOST_RUN_DIR)/*.log $(HOST_RUN_DIR)/*.csv $(HOST_RUN_DIR)/*.html + $(RM) $(KERNEL_BIN) + $(RM) $(HOST_EXE) + $(RM) $(HOST_EXE)/*.*xclbin + +.PHONY: all + +#--- Kernel rules ---# + +$(KERNEL_OBJ): $(KERNEL_BLD_DIR)/%.xo : %_kernel_aws.cpp + @echo " " + @echo "================================================================" + @echo "Compilation of $< to $@" + @echo "================================================================" + @echo " " + @mkdir -p $(@D) + $(XOCC) -c -t $(TARGET) $(XOCC_OPTS) $(XOCC_INCL) -k $(*F) --max_memory_ports $(*F) -I$(" diff --git a/aws_demo/gaussianfilter/hw/run/run.sh b/aws_demo/gaussianfilter/hw/run/run.sh new file mode 100644 index 0000000..fb16389 --- /dev/null +++ b/aws_demo/gaussianfilter/hw/run/run.sh @@ -0,0 +1,5 @@ +#!/bin/sh + +source /opt/Xilinx/SDx/2017.1.rte.4ddr/setup.sh + +./gaussian_filter_test ../../im0.jpg diff --git a/aws_demo/gaussianfilter/hw_emu/run/run.sh b/aws_demo/gaussianfilter/hw_emu/run/run.sh new file mode 100644 index 0000000..e8dad87 --- /dev/null +++ b/aws_demo/gaussianfilter/hw_emu/run/run.sh @@ -0,0 +1,5 @@ +emconfigutil -f $AWS_PLATFORM + +export XCL_EMULATION_MODE=hw_emu + +./gaussian_filter_test ../../im0.jpg diff --git a/aws_demo/gaussianfilter/hw_emu/run/sdaccel.ini b/aws_demo/gaussianfilter/hw_emu/run/sdaccel.ini new file mode 100644 index 0000000..63a1cac --- /dev/null +++ b/aws_demo/gaussianfilter/hw_emu/run/sdaccel.ini @@ -0,0 +1,5 @@ +[Debug] +timeline_trace=true +device_profile=true +app_debug=true +profile=true diff --git a/aws_demo/gaussianfilter/im0.jpg b/aws_demo/gaussianfilter/im0.jpg new file mode 100644 index 0000000..bce6d36 Binary files /dev/null and b/aws_demo/gaussianfilter/im0.jpg differ diff --git a/aws_demo/gaussianfilter/makefile b/aws_demo/gaussianfilter/makefile new file mode 100644 index 0000000..1a9f186 --- /dev/null +++ b/aws_demo/gaussianfilter/makefile @@ -0,0 +1,25 @@ +######################################## +# # +# Host section # +# # +######################################## + +TEST_NAME = gaussian_filter_test + +HOST_AWS_SRC = xf_gaussian_filter_accel_aws +HOST_AWS_SRC += xf_gaussian_filter_tb + +######################################## +# # +# Kernel section # +# # +######################################## + +KERNEL = xf_gaussian_filter + +######################################## + +include ../common_makefile + + + diff --git a/aws_demo/gaussianfilter/readme.md b/aws_demo/gaussianfilter/readme.md new file mode 100644 index 0000000..9d22445 --- /dev/null +++ b/aws_demo/gaussianfilter/readme.md @@ -0,0 +1,183 @@ +# Gaussian Filter # + +Example demonstrates using of **`xf::GaussianBlur()`** and **`xf::resize()`** functions of xfOpenCV library in pipeline. Example designed to process one image once. If you would like to process many images in loop you need to extract from kernel interface wrapper FPGA & kernel initialization and finalization operations and move them to host application before and after processing loop respectively. + +## Code structure ## + +![](./../Code_Structure.png) + +| Component | Source files | +| :- | :- | +| *Kernel Configuration* |**`xf_gaussian_filter_config.h`**
**`xf_config_params.h`**| +| *Host Application* |**`xf_gaussian_filter_tb.cpp`**| +| *Kernel Interface Wrapper* |**`xf_gaussian_filter_accel_aws.cpp`**| +| *Kernel Driver* |**`xcl2.cpp (in SDx library)`**| +| *Kernel* |**`xf_gaussian_filter_kernel_aws.cpp`**| + +## Kernel Configuration # + +Following constants in header files define kernel configuration + +| Constant | Possible values | Default Value | Description | +| :- | :- | :- | :- | +| **`FILTER_SIZE_3`**
**`FILTER_SIZE_5`**
**`FILTER_SIZE_7`**|**`0, 1`**| **`1`**
**`0`**
**`0`**| Select window size of the Gaussian filter. One of them should be defined as 1. And only one can be defined as 1 - others should be defined as 0 | +| **`FILTER_WIDTH`** |-|-|The window size of the Gaussian filter. Value set automatically depending on which **`FILTER_SIZE_n`** set to 1. +| **`SIGMA`** |-|-|Standard deviation of of Gaussian Filter. Value set automatically depending on which **`FILTER_SIZE_n`** set to 1.| +| **`NPC1`** |**`XF_NPPC1`**
**`XF_NPPC8`**|**`XF_NPPC1`**|Select level of parallelism in kernel (number of pixels which kernel process per clock cycle).| +| **`XF_RESIZE_INTERPOLATION`** |**`XF_INTERPOLATION_NN`**
**`XF_INTERPOLATION_BILINEAR`**
**`XF_INTERPOLATION_AREA`**
|**`XF_INTERPOLATION_NN`**|Types of Interpolaton techniques| +| **`CV_RESIZE_INTERPOLATION`** |**`cv::INTER_NEAREST`**
**`cv::INTER_LINEAR`**
**`cv::INTER_AREA`**
**`others are not suitable`**|**`cv::INTER_NEAREST`**|Types of Interpolaton techniques| +| **`XF_GAUSSIAN_BORDER`** |**`XF_BORDER_CONSTANT`**
**`XF_BORDER_REPLICATE`**|**`XF_BORDER_CONSTANT`**|The way in which borders will be processed| +| **`CV_GAUSSIAN_BORDER`** |**`cv::BORDER_CONSTANT`**
**`cv::BORDER_REPLICATE`**
**`others are not suitable`**|**`cv::BORDER_CONSTANT`**|The way in which borders will be processed| +| **`COLS_INP`** |**`multiple of 8`**|**`1920`**|Maximum width of input image| +| **`ROWS_INP`** |**`multiple of 8`**|**`1080`**|Maximum height of input image| +| **`SCALE`** |**`> 0 and !=1`**|**`0.5`**|Define scale factor of image after Gaussian Filter.
**Note: The **`xf::resize()`** doesn't support scale factor 1.** | +| **`COLS_OUT`** |**`multiple of 8`**|**`COLS_INP/2`**|Maximum width of output image. Please keep value to correspond to the scale factor (**`SCALE`**). Value should be **`>= ceil(COLS_INP * SCALE)`** and should be multiple of 8.| +| **`ROWS_OUT`** |**`> 0`**|**`ROWS_INP/2`**|Maximum height of input image. Please keep value to correspond to the scale factor (**`SCALE`**). Value should be **`>= ceil(ROWS_INP * SCALE)`**| + +## Host Application ## +Host application reads test image from file, process it with help of regular OpenCV library on host, perform same processing with help of FPGA kernel with function from xfOpenCV library and compare result. + +Input image of example ***im0.jpg*** placed in root folder of example. First filter applied to the image is **`xf::GaussianBlur()`**, next is **`xf::resize()`**. Both has analog with same name in OpenCV library. Application calculate difference between result images - images assumed equal if difference for each pixel not exceed 1. Result images will be stored into run folder. + +The following images will be in run folder after execution: + +- ***xf_img_out.jpg*** - result of FPGA kernel processing +- ***cv_img_out.jpg*** - result of OpenCV processing +- ***error.png*** - contains difference of values for each pixel of result images + + +## Kernel Interface Wrapper ## + +In conjunction with xfOpenCV library on host application is convenient to use xf::Mat or cv::Mat class and image manipulation functions. Unfortunately the XOCC kernel compiler doesn't support classes/structures as kernel input/output parameters. To pass xf::Mat to a kernel a wrapper is needed. The kernel interface wrapper convert interface convenient to host application to kernel interface available in Amazon F1 instance. + +For this example kernel interface wrapper also perform FPGA initialization, kernel downloading, initialization and finalization. + + +| Parameter Name |Direction|Type | Description | +| :- | :- | :- | :- | +| **`img_inp`** |Input | **`xf::Mat &`** | Input image | +| **`img_out`** |Output | **`xf::Mat &`** | Output image | +| **`sigma`** | Input | **`float`** | Standard deviation of of Gaussian Filter | + +To forward these parameters to kernel wrapper create 2 buffers in global memory for images data. Wrapper decompose **`img_inp`** and **`img_out`** classes and pass member separately. + + +## Kernel Driver ### + +Example use modification of SDx xcl kernel driver v.2 for Amazon F1 instance. Source code of this driver and description could be found in Amazon aws-fpga framework. + +## Kernel ## + +To apply Gaussian filter and change size of processed image the kernel pipeline functions from xfOpenCV library as shown on the image below.
+ +![](./Gaussian_Filter_Diagram.png) + +The kernel has following parameters: + +| Parameter Name |Direction|Type | Description | +| :- | :- | :- | :- | +| **`img_inp`** |Input | **`XF_TNAME(XF_8UC1, NPC1) *`** | Pointer to input image buffer | +| **`img_out`** |Output | **`XF_TNAME(XF_8UC1, NPC1) *`** | Pointer to output image buffer| +| **`rows_inp`**| Input | **`int`** | Height of input image | +| **`cols_inp`**| Input | **`int`** | Width of input image | +| **`sigma`** | Input | **`float`** | Standard deviation of of Gaussian Filter | +| **`rows_out`**| Input | **`int`** | Height of output image | +| **`cols_out`**| Input | **`int`** | Width of output image | + +During synthesis for FPGA kernel's parameters should be mapped to HW interfaces supported on Amazon F1 instance. To map kernel parameters **`HLS INTERFACE`** pragma should be used. Supported following interfaces: **`m_axi`** and **`s_axilite`**. For **`m_axi`** offset can be set through **`s_axilite`** port only. + +Because functions from xfOpenCV library operate with **`xf::Mat`** class as image container kernel's parameters should be packed back to objects of this class. To do this you need following: + +- Declare **`xf::Mat`** variable
***Note: due to XOCC issues use default constructor only - do not try initialize class members with help of non-default constructors*** +- Assign image size to **`rows`** and **`cols`** members +- Copy image from input buffer to **`data`** member of **`xf::Mat`** or from **`data`** to output buffer + +```cpp +xf::Mat mi; + +mi.rows = rows_inp; +mi.cols = cols_inp; + +for(int i=0; i < rows_inp; i++) + { + #pragma HLS LOOP_TRIPCOUNT min=1 max=pROWS_INP + + for(int j=0; j < (cols_inp >> (XF_BITSHIFT(NPC1))); j++) + { + #pragma HLS LOOP_TRIPCOUNT min=1 max=pCOLS_INP/pNPC1 + #pragma HLS PIPELINE + #pragma HLS loop_flatten off + + *(mi.data + i*(cols_inp >> (XF_BITSHIFT(NPC1))) +j) = *(img_inp + i*(cols_inp >> (XF_BITSHIFT(NPC1))) +j); + } + } +``` +**Note: `#pragma HLS` doesn't support constants defined through **`#define`** directive - use `const int`. In the code above `pROWS_INP`, `pCOLS_INP` and `pNPC1` are `const int` variables which get values from constants defined in xf_gaussian_filter_config.h with help of #define directive** + +```cpp + const int pROWS_INP = ROWS_INP; + const int pCOLS_INP = COLS_INP; + const int pNPC1 = NPC1; +``` + +Simple declaration of **`xf::Mat`** object create buffer to store whole image with maximum defined size. This buffer use FPGA internal memory blocks and even big FPGA devices could not have enough resources. You should use **`#pragma HLS stream`** to ask HLS convert big RAM buffer to small FIFO buffer + +```cpp + xf::Mat mi; + #pragma HLS stream variable=mi.data depth=pCOLS_INP/pNPC1 +``` + +Please note that **`#pragma HLS stream`** could be used inside dataflow block, therefore kernel body should be declared as dataflow. This also permit pipeline functions from xfOpenCV library. + +```cpp +void kernel(...) +{ + #pragma HLS INTERFACE ... + #pragma HLS INTERFACE ... + + #pragma HLS dataflow + ... +} +``` + +## Known Issues + +- #### Kernel can't accept class/structure as parameters +**Solution**: use simple types, pass class/structure members as separate parameters of simple types and compose class/structure object back inside kernel. + +- #### Using non-default constructors can cause kernel suspension on FPGA and HW emulation +**Solution**: use default constructor for object declaration and next assign desired values to the members separately. + +```cpp +xf::Mat mi; + +mi.rows = rows_inp; +mi.cols = cols_inp; +``` + +- #### **`#pragma HLS`** doesn't support constants defined through **`#define`** directive. +**Solution**: use **`const int`** instead + + +```cpp +#define ROWS_INP 1080 + +void kernel(...) +{ + const int pROWS_INP = ROWS_INP; + + for(int i=0; i < rows_inp; i++) + { + #pragma HLS LOOP_TRIPCOUNT min=1 max=pROWS_INP + ... + } + ... +} +``` + + +## Revision History + +Date | Readme Version | Release Notes +-------- |----------------|------------------------- +May 2018 | 1.0 | Initial version. diff --git a/aws_demo/gaussianfilter/sw_emu/run/run.sh b/aws_demo/gaussianfilter/sw_emu/run/run.sh new file mode 100644 index 0000000..35d1bd4 --- /dev/null +++ b/aws_demo/gaussianfilter/sw_emu/run/run.sh @@ -0,0 +1,5 @@ +emconfigutil -f $AWS_PLATFORM + +export XCL_EMULATION_MODE=sw_emu + +./gaussian_filter_test ../../im0.jpg diff --git a/aws_demo/gaussianfilter/sw_emu/run/sdaccel.ini b/aws_demo/gaussianfilter/sw_emu/run/sdaccel.ini new file mode 100644 index 0000000..63a1cac --- /dev/null +++ b/aws_demo/gaussianfilter/sw_emu/run/sdaccel.ini @@ -0,0 +1,5 @@ +[Debug] +timeline_trace=true +device_profile=true +app_debug=true +profile=true diff --git a/aws_demo/gaussianfilter/xf_config_params.h b/aws_demo/gaussianfilter/xf_config_params.h new file mode 100644 index 0000000..dfd727c --- /dev/null +++ b/aws_demo/gaussianfilter/xf_config_params.h @@ -0,0 +1,3 @@ +#define FILTER_SIZE_3 1 +#define FILTER_SIZE_5 0 +#define FILTER_SIZE_7 0 diff --git a/aws_demo/gaussianfilter/xf_gaussian_filter_accel_aws.cpp b/aws_demo/gaussianfilter/xf_gaussian_filter_accel_aws.cpp new file mode 100644 index 0000000..9e427fc --- /dev/null +++ b/aws_demo/gaussianfilter/xf_gaussian_filter_accel_aws.cpp @@ -0,0 +1,57 @@ +#include +#include +#include + +#include "xcl2.hpp" + +#include "xf_gaussian_filter_config.h" + +#define CL_MIGRATE_MEM_OBJECT_KERNEL 0 //OpenCL define constant to indicate memory object migration to host only, to make program more readable define "counterpart" constant + +void gaussian_filter_accel(xf::Mat &img_inp, xf::Mat &img_out, float sigma) +{ + std::vector devices = xcl::get_xil_devices(); + + cl::Device device = devices[0]; + + cl::Context context(device); + + cl::CommandQueue q(context, device, CL_QUEUE_PROFILING_ENABLE); + std::string device_name = device.getInfo(); + + std::string binaryFile = (xcl::is_emulation() || xcl::is_hw_emulation ()) ? "xf_gaussian_filter.xclbin" : "xf_gaussian_filter.awsxclbin"; + + std::cout << "======== " << binaryFile << " ========" << std::endl; + + cl::Program::Binaries bins = xcl::import_binary_file(binaryFile); + devices.resize(1); + cl::Program program(context, devices, bins); + cl::Kernel kernel(program,"xf_gaussian_filter"); + + //----------- Allocate Buffer in Global Memory -----------// + + cl::Buffer buffer_inp(context,CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY , img_inp.rows * img_inp.cols, img_inp.data); + cl::Buffer buffer_out(context,CL_MEM_USE_HOST_PTR | CL_MEM_WRITE_ONLY, img_out.rows * img_out.cols, img_out.data); + + std::vector writeBufVec; + writeBufVec.push_back(buffer_inp); + + //----------- Migrate input data to device global memory -----------// + + q.enqueueMigrateMemObjects(writeBufVec, CL_MIGRATE_MEM_OBJECT_KERNEL); + + auto krnl = cl::KernelFunctor(kernel); + + //----------- Launch the Kernel -----------// + + krnl(cl::EnqueueArgs(q, cl::NDRange(1,1,1), cl::NDRange(1,1,1)), buffer_inp, buffer_out, img_inp.rows, img_inp.cols, sigma, img_out.rows, img_out.cols); + + //----------- Copy Result from Device Global Memory to Host Local Memory -----------// + + std::vector readBufVec; + readBufVec.push_back(buffer_out); + + q.enqueueMigrateMemObjects(readBufVec,CL_MIGRATE_MEM_OBJECT_HOST); + + q.finish(); +} diff --git a/aws_demo/gaussianfilter/xf_gaussian_filter_config.h b/aws_demo/gaussianfilter/xf_gaussian_filter_config.h new file mode 100644 index 0000000..0a35c2b --- /dev/null +++ b/aws_demo/gaussianfilter/xf_gaussian_filter_config.h @@ -0,0 +1,80 @@ +/*************************************************************************** +Copyright (c) 2016, Xilinx, Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors +may be used to endorse or promote products derived from this software +without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +***************************************************************************/ + +#ifndef _XF_GAUSSIAN_FILTER_CONFIG_H_ +//{ + #define _XF_GAUSSIAN_FILTER_CONFIG_H_ + + #include "hls_stream.h" + #include "common/xf_common.h" + #include "common/xf_utility.h" + #include "imgproc/xf_gaussian_filter.hpp" + #include "xf_config_params.h" + + typedef unsigned short int uint16_t; + + #define SCALE ( 0.5f ) + + #define ROWS_INP ( 1080 ) + #define COLS_INP ( 1920 ) + + #define ROWS_OUT ( ROWS_INP / 2 ) + #define COLS_OUT ( COLS_INP / 2 ) + + //----------------- Filters parameters -----------------// + + #define XF_RESIZE_INTERPOLATION XF_INTERPOLATION_NN // Interpolation type for xf::resize() inside kernel + #define CV_RESIZE_INTERPOLATION cv::INTER_NEAREST // Interpolation type for cv::resize() called from testbench + + #define XF_GAUSSIAN_BORDER XF_BORDER_CONSTANT // Border type of xfopencv Gaussian filter inside kernel + #define CV_GAUSSIAN_BORDER cv::BORDER_CONSTANT // Border type of opencv Gaussian filter called from testbench + + #if FILTER_SIZE_3 // Set Gaussian filter parameters depending on constant defined in xf_config_params.h + //{ + #define FILTER_WIDTH ( 3 ) + #define SIGMA ( 0.5f) + //} + #elif FILTER_SIZE_5 + //{ + #define FILTER_WIDTH ( 5 ) + #define SIGMA ( 0.8333f ) + //} + #elif FILTER_SIZE_7 + //{ + #define FILTER_WIDTH ( 7 ) + #define SIGMA ( 1.16666f ) + //} + #endif + + #define NPC1 XF_NPPC1 + + void gaussian_filter_accel(xf::Mat &img_inp, xf::Mat &img_out, float sigma); +//} +#endif //_XF_GAUSSIAN_FILTER_CONFIG_H_ diff --git a/aws_demo/gaussianfilter/xf_gaussian_filter_kernel_aws.cpp b/aws_demo/gaussianfilter/xf_gaussian_filter_kernel_aws.cpp new file mode 100644 index 0000000..50207c6 --- /dev/null +++ b/aws_demo/gaussianfilter/xf_gaussian_filter_kernel_aws.cpp @@ -0,0 +1,96 @@ +//Includes +#include +#include +#include + +#include "hls_stream.h" + +#include "common/xf_common.h" + +#include "xf_gaussian_filter_config.h" + +#include "imgproc/xf_gaussian_filter.hpp" +#include "imgproc/xf_resize.hpp" + +extern "C" void xf_gaussian_filter(XF_TNAME(XF_8UC1, NPC1) *img_inp, XF_TNAME(XF_8UC1, NPC1) *img_out, int rows_inp, int cols_inp, float sigma, int rows_out, int cols_out); + +void xf_gaussian_filter(XF_TNAME(XF_8UC1, NPC1) *img_inp, XF_TNAME(XF_8UC1, NPC1) *img_out, int rows_inp, int cols_inp, float sigma, int rows_out, int cols_out) +{ + #pragma HLS INTERFACE m_axi port=img_inp offset=slave bundle=gmem + #pragma HLS INTERFACE m_axi port=img_out offset=slave bundle=gmem + + #pragma HLS INTERFACE s_axilite port=img_inp bundle=control + #pragma HLS INTERFACE s_axilite port=img_out bundle=control + + #pragma HLS INTERFACE s_axilite port=rows_inp bundle=control + #pragma HLS INTERFACE s_axilite port=cols_inp bundle=control + #pragma HLS INTERFACE s_axilite port=sigma bundle=control + + #pragma HLS INTERFACE s_axilite port=rows_out bundle=control + #pragma HLS INTERFACE s_axilite port=cols_out bundle=control + + #pragma HLS INTERFACE s_axilite port=return bundle=control + + #pragma HLS dataflow + + const int pROWS_INP = ROWS_INP; + const int pCOLS_INP = COLS_INP; + + const int pROWS_OUT = ROWS_OUT; + const int pCOLS_OUT = COLS_OUT; + + const int pNPC1 = NPC1; + + xf::Mat mi; + xf::Mat mf; + + #pragma HLS stream variable=mi.data depth=pCOLS_INP/pNPC1 + #pragma HLS stream variable=mf.data depth=pCOLS_INP/pNPC1 + + xf::Mat mo; + + #pragma HLS stream variable=mo.data depth=pCOLS_OUT/pNPC1 + + mi.rows = rows_inp; mi.cols = cols_inp; + mf.rows = rows_inp; mf.cols = cols_inp; + + mo.rows = rows_out; mo.cols = cols_out; + + /********************************************************/ + + for(int i=0; i < rows_inp; i++) + { + #pragma HLS LOOP_TRIPCOUNT min=1 max=pROWS_INP + + for(int j=0; j < (cols_inp >> (XF_BITSHIFT(NPC1))); j++) + { + #pragma HLS LOOP_TRIPCOUNT min=1 max=pCOLS_INP/pNPC1 + #pragma HLS PIPELINE + #pragma HLS loop_flatten off + + *(mi.data + i*(cols_inp >> (XF_BITSHIFT(NPC1))) +j) = *(img_inp + i*(cols_inp >> (XF_BITSHIFT(NPC1))) +j); + } + } + + xf::GaussianBlur(mi, mf, sigma); + + + xf::resize (mf, mo); + + for(int i=0; i < rows_out; i++) + { + #pragma HLS LOOP_TRIPCOUNT min=1 max=pROWS_OUT + + for(int j=0; j < (cols_out >> (XF_BITSHIFT(NPC1))); j++) + { + #pragma HLS LOOP_TRIPCOUNT min=1 max=pCOLS_OUT/pNPC1 + #pragma HLS PIPELINE + #pragma HLS loop_flatten off + + *(img_out + i*(cols_out >> (XF_BITSHIFT(NPC1))) +j) = *(mo.data + i*(cols_out >> (XF_BITSHIFT(NPC1))) +j) ; + } + } + +} + + diff --git a/aws_demo/gaussianfilter/xf_gaussian_filter_tb.cpp b/aws_demo/gaussianfilter/xf_gaussian_filter_tb.cpp new file mode 100644 index 0000000..05307d6 --- /dev/null +++ b/aws_demo/gaussianfilter/xf_gaussian_filter_tb.cpp @@ -0,0 +1,138 @@ +/*************************************************************************** + Copyright (c) 2016, Xilinx, Inc. + All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, + are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + 3. Neither the name of the copyright holder nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + ***************************************************************************/ + +#include "opencv2/opencv.hpp" +#include "opencv2/imgproc/imgproc.hpp" +#include "opencv2/highgui/highgui.hpp" + +#include "common/xf_sw_utils.h" + +#include "xf_gaussian_filter_config.h" + +using namespace std; + +int main(int argc, char **argv) +{ + if (argc != 2) + { + printf("Usage: \n"); + return -1; + } + + cv::Mat cv_img_inp, cv_img_out, cv_img_ref; + cv::Mat diff; + + int rows_out, cols_out; + + cv_img_inp = cv::imread(argv[1], 0); // reading in the color image + + if (!cv_img_inp.data) + { + printf("Failed to load the image ... !!!"); + return -1; + } + + rows_out = cv_img_inp.rows * SCALE; + cols_out = cv_img_inp.cols * SCALE; + + cv_img_ref.create(cv_img_inp.rows, cv_img_inp.cols, cv_img_inp.depth()); // create memory for OCV output image + + cv_img_out.create(rows_out, cols_out, cv_img_inp.depth()); // create memory for OCV output image + + float sigma = SIGMA; + + // OpenCV Gaussian filter function + cv::GaussianBlur(cv_img_inp, cv_img_ref, cvSize(FILTER_WIDTH, FILTER_WIDTH), SIGMA, SIGMA, CV_GAUSSIAN_BORDER); + + cv::resize(cv_img_ref, cv_img_out, cvSize(cv_img_out.cols, cv_img_out.rows), 0, 0, CV_RESIZE_INTERPOLATION ); + + imwrite("cv_img_out.jpg", cv_img_out); + + + diff.create(cv_img_out.rows, cv_img_out.cols, cv_img_out.depth()); // create memory for diff image + + + //=====================================================================// + + + xf::Mat xf_img_inp(cv_img_inp.rows,cv_img_inp.cols); + xf::Mat xf_img_out(cv_img_out.rows,cv_img_out.cols); + + xf_img_inp = xf::imread(argv[1], 0); + + gaussian_filter_accel(xf_img_inp, xf_img_out, sigma); + + + // Write output image + xf::imwrite("xf_img_out.jpg",xf_img_out); + + + xf::absDiff(cv_img_out, xf_img_out, diff); // Compute absolute difference image + + imwrite("error.png", diff); // Save the difference image for debugging purpose + + // Find minimum and maximum differences. + + #define THRESHOLD 1 + + double minval = 256, maxval = 0; + int cnt = 0; + + for( int i = 0; i < diff.rows; i++ ) + { + for( int j = 0; j < diff.cols; j++ ) + { + uchar v = diff.at(i, j); + + if( v > THRESHOLD ) + cnt++; + + if (minval > v) minval = v; + if (maxval < v) maxval = v; + } + } + + float err_per = 100.0 * (float) cnt / (cv_img_inp.rows * cv_img_inp.cols); + + printf( "\nMinimum error in intensity = %f\n", minval); + printf( "Maximum error in intensity = %f\n", maxval); + + printf( "\nPercentage of pixels above error threshold = %f\n", err_per); + + if(err_per > 1) + { + printf("\nTest Failed\n"); + return -1; + } + + printf("\nTest Pass\n"); + + return 0; +} diff --git a/aws_demo/ide/vs/Gaussian_Filter.vcxproj b/aws_demo/ide/vs/Gaussian_Filter.vcxproj new file mode 100644 index 0000000..274464e --- /dev/null +++ b/aws_demo/ide/vs/Gaussian_Filter.vcxproj @@ -0,0 +1,82 @@ + + + + + Debug + Win32 + + + Release + Win32 + + + + + + + + + + + + + + + + + {4F175088-E060-4DD3-B199-92A67421ACE2} + Gaussian_Filter + + + + Application + true + v110 + MultiByte + + + Application + false + v110 + true + MultiByte + + + + + + + + + + + + + + + Level3 + Disabled + ..\..\..\include;..\..\..\examples\gaussianfilter;..\..\..\..\aws-fpga\SDAccel\examples\xilinx\libs\xcl2;..\..\..\..\OpenCV;..\..\..\..\OpenCL;..\..\..\..\SDx\include;%(AdditionalIncludeDirectories) + + + true + + + + + Level3 + MaxSpeed + true + true + ..\..\..\include;..\..\..\examples\gaussianfilter;..\..\..\..\aws-fpga\SDAccel\examples\xilinx\libs\xcl2;..\..\..\..\OpenCV;..\..\..\..\OpenCL;..\..\..\..\SDx\include;%(AdditionalIncludeDirectories) + + + true + true + true + + + + + + \ No newline at end of file diff --git a/aws_demo/ide/vs/Gaussian_Filter.vcxproj.filters b/aws_demo/ide/vs/Gaussian_Filter.vcxproj.filters new file mode 100644 index 0000000..bc11928 --- /dev/null +++ b/aws_demo/ide/vs/Gaussian_Filter.vcxproj.filters @@ -0,0 +1,25 @@ + + + + + {b5f48f04-d9bc-4b50-b9a6-c222b80c54d0} + + + + + + + + + + + + + + h + + + h + + + \ No newline at end of file diff --git a/aws_demo/ide/vs/Stereo_Pipeline.vcxproj b/aws_demo/ide/vs/Stereo_Pipeline.vcxproj new file mode 100644 index 0000000..1cc6c15 --- /dev/null +++ b/aws_demo/ide/vs/Stereo_Pipeline.vcxproj @@ -0,0 +1,86 @@ + + + + + Debug + Win32 + + + Release + Win32 + + + + {BE04D816-584B-4836-9C40-FBA3C41593F2} + Stereo_Pipeline + + + + Application + true + v110 + MultiByte + + + Application + false + v110 + true + MultiByte + + + + + + + + + + + + + + + Level3 + Disabled + ..\..\..\include;..\..\..\examples\gaussianfilter;..\..\..\..\aws-fpga\SDAccel\examples\xilinx\libs\xcl2;..\..\..\..\OpenCV;..\..\..\..\OpenCL;..\..\..\..\SDx\include;%(AdditionalIncludeDirectories) + + + true + + + + + Level3 + MaxSpeed + true + true + ..\..\..\include;..\..\..\examples\gaussianfilter;..\..\..\..\aws-fpga\SDAccel\examples\xilinx\libs\xcl2;..\..\..\..\OpenCV;..\..\..\..\OpenCL;..\..\..\..\SDx\include;%(AdditionalIncludeDirectories) + + + true + true + true + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/aws_demo/ide/vs/Stereo_Pipeline.vcxproj.filters b/aws_demo/ide/vs/Stereo_Pipeline.vcxproj.filters new file mode 100644 index 0000000..f7ce491 --- /dev/null +++ b/aws_demo/ide/vs/Stereo_Pipeline.vcxproj.filters @@ -0,0 +1,42 @@ + + + + + + + + + + + + + {c22c4267-6cf2-4761-b9f9-50716799aa60} + + + {4f3f7a8a-73c4-455b-a5de-39d8d14135a4} + + + + + xfopencv + + + xfopencv + + + xfopencv + + + h + + + h + + + h + + + h + + + \ No newline at end of file diff --git a/aws_demo/ide/vs/aws_demo.sln b/aws_demo/ide/vs/aws_demo.sln new file mode 100644 index 0000000..1b5179c --- /dev/null +++ b/aws_demo/ide/vs/aws_demo.sln @@ -0,0 +1,32 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 2012 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "examples", "examples.vcxproj", "{4ABDD7A8-F903-4D12-B736-4BEF47FB6F4B}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Gaussian_Filter", "Gaussian_Filter.vcxproj", "{4F175088-E060-4DD3-B199-92A67421ACE2}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Stereo_Pipeline", "Stereo_Pipeline.vcxproj", "{BE04D816-584B-4836-9C40-FBA3C41593F2}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Win32 = Debug|Win32 + Release|Win32 = Release|Win32 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {4ABDD7A8-F903-4D12-B736-4BEF47FB6F4B}.Debug|Win32.ActiveCfg = Debug|Win32 + {4ABDD7A8-F903-4D12-B736-4BEF47FB6F4B}.Debug|Win32.Build.0 = Debug|Win32 + {4ABDD7A8-F903-4D12-B736-4BEF47FB6F4B}.Release|Win32.ActiveCfg = Release|Win32 + {4ABDD7A8-F903-4D12-B736-4BEF47FB6F4B}.Release|Win32.Build.0 = Release|Win32 + {4F175088-E060-4DD3-B199-92A67421ACE2}.Debug|Win32.ActiveCfg = Debug|Win32 + {4F175088-E060-4DD3-B199-92A67421ACE2}.Debug|Win32.Build.0 = Debug|Win32 + {4F175088-E060-4DD3-B199-92A67421ACE2}.Release|Win32.ActiveCfg = Release|Win32 + {4F175088-E060-4DD3-B199-92A67421ACE2}.Release|Win32.Build.0 = Release|Win32 + {BE04D816-584B-4836-9C40-FBA3C41593F2}.Debug|Win32.ActiveCfg = Debug|Win32 + {BE04D816-584B-4836-9C40-FBA3C41593F2}.Debug|Win32.Build.0 = Debug|Win32 + {BE04D816-584B-4836-9C40-FBA3C41593F2}.Release|Win32.ActiveCfg = Release|Win32 + {BE04D816-584B-4836-9C40-FBA3C41593F2}.Release|Win32.Build.0 = Release|Win32 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/aws_demo/ide/vs/examples.vcxproj b/aws_demo/ide/vs/examples.vcxproj new file mode 100644 index 0000000..922b667 --- /dev/null +++ b/aws_demo/ide/vs/examples.vcxproj @@ -0,0 +1,89 @@ + + + + + Debug + Win32 + + + Release + Win32 + + + + {4ABDD7A8-F903-4D12-B736-4BEF47FB6F4B} + examples + examples + + + + Application + true + v110 + MultiByte + + + Application + false + v110 + true + MultiByte + + + + + + + + + + + + + false + + + + Level3 + Disabled + + + true + + + + + Level3 + MaxSpeed + true + true + + + true + true + true + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/aws_demo/ide/vs/examples.vcxproj.filters b/aws_demo/ide/vs/examples.vcxproj.filters new file mode 100644 index 0000000..877d59f --- /dev/null +++ b/aws_demo/ide/vs/examples.vcxproj.filters @@ -0,0 +1,56 @@ + + + + + accumulate + + + accumulate + + + accumulate + + + stereopipeline + + + stereopipeline + + + stereopipeline + + + stereopipeline + + + + + accumulate + + + accumulate + + + stereopipeline + + + stereopipeline + + + + + {fdb0de10-a233-42a7-908e-33cb0564f49c} + + + {8cbe6705-9bb3-4895-918d-c42dcf0d36f6} + + + + + stereopipeline + + + stereopipeline + + + \ No newline at end of file diff --git a/aws_demo/make_description.md b/aws_demo/make_description.md new file mode 100644 index 0000000..90a7c88 --- /dev/null +++ b/aws_demo/make_description.md @@ -0,0 +1,71 @@ +# Makefiles description # + +Examples for Amazon F1 instance use example specific and common makefile. Example specific makefile is placed at root folder of example and include common makefile ([`aws_demo/common_makefile`](common_makefile)). + +## Example specific makefile ## +Example specific makefile contains following variables to list source files for host application and kernel. + +### Variables for host part ### + +| Variable Name |Necessity | Purpose | +| :- | :- | :- | +| **`TEST_NAME`** |Mandatory| Name of the host executable which will be created as successful build result in **`run`** folder | +| **`HOST_AWS_SRC`** |Mandatory| List of host source files placed in root folder of example | +| **`HOST_SDx_SRC`**
**`SDx_LIB_DIR`** | Optional | The **`HOST_SDx_SRC`** contains list of SDx kernel driver source files which provide interaction between host and FPGA kernel on Amazon F1 instance and **`SDx_LIB_DIR`** contains path to these sources. Originally all examples use xcl driver v.2. Default values are assigned in [`common_makefile`](common_makefile). If you would like to use other driver you need to do following:
1) Modify example source code to use desired driver;
2) assign list of appropriate library source files to **`HOST_SDx_SRC`**;
3) setup path to the library in **`SDx_LIB_DIR`** variable.
Settings of these variables in example specific makefile override default values of [`common_makefile`](common_makefile) | + +### Variables for kernel part ### + +| Variable Name |Necessity | Purpose | +| :- | :- | :- | +| **`KERNEL`** |Mandatory| Name of the kernel should be same as kernel source file name (without extension) | + + +## Common makefile ## +Common makefile contains following variables and makefile's targets for host application and kernel. + +### Variables for host part ### + +| Variable Name |Default value | Description | +| :- | :- | :- | +| **`XILINX_SDX`** |**`/opt/Xilinx/SDx/2017.1.op`**| Path to Xilinx SDx toolset on Amazon F1 instance | +| **`XILINX_HLS`** |**`$(XILINX_SDX)/Vivado_HLS`** | Path to Xilinx Vivado HLS | +| **`SDX_CXX`** |**`$(XILINX_SDX)/bin/xcpp`** | Alias for Xilinx SDx compiler | +| **`XOCC`** |**`$(XILINX_SDX)/bin/xocc`** | ALias for Xilinx XOCC compiler | +| **`XILINX_SDX_RUNTIME`**| - | Set automatically to run-time library of selected platform (value of **`$(AWS_PLATFORM)`**).| +| **`XFOPENCV`** |**`/home/centos/src/project_data/xfopencv`** | Location of xfOpenCV library.
***Note: If you place xfOpenCV library in other location than recommended (default) please update this variable!*** | +| **`TARGET`** |**`hw_emu`** | The target flow. This variable should be override by desired target flow (**`hw/sw_emu/hw_emu`**) in make command line | +| **`HOST_SDx_SRC`** |**`xcl2`** | List of SDx kernel driver source files which provide interaction between host and FPGA kernel on Amazon F1 instance. Originally all examples use xcl driver v.2.| +| **`SDx_LIB_DIR`** |**`$(SDACCEL_DIR)/examples/xilinx/libs/xcl2`** | Path to SDx kernel driver source files | +| **`CXXFLAGS`** |- | Contains SDx compiler options. Please see default value in [`common_makefile`](common_makefile) | +| **`LDFLAGS`** |- | Contains SDx linker options. Please see default value in [`common_makefile`](common_makefile)
***Note: Host application needs specific version of run-time shared libraries. Important to explicitly specify for linker needed libraries with help of `-rpath` option. Take it in mind in case of [`common_makefile`](common_makefile) modification *** | +| **`HOST_AWS_DIR`** |**`./`** | Root folder of example | +| **`HOST_BLD_DIR`** |**`$(TARGET)/build/host`** | Build folder for host application build artifacts| +| **`HOST_RUN_DIR`** |**`$(TARGET)/run`** | Run folder of host application | +| **`HOST_EXE`** |**`$(HOST_RUN_DIR)/$(TEST_NAME)`** | Host application executable name with path | + +### Variables for kernel part ### + +| Variable Name |Default value | Description | +| :- | :- | :- | +| **`XOCC_OPTS`** |- | Contains XOCC options. Please see default value in [`common_makefile`](common_makefile) | +| **`XOCC_INCL`** |- | Contains paths to search header files. Please see default value in [`common_makefile`](common_makefile) | +| **`KERNEL_BLD_DIR`** |**`$(TARGET)/build/kernel`** | Build folder for kernel build artifacts| +| **`KERNEL_RUN_DIR`** |- | Folder to store kernel binary (`.xclbin`). Default value depends on target flow. Please see default value in [`common_makefile`](common_makefile)| + + +### Makefile targets ### + +| Target label | Description | +| :- | :- | +| **`all`** | Build host application and kernel for target flow specified by **`$(TARGET)`** variable | +| **`host`** | Build host application only for target flow specified by **`$(TARGET)`** variable | +| **`krnl`** | Build kernel only for target flow specified by **`$(TARGET)`** variable | +| **`clean`** | Clean build artifacts of target flow specified by **`$(TARGET)`** variable.
**_Note: afi folder of FPGA flow ($(TARGET) == hw) kept untouched. You should clean it manually if needed_** | + + + +## REVISION HISTORY + +Date | Readme Version | Release Notes +-------- |----------------|------------------------- +May 2018 | 1.0 | Initial version. diff --git a/aws_demo/stereopipeline/Stereo_Pipeline_Diagram.png b/aws_demo/stereopipeline/Stereo_Pipeline_Diagram.png new file mode 100644 index 0000000..94c4f14 Binary files /dev/null and b/aws_demo/stereopipeline/Stereo_Pipeline_Diagram.png differ diff --git a/aws_demo/stereopipeline/cameraParameters.h b/aws_demo/stereopipeline/cameraParameters.h new file mode 100644 index 0000000..491ace5 --- /dev/null +++ b/aws_demo/stereopipeline/cameraParameters.h @@ -0,0 +1,173 @@ +typedef float param_T; + +//#define OLD720PPARAMS +//#define JACKSCAMERA_FHD +//#define JACKSCAMERA_HACKF_720P +//#define DANSCAMERA_FHD +//#define DANSCAMERA_720P +#define DANSCAMERA_HACKF_720P + +#ifdef OLD720PPARAMS +param_T cameraMA_l[9] = { 1000, 0.0, 950, 0.0, 1000, 950, 0, 0, 1 }; +param_T irA_l[9] = { 0.001,0, -0.95, 0, 0.001,-0.95, 0,0,1 }; +param_T distC_l[5] = { 0,0,0,0,0 }; +param_T cameraMA_r[9] = { 1000, 0.0, 950, 0.0, 1000, 950, 0, 0, 1 }; +param_T irA_r[9] = { 0.001,0, -0.95, 0, 0.001,-0.95, 0,0,1 }; +param_T distC_r[5] = { 0,0,0,0,0 }; +#endif + + +#ifdef JACKSCAMERA_HACKF_720P +param_T cameraMA_l[9] = +{ 933.6330000000, 0.0000000000, 695.1210000000, + 0.0000000000, 933.6330000000, 357.9060000000, + 0.0000000000, 0.0000000000, 1.0000000000}; + +param_T cameraMA_r[9] = +{ 933.0330000000, 0.0000000000, 713.4870000000, + 0.0000000000, 933.0330000000, 371.9880000000, + 0.0000000000, 0.0000000000, 1.0000000000}; + +param_T distC_l[5] = +{ -0.1742480000, 0.0257726000, 0.0000000000, 0.0000000000, 0.0000000000}; + +param_T distC_r[5] = +{ -0.1761240000, 0.0290219000, 0.0000000000, 0.0000000000, 0.0000000000}; + +param_T irA_l[9] = +{ 0.0012029958, 0.0000006953, -0.8686828369, + -0.0000006919, 0.0012030057, -0.4384867217, + 0.0000049680, -0.0000008223, 0.9967224703}; + +param_T irA_r[9] = +{ 0.0012030062, 0.0000000000, -0.8643069377, + -0.0000000000, 0.0012030059, -0.4403546203, + 0.0000000000, 0.0000008238, 0.9996986971}; +#endif + + +#ifdef JACKSCAMERA_FHD +// ZED 2x 1080p Stereo Camera +// from SN2484.conf [LEFT_CAM_FHD] fx,cx,fy,cy +// Camera Mat Left: +param_T cameraMA_l[9] = +{ 1400.4500000000, 0.0000000000, 1073.2400000000, + 0.0000000000, 1400.4500000000, 538.8110000000, + 0.0000000000, 0.0000000000, 1.0000000000}; + +//inv(Rotation Mat * new Camera Mat) Left: +param_T irA_l[9] = +{ 0.0008019968, 0.0000004635, -0.9044744614, + -0.0000004613, 0.0008020035, -0.4432839221, + 0.0000033120, -0.0000005482, 0.9965779658}; + +// from SN2484.conf [LEFT_CAM_FHD] k1,k2 +param_T distC_l[5] = +{ -0.174248, 0.0257726, 0, 0, 0 }; + +// from SN2484.conf [RIGHT_CAM_FHD] fx,cx,fy,cy +// Camera Mat Right: +param_T cameraMA_r[9] = +{ 1399.5500000000, 0.0000000000, 1109.9700000000, + 0.0000000000, 1399.5500000000, 566.9750000000, + 0.0000000000, 0.0000000000, 1.0000000000}; + +// inv(Rotation Mat * new Camera Mat) Right: +param_T irA_r[9] = +{ 0.0008020038, 0.0000000000, -0.9000960887, + -0.0000000000, 0.0008020036, -0.4451724061, + 0.0000000000, 0.0000005492, 0.9996953980}; + +// from SN2484.conf [RIGHT_CAM_FHD] k1,k2 +param_T distC_r[5] = +{ -0.176124, 0.0290219, 0, 0, 0 }; + +#endif + +#ifdef DANSCAMERA_720P +param_T cameraMA_l[9] = +{ 699.8780000000, 0.0000000000, 663.4510000000, + 0.0000000000, 699.8780000000, 377.0150000000, + 0.0000000000, 0.0000000000, 1.0000000000}; + +param_T cameraMA_r[9] = +{ 700.0990000000, 0.0000000000, 678.2970000000, + 0.0000000000, 700.0990000000, 359.6230000000, + 0.0000000000, 0.0000000000, 1.0000000000}; + +param_T distC_l[5] = +{ -0.1693980000, 0.0227329000, 0.0000000000, 0.0000000000, 0.0000000000}; + +param_T distC_r[5] = +{ -0.1705810000, 0.0249444000, 0.0000000000, 0.0000000000, 0.0000000000}; + +param_T irA_l[9] = +{ 0.0024720519, -0.0000000039, -1.7464382128, + 0.0000000015, 0.0024721905, -0.9765267985, + 0.0000261810, 0.0000002197, 0.9814731201}; + +param_T irA_r[9] = +{ 0.0024721905, 0.0000000000, -1.7359468834, + -0.0000000000, 0.0024721905, -0.9763479760, + -0.0000000000, -0.0000002197, 1.0000867727}; +#endif + +#ifdef DANSCAMERA_HACKF_720P +param_T cameraMA_l[9] = +{ 933.1730000000, 0.0000000000, 663.4510000000, + 0.0000000000, 933.1730000000, 377.0150000000, + 0.0000000000, 0.0000000000, 1.0000000000}; + +param_T cameraMA_r[9] = +{ 933.4670000000, 0.0000000000, 678.2970000000, + 0.0000000000, 933.4670000000, 359.6230000000, + 0.0000000000, 0.0000000000, 1.0000000000}; + +param_T distC_l[5] = +{ -0.1693980000, 0.0227329000, 0.0000000000, 0.0000000000, 0.0000000000}; + +param_T distC_r[5] = +{ -0.1705810000, 0.0249444000, 0.0000000000, 0.0000000000, 0.0000000000}; + +param_T irA_l[9] = +{ 0.0011976323, -0.0000000019, -0.8153011732, + 0.0000000007, 0.0011976994, -0.4422348617, + 0.0000126839, 0.0000001064, 0.9913820905}; + +param_T irA_r[9] = +{ 0.0011976994, 0.0000000000, -0.8047567905, + -0.0000000000, 0.0011976994, -0.4420566166, + -0.0000000000, -0.0000001064, 1.0000392898}; +#endif + + +#ifdef DANSCAMERA_FHD +param_T cameraMA_l[9] = +{ 1399.7600000000, 0.0000000000, 1009.9000000000, + 0.0000000000, 1399.7600000000, 577.0300000000, + 0.0000000000, 0.0000000000, 1.0000000000}; + +param_T cameraMA_r[9] = +{ 1400.2000000000, 0.0000000000, 1039.5900000000, + 0.0000000000, 1400.2000000000, 542.2460000000, + 0.0000000000, 0.0000000000, 1.0000000000}; + +param_T distC_l[5] = +{ -0.1693980000, 0.0227329000, 0.0000000000, 0.0000000000, 0.0000000000}; + +param_T distC_r[5] = +{ -0.1705810000, 0.0249444000, 0.0000000000, 0.0000000000, 0.0000000000}; + +param_T irA_l[9] = +{ 0.0007984219, -0.0000000012, -0.8338509656, + 0.0000000005, 0.0007984666, -0.4484861710, + 0.0000084559, 0.0000000710, 0.9911850779}; + +param_T irA_r[9] = +{ 0.0007984666, 0.0000000000, -0.8233076329, + -0.0000000000, 0.0007984666, -0.4483079145, + -0.0000000000, -0.0000000710, 1.0000398454}; +#endif + + + diff --git a/aws_demo/stereopipeline/hw/afi/gen_afi.sh b/aws_demo/stereopipeline/hw/afi/gen_afi.sh new file mode 100644 index 0000000..d3ec1e7 --- /dev/null +++ b/aws_demo/stereopipeline/hw/afi/gen_afi.sh @@ -0,0 +1,31 @@ +#!/bin/bash +echo aws s3 rm --recursive s3://xfsp +aws s3 rm --recursive s3://xfsp + +echo aws s3 rb s3://xfsp +aws s3 rb s3://xfsp + + +echo aws s3 mb s3://xfsp +aws s3 mb s3://xfsp + +aws s3 mb s3://xfsp/dcp +touch FILES_GO_HERE.txt +aws s3 cp FILES_GO_HERE.txt s3://xfsp/dcp/ + + +aws s3 mb s3://xfsp/log +touch LOGS_FILES_GO_HERE.txt +aws s3 cp LOGS_FILES_GO_HERE.txt s3://xfsp/log/ + +aws s3 ls --recursive s3://xfsp + +rm -f FILES_GO_HERE.txt +rm -f LOGS_FILES_GO_HERE.txt + +$SDACCEL_DIR/tools/create_sdaccel_afi.sh -xclbin=xf_stereo_pipeline.xclbin -s3_bucket=xfsp -s3_dcp_key=dcp -s3_logs_key=log + +cat *afi_id* + +echo "use following command to check afi ready" +echo "aws ec2 describe-fpga-images --fpga-image-id " diff --git a/aws_demo/stereopipeline/hw/run/run.sh b/aws_demo/stereopipeline/hw/run/run.sh new file mode 100644 index 0000000..08c6dc4 --- /dev/null +++ b/aws_demo/stereopipeline/hw/run/run.sh @@ -0,0 +1,5 @@ +#!/bin/sh + +source /opt/Xilinx/SDx/2017.1.rte.4ddr/setup.sh + +./stereo_pipeline_test ../../left.png ../../right.png diff --git a/aws_demo/stereopipeline/hw_emu/run/run.sh b/aws_demo/stereopipeline/hw_emu/run/run.sh new file mode 100644 index 0000000..277ae3d --- /dev/null +++ b/aws_demo/stereopipeline/hw_emu/run/run.sh @@ -0,0 +1,5 @@ +emconfigutil -f $AWS_PLATFORM + +export XCL_EMULATION_MODE=hw_emu + +./stereo_pipeline_test ../../left.png ../../right.png diff --git a/aws_demo/stereopipeline/hw_emu/run/sdaccel.ini b/aws_demo/stereopipeline/hw_emu/run/sdaccel.ini new file mode 100644 index 0000000..63a1cac --- /dev/null +++ b/aws_demo/stereopipeline/hw_emu/run/sdaccel.ini @@ -0,0 +1,5 @@ +[Debug] +timeline_trace=true +device_profile=true +app_debug=true +profile=true diff --git a/aws_demo/stereopipeline/left.png b/aws_demo/stereopipeline/left.png new file mode 100644 index 0000000..2819082 Binary files /dev/null and b/aws_demo/stereopipeline/left.png differ diff --git a/aws_demo/stereopipeline/makefile b/aws_demo/stereopipeline/makefile new file mode 100644 index 0000000..1eabcfc --- /dev/null +++ b/aws_demo/stereopipeline/makefile @@ -0,0 +1,22 @@ +######################################## +# # +# Host section # +# # +######################################## + +TEST_NAME = stereo_pipeline_test + +HOST_AWS_SRC += xf_stereo_pipeline_accel_aws +HOST_AWS_SRC += xf_stereo_pipeline_tb + +######################################## +# # +# Kernel section # +# # +######################################## + +KERNEL = xf_stereo_pipeline + +######################################## + +include ../common_makefile diff --git a/aws_demo/stereopipeline/readme.md b/aws_demo/stereopipeline/readme.md new file mode 100644 index 0000000..42ace91 --- /dev/null +++ b/aws_demo/stereopipeline/readme.md @@ -0,0 +1,198 @@ +# Stereo Pipeline # + +Disparity map generation is one of the first steps in creating a three dimensional map of the environment. The xfOpenCV library has components to build an image processing pipeline to compute a disparity map given the camera parameters and inputs from a stereo camera setup. + +Example demonstrates using of **`xf::InitUndistortRectifyMapInverse()`**, **`xf::remap()`** and **`xf::StereoBM()`** functions of xfOpenCV library in pipeline to compute disparity map. Example designed to process one image set (image from left and right cameras) once. If you would like to process many sets of images in loop you need to extract from kernel interface wrapper FPGA & kernel initialization and finalization operations and move them to host application before and after processing loop respectively. + +## Code structure ## + +![](./../Code_Structure.png) + +| Component | Source files | +| :- | :- | +| *Kernel Configuration* |**`xf_stereo_pipeline_config.h`**
**`xf_config_params.h`**
**`cameraParameters.h`**| +| *Host Application* |**`xf_stereo_pipeline_tb.cpp`**| +| *Kernel Interface Wrapper* |**`xf_stereo_pipeline_accel_aws.cpp`**| +| *Kernel Driver* |**`xcl2.cpp (in SDx library)`**| +| *Kernel* |**`xf_stereo_pipeline_kernel_aws.cpp`**| + +## Kernel Configuration # + +Following constants in header files define kernel configuration + +| Constant | Possible values | Default Value | Description | +| :- | :- | :- | :- | +| **`XF_WIDTH`** |**`> 0`**|**`1280`**|Maximum width of input image| +| **`XF_HEIGHT`** |**`> 0`**|**`720`**|Maximum height of input image| +| **`XF_CAMERA_MATRIX_SIZE`**|**`9`**|**`9`**|Number of element in camera coordinate system matrix (9 == 3 x 3 matrix)| +| **`XF_DIST_COEFF_SIZE`** |**`4, 5, 8`**|**`5`**|Size of array with distortion coefficients| +| **`NO_OF_DISPARITIES`** |**`0 < value < XF_WIDTH &&`**
**`value >= PARALLEL_UNITS &&`**
**`value % PARALLEL_UNITS == 0`**| **`48`**| Number of disparities | +| **`PARALLEL_UNITS`** |**` > 0`**|**`16`**|Number of disparities to be computed in parallel| +| **`SAD_WINDOW_SIZE`** |**`value % 2 == 1 &&`**
**`> minimum image heght &&`**
**`< 21`**|15|Size of the window used for disparity computation| +| **`XF_REMAP_BUFSIZE`** |**`> 0`**|**`128`** |Number of input image rows to be buffered inside **`xf::remap()`**| + +## Host Application ## +Host application reads two test images from file (images of left and right cameras) and forward them with cameras matrices, distortion coefficients and transformation matrices predefined in **`cameraParameters.h`** to the kernel for disparity map computation. + +Input images of example **_left.png_** and **_right.png_** placed in root folder of example. Disparity map calculated in kernel writing as image to **_hls_output.png_**. + + +## Kernel Interface Wrapper ## + +In conjunction with xfOpenCV library on host application is convenient to use xf::Mat or cv::Mat class and image manipulation functions. Unfortunately the XOCC kernel compiler doesn't support classes/structures as kernel input/output parameters. To pass xf::Mat to a kernel a wrapper is needed. The kernel interface wrapper convert interface convenient to host application to kernel interface available in Amazon F1 instance. + +For this example kernel interface wrapper also perform FPGA initialization, kernel downloading, initialization and finalization. + + +| Parameter Name |Direction|Type | Description | +| :- | :- | :- | :- | +| **`xf_img_l`** |Input | **`xf::Mat**`XF_WIDTH, XF_NPPC1> &`** | Input image from left camera | +| **`xf_img_r`** |Input | **`xf::Mat**`XF_WIDTH, XF_NPPC1> &`** | Input image from right camera | +| **`xf_img_d`** |Output| **`xf::Mat**` XF_WIDTH, XF_NPPC1> &`**| Output disparity map | +| **`bm_state`** |Input | **`xf::xFSBMState**`NO_OF_DISPARITIES, PARALLEL_UNITS> &`** | Set of various parameters regarding the stereo block matching algorithm | +| **`cameraMA_l_fix`**|Input | **`ap_fixed<32,12>`**| Left camera parameters matrix | +| **`cameraMA_r_fix`**|Input | **`ap_fixed<32,12>`**| Right camera parameters matrix | +| **`distC_l_fix`** |Input | **`ap_fixed<32,12>`**| Left image distortion coefficients | +| **`distC_r_fix`** |Input | **`ap_fixed<32,12>`**| Right image distortion coefficients | +| **`irA_l_fix`** |Input | **`ap_fixed<32,12>`**| Left image transformation matrix | +| **`irA_r_fix`** |Input | **`ap_fixed<32,12>`**| Right image transformation matrix | +| **`cm_size`** |Input | **`int`** | Size of camera parameters matrix | +| **`dc_size`** |Input | **`int`** | Size of distortion coefficients array | + +To forward these parameters to kernel wrapper create 10 buffers in global memory for images data, disparity map and transformation parameters matrices & arrays. Wrapper decompose **`xf_img_l`**, **`xf_img_r`**, **`xf_img_d`** and **`bm_state`** classes and pass member separately. Not all members of **`bm_state`** wrapper transfers to kernel. Most members of **`bm_state`** is predefined by template or calculated based on others members. Therefore kernel could fully restore values of host side **`bm_state`** based on template and restricted set of values. Wrapper forward as regular parameters of type **`int`** following members of **`bm_state`**: **`preFilterType`**, **`preFilterCap`**, **`minDisparity`**, **`textureThreshold`**, **`uniquenessRatio`** + + +## Kernel Driver ### + +Example use modification of SDx xcl kernel driver v.2 for Amazon F1 instance. Source code of this driver and description could be found in Amazon aws-fpga framework. + +## Kernel ## + +To calculate disparity map the kernel pipeline functions from xfOpenCV as shown on the image below.
+ +![](./Stereo_Pipeline_Diagram.png) + +The kernel has following parameters: + +| Parameter Name |Direction|Type | Description | +| :- | :- | :- | :- | +| **`img_l`** |Input | **`XF_TNAME(XF_8UC1, XF_NPPC1) *`** | Pointer to input image buffer from left camera | +| **`img_r`** |Input | **`XF_TNAME(XF_8UC1, XF_NPPC1) *`** | Pointer to input image buffer from right camera | +| **`cameraMA_l_fix`**|Input | **`ap_fixed<32,12> *`** | Pointer to buffer with left camera parameters matrix | +| **`cameraMA_r_fix`**|Input | **`ap_fixed<32,12> *`** | Pointer to buffer with right camera parameters matrix | +| **`distC_l_fix`** |Input | **`ap_fixed<32,12> *`** | Pointer to buffer with left image distortion coefficients | +| **`distC_r_fix`** |Input | **`ap_fixed<32,12> *`** | Pointer to buffer with right image distortion coefficients | +| **`irA_l_fix`** |Input | **`ap_fixed<32,12> *`** | Pointer to buffer with left image transformation matrix | +| **`irA_r_fix`** |Input | **`ap_fixed<32,12> *`** | Pointer to buffer with right image transformation matrix | +| **`img_d`** |Output| **`XF_TNAME(XF_16UC1, XF_NPPC1) *`**| Pointer to buffer for output disparity map | +| **`preFilterType`**
**`preFilterCap`**
**`minDisparity`**
**`textureThreshold`**
**`uniquenessRatio`**
| Input | **`int`**|Restricted set members of **`xf::xFSBMState<...>`** structure which have arbitrary values. This set is enough to restore values of all member of **`xf::xFSBMState<...>`** same as on host side| +| **`cm_size`** |Input | **`int`** | Size of camera parameters matrix | +| **`dc_size`** |Input | **`int`** | Size of distortion coefficients array| +| **`rows`** |Input | **`int`** | Height of images and disparity map| +| **`cols`** |Input | **`int`** | Width of images and disparity map | + + + +During synthesis for FPGA kernel's parameters should be mapped to HW interfaces supported on Amazon F1 instance. To map kernel parameters **`HLS INTERFACE`** pragma should be used. Supported following interfaces: **`m_axi`** and **`s_axilite`**. For **`m_axi`** offset can be set through **`s_axilite`** port only. + +Because functions from xfOpenCV library operate with **`xf::Mat`** class as image container kernel's parameters should be packed back to objects of this class. To do this you need following: + +- Declare **`xf::Mat`** variable
***Note: due to XOCC issues use default constructor only - do not try initialize class members with help of non-default constructors*** +- Assign image size to **`rows`** and **`cols`** members +- Copy image from input buffer to **`data`** member of **`xf::Mat`** or from **`data`** to output buffer + +```cpp +xf::Mat xf_img_l; +xf::Mat xf_img_r; + +xf_img_l.rows = rows; xf_img_l.cols = cols; +xf_img_r.rows = rows; xf_img_r.cols = cols; + +for(int i=0; i < rows; i++) + { + #pragma HLS LOOP_TRIPCOUNT min=1 max=pROWS + + for(int j=0; j < (cols >> (XF_BITSHIFT(XF_NPPC1))); j++) + { + #pragma HLS LOOP_TRIPCOUNT min=1 max=pCOLS/pNPC + #pragma HLS PIPELINE + #pragma HLS loop_flatten off + + *(xf_img_l.data + i*(cols >> (XF_BITSHIFT(XF_NPPC1))) +j) = *(img_l + i*(cols >> (XF_BITSHIFT(XF_NPPC1))) +j); + *(xf_img_r.data + i*(cols >> (XF_BITSHIFT(XF_NPPC1))) +j) = *(img_r + i*(cols >> (XF_BITSHIFT(XF_NPPC1))) +j); + } + } +``` +**Note: `#pragma HLS` doesn't support constants defined through **`#define`** directive - use `const int`. In the code above `pROWS`, `pCOLS` and `pNPC` are `const int` variables which get values from constants defined in xf_stereo_pipeline_config.h with help of #define directive** + +```cpp +const int pROWS = XF_HEIGHT; +const int pCOLS = XF_WIDTH; +const int pNPC = XF_NPPC1; +``` + +Simple declaration of **`xf::Mat`** object create buffer to store whole image with maximum defined size. This buffer use FPGA internal memory blocks and even big FPGA devices could not have enough resources. You should use **`#pragma HLS stream`** to ask HLS convert big RAM buffer to small FIFO buffer + +```cpp +xf::Mat xf_img_l; +xf::Mat xf_img_r; + +#pragma HLS stream variable=xf_img_l.data depth=pCOLS/pNPC +#pragma HLS stream variable=xf_img_r.data depth=pCOLS/pNPC +``` + +Please note that **`#pragma HLS stream`** could be used inside dataflow block, therefore kernel body should be declared as dataflow. This also permit pipeline functions from xfOpenCV library. + +```cpp +void kernel(...) +{ + #pragma HLS INTERFACE ... + #pragma HLS INTERFACE ... + + #pragma HLS dataflow + ... +} +``` + +## Known Issues + +- #### Kernel can't accept class/structure as parameters +**Solution**: use simple types, pass class/structure members as separate parameters of simple types and compose class/structure object back inside kernel. + +- #### Using non-default constructors can cause kernel suspension on FPGA and HW emulation +**Solution**: use default constructor for object declaration and next assign desired values to the members separately. + +```cpp +xf::Mat xf_img_l; +xf::Mat xf_img_r; + +xf_img_l.rows = rows; xf_img_l.cols = cols; +xf_img_r.rows = rows; xf_img_r.cols = cols; +``` + +- #### **`#pragma HLS`** doesn't support constants defined through **`#define`** directive. +**Solution**: use **`const int`** instead + + +```cpp +#define XF_HEIGHT 720 + +void kernel(...) +{ + const int pROWS = XF_HEIGHT; + + for(int i=0; i < rows; i++) + { + #pragma HLS LOOP_TRIPCOUNT min=1 max=pROWS + ... + } + ... +} +``` + + +## Revision History + +Date | Readme Version | Release Notes +-------- |----------------|------------------------- +May 2018 | 1.0 | Initial version. diff --git a/aws_demo/stereopipeline/right.png b/aws_demo/stereopipeline/right.png new file mode 100644 index 0000000..4e31067 Binary files /dev/null and b/aws_demo/stereopipeline/right.png differ diff --git a/aws_demo/stereopipeline/sw_emu/run/run.sh b/aws_demo/stereopipeline/sw_emu/run/run.sh new file mode 100644 index 0000000..d75a02e --- /dev/null +++ b/aws_demo/stereopipeline/sw_emu/run/run.sh @@ -0,0 +1,5 @@ +emconfigutil -f $AWS_PLATFORM + +export XCL_EMULATION_MODE=sw_emu + +./stereo_pipeline_test ../../left.png ../../right.png \ No newline at end of file diff --git a/aws_demo/stereopipeline/sw_emu/run/sdaccel.ini b/aws_demo/stereopipeline/sw_emu/run/sdaccel.ini new file mode 100644 index 0000000..63a1cac --- /dev/null +++ b/aws_demo/stereopipeline/sw_emu/run/sdaccel.ini @@ -0,0 +1,5 @@ +[Debug] +timeline_trace=true +device_profile=true +app_debug=true +profile=true diff --git a/aws_demo/stereopipeline/xf_config_params.h b/aws_demo/stereopipeline/xf_config_params.h new file mode 100644 index 0000000..3f56cdc --- /dev/null +++ b/aws_demo/stereopipeline/xf_config_params.h @@ -0,0 +1,11 @@ +/* NO_OF_DISPARITIES must be greater than '0' and less than the image width */ +#define NO_OF_DISPARITIES 48 + +/* NO_OF_DISPARITIES must not be lesser than PARALLEL_UNITS and NO_OF_DISPARITIES/PARALLEL_UNITS must be a non-fractional number */ +#define PARALLEL_UNITS 16 + +/* SAD window size must be an odd number and it must be less than minimum of image height and width and less than the tested size '21' */ +#define SAD_WINDOW_SIZE 15 + +// Configure this based on the number of rows needed for Remap function +#define XF_REMAP_BUFSIZE 128 diff --git a/aws_demo/stereopipeline/xf_headers.h b/aws_demo/stereopipeline/xf_headers.h new file mode 100644 index 0000000..793f8da --- /dev/null +++ b/aws_demo/stereopipeline/xf_headers.h @@ -0,0 +1,56 @@ +/*************************************************************************** + Copyright (c) 2016, Xilinx, Inc. + All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, + are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + 3. Neither the name of the copyright holder nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + HOWEVER CXFSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + ***************************************************************************/ +#ifndef _XF_HEADERS_H_ +#define _XF_HEADERS_H_ + +#include +#include +#include + +#undef __ARM_NEON__ +#undef __ARM_NEON +#include "opencv/cv.h" +#include "opencv2/imgproc/imgproc.hpp" +#include "opencv2/highgui/highgui.hpp" +#include "opencv2/video/tracking.hpp" +#define __ARM_NEON__ +#define __ARM_NEON + + +#if __SDSCC__ +#include "sds_lib.h" +#define TIME_STAMP_INIT unsigned int clock_start, clock_end; clock_start = sds_clock_counter(); +#define TIME_STAMP { clock_end = sds_clock_counter(); printf("elapsed time %lu \n", clock_end-clock_start); clock_start = sds_clock_counter(); } +#endif + +#include "common/xf_sw_utils.h" + +#endif//_XF_HEADERS_H_ + diff --git a/aws_demo/stereopipeline/xf_stereo_pipeline_accel_aws.cpp b/aws_demo/stereopipeline/xf_stereo_pipeline_accel_aws.cpp new file mode 100644 index 0000000..1bae1c1 --- /dev/null +++ b/aws_demo/stereopipeline/xf_stereo_pipeline_accel_aws.cpp @@ -0,0 +1,110 @@ +#include +#include +#include + +#include "xcl2.hpp" + +#include "xf_stereo_pipeline_config.h" + +typedef xf::xFSBMState xf_BMState; + +#define CL_MIGRATE_MEM_OBJECT_KERNEL 0 //OpenCL define constant to indicate memory object migration to host only, to make program more readable define "counterpart" constant + +void stereo_pipeline_accel + ( + // Left | Right + xf::Mat &xf_img_l, xf::Mat &xf_img_r, + + xf::Mat &xf_img_d, + + xf::xFSBMState &bm_state, + + ap_fixed<32,12> *cameraMA_l_fix , ap_fixed<32,12> *cameraMA_r_fix, + ap_fixed<32,12> *distC_l_fix , ap_fixed<32,12> *distC_r_fix , + ap_fixed<32,12> *irA_l_fix , ap_fixed<32,12> *irA_r_fix , + + int cm_size, + int dc_size + ) +{ + std::vector devices = xcl::get_xil_devices(); + + cl::Device device = devices[0]; + + cl::Context context(device); + + cl::CommandQueue q(context, device, CL_QUEUE_PROFILING_ENABLE); + std::string device_name = device.getInfo(); + + std::string binaryFile = (xcl::is_emulation() || xcl::is_hw_emulation ()) ? "xf_stereo_pipeline.xclbin" : "xf_stereo_pipeline.awsxclbin"; + + std::cout << "======== " << binaryFile << " ========" << std::endl; + + cl::Program::Binaries bins = xcl::import_binary_file(binaryFile); + devices.resize(1); + cl::Program program(context, devices, bins); + cl::Kernel kernel(program,"xf_stereo_pipeline"); + + //----------- Allocate Buffer in Global Memory -----------// + + int rows = xf_img_l.rows; + int cols = xf_img_l.cols; + + int pixel_qnt = rows * cols; + + cl::Buffer buffer_l (context, cl_mem_flags(CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY ), pixel_qnt * 1, (void*)xf_img_l.data ); cl::Buffer buffer_r (context, cl_mem_flags(CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY), pixel_qnt * 1, (void*)xf_img_r.data); + + cl::Buffer buffer_cm_l(context, cl_mem_flags(CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY ), cm_size * 4, (void*)cameraMA_l_fix); cl::Buffer buffer_cm_r(context, cl_mem_flags(CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY), cm_size * 4, (void*)cameraMA_r_fix); + cl::Buffer buffer_dc_l(context, cl_mem_flags(CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY ), dc_size * 4, (void*)distC_l_fix ); cl::Buffer buffer_dc_r(context, cl_mem_flags(CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY), dc_size * 4, (void*)distC_r_fix ); + cl::Buffer buffer_ir_l(context, cl_mem_flags(CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY ), cm_size * 4, (void*)irA_l_fix ); cl::Buffer buffer_ir_r(context, cl_mem_flags(CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY), cm_size * 4, (void*)irA_r_fix ); + + cl::Buffer buffer_d (context, cl_mem_flags(CL_MEM_USE_HOST_PTR | CL_MEM_WRITE_ONLY), pixel_qnt * 2, xf_img_d.data); + + std::vector kernel_wr_buf; + + kernel_wr_buf.push_back(buffer_l ); kernel_wr_buf.push_back(buffer_r ); + kernel_wr_buf.push_back(buffer_cm_l); kernel_wr_buf.push_back(buffer_cm_r); + kernel_wr_buf.push_back(buffer_dc_l); kernel_wr_buf.push_back(buffer_dc_r); + kernel_wr_buf.push_back(buffer_ir_l); kernel_wr_buf.push_back(buffer_ir_r); + + //----------- Migrate input data to device global memory -----------// + + q.enqueueMigrateMemObjects(kernel_wr_buf, CL_MIGRATE_MEM_OBJECT_KERNEL); + + // The kernel parameters should be rearranged: input buffers, output buffers, variables + // + // img_l img_r cm_l cm_r dc_l dc_r ir_l ir_r img_s + auto krnl = cl::KernelFunctor(kernel); + + //----------- Launch the Kernel -----------// + + krnl(cl::EnqueueArgs(q, cl::NDRange(1,1,1), cl::NDRange(1,1,1)), buffer_l, buffer_r, buffer_cm_l, buffer_cm_r, buffer_dc_l, buffer_dc_r, buffer_ir_l, buffer_ir_r, buffer_d, bm_state.preFilterType, + bm_state.preFilterCap, + bm_state.minDisparity, + bm_state.textureThreshold, + bm_state.uniquenessRatio, + + cm_size, + dc_size, + rows, + cols); + + //----------- Copy Result from Device Global Memory to Host Local Memory -----------// + + std::vector kernel_rd_buf; + kernel_rd_buf.push_back(buffer_d); + + q.enqueueMigrateMemObjects(kernel_rd_buf, CL_MIGRATE_MEM_OBJECT_HOST); + + q.finish(); +} diff --git a/aws_demo/stereopipeline/xf_stereo_pipeline_config.h b/aws_demo/stereopipeline/xf_stereo_pipeline_config.h new file mode 100644 index 0000000..ec9eefc --- /dev/null +++ b/aws_demo/stereopipeline/xf_stereo_pipeline_config.h @@ -0,0 +1,69 @@ +/*************************************************************************** +Copyright (c) 2016, Xilinx, Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors +may be used to endorse or promote products derived from this software +without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + ***************************************************************************/ + +#ifndef _XF_STEREO_PIPELINE_CONFIG_H_ +#define _XF_STEREO_PIPELINE_CONFIG_H_ + +#include "hls_stream.h" + +#include "common/xf_common.h" +#include "common/xf_utility.h" + +#include "xf_config_params.h" + + +/* config width and height */ +#define XF_HEIGHT 720 +#define XF_WIDTH 1280 + +#define XF_CAMERA_MATRIX_SIZE 9 +#define XF_DIST_COEFF_SIZE 5 + + +void stereo_pipeline_accel + ( + // Left | Right + xf::Mat &xf_img_l, xf::Mat &xf_img_r, + + xf::Mat &xf_img_s, + + xf::xFSBMState &bm_state, + + ap_fixed<32,12> *cameraMA_l_fix , ap_fixed<32,12> *cameraMA_r_fix, + ap_fixed<32,12> *distC_l_fix , ap_fixed<32,12> *distC_r_fix , + ap_fixed<32,12> *irA_l_fix , ap_fixed<32,12> *irA_r_fix , + + int cm_size, + int dc_size + ); + + +#endif // _XF_STEREO_PIPELINE_CONFIG_H_ + diff --git a/aws_demo/stereopipeline/xf_stereo_pipeline_kernel_aws.cpp b/aws_demo/stereopipeline/xf_stereo_pipeline_kernel_aws.cpp new file mode 100644 index 0000000..2080d2d --- /dev/null +++ b/aws_demo/stereopipeline/xf_stereo_pipeline_kernel_aws.cpp @@ -0,0 +1,216 @@ +//Includes +#include +#include +#include + +#include "xf_stereo_pipeline_config.h" + +#include "imgproc/xf_stereo_pipeline.hpp" +#include "imgproc/xf_remap.hpp" +#include "imgproc/xf_stereoBM.hpp" + + +extern "C" + { + void xf_stereo_pipeline + ( + // Left | Right + XF_TNAME(XF_8UC1, XF_NPPC1) *img_l, XF_TNAME(XF_8UC1, XF_NPPC1) *img_r, + + ap_fixed<32,12> *cameraMA_l_fix , ap_fixed<32,12> *cameraMA_r_fix, + ap_fixed<32,12> *distC_l_fix , ap_fixed<32,12> *distC_r_fix , + ap_fixed<32,12> *irA_l_fix , ap_fixed<32,12> *irA_r_fix , + + XF_TNAME(XF_16UC1, XF_NPPC1) *img_d , + + int preFilterType, + int preFilterCap, + int minDisparity, + int textureThreshold, + int uniquenessRatio, + + int cm_size, + int dc_size, + + int rows, + int cols + ); + } + +void xf_stereo_pipeline + ( + // Left | Right + XF_TNAME(XF_8UC1, XF_NPPC1) *img_l, XF_TNAME(XF_8UC1 , XF_NPPC1) *img_r, + + ap_fixed<32,12> *cameraMA_l_fix , ap_fixed<32,12> *cameraMA_r_fix, + ap_fixed<32,12> *distC_l_fix , ap_fixed<32,12> *distC_r_fix , + ap_fixed<32,12> *irA_l_fix , ap_fixed<32,12> *irA_r_fix , + + XF_TNAME(XF_16UC1, XF_NPPC1) *img_d, + + int preFilterType, + int preFilterCap, + int minDisparity, + int textureThreshold, + int uniquenessRatio, + + int cm_size, + int dc_size, + + int rows, + int cols + ) +{ + #pragma HLS INTERFACE m_axi port=img_l offset=slave bundle=gmem_i_l + #pragma HLS INTERFACE m_axi port=img_r offset=slave bundle=gmem_i_r + + #pragma HLS INTERFACE m_axi port=cameraMA_l_fix offset=slave bundle=gmem_l + #pragma HLS INTERFACE m_axi port=cameraMA_r_fix offset=slave bundle=gmem_r + + #pragma HLS INTERFACE m_axi port=distC_l_fix offset=slave bundle=gmem_l + #pragma HLS INTERFACE m_axi port=distC_r_fix offset=slave bundle=gmem_r + + #pragma HLS INTERFACE m_axi port=irA_l_fix offset=slave bundle=gmem_l + #pragma HLS INTERFACE m_axi port=irA_r_fix offset=slave bundle=gmem_r + + #pragma HLS INTERFACE m_axi port=img_d offset=slave bundle=gmem_s + + + #pragma HLS INTERFACE s_axilite port=img_l bundle=control + #pragma HLS INTERFACE s_axilite port=img_r bundle=control + + #pragma HLS INTERFACE s_axilite port=cameraMA_l_fix bundle=control + #pragma HLS INTERFACE s_axilite port=cameraMA_r_fix bundle=control + + #pragma HLS INTERFACE s_axilite port=distC_l_fix bundle=control + #pragma HLS INTERFACE s_axilite port=distC_r_fix bundle=control + + #pragma HLS INTERFACE s_axilite port=irA_l_fix bundle=control + #pragma HLS INTERFACE s_axilite port=irA_r_fix bundle=control + + + #pragma HLS INTERFACE s_axilite port=img_d bundle=control + + + #pragma HLS INTERFACE s_axilite port=preFilterType bundle=control + #pragma HLS INTERFACE s_axilite port=preFilterCap bundle=control + #pragma HLS INTERFACE s_axilite port=minDisparity bundle=control + #pragma HLS INTERFACE s_axilite port=textureThreshold bundle=control + #pragma HLS INTERFACE s_axilite port=uniquenessRatio bundle=control + + #pragma HLS INTERFACE s_axilite port=cm_size bundle=control + #pragma HLS INTERFACE s_axilite port=dc_size bundle=control + + #pragma HLS INTERFACE s_axilite port=rows bundle=control + #pragma HLS INTERFACE s_axilite port=cols bundle=control + + #pragma HLS INTERFACE s_axilite port=return bundle=control + + + #pragma HLS INLINE OFF + #pragma HLS dataflow + + + const int pROWS = XF_HEIGHT; + const int pCOLS = XF_WIDTH ; + + const int pNPC = XF_NPPC1; + + xf::Mat xf_img_l; // don't use non default constructor xf::Mat<...> xf_img_l(rows, cols) - kernel will suspend on hw emulation and FPGA + xf::Mat xf_img_r; + + #pragma HLS stream variable=xf_img_l.data depth=pCOLS/pNPC + #pragma HLS stream variable=xf_img_r.data depth=pCOLS/pNPC + + + xf::Mat xf_img_d; + + #pragma HLS stream variable=xf_img_d.data depth=pCOLS/pNPC + + xf::Mat xf_map_x_l; + xf::Mat xf_map_y_l; + + #pragma HLS stream variable=xf_map_x_l.data depth=pCOLS/pNPC + #pragma HLS stream variable=xf_map_y_l.data depth=pCOLS/pNPC + + xf::Mat xf_map_x_r; + xf::Mat xf_map_y_r; + + #pragma HLS stream variable=xf_map_x_r.data depth=pCOLS/pNPC + #pragma HLS stream variable=xf_map_y_r.data depth=pCOLS/pNPC + + xf::Mat xf_remapped_l; + xf::Mat xf_remapped_r; + + #pragma HLS stream variable=xf_remapped_l.data depth=pCOLS/pNPC + #pragma HLS stream variable=xf_remapped_r.data depth=pCOLS/pNPC + + xf::xFSBMState bm_state; + + xf_img_l.rows = rows; xf_img_l.cols = cols; + xf_img_r.rows = rows; xf_img_r.cols = cols; + xf_img_d.rows = rows; xf_img_d.cols = cols; + + xf_map_x_l.rows = rows; xf_map_x_l.cols = cols; + xf_map_y_l.rows = rows; xf_map_y_l.cols = cols; + xf_map_x_r.rows = rows; xf_map_x_r.cols = cols; + xf_map_y_r.rows = rows; xf_map_y_r.cols = cols; + + xf_remapped_l.rows = rows; xf_remapped_l.cols = cols; + xf_remapped_r.rows = rows; xf_remapped_r.cols = cols; + + bm_state.preFilterType = preFilterType ; + bm_state.preFilterCap = preFilterCap ; + bm_state.minDisparity = minDisparity ; + bm_state.textureThreshold = textureThreshold; + bm_state.uniquenessRatio = uniquenessRatio ; + + for(int i=0; i < rows; i++) + { + #pragma HLS LOOP_TRIPCOUNT min=1 max=pROWS + + for(int j=0; j < (cols >> (XF_BITSHIFT(XF_NPPC1))); j++) + { + #pragma HLS LOOP_TRIPCOUNT min=1 max=pCOLS/pNPC + #pragma HLS PIPELINE + #pragma HLS loop_flatten off + + *(xf_img_l.data + i*(cols >> (XF_BITSHIFT(XF_NPPC1))) +j) = *(img_l + i*(cols >> (XF_BITSHIFT(XF_NPPC1))) +j); + *(xf_img_r.data + i*(cols >> (XF_BITSHIFT(XF_NPPC1))) +j) = *(img_r + i*(cols >> (XF_BITSHIFT(XF_NPPC1))) +j); + } + } + + + xf::InitUndistortRectifyMapInverse < XF_CAMERA_MATRIX_SIZE, XF_DIST_COEFF_SIZE, XF_32FC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1 > (cameraMA_l_fix, distC_l_fix, irA_l_fix, xf_map_x_l, xf_map_y_l, cm_size, dc_size); + + xf::remap ( xf_img_l, xf_remapped_l, xf_map_x_l, xf_map_y_l ); + + + + xf::InitUndistortRectifyMapInverse < XF_CAMERA_MATRIX_SIZE, XF_DIST_COEFF_SIZE, XF_32FC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1 > (cameraMA_r_fix, distC_r_fix, irA_r_fix, xf_map_x_r, xf_map_y_r, cm_size, dc_size); + + xf::remap ( xf_img_r, xf_remapped_r, xf_map_x_r, xf_map_y_r); + + + + + xf::StereoBM ( xf_remapped_l, xf_remapped_r, xf_img_d, bm_state); + + + + + for(int i=0; i < rows; i++) + { + #pragma HLS LOOP_TRIPCOUNT min=1 max=pROWS + + for(int j=0; j < (cols >> (XF_BITSHIFT(XF_NPPC1))); j++) + { + #pragma HLS LOOP_TRIPCOUNT min=1 max=pCOLS/pNPC + #pragma HLS PIPELINE + #pragma HLS loop_flatten off + + *(img_d + i*(cols >> (XF_BITSHIFT(XF_NPPC1))) +j) = *(xf_img_d.data + i*(cols >> (XF_BITSHIFT(XF_NPPC1))) +j); + } + } + +} diff --git a/aws_demo/stereopipeline/xf_stereo_pipeline_tb.cpp b/aws_demo/stereopipeline/xf_stereo_pipeline_tb.cpp new file mode 100644 index 0000000..273e67b --- /dev/null +++ b/aws_demo/stereopipeline/xf_stereo_pipeline_tb.cpp @@ -0,0 +1,129 @@ +/*************************************************************************** +Copyright (c) 2016, Xilinx, Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors +may be used to endorse or promote products derived from this software +without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + ***************************************************************************/ + +#include +#include +#include + +#include "opencv/cv.h" +#include "opencv2/imgproc/imgproc.hpp" +#include "opencv2/highgui/highgui.hpp" +#include "opencv2/video/tracking.hpp" + +#include "common/xf_sw_utils.h" + +#include "xf_stereo_pipeline_config.h" + +#include "cameraParameters.h" + +using namespace std; + +int main(int argc, char** argv) +{ + cv::setUseOptimized(false); + + if(argc != 3) + { + fprintf(stderr,"Invalid Number of Arguments!\nUsage: \n"); + return -1; + } + + + cv::Mat cv_img_l, cv_img_r; + + cv_img_l = cv::imread(argv[1], 0); + cv_img_r = cv::imread(argv[2], 0); + + ////////////////// HLS TOP Function Call //////////////////////// + + xf::Mat xf_img_l(cv_img_l.rows, cv_img_l.cols); + xf::Mat xf_img_r(cv_img_r.rows, cv_img_r.cols); + + int rows = cv_img_l.rows; + int cols = cv_img_l.cols; + + xf::Mat xf_img_d(rows,cols); + + // camera parameters for rectification + + ap_fixed<32,12> *cameraMA_l_fix = (ap_fixed<32,12>*)malloc(XF_CAMERA_MATRIX_SIZE*sizeof(ap_fixed<32,12>)); + ap_fixed<32,12> *cameraMA_r_fix = (ap_fixed<32,12>*)malloc(XF_CAMERA_MATRIX_SIZE*sizeof(ap_fixed<32,12>)); + ap_fixed<32,12> *irA_l_fix = (ap_fixed<32,12>*)malloc(XF_CAMERA_MATRIX_SIZE*sizeof(ap_fixed<32,12>)); + ap_fixed<32,12> *irA_r_fix = (ap_fixed<32,12>*)malloc(XF_CAMERA_MATRIX_SIZE*sizeof(ap_fixed<32,12>)); + ap_fixed<32,12> *distC_l_fix = (ap_fixed<32,12>*)malloc(XF_DIST_COEFF_SIZE *sizeof(ap_fixed<32,12>)); + ap_fixed<32,12> *distC_r_fix = (ap_fixed<32,12>*)malloc(XF_DIST_COEFF_SIZE *sizeof(ap_fixed<32,12>)); + + + xf_img_l = xf::imread(argv[1], 0); + xf_img_r = xf::imread(argv[2], 0); + + xf::xFSBMState bm_state; + + bm_state.preFilterCap = 31; + bm_state.uniquenessRatio = 15; + bm_state.textureThreshold = 20; + bm_state.minDisparity = 0; + + // copy camera params + for(int i=0; i)cameraMA_l[i]; + cameraMA_r_fix[i] = (ap_fixed<32,12>)cameraMA_r[i]; + irA_l_fix [i] = (ap_fixed<32,12>)irA_l [i]; + irA_r_fix [i] = (ap_fixed<32,12>)irA_r [i]; + } + + // copy distortion coefficients + for(int i=0; i)distC_l[i]; + distC_r_fix[i] = (ap_fixed<32,12>)distC_r[i]; + } + + printf("starting the kernel...\n"); + + + stereo_pipeline_accel(xf_img_l, xf_img_r, xf_img_d, bm_state, cameraMA_l_fix, cameraMA_r_fix, distC_l_fix, distC_r_fix, irA_l_fix, irA_r_fix, 9, 5); + + + cv::Mat out_disp_16(rows,cols,CV_16UC1); + cv::Mat out_disp_08(rows,cols,CV_8UC1 ); + + out_disp_16.data = xf_img_d.copyFrom(); + + out_disp_16.convertTo(out_disp_08, CV_8U, (256.0/NO_OF_DISPARITIES)/(16.)); + + imwrite("hls_output.png",out_disp_08); + + printf ("run complete !\n\n"); + + return 0; +} + diff --git a/examples/lkdensepyrof/xf_config_params.h b/examples/lkdensepyrof/xf_config_params.h index 05cd576..a1d0193 100644 --- a/examples/lkdensepyrof/xf_config_params.h +++ b/examples/lkdensepyrof/xf_config_params.h @@ -10,4 +10,6 @@ #define HEIGHT 1080 #define WIDTH 1920 -#define NUM_LINES_FINDIT 50 \ No newline at end of file +#define NUM_LINES_FINDIT 50 + +#define XF_USE_URAM false diff --git a/examples/lkdensepyrof/xf_pyr_dense_optical_flow_accel.cpp b/examples/lkdensepyrof/xf_pyr_dense_optical_flow_accel.cpp index 1cfb0d7..79653c8 100644 --- a/examples/lkdensepyrof/xf_pyr_dense_optical_flow_accel.cpp +++ b/examples/lkdensepyrof/xf_pyr_dense_optical_flow_accel.cpp @@ -35,10 +35,10 @@ void pyr_dense_optical_flow_pyr_down_accel(xf::Mat(mat_imagepyr1[pyr_comp], mat_imagepyr1[pyr_comp+1]); + xf::pyrDown(mat_imagepyr1[pyr_comp], mat_imagepyr1[pyr_comp+1]); #pragma SDS async(2) #pragma SDS resource(2) - xf::pyrDown(mat_imagepyr2[pyr_comp], mat_imagepyr2[pyr_comp+1]); + xf::pyrDown(mat_imagepyr2[pyr_comp], mat_imagepyr2[pyr_comp+1]); #pragma SDS wait(1) #pragma SDS wait(2) } @@ -46,6 +46,6 @@ void pyr_dense_optical_flow_pyr_down_accel(xf::Mat & _current_img, xf::Mat & _next_image, xf::Mat & _streamFlowin, xf::Mat & _streamFlowout, const int level, const unsigned char scale_up_flag, float scale_in, ap_uint<1> init_flag) { - xf::densePyrOpticalFlow(_current_img, _next_image, _streamFlowin, _streamFlowout, level, scale_up_flag, scale_in, init_flag); + xf::densePyrOpticalFlow(_current_img, _next_image, _streamFlowin, _streamFlowout, level, scale_up_flag, scale_in, init_flag); } diff --git a/examples/lknpyroflow/xf_config_params.h b/examples/lknpyroflow/xf_config_params.h index 6e91f55..c9b25c8 100644 --- a/examples/lknpyroflow/xf_config_params.h +++ b/examples/lknpyroflow/xf_config_params.h @@ -1,4 +1,5 @@ #define MAX_HEIGHT 2160 #define MAX_WIDTH 3840 #define WORD_SZ 1 -#define KMED 25 \ No newline at end of file +#define KMED 25 +#define XF_USE_URAM false diff --git a/examples/lknpyroflow/xf_dense_npyr_optical_flow_accel.cpp b/examples/lknpyroflow/xf_dense_npyr_optical_flow_accel.cpp index cbb1ba8..7980c37 100644 --- a/examples/lknpyroflow/xf_dense_npyr_optical_flow_accel.cpp +++ b/examples/lknpyroflow/xf_dense_npyr_optical_flow_accel.cpp @@ -31,6 +31,6 @@ EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. void dense_non_pyr_of_accel(xf::Mat &buf0, xf::Mat &buf1, xf::Mat &flowx, xf::Mat &flowy) { - xf::DenseNonPyrLKOpticalFlow(buf0, buf1, flowx, flowy); + xf::DenseNonPyrLKOpticalFlow(buf0, buf1, flowx, flowy); } diff --git a/examples/remap/xf_config_params.h b/examples/remap/xf_config_params.h index 801d559..da08235 100644 --- a/examples/remap/xf_config_params.h +++ b/examples/remap/xf_config_params.h @@ -4,3 +4,4 @@ // The type of interpolation, define "XF_REMAP_INTERPOLATION" as either "XF_INTERPOLATION_NN" or "XF_INTERPOLATION_BILINEAR" #define XF_REMAP_INTERPOLATION XF_INTERPOLATION_BILINEAR +#define XF_USE_URAM false diff --git a/examples/remap/xf_remap_accel.cpp b/examples/remap/xf_remap_accel.cpp index 37b4d72..b6bb4ee 100644 --- a/examples/remap/xf_remap_accel.cpp +++ b/examples/remap/xf_remap_accel.cpp @@ -32,6 +32,6 @@ EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. void remap_accel(xf::Mat &inMat, xf::Mat &remappedMat, xf::Mat &mapxMat, xf::Mat &mapyMat) { - xf::remap(inMat,remappedMat,mapxMat,mapyMat); + xf::remap(inMat,remappedMat,mapxMat,mapyMat); } diff --git a/examples/stereolbm/xf_stereoBM_tb.cpp b/examples/stereolbm/xf_stereoBM_tb.cpp index 4a98676..b906da7 100644 --- a/examples/stereolbm/xf_stereoBM_tb.cpp +++ b/examples/stereolbm/xf_stereoBM_tb.cpp @@ -137,9 +137,9 @@ int main(int argc, char** argv) int cnt=0, total = 0; - for(int i=(SAD_WINDOW_SIZE>>1)+20; i>1)+20); i++) + for(int i=SAD_WINDOW_SIZE; i>1)+20; j>1)+20); j++) + for(int j=SAD_WINDOW_SIZE; j (i,j))-(out_disp_img.data[i*out_disp_img.cols +j]); diff --git a/examples/stereopipeline/xf_stereo_pipeline_accel.cpp b/examples/stereopipeline/xf_stereo_pipeline_accel.cpp index 4436890..ce0f3c7 100644 --- a/examples/stereopipeline/xf_stereo_pipeline_accel.cpp +++ b/examples/stereopipeline/xf_stereo_pipeline_accel.cpp @@ -29,18 +29,36 @@ EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ***************************************************************************/ #include "xf_stereo_pipeline_config.h" -void stereopipeline_accel(xf::Mat &leftMat, xf::Mat &rightMat, xf::Mat &dispMat, - xf::Mat &mapxLMat, xf::Mat &mapyLMat, xf::Mat &mapxRMat, - xf::Mat &mapyRMat, xf::Mat &leftRemappedMat, xf::Mat &rightRemappedMat, - xf::xFSBMState &bm_state, ap_fixed<32,12> *cameraMA_l_fix, ap_fixed<32,12> *cameraMA_r_fix, ap_fixed<32,12> *distC_l_fix, ap_fixed<32,12> *distC_r_fix, - ap_fixed<32,12> *irA_l_fix, ap_fixed<32,12> *irA_r_fix, int _cm_size, int _dc_size) +void stereopipeline_accel + ( + xf::Mat &leftMat, xf::Mat &rightMat, + + xf::Mat &dispMat, + + xf::Mat &mapxLMat, xf::Mat &mapyLMat, + xf::Mat &mapxRMat, xf::Mat &mapyRMat, + + xf::Mat &leftRemappedMat, xf::Mat &rightRemappedMat, + + xf::xFSBMState &bm_state, + + ap_fixed<32,12> *cameraMA_l_fix, ap_fixed<32,12> *cameraMA_r_fix, + ap_fixed<32,12> *distC_l_fix , ap_fixed<32,12> *distC_r_fix , + ap_fixed<32,12> *irA_l_fix , ap_fixed<32,12> *irA_r_fix , + + int _cm_size, int _dc_size + ) { - xf::InitUndistortRectifyMapInverse(cameraMA_l_fix,distC_l_fix,irA_l_fix,mapxLMat,mapyLMat,_cm_size,_dc_size); - xf::remap(leftMat,leftRemappedMat,mapxLMat,mapyLMat); + xf::InitUndistortRectifyMapInverse(cameraMA_l_fix, distC_l_fix, irA_l_fix, mapxLMat, mapyLMat, _cm_size, _dc_size); + + xf::remap(leftMat, leftRemappedMat, mapxLMat, mapyLMat); - xf::InitUndistortRectifyMapInverse(cameraMA_r_fix,distC_r_fix,irA_r_fix,mapxRMat,mapyRMat,_cm_size,_dc_size); - xf::remap(leftMat,leftRemappedMat,mapxLMat,mapyLMat); + + + + xf::InitUndistortRectifyMapInverse(cameraMA_r_fix, distC_r_fix, irA_r_fix, mapxRMat, mapyRMat, _cm_size, _dc_size); + xf::remap(leftMat, leftRemappedMat, mapxLMat, mapyLMat); - xf::StereoBM(leftRemappedMat, rightRemappedMat, dispMat, bm_state); + xf::StereoBM(leftRemappedMat, rightRemappedMat, dispMat, bm_state); } diff --git a/examples/warptransform/xf_config_params.h b/examples/warptransform/xf_config_params.h index e903c30..8d2ee85 100644 --- a/examples/warptransform/xf_config_params.h +++ b/examples/warptransform/xf_config_params.h @@ -16,4 +16,7 @@ #define INTERPOLATION 1 //transform type 0-AFFINE 1-PERSPECTIVE -#define TRANSFORM_TYPE 0 \ No newline at end of file +#define TRANSFORM_TYPE 0 + +//usage of URAMs for buffers implementation +#define XF_USE_URAM false diff --git a/examples/warptransform/xf_warp_transform_accel.cpp b/examples/warptransform/xf_warp_transform_accel.cpp index a9d0b65..1295a80 100644 --- a/examples/warptransform/xf_warp_transform_accel.cpp +++ b/examples/warptransform/xf_warp_transform_accel.cpp @@ -31,5 +31,5 @@ EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. void warp_transform_accel(xf::Mat &_src, xf::Mat &_dst, float *R) { - xf::warpTransform(_src, _dst, R); + xf::warpTransform(_src, _dst, R); } diff --git a/include/imgproc/xf_dense_npyr_optical_flow.hpp b/include/imgproc/xf_dense_npyr_optical_flow.hpp index b79a327..791c78d 100644 --- a/include/imgproc/xf_dense_npyr_optical_flow.hpp +++ b/include/imgproc/xf_dense_npyr_optical_flow.hpp @@ -170,7 +170,7 @@ namespace xf{ // TODO: // 1. Dont need the entire column for img1Win and img2Win. Need only the kernel // 2. Full line buffer is not needed - template + template static void computeSums16 (hls::stream < mywide_t< XF_NPIXPERCYCLE(NPC) > > img1Col [(WINDOW_SIZE+1)], hls::stream < mywide_t< XF_NPIXPERCYCLE(NPC) > > img2Col [(WINDOW_SIZE+1)], hls::stream & ixix_out0, @@ -208,34 +208,55 @@ namespace xf{ // For II=1 pipelining, need two read and 1 write ports. Simulating it with // two arrays that have their write ports tied together. // TODO need only MAX_WODTH/2. Have to adjust zIdx and nIdx as well - static int csIxixO [COLS], csIxiyO [COLS], csIyiyO [COLS], csDixO [COLS], csDiyO [COLS]; - static int csIxixE [COLS], csIxiyE [COLS], csIyiyE [COLS], csDixE [COLS], csDiyE [COLS]; - - static int cbIxixO [COLS], cbIxiyO [COLS], cbIyiyO [COLS], cbDixO [COLS], cbDiyO [COLS]; - static int cbIxixE [COLS], cbIxiyE [COLS], cbIyiyE [COLS], cbDixE [COLS], cbDiyE [COLS]; - - int zIdx= - (WINDOW_SIZE-2); // odd - int zIdx1 = zIdx + 1; // even - - int nIdx = zIdx + WINDOW_SIZE-2; // even (0) - int nIdx1 = nIdx + 1; // odd - + static int csIxixO [COLS/2], csIxiyO [COLS/2], csIyiyO [COLS/2], csDixO [COLS/2], csDiyO [COLS/2]; + static int csIxixE [COLS/2], csIxiyE [COLS/2], csIyiyE [COLS/2], csDixE [COLS/2], csDiyE [COLS/2]; + + static int cbIxixO [COLS/2], cbIxiyO [COLS/2], cbIyiyO [COLS/2], cbDixO [COLS/2], cbDiyO [COLS/2]; + static int cbIxixE [COLS/2], cbIxiyE [COLS/2], cbIyiyE [COLS/2], cbDixE [COLS/2], cbDiyE [COLS/2]; + + int zIdx= - (WINDOW_SIZE/2-1); + int nIdx = zIdx + WINDOW_SIZE/2-1; + + #pragma HLS ARRAY_MAP variable=csIxixO instance=csO vertical + #pragma HLS ARRAY_MAP variable=csIxiyO instance=csO vertical + #pragma HLS ARRAY_MAP variable=csIyiyO instance=csO vertical + #pragma HLS ARRAY_MAP variable=csDixO instance=csO vertical + #pragma HLS ARRAY_MAP variable=csDiyO instance=csO vertical + + #pragma HLS ARRAY_MAP variable=csIxixE instance=csE vertical + #pragma HLS ARRAY_MAP variable=csIxiyE instance=csE vertical + #pragma HLS ARRAY_MAP variable=csIyiyE instance=csE vertical + #pragma HLS ARRAY_MAP variable=csDixE instance=csE vertical + #pragma HLS ARRAY_MAP variable=csDiyE instance=csE vertical + + #pragma HLS ARRAY_MAP variable=cbIxixO instance=cb vertical + #pragma HLS ARRAY_MAP variable=cbIxiyO instance=cb vertical + #pragma HLS ARRAY_MAP variable=cbIyiyO instance=cb vertical + #pragma HLS ARRAY_MAP variable=cbDixO instance=cb vertical + #pragma HLS ARRAY_MAP variable=cbDiyO instance=cb vertical + #pragma HLS ARRAY_MAP variable=cbIxixE instance=cb vertical + #pragma HLS ARRAY_MAP variable=cbIxiyE instance=cb vertical + #pragma HLS ARRAY_MAP variable=cbIyiyE instance=cb vertical + #pragma HLS ARRAY_MAP variable=cbDixE instance=cb vertical + #pragma HLS ARRAY_MAP variable=cbDiyE instance=cb vertical + + if (USE_URAM) { + #pragma HLS RESOURCE variable=csIxixO core=XPM_MEMORY uram + #pragma HLS RESOURCE variable=csIxixE core=XPM_MEMORY uram + #pragma HLS RESOURCE variable=cbIxixO core=XPM_MEMORY uram + } + else { #pragma HLS RESOURCE variable=csIxixO core=RAM_2P_BRAM - #pragma HLS RESOURCE variable=csIxiyO core=RAM_2P_BRAM - #pragma HLS RESOURCE variable=csIyiyO core=RAM_2P_BRAM - #pragma HLS RESOURCE variable=csDixO core=RAM_2P_BRAM - #pragma HLS RESOURCE variable=csDiyO core=RAM_2P_BRAM + #pragma HLS RESOURCE variable=csIxixE core=RAM_2P_BRAM + #pragma HLS RESOURCE variable=cbIxixO core=RAM_2P_BRAM + } + #pragma HLS DEPENDENCE variable=csIxixO inter WAR false #pragma HLS DEPENDENCE variable=csIxiyO inter WAR false #pragma HLS DEPENDENCE variable=csIyiyO inter WAR false #pragma HLS DEPENDENCE variable=csDixO inter WAR false #pragma HLS DEPENDENCE variable=csDiyO inter WAR false - #pragma HLS RESOURCE variable=csIxixE core=RAM_2P_BRAM - #pragma HLS RESOURCE variable=csIxiyE core=RAM_2P_BRAM - #pragma HLS RESOURCE variable=csIyiyE core=RAM_2P_BRAM - #pragma HLS RESOURCE variable=csDixE core=RAM_2P_BRAM - #pragma HLS RESOURCE variable=csDiyE core=RAM_2P_BRAM #pragma HLS DEPENDENCE variable=csIxixE inter WAR false #pragma HLS DEPENDENCE variable=csIxiyE inter WAR false #pragma HLS DEPENDENCE variable=csIyiyE inter WAR false @@ -243,28 +264,12 @@ namespace xf{ #pragma HLS DEPENDENCE variable=csDiyE inter WAR false - #pragma HLS RESOURCE variable=cbIxixO core=RAM_2P_BRAM - #pragma HLS RESOURCE variable=cbIxiyO core=RAM_2P_BRAM - #pragma HLS RESOURCE variable=cbIyiyO core=RAM_2P_BRAM - #pragma HLS RESOURCE variable=cbDixO core=RAM_2P_BRAM - #pragma HLS RESOURCE variable=cbDiyO core=RAM_2P_BRAM #pragma HLS DEPENDENCE variable=cbIxixO inter WAR false #pragma HLS DEPENDENCE variable=cbIxiyO inter WAR false #pragma HLS DEPENDENCE variable=cbIyiyO inter WAR false #pragma HLS DEPENDENCE variable=cbDixO inter WAR false #pragma HLS DEPENDENCE variable=cbDiyO inter WAR false -#if PLATFORM_ZCU104 - #pragma HLS RESOURCE variable=cbIxixE core=XPM_MEMORY uram - #pragma HLS RESOURCE variable=cbIxiyE core=XPM_MEMORY uram - #pragma HLS RESOURCE variable=cbIyiyE core=XPM_MEMORY uram -#else - #pragma HLS RESOURCE variable=cbIxixE core=RAM_2P_BRAM - #pragma HLS RESOURCE variable=cbIxiyE core=RAM_2P_BRAM - #pragma HLS RESOURCE variable=cbIyiyE core=RAM_2P_BRAM -#endif - #pragma HLS RESOURCE variable=cbDixE core=RAM_2P_BRAM - #pragma HLS RESOURCE variable=cbDiyE core=RAM_2P_BRAM #pragma HLS DEPENDENCE variable=cbIxixE inter WAR false #pragma HLS DEPENDENCE variable=cbIxiyE inter WAR false #pragma HLS DEPENDENCE variable=cbIyiyE inter WAR false @@ -284,18 +289,18 @@ namespace xf{ int csIxixL1 = 0, csIxiyL1 = 0, csIyiyL1 = 0, csDixL1 = 0, csDiyL1 = 0; if (zIdx >= 0) { - csIxixL0 = csIxixO [zIdx]; - csIxiyL0 = csIxiyO [zIdx]; - csIyiyL0 = csIyiyO [zIdx]; - csDixL0 = csDixO [zIdx]; - csDiyL0 = csDiyO [zIdx]; - } - if (zIdx1 >= 0) { - csIxixL1 = csIxixE [zIdx1]; - csIxiyL1 = csIxiyE [zIdx1]; - csIyiyL1 = csIyiyE [zIdx1]; - csDixL1 = csDixE [zIdx1]; - csDiyL1 = csDiyE [zIdx1]; + int const zIdxPrev = zIdx==0 ? cols/2-1 : zIdx-1; + csIxixL0 = csIxixO [zIdxPrev]; + csIxiyL0 = csIxiyO [zIdxPrev]; + csIyiyL0 = csIyiyO [zIdxPrev]; + csDixL0 = csDixO [zIdxPrev]; + csDiyL0 = csDiyO [zIdxPrev]; + + csIxixL1 = csIxixE [zIdx]; + csIxiyL1 = csIxiyE [zIdx]; + csIyiyL1 = csIyiyE [zIdx]; + csDixL1 = csDixE [zIdx]; + csDiyL1 = csDiyE [zIdx]; } for (int wr=0; wr<(WINDOW_SIZE+1); ++wr) { @@ -344,11 +349,11 @@ namespace xf{ csDixR0 = cbDixE [nIdx] + delBotR0 * cIxBotR0 - delTopR0 * cIxTopR0; csDiyR0 = cbDiyE [nIdx] + delBotR0 * cIyBotR0 - delTopR0 * cIyTopR0; - csIxixR1 = cbIxixO [nIdx1] + cIxBotR1 * cIxBotR1 - cIxTopR1 * cIxTopR1; - csIxiyR1 = cbIxiyO [nIdx1] + cIxBotR1 * cIyBotR1 - cIxTopR1 * cIyTopR1; - csIyiyR1 = cbIyiyO [nIdx1] + cIyBotR1 * cIyBotR1 - cIyTopR1 * cIyTopR1; - csDixR1 = cbDixO [nIdx1] + delBotR1 * cIxBotR1 - delTopR1 * cIxTopR1; - csDiyR1 = cbDiyO [nIdx1] + delBotR1 * cIyBotR1 - delTopR1 * cIyTopR1; + csIxixR1 = cbIxixO [nIdx] + cIxBotR1 * cIxBotR1 - cIxTopR1 * cIxTopR1; + csIxiyR1 = cbIxiyO [nIdx] + cIxBotR1 * cIyBotR1 - cIxTopR1 * cIyTopR1; + csIyiyR1 = cbIyiyO [nIdx] + cIyBotR1 * cIyBotR1 - cIyTopR1 * cIyTopR1; + csDixR1 = cbDixO [nIdx] + delBotR1 * cIxBotR1 - delTopR1 * cIxTopR1; + csDiyR1 = cbDiyO [nIdx] + delBotR1 * cIyBotR1 - delTopR1 * cIyTopR1; int tmpixix0 = (csIxixR0 - csIxixL0); int tmpixix1 = (csIxixR0 - csIxixL0) + (csIxixR1 - csIxixL1); @@ -415,29 +420,22 @@ namespace xf{ csDixE [nIdx] = csDixR0; csDiyE [nIdx] = csDiyR0; - cbIxixO [nIdx1] = csIxixR1; - cbIxiyO [nIdx1] = csIxiyR1; - cbIyiyO [nIdx1] = csIyiyR1; - cbDixO [nIdx1] = csDixR1; - cbDiyO [nIdx1] = csDiyR1; - - csIxixO [nIdx1] = csIxixR1; - csIxiyO [nIdx1] = csIxiyR1; - csIyiyO [nIdx1] = csIyiyR1; - csDixO [nIdx1] = csDixR1; - csDiyO [nIdx1] = csDiyR1; - - // zIdx is always odd, zIdx1 is even - // nIdx is always even, nIdx1 is odd - zIdx += 2; - if (zIdx >= cols) zIdx = 1; - zIdx1 += 2; - if (zIdx1 == cols) zIdx1 = 0; - - nIdx += 2; - if (nIdx == cols) nIdx = 0; - nIdx1 += 2; - if (nIdx1 >= cols) nIdx1 = 1; + cbIxixO [nIdx] = csIxixR1; + cbIxiyO [nIdx] = csIxiyR1; + cbIyiyO [nIdx] = csIyiyR1; + cbDixO [nIdx] = csDixR1; + cbDiyO [nIdx] = csDiyR1; + + csIxixO [nIdx] = csIxixR1; + csIxiyO [nIdx] = csIxiyR1; + csIyiyO [nIdx] = csIyiyR1; + csDixO [nIdx] = csDixR1; + csDiyO [nIdx] = csDiyR1; + + zIdx ++; + if (zIdx == cols/2) zIdx = 0; + nIdx ++; + if (nIdx == cols/2) nIdx = 0; } } @@ -446,12 +444,12 @@ namespace xf{ // TODO zero in the line buffer instead, for r < WINDOW_SIZE for (int r = 0; r < (WINDOW_SIZE+1); r++) { #pragma HLS LOOP_TRIPCOUNT min=1 max=WINDOW_SIZE+1 - #pragma HLS PIPELINE + #pragma HLS UNROLL img1Win [r] = 0; img1Win [r+(WINDOW_SIZE+1)] = 0; img2Win [r] = 0; img1Col0 [r] =0; img2Col0 [r] =0; img1Col1 [r] =0; img2Col1 [r] =0; } - for (int r=0; r < cols; ++r) { + for (int r=0; r < cols/2; ++r) { #pragma HLS LOOP_TRIPCOUNT min=1 max=COLS #pragma HLS PIPELINE csIxixO [r] = 0; csIxiyO [r] = 0; csIyiyO [r] = 0; csDixO [r] = 0; csDiyO [r] = 0; @@ -534,15 +532,27 @@ namespace xf{ // line buffer for both input images. Can be split to a fn that models a single // linebuffer - template + template static void lbWrapper16 (hls::stream < mywide_t< XF_NPIXPERCYCLE(NPC) > >& f0Stream, hls::stream < mywide_t< XF_NPIXPERCYCLE(NPC) > >& f1Stream, hls::stream < mywide_t< XF_NPIXPERCYCLE(NPC) > > img1Col[(WINDOW_SIZE+1)], hls::stream < mywide_t< XF_NPIXPERCYCLE(NPC) > > img2Col[(WINDOW_SIZE+1)], int rows, int cols, int size) { - static mywide_t< XF_NPIXPERCYCLE(NPC) > lb1 [(WINDOW_SIZE+1)][COLS/2], lb2 [(WINDOW_SIZE+1)][COLS/2]; - #pragma HLS ARRAY_PARTITION variable=lb1 complete dim=1 - #pragma HLS ARRAY_PARTITION variable=lb2 complete dim=1 + static pix_t lb1 [(WINDOW_SIZE+1)][COLS/XF_NPIXPERCYCLE(NPC)][XF_NPIXPERCYCLE(NPC)], + lb2 [(WINDOW_SIZE+1)][COLS/XF_NPIXPERCYCLE(NPC)][XF_NPIXPERCYCLE(NPC)]; + + #pragma HLS ARRAY_MAP variable=lb1 instance=lbMap vertical + #pragma HLS ARRAY_MAP variable=lb2 instance=lbMap vertical + + #pragma HLS ARRAY_RESHAPE variable=lb1 complete dim=1 + #pragma HLS ARRAY_RESHAPE variable=lb2 complete dim=1 + #pragma HLS ARRAY_RESHAPE variable=lb1 complete dim=3 + #pragma HLS ARRAY_RESHAPE variable=lb2 complete dim=3 + + if (USE_URAM) { + #pragma HLS RESOURCE variable=lb1 core=XPM_MEMORY uram + #pragma HLS RESOURCE variable=lb2 core=XPM_MEMORY uram + } for (int r = 0; r < rows; r++) { #pragma HLS LOOP_TRIPCOUNT min=1 max=ROWS @@ -552,43 +562,53 @@ namespace xf{ #pragma HLS pipeline // shift up both linebuffers at col=c for (int i = 0; i < ((WINDOW_SIZE+1) - 1); i++) { - lb1 [i][c] = lb1 [i + 1][c]; - img1Col [i]. write (lb1 [i][c]); - - lb2 [i][c] = lb2 [i+1][c]; - img2Col [i]. write (lb2 [i][c]); + mywide_t< XF_NPIXPERCYCLE(NPC) > lb; + + for (int k = 0; k pix0 = f0Stream. read (); - lb1 [(WINDOW_SIZE+1) - 1][c] = pix0; img1Col [(WINDOW_SIZE+1) - 1]. write (pix0); mywide_t< XF_NPIXPERCYCLE(NPC) > pix1 = f1Stream. read (); - lb2 [(WINDOW_SIZE+1) -1][c] = pix1; img2Col [(WINDOW_SIZE+1) - 1]. write (pix1); + + for (int k = 0; k tmpClr; - tmpClr. data [0] = 0; - tmpClr. data [1] = 0; - for (int r = 0; r < (WINDOW_SIZE+1); r++) { - #pragma HLS LOOP_TRIPCOUNT min=1 max=WINDOW_SIZE+1 - for (int c = 0; c < cols/2; c++) { + for (int c = 0; c < cols/2; c++) { #pragma HLS LOOP_TRIPCOUNT min=1 max=COLS/2 #pragma HLS PIPELINE - lb1 [r][c] = tmpClr; - lb2 [r][c] = tmpClr; + for (int r = 0; r < (WINDOW_SIZE+1); r++) { + #pragma HLS LOOP_TRIPCOUNT min=1 max=WINDOW_SIZE+1 + for (int k = 0; k + template static void flowWrap16 (ap_uint<16> *frame0, ap_uint<16> *frame1, ap_uint<64> *flowx, ap_uint<64> *flowy, int rows, int cols, int size) { //#pragma HLS data_pack variable=frame0 @@ -642,8 +662,8 @@ namespace xf{ readMatRows16 (frame0, f0Stream, rows, cols, size); readMatRows16 (frame1, f1Stream, rows, cols, size); - lbWrapper16 (f0Stream, f1Stream, img1Col, img2Col, rows, cols, size); - computeSums16 (img1Col, img2Col, + lbWrapper16 (f0Stream, f1Stream, img1Col, img2Col, rows, cols, size); + computeSums16 (img1Col, img2Col, ixix0, ixiy0, iyiy0, dix0, diy0, ixix1, ixiy1, iyiy1, dix1, diy1, rows, cols, size); @@ -666,12 +686,12 @@ namespace xf{ // ulonglong = 64 bits, 32 bits per color pixel (rgba), so two color pix //void fpga_optflow (unsigned short *frame0, unsigned short *frame1, unsigned long long *framef) //void fpga_optflow (unsigned short frame0[NUM_WORDS], unsigned short frame1[NUM_WORDS], unsigned long long framef[NUM_WORDS]) - template + template static void fpga_optflow16 (ap_uint<16> *frame0, ap_uint<16> *frame1, ap_uint<64> *flowx, ap_uint<64> *flowy, int rows, int cols, int size) { #pragma HLS inline off - flowWrap16 (frame0, frame1, flowx, flowy, rows, cols, size); + flowWrap16 (frame0, frame1, flowx, flowy, rows, cols, size); return; @@ -713,7 +733,7 @@ namespace xf{ // TODO: // 1. Dont need the entire column for img1Win and img2Win. Need only the kernel // 2. Full line buffer is not needed - template + template static void computeSums (hls::stream img1Col [(WINDOW_SIZE+1)], hls::stream img2Col [(WINDOW_SIZE+1)], hls::stream & ixix_out, @@ -742,21 +762,38 @@ namespace xf{ int zIdx= - (WINDOW_SIZE-2); int nIdx = zIdx + WINDOW_SIZE-2; + #pragma HLS ARRAY_MAP variable=csIxix instance=cs vertical + #pragma HLS ARRAY_MAP variable=csIxiy instance=cs vertical + #pragma HLS ARRAY_MAP variable=csIyiy instance=cs vertical + #pragma HLS ARRAY_MAP variable=csDix instance=cs vertical + #pragma HLS ARRAY_MAP variable=csDiy instance=cs vertical + + if (USE_URAM) { + #pragma HLS RESOURCE variable=csIxix core=XPM_MEMORY uram + } + else { #pragma HLS RESOURCE variable=csIxix core=RAM_2P_BRAM - #pragma HLS RESOURCE variable=csIxiy core=RAM_2P_BRAM - #pragma HLS RESOURCE variable=csIyiy core=RAM_2P_BRAM - #pragma HLS RESOURCE variable=csDix core=RAM_2P_BRAM - #pragma HLS RESOURCE variable=csDiy core=RAM_2P_BRAM + } + #pragma HLS DEPENDENCE variable=csIxix inter WAR false #pragma HLS DEPENDENCE variable=csIxiy inter WAR false #pragma HLS DEPENDENCE variable=csIyiy inter WAR false #pragma HLS DEPENDENCE variable=csDix inter WAR false #pragma HLS DEPENDENCE variable=csDiy inter WAR false + + #pragma HLS ARRAY_MAP variable=cbIxix instance=cb vertical + #pragma HLS ARRAY_MAP variable=cbIxiy instance=cb vertical + #pragma HLS ARRAY_MAP variable=cbIyiy instance=cb vertical + #pragma HLS ARRAY_MAP variable=cbDix instance=cb vertical + #pragma HLS ARRAY_MAP variable=cbDiy instance=cb vertical + + if (USE_URAM) { + #pragma HLS RESOURCE variable=cbIxix core=XPM_MEMORY uram + } + else { #pragma HLS RESOURCE variable=cbIxix core=RAM_2P_BRAM - #pragma HLS RESOURCE variable=cbIxiy core=RAM_2P_BRAM - #pragma HLS RESOURCE variable=cbIyiy core=RAM_2P_BRAM - #pragma HLS RESOURCE variable=cbDix core=RAM_2P_BRAM - #pragma HLS RESOURCE variable=cbDiy core=RAM_2P_BRAM + } + #pragma HLS DEPENDENCE variable=cbIxix inter WAR false #pragma HLS DEPENDENCE variable=cbIxiy inter WAR false #pragma HLS DEPENDENCE variable=cbIyiy inter WAR false @@ -860,7 +897,7 @@ namespace xf{ // TODO zero in the line buffer instead, for r < WINDOW_SIZE for (int r = 0; r < (WINDOW_SIZE+1); r++) { #pragma HLS LOOP_TRIPCOUNT min=1 max=WINDOW_SIZE+1 - #pragma HLS PIPELINE + #pragma HLS UNROLL img1Win [r] = 0; img1Win [r+(WINDOW_SIZE+1)] = 0; img2Win [r] = 0; img1Col_ [r] =0; img2Col_ [r] =0; } @@ -957,15 +994,21 @@ namespace xf{ // line buffer for both input images. Can be split to a fn that models a single // linebuffer - template + template static void lbWrapper (hls::stream & f0Stream, hls::stream & f1Stream, hls::stream img1Col[(WINDOW_SIZE+1)], hls::stream img2Col[(WINDOW_SIZE+1)], int rows, int cols, int size) { static pix_t lb1 [(WINDOW_SIZE+1)][COLS], lb2 [(WINDOW_SIZE+1)][COLS]; - #pragma HLS ARRAY_PARTITION variable=lb1 complete dim=1 - #pragma HLS ARRAY_PARTITION variable=lb2 complete dim=1 + #pragma HLS ARRAY_MAP variable=lb1 instance=lbMap vertical + #pragma HLS ARRAY_MAP variable=lb2 instance=lbMap vertical + #pragma HLS ARRAY_RESHAPE variable=lb1 complete dim=1 + #pragma HLS ARRAY_RESHAPE variable=lb2 complete dim=1 + if (USE_URAM) { + #pragma HLS RESOURCE variable=lb1 core=XPM_MEMORY uram + #pragma HLS RESOURCE variable=lb2 core=XPM_MEMORY uram + } for (int r = 0; r < rows; r++) { #pragma HLS LOOP_TRIPCOUNT min=1 max=ROWS @@ -996,11 +1039,11 @@ namespace xf{ // cleanup - for (int r = 0; r < (WINDOW_SIZE+1); r++) { - #pragma HLS LOOP_TRIPCOUNT min=1 max=ROWS - for (int c = 0; c < COLS; c++) { + for (int c = 0; c < cols; c++) { #pragma HLS LOOP_TRIPCOUNT min=1 max=COLS #pragma HLS PIPELINE + for (int r = 0; r < (WINDOW_SIZE+1); r++) { + #pragma HLS LOOP_TRIPCOUNT min=1 max=WINDOW_SIZE+1 lb1 [r][c] = 0; lb2 [r][c] = 0; } @@ -1008,7 +1051,7 @@ namespace xf{ } // top level wrapper to avoid dataflow problems - template + template static void flowWrap (ap_uint<8> *frame0, ap_uint<8> *frame1, float *flowx, float *flowy, int rows, int cols, int size) { #pragma HLS inline off @@ -1046,9 +1089,9 @@ namespace xf{ readMatRows (frame0, f0Stream, rows, cols, size); readMatRows (frame1, f1Stream, rows, cols, size); - lbWrapper (f0Stream, f1Stream, img1Col, img2Col, rows, cols, size); + lbWrapper (f0Stream, f1Stream, img1Col, img2Col, rows, cols, size); - computeSums (img1Col, img2Col, ixix, ixiy, iyiy, dix, diy, rows, cols, size); + computeSums (img1Col, img2Col, ixix, ixiy, iyiy, dix, diy, rows, cols, size); computeFlow (ixix, ixiy, iyiy, dix, diy, fx, fy, rows, cols, size); @@ -1062,12 +1105,12 @@ namespace xf{ // frame0 - First input frame (grayscale 1 byte per pixel) // frame1 - Second input frame (grayscale 1 byte per pixel) // framef - Output frame with flows visualized. 3 bytes per pixel + 1 byte padding - template + template static void fpga_optflow8 (ap_uint<8> *frame0, ap_uint<8> *frame1, float *flowx, float *flowy, int rows, int cols, int size) { #pragma HLS inline off - flowWrap(frame0, frame1, flowx, flowy, rows, cols, size); + flowWrap(frame0, frame1, flowx, flowy, rows, cols, size); return; @@ -1087,16 +1130,16 @@ namespace xf{ #pragma SDS data copy("frame1.data"[0:"frame1.size"]) #pragma SDS data copy("flowx.data"[0:"flowx.size"]) #pragma SDS data copy("flowy.data"[0:"flowy.size"]) -template +template void DenseNonPyrLKOpticalFlow (xf::Mat & frame0, xf::Mat & frame1, xf::Mat & flowx, xf::Mat & flowy) { if(NPC==XF_NPPC1) { - fpga_optflow8 ( (ap_uint<8> *) frame0.data, (ap_uint<8> *)frame1.data, (float *)flowx.data, (float *)flowy.data, frame0.rows, frame0.cols, frame0.size); + fpga_optflow8 ( (ap_uint<8> *) frame0.data, (ap_uint<8> *)frame1.data, (float *)flowx.data, (float *)flowy.data, frame0.rows, frame0.cols, frame0.size); } else { - fpga_optflow16 ( (ap_uint<16> *) frame0.data, (ap_uint<16> *) frame1.data, (ap_uint<64> *)flowx.data, (ap_uint<64> *)flowy.data, frame0.rows, frame0.cols, frame0.size); + fpga_optflow16 ( (ap_uint<16> *) frame0.data, (ap_uint<16> *) frame1.data, (ap_uint<64> *)flowx.data, (ap_uint<64> *)flowy.data, frame0.rows, frame0.cols, frame0.size); } } } diff --git a/include/imgproc/xf_gaussian_filter.hpp b/include/imgproc/xf_gaussian_filter.hpp index 8dc51d9..242ce0c 100644 --- a/include/imgproc/xf_gaussian_filter.hpp +++ b/include/imgproc/xf_gaussian_filter.hpp @@ -1138,45 +1138,49 @@ void xFGaussianFilter(hls::stream< XF_SNAME(WORDWIDTH)> &_src, hls::stream< XF_S #pragma SDS data access_pattern("_dst.data":SEQUENTIAL) #pragma SDS data copy("_dst.data"[0:"_dst.size"]) -template -void GaussianBlur(xf::Mat & _src, xf::Mat & _dst, float sigma) +template +void GaussianBlur(xf::Mat &_src, xf::Mat &_dst, float sigma) { -#pragma HLS inline off + #pragma HLS inline off -#pragma HLS dataflow + #pragma HLS dataflow - hls::streamsrc; - hls::stream< XF_TNAME(SRC_T,NPC)> dst; + hls::stream src; + hls::stream dst; - /********************************************************/ + /********************************************************/ - Read_yuyv_Loop: - for(int i=0; i<_src.rows;i++) - { - #pragma HLS LOOP_TRIPCOUNT min=1 max=ROWS - for(int j=0; j<(_src.cols)>>(XF_BITSHIFT(NPC));j++) - { - #pragma HLS LOOP_TRIPCOUNT min=1 max=COLS/NPC - #pragma HLS PIPELINE - #pragma HLS loop_flatten off - src.write( *(_src.data + i*(_src.cols>>(XF_BITSHIFT(NPC))) +j) ); - } - } + Read_yuyv_Loop: + for(int i=0; i < _src.rows; i++) + { + #pragma HLS LOOP_TRIPCOUNT min=1 max=ROWS - xFGaussianFilter< ROWS, COLS, XF_DEPTH(SRC_T,NPC),NPC,XF_WORDWIDTH(SRC_T,NPC)>(src, dst, FILTER_SIZE, BORDER_TYPE, _src.rows,_src.cols,sigma); + for(int j=0; j < (_src.cols)>>(XF_BITSHIFT(NPC)); j++) + { + #pragma HLS LOOP_TRIPCOUNT min=1 max=COLS/NPC + #pragma HLS PIPELINE + #pragma HLS loop_flatten off - for(int i=0; i<_dst.rows;i++) - { - #pragma HLS LOOP_TRIPCOUNT min=1 max=ROWS - for(int j=0; j<(_dst.cols)>>(XF_BITSHIFT(NPC));j++) - { - #pragma HLS LOOP_TRIPCOUNT min=1 max=COLS/NPC - #pragma HLS PIPELINE - #pragma HLS loop_flatten off - *(_dst.data + i*(_dst.cols>>(XF_BITSHIFT(NPC))) +j) = dst.read(); + src.write( *(_src.data + i*(_src.cols>>(XF_BITSHIFT(NPC))) +j) ); + } + } - } - } + xFGaussianFilter< ROWS, COLS, XF_DEPTH(SRC_T,NPC),NPC,XF_WORDWIDTH(SRC_T,NPC)>(src, dst, FILTER_SIZE, BORDER_TYPE, _src.rows, _src.cols, sigma); + + for(int i=0; i < _src.rows; i++) + { + #pragma HLS LOOP_TRIPCOUNT min=1 max=ROWS + + for(int j=0; j < (_src.cols)>>(XF_BITSHIFT(NPC)); j++) + { + #pragma HLS LOOP_TRIPCOUNT min=1 max=COLS/NPC + #pragma HLS PIPELINE + #pragma HLS loop_flatten off + + *(_dst.data + i*(_src.cols>>(XF_BITSHIFT(NPC))) +j) = dst.read(); + + } + } } } #endif //_XF_GAUSSIAN_HPP_ diff --git a/include/imgproc/xf_pyr_dense_optical_flow.hpp b/include/imgproc/xf_pyr_dense_optical_flow.hpp index d7ce4ff..92079c4 100644 --- a/include/imgproc/xf_pyr_dense_optical_flow.hpp +++ b/include/imgproc/xf_pyr_dense_optical_flow.hpp @@ -232,7 +232,7 @@ void find_flow(hls::stream< ap_fixed > &strmSigmaIx2, hls } // end find_flow() -template +template void xFLKOpticalFlowDenseKernel(unsigned char *currImg, unsigned char *nextImg, unsigned int *strmFlowin, unsigned int *strmFlow, const unsigned int rows, const unsigned int cols, const unsigned int prev_rows, const unsigned int prev_cols, const int level, const bool scale_up_flag, float scale_in, ap_uint<1> init_flag) { const int WINDOW_SIZE = WINDOW_SIZE_FL; @@ -290,22 +290,20 @@ const int ITCMP_INT = FLOW_INT+12; split_stream_int_fixed(strmFlowin, strmFlowU_split, strmFlowV_split, prev_rows, prev_cols, level); //scaling up U and V streams whenever scaleup is enabled - scale_up( strmFlowU_split, strmFlowU_scaled, prev_rows, prev_cols, rows, cols, 2, scale_up_flag, scale_in); - scale_up( strmFlowV_split, strmFlowV_scaled, prev_rows, prev_cols, rows, cols, 2, scale_up_flag, scale_in); + scale_up( strmFlowU_split, strmFlowU_scaled, strmFlowV_split, strmFlowV_scaled, prev_rows, prev_cols, rows, cols, 2, scale_up_flag, scale_in); //Finding the Temporal and space gradients for the input set of images - findGradients(currImg, nextImg, strmIt_float, strmIx, strmIy, rows, cols, strmFlowU_scaled, strmFlowV_scaled, strmFlowU_in1, strmFlowV_in1, level); + findGradients(currImg, nextImg, strmIt_float, strmIx, strmIy, rows, cols, strmFlowU_scaled, strmFlowV_scaled, strmFlowU_in1, strmFlowV_in1, level); //finding the hessian matrix - find_G_and_b_matrix(strmIx, strmIy, strmIt_float, sigmaIx2, sigmaIy2, sigmaIxIy, sigmaIxIt, sigmaIyIt, rows, cols, level); + find_G_and_b_matrix(strmIx, strmIy, strmIt_float, sigmaIx2, sigmaIy2, sigmaIxIy, sigmaIxIt, sigmaIyIt, rows, cols, level); //computing the the optical flow find_flow(sigmaIx2, sigmaIy2, sigmaIxIy, sigmaIxIt, sigmaIyIt, strmFlowU_in1, strmFlowV_in1, strmFlowU_fil, strmFlowV_fil, flagU, flagV, rows, cols,level,scale_up_flag,init_flag); //filtering the flow vectors using median blur - auMedianBlur (strmFlowU_fil, strmFlowU_fil_out, flagU, WINDOW_SIZE,1,rows,cols); - auMedianBlur (strmFlowV_fil, strmFlowV_fil_out, flagV, WINDOW_SIZE,1,rows,cols); + auMedianBlur (strmFlowU_fil, strmFlowU_fil_out, flagU, strmFlowV_fil, strmFlowV_fil_out, flagV, WINDOW_SIZE,1,rows,cols); //stitching the U and V flow streams to a single flow stream stitch_stream_fixed_int(strmFlowU_fil_out, strmFlowV_fil_out, strmFlow, rows, cols, level); diff --git a/include/imgproc/xf_pyr_dense_optical_flow_find_gradients.hpp b/include/imgproc/xf_pyr_dense_optical_flow_find_gradients.hpp index 4fcac07..d80d597 100644 --- a/include/imgproc/xf_pyr_dense_optical_flow_find_gradients.hpp +++ b/include/imgproc/xf_pyr_dense_optical_flow_find_gradients.hpp @@ -71,6 +71,8 @@ ap_fixed findIntensity(unsigned char lineBuffer[NUM_LINES+1][MA // Find which location in linebuffers to access int lx0 = tmp_locj; + // AK,ZoTech: here out of bound of current level picture access may happen, thus workaround for bound padding suggested: + // int lx1 = lx0 + ((lx0<(cols-1)) ? 1:0); int lx1 = lx0 + 1; ap_fixed fracx = ap_fixed(tmp_locj - lx0); @@ -98,7 +100,7 @@ ap_fixed findIntensity(unsigned char lineBuffer[NUM_LINES+1][MA } // end findIntensity() -template +template void findGradients(unsigned char *currImg3, unsigned char *nextImg, hls::stream< ap_fixed > &strmIt, hls::stream< ap_int<9> > &strmIx, hls::stream< ap_int<9> > &strmIy, unsigned int rows, unsigned int cols, hls::stream< ap_fixed > &strmFlowUin, hls::stream< ap_fixed > &strmFlowVin, hls::stream< ap_fixed > &strmFlowU_in1, hls::stream< ap_fixed > &strmFlowV_in1, int level) { @@ -132,11 +134,17 @@ sprintf(name,"gy_hw%d.txt",level); unsigned int read_curimg = 0; unsigned int read_nxtimg = 0; + //AK,ZoTech: this buffer needs initialization as workaround to exclude "X" values in co-sim. unsigned char lineBuffer[NUM_LINES+1][MAXWIDTH]; -#pragma HLS array_partition variable=lineBuffer complete dim=1 +#pragma HLS array_reshape variable=lineBuffer complete dim=1 unsigned char curr_img_buf[2][MAXWIDTH]; -#pragma HLS array_partition variable=curr_img_buf complete dim=1 +#pragma HLS array_reshape variable=curr_img_buf complete dim=1 + +if (USE_URAM) { +#pragma HLS RESOURCE variable=lineBuffer core=XPM_MEMORY uram +#pragma HLS RESOURCE variable=curr_img_buf core=XPM_MEMORY uram +} unsigned char effBufferedLines = std::min(NUM_LINES,(1<<(NUM_PYR_LEVELS - 1 - level))*(WINSIZE-1) + 1); /**** Change this appropriately in original function***/ ap_uint<8> totalLinesInBuffer = effBufferedLines + 1; diff --git a/include/imgproc/xf_pyr_dense_optical_flow_median_blur.hpp b/include/imgproc/xf_pyr_dense_optical_flow_median_blur.hpp index e0a4a7b..1ac33d7 100644 --- a/include/imgproc/xf_pyr_dense_optical_flow_median_blur.hpp +++ b/include/imgproc/xf_pyr_dense_optical_flow_median_blur.hpp @@ -113,43 +113,15 @@ void auMedianProc( } template -void ProcessMedian3x3(hls::stream< ap_fixed > & _src_mat, +void ProcessMedian3x3( hls::stream< ap_fixed > & _out_mat, hls::stream< bool > &flag, - ap_fixed buf[WIN_SZ][(COLS >> NPC)], ap_fixed src_buf[WIN_SZ][1+(WIN_SZ-1)], + ap_fixed src_buf[WIN_SZ][1+(WIN_SZ-1)], ap_fixed buf_cop[WIN_SZ], ap_fixed OutputValues[1], - ap_fixed &P0, uint16_t img_width, uint16_t img_height, uint16_t &shift_x, ap_uint<13> row_ind[WIN_SZ], ap_uint<13> row, ap_uint<8> win_size) + ap_fixed &P0, uint16_t img_width, uint16_t img_height, uint16_t &shift_x, ap_uint<13> row_ind[WIN_SZ], ap_uint<13> row, ap_uint<16> col, ap_uint<8> win_size) { #pragma HLS INLINE - ap_fixed buf_cop[WIN_SZ]; -#pragma HLS ARRAY_PARTITION variable=buf_cop complete dim=1 - uint16_t npc = 1; - Col_Loop: - for(ap_uint<16> col = 0; col < img_width+(WIN_SZ>>1); col++) - { -#pragma HLS LOOP_TRIPCOUNT min=1 max=TC -#pragma HLS pipeline -#pragma HLS LOOP_FLATTEN OFF - - if(row < img_height && col < img_width) - buf[row_ind[win_size-1]][col] = _src_mat.read(); // Read data - else - buf[row_ind[win_size-1]][col] = 0; - - for(int copy_buf_var=0;copy_buf_var(img_height-1)) && (copy_buf_var>(win_size-1-(row-(img_height-1))))) - { - buf_cop[copy_buf_var] = buf[(row_ind[win_size-1-(row-(img_height-1))])][col]; - } - else - { - buf_cop[copy_buf_var] = buf[(row_ind[copy_buf_var])][col]; - } - } // if(NPC == AU_NPPC8) // { @@ -167,7 +139,9 @@ void ProcessMedian3x3(hls::stream< ap_fixed > & _src_mat, #pragma HLS UNROLL if(col(img_height-1)) && (extract_px>(win_size-1-(row-(img_height-1))))) + src_buf[extract_px][win_size-1] = buf_cop[(row_ind[win_size-1-(row-(img_height-1))])]; + else src_buf[extract_px][win_size-1] = buf_cop[(row_ind[extract_px])]; } else { @@ -216,14 +190,16 @@ void ProcessMedian3x3(hls::stream< ap_fixed > & _src_mat, } } } - } // Col_Loop } -template -void auMedian3x3(hls::stream< ap_fixed > &_src_mat, - hls::stream< ap_fixed > &_out_mat, hls::stream< bool > &flag, ap_uint<8> win_size, +template +void auMedian3x3(hls::stream< ap_fixed > &_src_mat0, + hls::stream< ap_fixed > &_out_mat0, hls::stream< bool > &flag0, + hls::stream< ap_fixed > &_src_mat1, + hls::stream< ap_fixed > &_out_mat1, hls::stream< bool > &flag1, + ap_uint<8> win_size, uint16_t img_height, uint16_t img_width) { ap_uint<13> row_ind[WIN_SZ]; @@ -234,19 +210,32 @@ void auMedian3x3(hls::stream< ap_fixed > &_src_mat, ap_uint<16> row, col; - ap_fixed OutputValues[1]; + ap_fixed OutputValues[2][1]; #pragma HLS ARRAY_PARTITION variable=OutputValues complete dim=1 +#pragma HLS ARRAY_PARTITION variable=OutputValues complete dim=2 + ap_fixed buf_cop[2][WIN_SZ]; +#pragma HLS ARRAY_PARTITION variable=buf_cop complete dim=1 +#pragma HLS ARRAY_PARTITION variable=buf_cop complete dim=2 + - ap_fixed src_buf[WIN_SZ][1+(WIN_SZ-1)]; + ap_fixed src_buf[2][WIN_SZ][1+(WIN_SZ-1)]; #pragma HLS ARRAY_PARTITION variable=src_buf complete dim=1 #pragma HLS ARRAY_PARTITION variable=src_buf complete dim=2 +#pragma HLS ARRAY_PARTITION variable=src_buf complete dim=3 // src_buf1 et al merged ap_fixed P0; - ap_fixed buf[WIN_SZ][(COLS >> NPC)]; -#pragma HLS ARRAY_PARTITION variable=buf complete dim=1 + ap_fixed buf[2][WIN_SZ][(COLS >> NPC)]; +#pragma HLS ARRAY_RESHAPE variable=buf complete dim=1 +#pragma HLS ARRAY_RESHAPE variable=buf complete dim=2 + +if (USE_URAM) { +#pragma HLS RESOURCE variable=buf core=XPM_MEMORY uram +} +else { #pragma HLS RESOURCE variable=buf core=RAM_S2P_BRAM +} //initializing row index @@ -265,7 +254,8 @@ void auMedian3x3(hls::stream< ap_fixed > &_src_mat, #pragma HLS LOOP_TRIPCOUNT min=TC max=TC #pragma HLS pipeline #pragma HLS LOOP_FLATTEN OFF - buf[init_buf][col] = _src_mat.read(); + buf[0][init_buf][col] = _src_mat0.read(); + buf[1][init_buf][col] = _src_mat1.read(); } } @@ -277,7 +267,8 @@ void auMedian3x3(hls::stream< ap_fixed > &_src_mat, { #pragma HLS LOOP_TRIPCOUNT min=WIN_SZ max=WIN_SZ #pragma HLS UNROLL - buf[init_buf][col] = buf[row_ind[win_size>>1]][col]; + buf[0][init_buf][col] = buf[0][row_ind[win_size>>1]][col]; + buf[1][init_buf][col] = buf[1][row_ind[win_size>>1]][col]; } } @@ -296,7 +287,36 @@ void auMedian3x3(hls::stream< ap_fixed > &_src_mat, // } // } P0 = 0; - ProcessMedian3x3(_src_mat, _out_mat, flag, buf, src_buf,OutputValues, P0, img_width, img_height, shift_x, row_ind, row,win_size); + Col_Loop: + for(ap_uint<16> col = 0; col < img_width+(WIN_SZ>>1); col++) + { +#pragma HLS LOOP_TRIPCOUNT min=1 max=TC +#pragma HLS pipeline +#pragma HLS LOOP_FLATTEN OFF + + for(int copy_buf_var=0;copy_buf_var(_out_mat0, flag0, src_buf[0], buf_cop[0], OutputValues[0], P0, img_width, img_height, shift_x, row_ind, row,col,win_size); + ProcessMedian3x3(_out_mat1, flag1, src_buf[1], buf_cop[1], OutputValues[1], P0, img_width, img_height, shift_x, row_ind, row,col,win_size); + } // Col_Loop //update indices ap_uint<13> zero_ind = row_ind[0]; @@ -311,10 +331,13 @@ void auMedian3x3(hls::stream< ap_fixed > &_src_mat, } // Row_Loop } -template +template void auMedianBlur( - hls::stream< ap_fixed > &_src, - hls::stream< ap_fixed > &_dst, hls::stream< bool > &flag, ap_uint<8> win_size, + hls::stream< ap_fixed > &_src0, + hls::stream< ap_fixed > &_dst0, hls::stream< bool > &flag0, + hls::stream< ap_fixed > &_src1, + hls::stream< ap_fixed > &_dst1, hls::stream< bool > &flag1, + ap_uint<8> win_size, int _border_type,uint16_t imgheight,uint16_t imgwidth) { #pragma HLS inline off @@ -329,7 +352,7 @@ void auMedianBlur( imgwidth = imgwidth >> NPC; - auMedian3x3< ROWS, COLS, DEPTH, NPC, WORDWIDTH, (COLS>>NPC)+(WIN_SZ>>1), WIN_SZ, WIN_SZ_SQ, FLOW_WIDTH, FLOW_INT>(_src, _dst,flag,WIN_SZ,imgheight,imgwidth); + auMedian3x3< ROWS, COLS, DEPTH, NPC, WORDWIDTH, (COLS>>NPC)+(WIN_SZ>>1), WIN_SZ, WIN_SZ_SQ, FLOW_WIDTH, FLOW_INT, USE_URAM>(_src0, _dst0,flag0,_src1, _dst1,flag1,WIN_SZ,imgheight,imgwidth); } diff --git a/include/imgproc/xf_pyr_dense_optical_flow_oflow_process.hpp b/include/imgproc/xf_pyr_dense_optical_flow_oflow_process.hpp index 2500114..cdc9a7c 100644 --- a/include/imgproc/xf_pyr_dense_optical_flow_oflow_process.hpp +++ b/include/imgproc/xf_pyr_dense_optical_flow_oflow_process.hpp @@ -29,29 +29,38 @@ ***************************************************************************/ #ifndef __XF_PYR_DENSE_OPTICAL_FLOW_OFLOW_PROCESS__ #define __XF_PYR_DENSE_OPTICAL_FLOW_OFLOW_PROCESS__ -template +template void find_G_and_b_matrix(hls::stream< ap_int<9> > &strmIx, hls::stream< ap_int<9> > &strmIy, hls::stream< ap_fixed > &strmIt, hls::stream< ap_fixed > &sigmaIx2, hls::stream< ap_fixed > &sigmaIy2, hls::stream< ap_fixed > &sigmaIxIy, hls::stream< ap_fixed > &sigmaIxIt, hls::stream< ap_fixed > &sigmaIyIt, unsigned int rows, unsigned int cols, int level) { #pragma HLS inline off // bufLines is used to buffer Ix, Iy, It in that order ap_int<9> bufLines_ix[WINSIZE][MAXWIDTH+(WINSIZE>>1)]; -#pragma HLS array_partition variable=bufLines_ix complete dim=1 +#pragma HLS array_reshape variable=bufLines_ix complete dim=1 ap_int<9> bufLines_iy[WINSIZE][MAXWIDTH+(WINSIZE>>1)]; -#pragma HLS array_partition variable=bufLines_iy complete dim=1 +#pragma HLS array_reshape variable=bufLines_iy complete dim=1 ap_fixed bufLines_it[WINSIZE][MAXWIDTH+(WINSIZE>>1)]; -#pragma HLS array_partition variable=bufLines_it complete dim=1 +#pragma HLS array_reshape variable=bufLines_it complete dim=1 ap_fixed colsum_IxIx[MAXWIDTH+(WINSIZE>>1)]; ap_fixed colsum_IxIy[MAXWIDTH+(WINSIZE>>1)]; ap_fixed colsum_IyIy[MAXWIDTH+(WINSIZE>>1)]; ap_fixed colsum_IxIt[MAXWIDTH+(WINSIZE>>1)]; ap_fixed colsum_IyIt[MAXWIDTH+(WINSIZE>>1)]; -#pragma HLS RESOURCE variable=colsum_IxIx core=RAM_T2P_BRAM -#pragma HLS RESOURCE variable=colsum_IxIy core=RAM_T2P_BRAM -#pragma HLS RESOURCE variable=colsum_IyIy core=RAM_T2P_BRAM -#pragma HLS RESOURCE variable=colsum_IxIt core=RAM_T2P_BRAM -#pragma HLS RESOURCE variable=colsum_IyIt core=RAM_T2P_BRAM + +#pragma HLS ARRAY_MAP variable=bufLines_ix instance=buffers vertical +#pragma HLS ARRAY_MAP variable=bufLines_iy instance=buffers vertical +#pragma HLS ARRAY_MAP variable=bufLines_it instance=buffers vertical + +#pragma HLS ARRAY_MAP variable=colsum_IxIx instance=buffers vertical +#pragma HLS ARRAY_MAP variable=colsum_IxIy instance=buffers vertical +#pragma HLS ARRAY_MAP variable=colsum_IyIy instance=buffers vertical +#pragma HLS ARRAY_MAP variable=colsum_IxIt instance=buffers vertical +#pragma HLS ARRAY_MAP variable=colsum_IyIt instance=buffers vertical + +if (USE_URAM) { +#pragma HLS RESOURCE variable=bufLines_ix core=XPM_MEMORY uram +} ap_fixed colsum_prevWIN_IxIx[WINSIZE]; ap_fixed colsum_prevWIN_IxIy[WINSIZE]; diff --git a/include/imgproc/xf_pyr_dense_optical_flow_scale.hpp b/include/imgproc/xf_pyr_dense_optical_flow_scale.hpp index 42ec06e..9b11634 100644 --- a/include/imgproc/xf_pyr_dense_optical_flow_scale.hpp +++ b/include/imgproc/xf_pyr_dense_optical_flow_scale.hpp @@ -31,7 +31,9 @@ #define __XF_PYR_DENSE_OPTICAL_FLOW_SCALE__ template -void load_data (hls::stream< ap_fixed > &inStrm, ap_fixed buf[MAXWIDTH], int rows, int cols, bool &flagLoaded, int i, ap_ufixed scaleI, ap_fixed &fracI, int &prevIceil) { +void load_data (hls::stream< ap_fixed > &inStrm0, + hls::stream< ap_fixed > &inStrm1, + ap_fixed buf[2][MAXWIDTH], int rows, int cols, bool &flagLoaded, int i, ap_ufixed scaleI, ap_fixed &fracI, int &prevIceil) { #pragma HLS inline off ap_fixed iSmall = i * scaleI; int iSmallFloor = (int) iSmall; @@ -42,7 +44,8 @@ void load_data (hls::stream< ap_fixed > &inStrm, ap_fixed compute_result(ap_fixed fra } // end compute_result() template -void process(ap_fixed buf[MAXWIDTH], ap_fixed buffer[2][MAXWIDTH], unsigned short int outRows, unsigned short int outCols, hls::stream< ap_fixed >& outStrm, bool flagLoaded, int row, ap_ufixed scaleI, ap_ufixed scaleJ, ap_fixed fracI, int mul) { -#pragma HLS array_partition variable=buffer dim=1 complete +void process(ap_fixed buf[2][MAXWIDTH], ap_fixed buffer[2][2][MAXWIDTH], unsigned short int outRows, unsigned short int outCols, + hls::stream< ap_fixed >& outStrm0, + hls::stream< ap_fixed >& outStrm1, + bool flagLoaded, int row, ap_ufixed scaleI, ap_ufixed scaleJ, ap_fixed fracI, int mul) { #pragma HLS inline off int bufCount = 0; ap_fixed regLoad; int prevJceil = -1; - ap_fixed i0=0, i1=0, i2=0, i3=0; + ap_fixed i0[2]={0,0}; + ap_fixed i1[2]={0,0}; + ap_fixed i2[2]={0,0}; + ap_fixed i3[2]={0,0}; L3:for (ap_uint<16> j=0; j buf[MAXWIDTH], ap_fixed reg = buf[bufCount]; - buffer[1][bufCount] = reg; - i3 = reg; + for (int k=0; k<2; k++) { + ap_fixed reg = buf[k][bufCount]; + buffer[k][1][bufCount] = reg; + i3[k] = reg; + } fracI = 1; fracJ = 1; bufCount++; prevJceil = 0; } else if (j reg = buf[bufCount]; - buffer[1][bufCount] = reg; - i3 = reg; + for (int k=0; k<2; k++) { + i2[k] = i3[k]; + ap_fixed reg = buf[k][bufCount]; + buffer[k][1][bufCount] = reg; + i3[k] = reg; + } bufCount++; prevJceil = jSmallFloor + 1; } } else { - i3 = buffer[1][bufCount-1]; + i3[0] = buffer[0][1][bufCount-1]; + i3[1] = buffer[1][1][bufCount-1]; fracI = 1; fracJ = 1; } } else if (row < outRows-1) { if (j==0) { - i0 = 0; i2 = 0; + i0[0] = 0; i2[0] = 0; + i0[1] = 0; i2[1] = 0; fracJ = 1; if (flagLoaded) { - ap_fixed reg = buf[bufCount]; - ap_fixed tmp = buffer[1][bufCount]; - buffer[0][bufCount] = tmp; - i1 = tmp; - buffer[1][bufCount] = reg; - i3 = reg; + for (int k=0; k<2; k++) { + ap_fixed reg = buf[k][bufCount]; + ap_fixed tmp = buffer[k][1][bufCount]; + buffer[k][0][bufCount] = tmp; + i1[k] = tmp; + buffer[k][1][bufCount] = reg; + i3[k] = reg; + } bufCount++; } else { - i1 = buffer[0][bufCount]; - i3 = buffer[1][bufCount]; + for (int k=0; k<2; k++) { + i1[k] = buffer[k][0][bufCount]; + i3[k] = buffer[k][1][bufCount]; + } bufCount++; } prevJceil = 0; } else if (j < outCols) { if (prevJceil == jSmallFloor) { - i0 = i1; i2 = i3; + i0[0] = i1[0]; i2[0] = i3[0]; + i0[1] = i1[1]; i2[1] = i3[1]; if (flagLoaded) { - ap_fixed reg = buf[bufCount]; - ap_fixed tmp = buffer[1][bufCount]; - buffer[0][bufCount] = tmp; - i1 = tmp; - buffer[1][bufCount] = reg; - i3 = reg; + for (int k=0; k<2; k++) { + ap_fixed reg = buf[k][bufCount]; + ap_fixed tmp = buffer[k][1][bufCount]; + buffer[k][0][bufCount] = tmp; + i1[k] = tmp; + buffer[k][1][bufCount] = reg; + i3[k] = reg; + } bufCount++; } else { - i1 = buffer[0][bufCount]; - i3 = buffer[1][bufCount]; + for (int k=0; k<2; k++) { + i1[k] = buffer[k][0][bufCount]; + i3[k] = buffer[k][1][bufCount]; + } bufCount++; } prevJceil = jSmallFloor + 1; @@ -160,40 +183,55 @@ void process(ap_fixed buf[MAXWIDTH], ap_fixed reg = buffer[1][bufCount]; - i3 = reg; + for (int k=0; k<2; k++) { + i2[k] = i3[k]; + ap_fixed reg = buffer[k][1][bufCount]; + i3[k] = reg; + } bufCount++; prevJceil = jSmallFloor + 1; } fracI = 1; } else { - i3 = buffer[1][bufCount-1]; + i3[0] = buffer[0][1][bufCount-1]; + i3[1] = buffer[1][1][bufCount-1]; fracI = 1; fracJ = 1; } } // end else - ap_fixed resIf = compute_result (fracI, fracJ, i0, i1, i2, i3); - outStrm.write(resIf<<1); + ap_fixed resIf0 = compute_result (fracI, fracJ, i0[0], i1[0], i2[0], i3[0]); + outStrm0.write(resIf0<<1); + ap_fixed resIf1 = compute_result (fracI, fracJ, i0[1], i1[1], i2[1], i3[1]); + outStrm1.write(resIf1<<1); } // end L3 } // end process() -template -void scale_up( hls::stream< ap_fixed > &inStrm, hls::stream< ap_fixed > &outStrm, +template +void scale_up( hls::stream< ap_fixed > &inStrm0, hls::stream< ap_fixed > &outStrm0, + hls::stream< ap_fixed > &inStrm1, hls::stream< ap_fixed > &outStrm1, unsigned short int inRows, unsigned short int inCols, unsigned short int outRows, unsigned short int outCols, int mul, const bool scale_up_flag, float scale_comp) { #pragma HLS inline off - ap_fixed buffer[2][MAXWIDTH]; -#pragma HLS array_partition variable=buffer dim=1 complete - ap_fixed buf0[MAXWIDTH], buf1[MAXWIDTH]; + ap_fixed buffer[2][2][MAXWIDTH]; +#pragma HLS array_reshape variable=buffer dim=1 complete +#pragma HLS array_reshape variable=buffer dim=2 complete + ap_fixed buf0[2][MAXWIDTH], buf1[2][MAXWIDTH]; +#pragma HLS array_reshape variable=buf0 dim=1 complete +#pragma HLS array_reshape variable=buf1 dim=1 complete +if (USE_URAM) { +#pragma HLS RESOURCE variable=buffer core=XPM_MEMORY uram +#pragma HLS RESOURCE variable=buf0 core=XPM_MEMORY uram +#pragma HLS RESOURCE variable=buf1 core=XPM_MEMORY uram +} ap_ufixed scaleI = (ap_ufixed)scale_comp; ap_ufixed scaleJ = (ap_ufixed)scale_comp; @@ -213,32 +251,33 @@ void scale_up( hls::stream< ap_fixed > &inStrm, hls::stream #pragma HLS LOOP_TRIPCOUNT min=1 max=MAXWIDTH #pragma HLS pipeline II=1 #pragma HLS LOOP_FLATTEN OFF - outStrm.write((ap_fixed)inStrm.read()); + outStrm0.write((ap_fixed)inStrm0.read()); + outStrm1.write((ap_fixed)inStrm1.read()); } } } else{ int prevIceil = -1; - load_data(inStrm, buf0, inRows, inCols, flagLoaded0, 0, scaleI, fracI0, prevIceil); + load_data(inStrm0, inStrm1, buf0, inRows, inCols, flagLoaded0, 0, scaleI, fracI0, prevIceil); L2:for (ap_uint<16> i=0; i(inStrm, buf1, inRows, inCols, flagLoaded1, i+1, scaleI, fracI1, prevIceil); - process(buf0, buffer, outRows, outCols, outStrm, flagLoaded0, i, scaleI, scaleJ, fracI0, mul); + load_data(inStrm0, inStrm1, buf1, inRows, inCols, flagLoaded1, i+1, scaleI, fracI1, prevIceil); + process(buf0, buffer, outRows, outCols, outStrm0, outStrm1, flagLoaded0, i, scaleI, scaleJ, fracI0, mul); flag = 1; } else { - load_data(inStrm, buf0, inRows, inCols, flagLoaded0, i+1, scaleI, fracI0, prevIceil); - process(buf1, buffer, outRows, outCols, outStrm, flagLoaded1, i, scaleI, scaleJ, fracI1, mul); + load_data(inStrm0, inStrm1, buf0, inRows, inCols, flagLoaded0, i+1, scaleI, fracI0, prevIceil); + process(buf1, buffer, outRows, outCols, outStrm0, outStrm1, flagLoaded1, i, scaleI, scaleJ, fracI1, mul); flag = 0; } } // end L2 if (flag ==0) { - process(buf0, buffer, outRows, outCols, outStrm, flagLoaded0, outRows-1, scaleI, scaleJ, fracI0, mul); + process(buf0, buffer, outRows, outCols, outStrm0, outStrm1, flagLoaded0, outRows-1, scaleI, scaleJ, fracI0, mul); } else { - process(buf1, buffer, outRows, outCols, outStrm, flagLoaded1, outRows-1, scaleI, scaleJ, fracI1, mul); + process(buf1, buffer, outRows, outCols, outStrm0, outStrm1, flagLoaded1, outRows-1, scaleI, scaleJ, fracI1, mul); } } diff --git a/include/imgproc/xf_pyr_dense_optical_flow_wrapper.hpp b/include/imgproc/xf_pyr_dense_optical_flow_wrapper.hpp index af41219..655aee1 100644 --- a/include/imgproc/xf_pyr_dense_optical_flow_wrapper.hpp +++ b/include/imgproc/xf_pyr_dense_optical_flow_wrapper.hpp @@ -51,11 +51,11 @@ namespace xf{ #pragma SDS data data_mover("_next_image.data":AXIDMA_SIMPLE) #pragma SDS data data_mover("_streamFlowin.data":AXIDMA_SIMPLE) #pragma SDS data data_mover("_streamFlowout.data":AXIDMA_SIMPLE) -template +template void densePyrOpticalFlow(xf::Mat & _current_img, xf::Mat & _next_image, xf::Mat & _streamFlowin, xf::Mat & _streamFlowout, const int level, const unsigned char scale_up_flag, float scale_in, ap_uint<1> init_flag) { #pragma HLS INLINE OFF - xFLKOpticalFlowDenseKernel((unsigned char *)_current_img.data, (unsigned char *)_next_image.data, (unsigned int *)_streamFlowin.data, (unsigned int *)_streamFlowout.data, _current_img.rows, _current_img.cols, _streamFlowin.rows, _streamFlowin.cols, level, scale_up_flag, scale_in, init_flag); + xFLKOpticalFlowDenseKernel((unsigned char *)_current_img.data, (unsigned char *)_next_image.data, (unsigned int *)_streamFlowin.data, (unsigned int *)_streamFlowout.data, _current_img.rows, _current_img.cols, _streamFlowin.rows, _streamFlowin.cols, level, scale_up_flag, scale_in, init_flag); } } #endif diff --git a/include/imgproc/xf_pyr_down.hpp b/include/imgproc/xf_pyr_down.hpp index 88df8e8..f1c9ef4 100644 --- a/include/imgproc/xf_pyr_down.hpp +++ b/include/imgproc/xf_pyr_down.hpp @@ -37,7 +37,7 @@ namespace xf{ -template +template void xFpyrDownKernel(XF_TNAME(TYPE,NPC) *in_image, XF_TNAME(TYPE,NPC) *out_image, unsigned short in_rows, unsigned short in_cols) { #pragma HLS DATAFLOW @@ -55,7 +55,7 @@ void xFpyrDownKernel(XF_TNAME(TYPE,NPC) *in_image, XF_TNAME(TYPE,NPC) *out_image read_pointer++; } } - xFPyrDownGaussianBlur(_filter_in, _filter_out, 5, XF_BORDER_CONSTANT,in_rows,in_cols); + xFPyrDownGaussianBlur(_filter_in, _filter_out, 5, XF_BORDER_CONSTANT,in_rows,in_cols); unsigned int write_ptr = 0; for(int i=0;i +template void pyrDown (xf::Mat & _src, xf::Mat & _dst) { #pragma HLS INLINE OFF unsigned short input_height = _src.rows; unsigned short input_width = _src.cols; - xFpyrDownKernel(_src.data, _dst.data, input_height, input_width); + xFpyrDownKernel(_src.data, _dst.data, input_height, input_width); return; } } diff --git a/include/imgproc/xf_pyr_down_gaussian_blur.hpp b/include/imgproc/xf_pyr_down_gaussian_blur.hpp index c87db8e..6c61198 100644 --- a/include/imgproc/xf_pyr_down_gaussian_blur.hpp +++ b/include/imgproc/xf_pyr_down_gaussian_blur.hpp @@ -99,29 +99,31 @@ void xFPyrDownprocessgaussian(hls::stream< XF_TNAME(DEPTH,NPC) > & _src_mat, #pragma HLS LOOP_FLATTEN OFF #pragma HLS LOOP_TRIPCOUNT min=1 max=TC #pragma HLS pipeline - if(row < img_height && col < img_width) - buf[row_ind[win_size-1]][col] = _src_mat.read(); // Read data - for(int copy_buf_var=0;copy_buf_var(img_height-1)) && (copy_buf_var>(win_size-1-(row-(img_height-1))))) - { - buf_cop[copy_buf_var] = buf[(row_ind[win_size-1-(row-(img_height-1))])][col]; - } - else - { - buf_cop[copy_buf_var] = buf[(row_ind[copy_buf_var])][col]; - } + buf_cop[copy_buf_var] = buf[copy_buf_var][col]; } + + if(row < img_height && col < img_width) + buf [row_ind[win_size-1]][col] = + buf_cop[row_ind[win_size-1]] = _src_mat.read(); // Read data + for(int extract_px=0;extract_px(img_height-1)) && (extract_px>(win_size-1-(row-(img_height-1))))) + { + src_buf[extract_px][win_size-1] = buf_cop[(row_ind[win_size-1-(row-(img_height-1))])]; + } + else + { + src_buf[extract_px][win_size-1] = buf_cop[(row_ind[extract_px])]; + } } else { @@ -158,7 +160,7 @@ void xFPyrDownprocessgaussian(hls::stream< XF_TNAME(DEPTH,NPC) > & _src_mat, -template +template void xf_pyrdown_gaussian_nxn(hls::stream< XF_TNAME(DEPTH,NPC) > &_src_mat, hls::stream< XF_TNAME(DEPTH,NPC) > &_out_mat, ap_uint<8> win_size, uint16_t img_height, uint16_t img_width) @@ -181,8 +183,12 @@ void xf_pyrdown_gaussian_nxn(hls::stream< XF_TNAME(DEPTH,NPC) > &_src_mat, XF_TNAME(DEPTH,NPC) P0; XF_TNAME(DEPTH,NPC) buf[WIN_SZ][(COLS >> XF_BITSHIFT(NPC))]; -#pragma HLS ARRAY_PARTITION variable=buf complete dim=1 +#pragma HLS ARRAY_RESHAPE variable=buf complete dim=1 +if (USE_URAM) { +#pragma HLS RESOURCE variable=buf core=XPM_MEMORY uram +} else { #pragma HLS RESOURCE variable=buf core=RAM_S2P_BRAM +} //initializing row index @@ -209,11 +215,13 @@ void xf_pyrdown_gaussian_nxn(hls::stream< XF_TNAME(DEPTH,NPC) > &_src_mat, for(col = 0; col < img_width; col++) { #pragma HLS LOOP_TRIPCOUNT min=1 max=TC + #pragma HLS pipeline + XF_TNAME(DEPTH,NPC) const bufTemp = buf[row_ind[win_size>>1]][col]; for(int init_buf=0; init_buf < WIN_SZ>>1;init_buf++) { #pragma HLS LOOP_TRIPCOUNT min=1 max=WIN_SZ #pragma HLS UNROLL - buf[init_buf][col] = buf[row_ind[win_size>>1]][col]; + buf[init_buf][col] = bufTemp; } } @@ -237,7 +245,7 @@ void xf_pyrdown_gaussian_nxn(hls::stream< XF_TNAME(DEPTH,NPC) > &_src_mat, } // Row_Loop } -template +template void xFPyrDownGaussianBlur( hls::stream< XF_TNAME(DEPTH,NPC) > &_src, hls::stream< XF_TNAME(DEPTH,NPC) > &_dst, ap_uint<8> win_size, @@ -249,7 +257,7 @@ void xFPyrDownGaussianBlur( imgwidth = imgwidth >> XF_BITSHIFT(NPC); - xf_pyrdown_gaussian_nxn>XF_BITSHIFT(NPC))+(WIN_SZ>>1),WIN_SZ, WIN_SZ_SQ>(_src, _dst,WIN_SZ,imgheight,imgwidth); + xf_pyrdown_gaussian_nxn>XF_BITSHIFT(NPC))+(WIN_SZ>>1),WIN_SZ, WIN_SZ_SQ, USE_URAM>(_src, _dst,WIN_SZ,imgheight,imgwidth); } diff --git a/include/imgproc/xf_remap.hpp b/include/imgproc/xf_remap.hpp index 459e7e1..4ea675a 100644 --- a/include/imgproc/xf_remap.hpp +++ b/include/imgproc/xf_remap.hpp @@ -44,7 +44,7 @@ EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace xf{ -template +template void xFRemapNNI( hls::stream< SRC_T > &src, hls::stream< DST_T > &dst, @@ -57,6 +57,24 @@ void xFRemapNNI( #pragma HLS ARRAY_PARTITION variable=buf complete dim=1 SRC_T s; + + ap_uint<64> bufUram[WIN_ROW][(COLS+7)/8]; +#pragma HLS RESOURCE variable=bufUram core=XPM_MEMORY uram + //additional separation of URAM buffer to single URAMs to exclude their built-in cascading and thus limited timing + //due to inability of VHLS to schedule built-in cascade register (OREG_CAS) + enum { + BUF_DEPTH = WIN_ROW * ((COLS+7)/8), + URAM_DEPTH = 4096, + BUF_URAMS = (BUF_DEPTH + URAM_DEPTH-1) / URAM_DEPTH, + PART_FACTOR = BUF_URAMS != 2 ? BUF_URAMS : 1 // exluding factor=2 as it leads to II degradation, so built-in cascading is left for the case of just 2 URAMs + }; + if (USE_URAM) { + assert(PART_FACTOR <= ((COLS+7)/8)); + #pragma HLS array_partition variable=bufUram dim=2 factor=PART_FACTOR block + } + SRC_T sx8[8]; +#pragma HLS ARRAY_PARTITION variable=sx8 complete dim=1 + DST_T d; MAP_T mx_fl; MAP_T my_fl; @@ -75,14 +93,23 @@ void xFRemapNNI( loop_width: for( int j=0; j< cols; j++) { #pragma HLS PIPELINE II=1 -#pragma HLS dependence array inter false +#pragma HLS dependence variable=buf inter false +#pragma HLS dependence variable=bufUram inter false +#pragma HLS dependence variable=r inter false #pragma HLS LOOP_TRIPCOUNT min=1 max=COLS if(i> s; + + if (USE_URAM) { + sx8[j%8] = s; + for (int k=0; k<8; k++) bufUram[i % WIN_ROW][j/8](k*8+7,k*8) = sx8[k]; + } } - buf[i % WIN_ROW][j] = s; + + if (!USE_URAM) + buf[i % WIN_ROW][j] = s; r[i % WIN_ROW] = i; if(i>=ishift) @@ -94,6 +121,12 @@ void xFRemapNNI( bool in_range = (y>=0 && y=0 && x +template void xFRemapLI( hls::stream< SRC_T > &src, hls::stream< DST_T > &dst, @@ -116,10 +149,33 @@ void xFRemapLI( ) { // Add one to always get zero for boundary interpolation. Maybe need initialization here? + //AK,ZoTech: this buffer needs initialization as workaround for correct boundary filtering, otherwise X are generated in co-sim. DST_T buf[WIN_ROW/2+1][2][COLS/2+1][2]; #pragma HLS array_partition complete variable=buf dim=2 #pragma HLS array_partition complete variable=buf dim=4 SRC_T s; + + //URAM storage garnularity is 3x3-pel block in 2x2-pel picture grid, it fits to one URAM word + ap_uint<72> bufUram[(WIN_ROW+1)/2][(COLS+1)/2]; +#pragma HLS RESOURCE variable=bufUram core=XPM_MEMORY uram + //additional separation of URAM buffer to single URAMs to exclude their built-in cascading and thus limited timing + //due to inability of VHLS to schedule built-in cascade register (OREG_CAS) + enum { + BUF_DEPTH = ((WIN_ROW+1)/2) * ((COLS+1)/2), + URAM_DEPTH = 4096, + BUF_URAMS = (BUF_DEPTH + URAM_DEPTH-1) / URAM_DEPTH, + PART_FACTOR = BUF_URAMS != 2 ? BUF_URAMS : 1 // exluding factor=2 as it leads to II degradation, so built-in cascading is left for the case of just 2 URAMs + }; + if (USE_URAM) { + assert(PART_FACTOR <= ((COLS+1)/2)); + #pragma HLS array_partition variable=bufUram dim=2 factor=PART_FACTOR block + } + SRC_T lineBuf[COLS]; //addtitional cashing as VHLS doesn't support URAM Byte Enables + SRC_T s3x3[2][9]; //URAM-wide word is doubled to resolve pipelining read/write dependency +#pragma HLS ARRAY_PARTITION complete variable=s3x3 dim=0 + SRC_T s3x3_2[9]; + SRC_T s0,s3; + MAP_T mx; MAP_T my; @@ -135,25 +191,73 @@ void xFRemapLI( #pragma HLS LOOP_FLATTEN OFF #pragma HLS LOOP_TRIPCOUNT min=1 max=row_tripcount - loop_width: for( int j=0; j< cols; j++) + loop_width: for( int j=0; j< cols+3; j++) { #pragma HLS PIPELINE II=1 -#pragma HLS dependence array inter false +#pragma HLS dependence variable=buf inter false +#pragma HLS dependence variable=bufUram inter false +#pragma HLS dependence variable=s3x3 inter false RAW +#pragma HLS dependence variable=r1 inter false +#pragma HLS dependence variable=r2 inter false #pragma HLS LOOP_TRIPCOUNT min=1 max=COLS if(i> s; } + + if (USE_URAM && i0 && (j/2)>1) for (int k=0; k<9; k++) bufUram[(i/2-1)%(WIN_ROW/2)][j/2-2](k*8+7,k*8) = s3x3[!!(j&2)][k]; + } else if (j0) { + for (int k=0; k<6; k++) s3x3[!!(j&2)][k] = bufUram[(i/2-1)%(WIN_ROW/2)][j/2](k*8+7,k*8); + s3x3[!!(j&2)][6] = s0; + s3x3[!!(j&2)][7] = s; + s3x3[!!(j&2)][8] = 0; + } + } + } else if (j0) for (int k=0; k<9; k++) bufUram[(i/2)%(WIN_ROW/2)][j/2-1](k*8+7,k*8) = s3x3_2[k]; + } else { // odd col + s3x3_2[0] = s0; + s3x3_2[1] = lineBuf[j]; + s3x3_2[3] = s3; + s3x3_2[4] = s; + + // this clearing is needed only for case of bottom zero padding (curently last(bottom-right) sample value is used) + s3x3_2[6] = 0; + s3x3_2[7] = 0; + s3x3_2[8] = 0; + //if (j==(cols-1)) { //these clearing and save is needed only at last column but may done every cycle + s3x3_2[2] = 0; + s3x3_2[5] = 0; + for (int k=0; k<9; k++) bufUram[(i/2)%(WIN_ROW/2)][j/2](k*8+7,k*8) = s3x3_2[k]; + //} + } + } + } + + if (!USE_URAM && j=ishift) + if(i>=ishift && j> mx; mapy >> my; @@ -198,6 +302,16 @@ void xFRemapLI( ya1 = (y/2)%(WIN_ROW/2); DST_T d00, d01, d10, d11; + + if (USE_URAM) { + DST_T d3x3[9]; +#pragma HLS ARRAY_PARTITION variable=d3x3 complete dim=1 + for (int k=0; k<9; k++) d3x3[k] = bufUram[ya1][xa1](k*8+7,k*8); + d00 = d3x3[(y%2 )*3 + x%2 ]; + d01 = d3x3[(y%2 )*3 + x%2+1]; + d10 = d3x3[(y%2+1)*3 + x%2 ]; + d11 = d3x3[(y%2+1)*3 + x%2+1]; + } else { d00=buf[ya0][0][xa0][0]; d01=buf[ya0][0][xa1][1]; d10=buf[ya1][1][xa0][0]; @@ -211,6 +325,7 @@ void xFRemapLI( std::swap(d00,d10); std::swap(d01,d11); } + } ap_ufixed<2*HLS_INTER_BITS + 1, 1> k01 = (1-iv)*( iu); // iu-iu*iv ap_ufixed<2*HLS_INTER_BITS + 1, 1> k10 = ( iv)*(1-iu); // iv-iu*iv ap_ufixed<2*HLS_INTER_BITS + 1, 1> k11 = ( iv)*( iu); // iu*iv @@ -230,7 +345,7 @@ void xFRemapLI( } } -template +template void xFRemapKernel( hls::stream< SRC_T > &src, hls::stream< DST_T > &dst, @@ -240,9 +355,9 @@ void xFRemapKernel( ) { if(INTERPOLATION_TYPE == XF_INTERPOLATION_NN) { - xFRemapNNI(src, dst, mapx, mapy,rows,cols); + xFRemapNNI(src, dst, mapx, mapy,rows,cols); } else if(INTERPOLATION_TYPE == XF_INTERPOLATION_BILINEAR) { - xFRemapLI(src, dst, mapx, mapy,rows,cols); + xFRemapLI(src, dst, mapx, mapy,rows,cols); } else { assert (((INTERPOLATION_TYPE == XF_INTERPOLATION_NN)||(INTERPOLATION_TYPE == XF_INTERPOLATION_BILINEAR)) && "The INTERPOLATION_TYPE must be either XF_INTERPOLATION_NN or XF_INTERPOLATION_BILINEAR"); @@ -253,7 +368,7 @@ void xFRemapKernel( //#pragma SDS data mem_attribute("_src_mat.data":NON_CACHEABLE|PHYSICAL_CONTIGUOUS,"_remapped_mat.data":NON_CACHEABLE|PHYSICAL_CONTIGUOUS,"_mapx_mat.data":NON_CACHEABLE|PHYSICAL_CONTIGUOUS,"_mapy_mat.data":NON_CACHEABLE|PHYSICAL_CONTIGUOUS) #pragma SDS data access_pattern("_src_mat.data":SEQUENTIAL,"_remapped_mat.data":SEQUENTIAL,"_mapx_mat.data":SEQUENTIAL,"_mapy_mat.data":SEQUENTIAL) #pragma SDS data copy("_src_mat.data"[0:"_src_mat.rows*_src_mat.cols"], "_remapped_mat.data"[0:"_remapped_mat.size"],"_mapx_mat.data"[0:"_mapx_mat.size"],"_mapy_mat.data"[0:"_mapy_mat.size"]) -template +template void remap (xf::Mat &_src_mat, xf::Mat &_remapped_mat, xf::Mat &_mapx_mat, xf::Mat &_mapy_mat) { @@ -304,7 +419,7 @@ void remap (xf::Mat &_src_mat, xf::Mat (_src, _remapped, _mapx, _mapy, rows, cols); + xFRemapKernel (_src, _remapped, _mapx, _mapy, rows, cols); xfremap_output_loop: for (int i = 0; i < loop_count; i++) diff --git a/include/imgproc/xf_stereoBM.hpp b/include/imgproc/xf_stereoBM.hpp index 93448c5..b705546 100644 --- a/include/imgproc/xf_stereoBM.hpp +++ b/include/imgproc/xf_stereoBM.hpp @@ -62,99 +62,99 @@ template T xFabsdiff2(T a, T b) { #pragma HLS INLINE - int x = a-b; + int x = a-b; #pragma HLS RESOURCE variable=x core=AddSubnS - T r; - if (x > 0) - { - r = x; - } - else - { - r = -x; - } - return r; + T r; + if (x > 0) + { + r = x; + } + else + { + r = -x; + } + return r; } template class xFMinSAD { public: - template - static void find(T a[SIZE], T_idx &loc, T &val) - { + template + static void find(T a[SIZE], T_idx &loc, T &val) + { #pragma HLS INLINE #pragma HLS array_partition variable=a complete dim=0 - T a1[SIZE/2]; - T a2[SIZE-SIZE/2]; + T a1[SIZE/2]; + T a2[SIZE-SIZE/2]; - for(int i = 0; i < SIZE/2; i++) - { + for(int i = 0; i < SIZE/2; i++) + { #pragma HLS UNROLL - a1[i] = a[i]; - } - for(int i = 0; i < SIZE-SIZE/2; i++) - { + a1[i] = a[i]; + } + for(int i = 0; i < SIZE-SIZE/2; i++) + { #pragma HLS UNROLL - a2[i] = a[i+SIZE/2]; - } - - T_idx l1,l2; - T v1,v2; - xFMinSAD::find(a1,l1,v1); - xFMinSAD::find(a2,l2,v2); - - if(v2 <= v1) - { - val = v2; - loc = l2+SIZE/2; - } - else - { - val = v1; - loc = l1; - } - } + a2[i] = a[i+SIZE/2]; + } + + T_idx l1,l2; + T v1,v2; + xFMinSAD::find(a1,l1,v1); + xFMinSAD::find(a2,l2,v2); + + if(v2 <= v1) + { + val = v2; + loc = l2+SIZE/2; + } + else + { + val = v1; + loc = l1; + } + } }; template<> class xFMinSAD<1> { public: - template - static void find(T a[1], T_idx &loc, T &val) - { + template + static void find(T a[1], T_idx &loc, T &val) + { #pragma HLS INLINE - loc = 0; - val = a[0]; - } + loc = 0; + val = a[0]; + } }; template<> class xFMinSAD<2> { public: - template - static void find(T a[2], T_idx &loc, T &val) - { + template + static void find(T a[2], T_idx &loc, T &val) + { #pragma HLS INLINE #pragma HLS array_partition variable=a complete dim=0 - T_idx l1=0, l2=1; - T v1=a[0], v2=a[1]; - if(v2 <= v1) - { - val = v2; - loc = l2; - } - else - { - val = v1; - loc = l1; - } - } + T_idx l1=0, l2=1; + T v1=a[0], v2=a[1]; + if(v2 <= v1) + { + val = v2; + loc = l2; + } + else + { + val = v1; + loc = l1; + } + } }; /* TEXTURE THRESHOLD computation */ @@ -163,32 +163,32 @@ void xFUpdateTextureSum(unsigned char window[WSIZE][L_WIN_COLS],unsigned char l_ { #pragma HLS INLINE - int abs_diff[WSIZE]; - int col_sums = 0; + int abs_diff[WSIZE]; + int col_sums = 0; - text_sum_loop1: - for (int i = 0; i < WSIZE; i++) - { + text_sum_loop1: + for (int i = 0; i < WSIZE; i++) + { #pragma HLS UNROLL - col_sums += (i > row? 0 : xFabsdiff2((int)(l_tmp[i]), cap)) - (((col < WSIZE) || (i > row) ) ? 0 : xFabsdiff2((int)window[i][WSIZE-1], cap)); - } + col_sums += (i > row? 0 : xFabsdiff2((int)(l_tmp[i]), cap)) - (((col < WSIZE) || (i > row) ) ? 0 : xFabsdiff2((int)window[i][WSIZE-1], cap)); + } - int tmp_prev[2]; - int tmp_int_sums; + int tmp_prev[2]; + int tmp_int_sums; - tmp_prev[0] = col>0 ? (int)text_sum[0]:(int)0; - tmp_prev[1] = col_sums; + tmp_prev[0] = col>0 ? (int)text_sum[0]:(int)0; + tmp_prev[1] = col_sums; - //shift right - for(int j = WSIZE-1; j >= 1; j--) - { + //shift right + for(int j = WSIZE-1; j >= 1; j--) + { #pragma HLS UNROLL - text_sum[j] = text_sum[j-1]; - } + text_sum[j] = text_sum[j-1]; + } - // shift_right, NDISP_UNITS,SAD_COL_SIZE,NPC>(text_sum); - tmp_int_sums = tmp_prev[0] + tmp_prev[1]; - text_sum[0] = tmp_int_sums; + // shift_right, NDISP_UNITS,SAD_COL_SIZE,NPC>(text_sum); + tmp_int_sums = tmp_prev[0] + tmp_prev[1]; + text_sum[0] = tmp_int_sums; } template @@ -196,57 +196,57 @@ void xFShiftRight(T buf[ROWS][COLS]) { #pragma HLS INLINE - shift_right_loop2: - for(unsigned char j = COLS-1; j >= 1; j--) - { + shift_right_loop2: + for(unsigned char j = COLS-1; j >= 1; j--) + { #pragma HLS UNROLL - shift_right_loop1: - for(unsigned char i = 0; i < ROWS; i++) - { + shift_right_loop1: + for(unsigned char i = 0; i < ROWS; i++) + { #pragma HLS UNROLL - buf[i][j] = buf[i][j-1]; - } - } + buf[i][j] = buf[i][j-1]; + } + } } template void xFInsertLeft(T buf[ROWS][COLS],T tmp[ROWS]) { #pragma HLS INLINE - insert_right_loop1: - for(unsigned char i = 0; i < ROWS; i++) - { + insert_right_loop1: + for(unsigned char i = 0; i < ROWS; i++) + { #pragma HLS UNROLL - buf[i][0] = tmp[i]; - } + buf[i][0] = tmp[i]; + } } template short int xFSADComputeInc( - T l_win[WSIZE][L_WIN_COLS], - T r_win_s[WSIZE][R_WIN_COLS], - unsigned char d, - unsigned short col, - short int sad_cols_d[WSIZE]) + T l_win[WSIZE][L_WIN_COLS], + T r_win_s[WSIZE][R_WIN_COLS], + unsigned char d, + unsigned short col, + short int sad_cols_d[WSIZE]) { #pragma HLS inline - short int a_sum = 0, b_sum = 0; - // compute new column sads; - for (unsigned char i = 0; i < WSIZE; i++) { - b_sum += __ABS((unsigned char)l_win[i][0] - (unsigned char)r_win_s[i][d]); - } - // valid guard; - if (col < d) b_sum = 0; - // get previous sad_cols value; - a_sum = sad_cols_d[WSIZE-1]; - // shift sad_cols[d]; - for (unsigned char j = WSIZE-1; j > 0; j--) { - sad_cols_d[j] = sad_cols_d[j-1]; - } - // fill in sad_cols with newly computed values; - sad_cols_d[0] = b_sum; - - return (-a_sum+b_sum); + short int a_sum = 0, b_sum = 0; + // compute new column sads; + for (unsigned char i = 0; i < WSIZE; i++) { + b_sum += __ABS((unsigned char)l_win[i][0] - (unsigned char)r_win_s[i][d]); + } + // valid guard; + if (col < d) b_sum = 0; + // get previous sad_cols value; + a_sum = sad_cols_d[WSIZE-1]; + // shift sad_cols[d]; + for (unsigned char j = WSIZE-1; j > 0; j--) { + sad_cols_d[j] = sad_cols_d[j-1]; + } + // fill in sad_cols with newly computed values; + sad_cols_d[0] = b_sum; + + return (-a_sum+b_sum); } @@ -256,264 +256,264 @@ int WSIZE,int NDISP,int NDISP_UNIT, int SWEEP_FACT, int ROW_TC, int COL_TC,int BUF_SIZE, int LWINWIDTH,int RWINWIDTH,int DISPWORDWIDTH,int SADWORDWIDTH> void xFSADBlockMatching( - hls::stream &left, - hls::stream &right, - hls::stream& out, - xf::xFSBMState& state, - short int height, short int width) + hls::stream &left, + hls::stream &right, + hls::stream& out, + xf::xFSBMState& state, + short int height, short int width) { - //create the left and right line buffers. - XF_TNAME(WORDWIDTH_SRC,1) left_line_buf[WSIZE][BUF_SIZE]; -#if PLATFORM_ZCU104 + //create the left and right line buffers. + XF_TNAME(WORDWIDTH_SRC,1) left_line_buf[WSIZE][BUF_SIZE]; +#if PLATFORM_ZCU104 #pragma HLS RESOURCE variable=left_line_buf core=XPM_MEMORY uram #endif #pragma HLS ARRAY_PARTITION variable=left_line_buf complete dim=1 - XF_TNAME(WORDWIDTH_SRC,1) right_line_buf[WSIZE][BUF_SIZE]; -#if PLATFORM_ZCU104 + XF_TNAME(WORDWIDTH_SRC,1) right_line_buf[WSIZE][BUF_SIZE]; +#if PLATFORM_ZCU104 #pragma HLS RESOURCE variable=right_line_buf core=XPM_MEMORY uram #endif #pragma HLS ARRAY_PARTITION variable=right_line_buf complete dim=1 - //create the left and right window buffers. - unsigned char l_window[WSIZE][LWINWIDTH]; + //create the left and right window buffers. + unsigned char l_window[WSIZE][LWINWIDTH]; #pragma HLS ARRAY_PARTITION variable=l_window complete dim=2 #pragma HLS ARRAY_PARTITION variable=l_window complete dim=1 - unsigned char r_window[WSIZE][RWINWIDTH]; + unsigned char r_window[WSIZE][RWINWIDTH]; #pragma HLS ARRAY_PARTITION variable=r_window complete dim=2 #pragma HLS ARRAY_PARTITION variable=r_window complete dim=1 - int TMP_INT_MAX_PACK; - TMP_INT_MAX_PACK = 2147483647; + int TMP_INT_MAX_PACK; + TMP_INT_MAX_PACK = 2147483647; - short int FILTERED = 0;//((state.minDisparity - 1) << 4); - unsigned char cap = state.preFilterCap; - unsigned char l_tmp[WSIZE]; + short int FILTERED = 0;//((state.minDisparity - 1) << 4); + unsigned char cap = state.preFilterCap; + unsigned char l_tmp[WSIZE]; #pragma HLS array_partition variable=l_tmp complete dim=0 - unsigned char r_tmp[WSIZE]; + unsigned char r_tmp[WSIZE]; #pragma HLS array_partition variable=r_tmp complete dim=0 - int text_sum[WSIZE]; + int text_sum[WSIZE]; #pragma HLS ARRAY_PARTITION variable=text_sum complete dim=0 - int sad[NDISP_UNIT]; + int sad[NDISP_UNIT]; #pragma HLS array_partition variable=sad complete dim=0 - short int sad_cols[NDISP_UNIT][WSIZE]; + short int sad_cols[NDISP_UNIT][WSIZE]; #pragma HLS array_partition variable=sad_cols complete dim=0 - int minsad[COLS+WSIZE-1]; -#if PLATFORM_ZCU104 + int minsad[COLS+WSIZE-1]; +#if PLATFORM_ZCU104 #pragma HLS RESOURCE variable=minsad core=XPM_MEMORY uram #endif - XF_TNAME(WORDWIDTH_DST,1) mind[BUF_SIZE]; -#if PLATFORM_ZCU104 + XF_TNAME(WORDWIDTH_DST,1) mind[BUF_SIZE]; +#if PLATFORM_ZCU104 #pragma HLS RESOURCE variable=mind core=XPM_MEMORY uram #endif - bool skip[BUF_SIZE]; -#if PLATFORM_ZCU104 + bool skip[BUF_SIZE]; +#if PLATFORM_ZCU104 #pragma HLS RESOURCE variable=skip core=XPM_MEMORY uram #endif - loop_row: - for (unsigned short row = 0; row < height+WSIZE-1; row++) { + loop_row: + for (unsigned short row = 0; row < height+WSIZE-1; row++) { #pragma HLS LOOP_TRIPCOUNT min=ROW_TC max=ROW_TC - loop_mux: - for (unsigned char sweep = 0; sweep < state.sweepFactor; sweep++) { + loop_mux: + for (unsigned char sweep = 0; sweep < state.sweepFactor; sweep++) { #pragma HLS LOOP_TRIPCOUNT min=SWEEP_FACT max=SWEEP_FACT - loop_sad_init: - for (unsigned char d = 0; d < NDISP_UNIT; d++) { + loop_sad_init: + for (unsigned char d = 0; d < NDISP_UNIT; d++) { #pragma HLS unroll - sad[d] = 0; - for (unsigned char i = 0; i < WSIZE; i++) { + sad[d] = 0; + for (unsigned char i = 0; i < WSIZE; i++) { #pragma HLS unroll - sad_cols[d][i] = 0; - } - } - loop_col: - for (unsigned short col = 0; col < width+WSIZE-1; col++) { + sad_cols[d][i] = 0; + } + } + loop_col: + for (unsigned short col = 0; col < width+WSIZE-1; col++) { #pragma HLS LOOP_TRIPCOUNT min=COL_TC max=COL_TC #pragma HLS loop_flatten #pragma HLS pipeline II=1 - unsigned char tmp_l = cap,tmp_r=cap; + unsigned char tmp_l = cap,tmp_r=cap; - if (sweep == 0) { - // load and shifting buffs - // shift down - for(unsigned char sd = WSIZE-1; sd > 0; sd--) { + if (sweep == 0) { + // load and shifting buffs + // shift down + for(unsigned char sd = WSIZE-1; sd > 0; sd--) { #pragma HLS unroll - left_line_buf[sd][col] = left_line_buf[sd-1][col]; - } + left_line_buf[sd][col] = left_line_buf[sd-1][col]; + } - for(unsigned char sd = WSIZE-1; sd > 0; sd--) { + for(unsigned char sd = WSIZE-1; sd > 0; sd--) { #pragma HLS unroll - right_line_buf[sd][col] = right_line_buf[sd-1][col]; - } - - if (!(row < (WSIZE-1)/2 || row >= height+(WSIZE-1)/2 || col < (WSIZE-1)/2 || col >= width+(WSIZE-1)/2)) { - tmp_l = left.read(); - tmp_r = right.read(); - } - // insert bottom - left_line_buf[0][col] = tmp_l; - right_line_buf[0][col] = tmp_r; - loop_get_data_from_linebuff: - for (unsigned char i = 0; i < WSIZE; i++) { - l_tmp[i] = left_line_buf[i][col]; - r_tmp[i] = right_line_buf[i][col]; - } - } else { - unsigned short offset = sweep * NDISP_UNIT; - loop_get_data_from_linebuff_with_offset: - for (unsigned char i = 0; i < WSIZE; i++) { - l_tmp[i] = left_line_buf[i][col]; - r_tmp[i] = right_line_buf[i][col-offset < 0 ? 0 : col-offset]; - } - } - - xFUpdateTextureSum(l_window,l_tmp,row,col,state.preFilterCap,text_sum); - - xFShiftRight(l_window); - xFShiftRight(r_window); - xFInsertLeft(l_window,l_tmp); - xFInsertLeft(r_window,r_tmp); - - loop_sad_compute: - for (unsigned char d = 0; d < NDISP_UNIT; d++) { - sad[d] += (int)xFSADComputeInc(l_window, r_window, d, col, sad_cols[d]); - } - - int skip_val[BUF_SIZE]; -#if PLATFORM_ZCU104 + right_line_buf[sd][col] = right_line_buf[sd-1][col]; + } + + if (!(row < (WSIZE-1)/2 || row >= height+(WSIZE-1)/2 || col < (WSIZE-1)/2 || col >= width+(WSIZE-1)/2)) { + tmp_l = left.read(); + tmp_r = right.read(); + } + // insert bottom + left_line_buf[0][col] = tmp_l; + right_line_buf[0][col] = tmp_r; + loop_get_data_from_linebuff: + for (unsigned char i = 0; i < WSIZE; i++) { + l_tmp[i] = left_line_buf[i][col]; + r_tmp[i] = right_line_buf[i][col]; + } + } else { + unsigned short offset = sweep * NDISP_UNIT; + loop_get_data_from_linebuff_with_offset: + for (unsigned char i = 0; i < WSIZE; i++) { + l_tmp[i] = left_line_buf[i][col]; + r_tmp[i] = right_line_buf[i][col-offset < 0 ? 0 : col-offset]; + } + } + + xFUpdateTextureSum(l_window,l_tmp,row,col,state.preFilterCap,text_sum); + + xFShiftRight(l_window); + xFShiftRight(r_window); + xFInsertLeft(l_window,l_tmp); + xFInsertLeft(r_window,r_tmp); + + loop_sad_compute: + for (unsigned char d = 0; d < NDISP_UNIT; d++) { + sad[d] += (int)xFSADComputeInc(l_window, r_window, d, col, sad_cols[d]); + } + + int skip_val[BUF_SIZE]; +#if PLATFORM_ZCU104 #pragma HLS RESOURCE variable=skip_val core=XPM_MEMORY uram #endif - int edge_neighbor[BUF_SIZE]; -#if PLATFORM_ZCU104 + int edge_neighbor[BUF_SIZE]; +#if PLATFORM_ZCU104 #pragma HLS RESOURCE variable=edge_neighbor core=XPM_MEMORY uram #endif - int edge[BUF_SIZE]; -#if PLATFORM_ZCU104 + int edge[BUF_SIZE]; +#if PLATFORM_ZCU104 #pragma HLS RESOURCE variable=edge core=XPM_MEMORY uram #endif - int minsad_p[BUF_SIZE]; -#if PLATFORM_ZCU104 + int minsad_p[BUF_SIZE]; +#if PLATFORM_ZCU104 #pragma HLS RESOURCE variable=minsad_p core=XPM_MEMORY uram #endif - int minsad_n[BUF_SIZE]; -#if PLATFORM_ZCU104 + int minsad_n[BUF_SIZE]; +#if PLATFORM_ZCU104 #pragma HLS RESOURCE variable=minsad_n core=XPM_MEMORY uram #endif - // SAD computing and store output - if (row >= WSIZE-1 && col >= WSIZE-1) { - int skip_flag = 0; - if (text_sum[0] < state.textureThreshold) skip_flag = 1; // texture threshold check - if ((row - WSIZE+1) < (WSIZE-1)/2 || (row - WSIZE+1) >= height - (WSIZE-1)/2) skip_flag = 1; // border skip horizontal - if ((col - WSIZE+1) < NDISP-1 + (WSIZE-1)/2 || (col - WSIZE+1) >= width - (WSIZE-1)/2) skip_flag = 1; // border skip vertical - - int gminsad = TMP_INT_MAX_PACK; - XF_TNAME(WORDWIDTH_DST,1) gmind = 0; - bool gskip = 0; - int gskip_val = TMP_INT_MAX_PACK; - int gedge_neighbor = TMP_INT_MAX_PACK; // for uniqueness check - int gedge=0; // for subpixel interpolation - if (NDISP_UNIT != 1) - gedge = sad[1]; - - int lminsad = TMP_INT_MAX_PACK; - XF_TNAME(WORDWIDTH_DST,1) lmind = 0; - int gminsad_p = TMP_INT_MAX_PACK; - int gminsad_n = TMP_INT_MAX_PACK; - - if (sweep > 0) { - gminsad = minsad[col]; - gmind = mind[col]; - gskip = skip[col]; - gskip_val = skip_val[col]; - gedge_neighbor = edge_neighbor[col]; - if (sweep == 1 && NDISP_UNIT == 1) - gedge_neighbor = TMP_INT_MAX_PACK; - gedge = edge[col]; - gminsad_p = minsad_p[col]; - gminsad_n = (gmind == sweep*NDISP_UNIT-1 ? sad[0] : minsad_n[col]); - } - - xFMinSAD::find(sad, lmind, lminsad); - - if (lminsad <= gminsad) { - gskip = 0; - if (state.uniquenessRatio > 0) { - int thresh = lminsad + (lminsad * state.uniquenessRatio / 100); - if (gminsad <= thresh && lmind+sweep*NDISP_UNIT > gmind+1) { - gskip = 1; - gskip_val = gminsad; - } else if (gminsad <= thresh && lmind+sweep*NDISP_UNIT == gmind+1 && gskip_val <= thresh) { - gskip = 1; - // gskip_val unchanged; - } else if (gminsad <= thresh && lmind+sweep*NDISP_UNIT == gmind+1 && gedge_neighbor <= thresh) { - gskip = 1; - gskip_val = gedge_neighbor; - } - loop_unique_search_0: - for (unsigned char d = 0; d < NDISP_UNIT; d++) { - if (sad[d] <= thresh && sad[d] < gskip_val && (d < lmind-1 || d > lmind+1)) { - gskip = 1; - gskip_val = sad[d]; - } - } - } - // update global values; - gminsad_p = (lmind == 0 ? gedge : sad[lmind-1]); - if (NDISP_UNIT == 1) - gminsad_n = sad[lmind == NDISP_UNIT-1 ? 0 : (int)(lmind+1)]; - else - gminsad_n = sad[lmind == NDISP_UNIT-1 ? lmind-1 : lmind+1]; - gminsad = lminsad; - gmind = lmind + sweep*NDISP_UNIT; - } else { - if (state.uniquenessRatio > 0) { - int thresh = gminsad + (gminsad * state.uniquenessRatio / 100); - loop_unique_search_1: - for (unsigned char d = 0; d < NDISP_UNIT; d++) { - if (sad[d] <= thresh && sad[d] < gskip_val && ((gmind == (sweep*NDISP_UNIT-1)) ? ((sweep*NDISP_UNIT+d) > (gmind+1)) : 1)) { - gskip = 1; - gskip_val = sad[d]; - } - } - } - } - minsad[col] = gminsad; - mind[col] = gmind; - skip[col] = gskip; - skip_val[col] = gskip_val; - if (NDISP_UNIT == 1) - edge_neighbor[col] = edge[col]; - else - edge_neighbor[col] = sad[NDISP_UNIT-2]; - edge[col] = sad[NDISP_UNIT-1]; - minsad_p[col] = gminsad_p; - minsad_n[col] = gminsad_n; - - if (sweep == state.sweepFactor-1) { - ap_int::Value> p = gmind==0?gminsad_n:gminsad_p; - ap_int::Value> n = gmind==NDISP-1?gminsad_p:gminsad_n; - ap_int::Value> k = p + n - 2*gminsad + __ABS((int)p - (int)n); - - ap_int::Value+8> num = p - n; - num = num << 8; - ap_int<10> delta = 0; - if (k != 0) delta = num/k; - XF_TNAME(WORDWIDTH_DST,1) out_disp = ((gmind*256 + delta + 15) >> 4); - - skip_flag |= gskip; - if (skip_flag) out_disp = FILTERED; - out.write(out_disp); - } - } - } - } - } + // SAD computing and store output + if (row >= WSIZE-1 && col >= WSIZE-1) { + int skip_flag = 0; + if (text_sum[0] < state.textureThreshold) skip_flag = 1; // texture threshold check + if ((row - WSIZE+1) < (WSIZE-1)/2 || (row - WSIZE+1) >= height - (WSIZE-1)/2) skip_flag = 1; // border skip horizontal + if ((col - WSIZE+1) < NDISP-1 + (WSIZE-1)/2 || (col - WSIZE+1) >= width - (WSIZE-1)/2) skip_flag = 1; // border skip vertical + + int gminsad = TMP_INT_MAX_PACK; + XF_TNAME(WORDWIDTH_DST,1) gmind = 0; + bool gskip = 0; + int gskip_val = TMP_INT_MAX_PACK; + int gedge_neighbor = TMP_INT_MAX_PACK; // for uniqueness check + int gedge=0; // for subpixel interpolation + if (NDISP_UNIT != 1) + gedge = sad[1]; + + int lminsad = TMP_INT_MAX_PACK; + XF_TNAME(WORDWIDTH_DST,1) lmind = 0; + int gminsad_p = TMP_INT_MAX_PACK; + int gminsad_n = TMP_INT_MAX_PACK; + + if (sweep > 0) { + gminsad = minsad[col]; + gmind = mind[col]; + gskip = skip[col]; + gskip_val = skip_val[col]; + gedge_neighbor = edge_neighbor[col]; + if (sweep == 1 && NDISP_UNIT == 1) + gedge_neighbor = TMP_INT_MAX_PACK; + gedge = edge[col]; + gminsad_p = minsad_p[col]; + gminsad_n = (gmind == sweep*NDISP_UNIT-1 ? sad[0] : minsad_n[col]); + } + + xFMinSAD::find(sad, lmind, lminsad); + + if (lminsad <= gminsad) { + gskip = 0; + if (state.uniquenessRatio > 0) { + int thresh = lminsad + (lminsad * state.uniquenessRatio / 100); + if (gminsad <= thresh && lmind+sweep*NDISP_UNIT > gmind+1) { + gskip = 1; + gskip_val = gminsad; + } else if (gminsad <= thresh && lmind+sweep*NDISP_UNIT == gmind+1 && gskip_val <= thresh) { + gskip = 1; + // gskip_val unchanged; + } else if (gminsad <= thresh && lmind+sweep*NDISP_UNIT == gmind+1 && gedge_neighbor <= thresh) { + gskip = 1; + gskip_val = gedge_neighbor; + } + loop_unique_search_0: + for (unsigned char d = 0; d < NDISP_UNIT; d++) { + if (sad[d] <= thresh && sad[d] < gskip_val && (d < lmind-1 || d > lmind+1)) { + gskip = 1; + gskip_val = sad[d]; + } + } + } + // update global values; + gminsad_p = (lmind == 0 ? gedge : sad[lmind-1]); + if (NDISP_UNIT == 1) + gminsad_n = sad[lmind == NDISP_UNIT-1 ? 0 : (int)(lmind+1)]; + else + gminsad_n = sad[lmind == NDISP_UNIT-1 ? lmind-1 : lmind+1]; + gminsad = lminsad; + gmind = lmind + sweep*NDISP_UNIT; + } else { + if (state.uniquenessRatio > 0) { + int thresh = gminsad + (gminsad * state.uniquenessRatio / 100); + loop_unique_search_1: + for (unsigned char d = 0; d < NDISP_UNIT; d++) { + if (sad[d] <= thresh && sad[d] < gskip_val && ((gmind == (sweep*NDISP_UNIT-1)) ? ((sweep*NDISP_UNIT+d) > (gmind+1)) : 1)) { + gskip = 1; + gskip_val = sad[d]; + } + } + } + } + minsad[col] = gminsad; + mind[col] = gmind; + skip[col] = gskip; + skip_val[col] = gskip_val; + if (NDISP_UNIT == 1) + edge_neighbor[col] = edge[col]; + else + edge_neighbor[col] = sad[NDISP_UNIT-2]; + edge[col] = sad[NDISP_UNIT-1]; + minsad_p[col] = gminsad_p; + minsad_n[col] = gminsad_n; + + if (sweep == state.sweepFactor-1) { + ap_int::Value> p = gmind==0?gminsad_n:gminsad_p; + ap_int::Value> n = gmind==NDISP-1?gminsad_p:gminsad_n; + ap_int::Value> k = p + n - 2*gminsad + __ABS((int)p - (int)n); + + ap_int::Value+8> num = p - n; + num = num << 8; + ap_int<10> delta = 0; + if (k != 0) delta = num/k; + XF_TNAME(WORDWIDTH_DST,1) out_disp = ((gmind*256 + delta + 15) >> 4); + + skip_flag |= gskip; + if (skip_flag) out_disp = FILTERED; + out.write(out_disp); + } + } + } + } + } } @@ -522,65 +522,65 @@ template void xFImageClipUtility(int i, int j, int k, int height, int width, int *pix) { #pragma HLS INLINE OFF - if (i<1 || i > height-2 || (j*(1< width-2) - *pix = 0; + if (i<1 || i > height-2 || (j*(1< width-2) + *pix = 0; } /* Clips the Output from the Sobel function based on the Cap value input */ template void xFImageClip( - hls::stream& src, - hls::stream& dst, - int cap, short int height, short int width) + hls::stream& src, + hls::stream& dst, + int cap, short int height, short int width) { - loop_row_clip: - for (short i = 0; i < height; i++) - { + loop_row_clip: + for (short i = 0; i < height; i++) + { #pragma HLS LOOP_TRIPCOUNT min=ROWS max=ROWS #pragma HLS LOOP_FLATTEN off - loop_col_clip: - for (short j = 0; j < (width>>XF_BITSHIFT(NPC)); j++) - { + loop_col_clip: + for (short j = 0; j < (width>>XF_BITSHIFT(NPC)); j++) + { #pragma HLS PIPELINE II=1 #pragma HLS LOOP_TRIPCOUNT min=COLS_TC max=COLS_TC - XF_TNAME(SRC_T,1) tmp = src.read(); - XF_TNAME(DST_T,1) tmp_out; - for (int k = 0; k < (1<(i,j,k,height,width,&pix); - - XF_PTNAME(DEPTH_DST) p = (XF_PTNAME(DEPTH_DST))(pix < -cap ? 0 : pix > cap ? cap*2 : pix + cap); - tmp_out.range((k+1)*XF_PIXELDEPTH(DEPTH_DST)-1,k*XF_PIXELDEPTH(DEPTH_DST)) = (XF_PTNAME(DEPTH_DST))p; - } - dst.write(tmp_out); - } - } + int pix = (XF_PTNAME(DEPTH_SRC))tmp.range((k+1)*XF_PIXELDEPTH(DEPTH_SRC)-1,k*XF_PIXELDEPTH(DEPTH_SRC)); + xFImageClipUtility(i,j,k,height,width,&pix); + + XF_PTNAME(DEPTH_DST) p = (XF_PTNAME(DEPTH_DST))(pix < -cap ? 0 : pix > cap ? cap*2 : pix + cap); + tmp_out.range((k+1)*XF_PIXELDEPTH(DEPTH_DST)-1,k*XF_PIXELDEPTH(DEPTH_DST)) = (XF_PTNAME(DEPTH_DST))p; + } + dst.write(tmp_out); + } + } } /* For reading the Gradient-Y stream, rather than letting the stream dangling */ template void xFReadOutStream( - hls::stream& src, - short int height,short int width) + hls::stream& src, + short int height,short int width) { - loop_row_clip: - for (short i = 0; i < height; i++) - { + loop_row_clip: + for (short i = 0; i < height; i++) + { #pragma HLS LOOP_TRIPCOUNT min=ROWS max=ROWS #pragma HLS LOOP_FLATTEN off - loop_col_clip: - for (short j = 0; j < (width>>XF_BITSHIFT(NPC)); j++) - { + loop_col_clip: + for (short j = 0; j < (width>>XF_BITSHIFT(NPC)); j++) + { #pragma HLS PIPELINE II=1 #pragma HLS LOOP_TRIPCOUNT min=COLS_TC max=COLS_TC - XF_TNAME(SRC_T,1) tmp = src.read(); - } - } + XF_TNAME(SRC_T,1) tmp = src.read(); + } + } } @@ -589,138 +589,138 @@ template void xFStereoPreProcess(hls::stream &in_strm, hls::stream& clipped_strm, int preFilterType,int preFilterCap, short int height, short int width) { #pragma HLS INLINE - hls::stream in_sobel_x("in_sobel_x"); - hls::stream in_sobel_y("in_sobel_y"); + hls::stream in_sobel_x("in_sobel_x"); + hls::stream in_sobel_y("in_sobel_y"); - xFSobelFilter(in_strm ,in_sobel_x ,in_sobel_y ,3,XF_BORDER_CONSTANT,height,width); - xFImageClip(in_sobel_x,clipped_strm,preFilterCap,height,width); - xFReadOutStream(in_sobel_y,height,width); + xFSobelFilter(in_strm ,in_sobel_x ,in_sobel_y ,3,XF_BORDER_CONSTANT,height,width); + xFImageClip(in_sobel_x,clipped_strm,preFilterCap,height,width); + xFReadOutStream(in_sobel_y,height,width); } /* This function performs preprocessing and disparity computation for NO mode */ template void xFFindStereoCorrespondenceLBMNO_pipeline (hls::stream &_left_strm, - hls::stream &_right_strm, - XF_TNAME(DST_T,NPC) *disp_ptr , - xf::xFSBMState &sbmstate, - short int height, short int width) + hls::stream &_right_strm, + XF_TNAME(DST_T,NPC) *disp_ptr , + xf::xFSBMState &sbmstate, + short int height, short int width) { #pragma HLS INLINE - hls::stream< XF_TNAME(SRC_T,NPC) > left_clipped("left_clipped"); - hls::stream< XF_TNAME(SRC_T,NPC) > right_clipped("right_clipped"); + hls::stream< XF_TNAME(SRC_T,NPC) > left_clipped("left_clipped"); + hls::stream< XF_TNAME(SRC_T,NPC) > right_clipped("right_clipped"); - hls::stream< XF_TNAME(DST_T,NPC) > _disp_strm("disparity stream"); + hls::stream< XF_TNAME(DST_T,NPC) > _disp_strm("disparity stream"); #pragma HLS DATAFLOW - int TC=(ROWS*COLS); + int TC=(ROWS*COLS); - /* Sobel and Clipping */ - xFStereoPreProcess(_left_strm,left_clipped,sbmstate.preFilterType,sbmstate.preFilterCap,height,width); - xFStereoPreProcess(_right_strm,right_clipped,sbmstate.preFilterType,sbmstate.preFilterCap,height,width); + /* Sobel and Clipping */ + xFStereoPreProcess(_left_strm,left_clipped,sbmstate.preFilterType,sbmstate.preFilterCap,height,width); + xFStereoPreProcess(_right_strm,right_clipped,sbmstate.preFilterType,sbmstate.preFilterCap,height,width); - /* SAD and disparity computation */ - xFSADBlockMatching(left_clipped,right_clipped,_disp_strm,sbmstate,height,width); + /* SAD and disparity computation */ + xFSADBlockMatching(left_clipped,right_clipped,_disp_strm,sbmstate,height,width); - for (int i = 0; i < height*width; i++) - { + for (int i = 0; i < height*width; i++) + { #pragma HLS pipeline ii=1 #pragma HLS LOOP_TRIPCOUNT min=1 max=TC - *(disp_ptr + i) = _disp_strm.read(); - } + *(disp_ptr + i) = _disp_strm.read(); + } } /* This function performs preprocessing and disparity computation for NO mode */ template void xFFindStereoCorrespondenceLBMNO (XF_TNAME(SRC_T,NPC) *left_ptr, - XF_TNAME(SRC_T,NPC) *right_ptr, - XF_TNAME(DST_T,NPC) *disp_ptr , - xf::xFSBMState &sbmstate, - short int height, short int width) + XF_TNAME(SRC_T,NPC) *right_ptr, + XF_TNAME(DST_T,NPC) *disp_ptr , + xf::xFSBMState &sbmstate, + short int height, short int width) { - hls::stream< XF_TNAME(SRC_T,NPC) > _left_strm; - hls::stream< XF_TNAME(SRC_T,NPC) > _right_strm; + hls::stream< XF_TNAME(SRC_T,NPC) > _left_strm; + hls::stream< XF_TNAME(SRC_T,NPC) > _right_strm; - hls::stream< XF_TNAME(SRC_T,NPC) > left_clipped("left_clipped"); - hls::stream< XF_TNAME(SRC_T,NPC) > right_clipped("right_clipped"); + hls::stream< XF_TNAME(SRC_T,NPC) > left_clipped("left_clipped"); + hls::stream< XF_TNAME(SRC_T,NPC) > right_clipped("right_clipped"); - hls::stream< XF_TNAME(DST_T,NPC) > _disp_strm("disparity stream"); + hls::stream< XF_TNAME(DST_T,NPC) > _disp_strm("disparity stream"); #pragma HLS DATAFLOW - int TC=(ROWS*COLS); - for (int i = 0; i < height*width; i++) - { + int TC=(ROWS*COLS); + for (int i = 0; i < height*width; i++) + { #pragma HLS pipeline ii=1 #pragma HLS LOOP_TRIPCOUNT min=1 max=TC - _left_strm.write(*(left_ptr + i)); - _right_strm.write(*(right_ptr + i)); - } + _left_strm.write(*(left_ptr + i)); + _right_strm.write(*(right_ptr + i)); + } - /* Sobel and Clipping */ - xFStereoPreProcess(_left_strm,left_clipped,sbmstate.preFilterType,sbmstate.preFilterCap,height,width); - xFStereoPreProcess(_right_strm,right_clipped,sbmstate.preFilterType,sbmstate.preFilterCap,height,width); + /* Sobel and Clipping */ + xFStereoPreProcess(_left_strm,left_clipped,sbmstate.preFilterType,sbmstate.preFilterCap,height,width); + xFStereoPreProcess(_right_strm,right_clipped,sbmstate.preFilterType,sbmstate.preFilterCap,height,width); - /* SAD and disparity computation */ - xFSADBlockMatching(left_clipped,right_clipped,_disp_strm,sbmstate,height,width); + /* SAD and disparity computation */ + xFSADBlockMatching(left_clipped,right_clipped,_disp_strm,sbmstate,height,width); - for (int i = 0; i < height*width; i++) - { + for (int i = 0; i < height*width; i++) + { #pragma HLS pipeline ii=1 #pragma HLS LOOP_TRIPCOUNT min=1 max=TC - *(disp_ptr + i) = _disp_strm.read(); - } + *(disp_ptr + i) = _disp_strm.read(); + } } /* Calls the functions based on the PIXEL PARALLELISM configuration */ template void xFFindStereoCorrespondenceLBM_pipeline(hls::stream &_left_strm, - hls::stream &_right_strm, - XF_TNAME(DST_T,NPC) *out_ptr, - xf::xFSBMState &sbmstate, - short int height,short int width) + hls::stream &_right_strm, + XF_TNAME(DST_T,NPC) *out_ptr, + xf::xFSBMState &sbmstate, + short int height,short int width) { #pragma HLS INLINE - assert((SRC_T == XF_8UW) && " WORDWIDTH_SRC must be XF_8UW "); - assert((DST_T == XF_16UW) && " WORDWIDTH_DST must be XF_16UW "); - assert((NPC == XF_NPPC1) && " NPC must be XF_NPPC1 "); - assert((WSIZE%2 == 1) && (WSIZE < __XF_MIN(height,width) && (WSIZE >= 5)) && " WSIZE must be an odd number, less than minimum of height & width and greater than or equal to '5' "); - assert(((NDISP > 1) && (NDISP < width)) && " NDISP must be greater than '1' and less than the image width "); - assert((NDISP >= NDISP_UNIT) && " NDISP must not be lesser than NDISP_UNIT"); - assert((((NDISP/NDISP_UNIT)*NDISP_UNIT) == NDISP) && " NDISP/NDISP_UNIT must be a non-fractional number "); - assert(sbmstate.uniquenessRatio >= 0 && "uniqueness ratio must be non-negative"); - assert(sbmstate.preFilterCap >=1 && sbmstate.preFilterCap <= 63 && "preFilterCap must be within 1..63"); - assert(sbmstate.preFilterType == XF_STEREO_PREFILTER_SOBEL_TYPE); - - xFFindStereoCorrespondenceLBMNO_pipeline(_left_strm,_right_strm,out_ptr,sbmstate,height,width); + assert((SRC_T == XF_8UW) && " WORDWIDTH_SRC must be XF_8UW "); + assert((DST_T == XF_16UW) && " WORDWIDTH_DST must be XF_16UW "); + assert((NPC == XF_NPPC1) && " NPC must be XF_NPPC1 "); + assert((WSIZE%2 == 1) && (WSIZE < __XF_MIN(height,width) && (WSIZE >= 5)) && " WSIZE must be an odd number, less than minimum of height & width and greater than or equal to '5' "); + assert(((NDISP > 1) && (NDISP < width)) && " NDISP must be greater than '1' and less than the image width "); + assert((NDISP >= NDISP_UNIT) && " NDISP must not be lesser than NDISP_UNIT"); + assert((((NDISP/NDISP_UNIT)*NDISP_UNIT) == NDISP) && " NDISP/NDISP_UNIT must be a non-fractional number "); + assert(sbmstate.uniquenessRatio >= 0 && "uniqueness ratio must be non-negative"); + assert(sbmstate.preFilterCap >=1 && sbmstate.preFilterCap <= 63 && "preFilterCap must be within 1..63"); + assert(sbmstate.preFilterType == XF_STEREO_PREFILTER_SOBEL_TYPE); + + xFFindStereoCorrespondenceLBMNO_pipeline(_left_strm,_right_strm,out_ptr,sbmstate,height,width); } /* Calls the functions based on the PIXEL PARALLELISM configuration */ template void xFFindStereoCorrespondenceLBM(XF_TNAME(SRC_T,NPC) *left_ptr, - XF_TNAME(SRC_T,NPC) *right_ptr, - XF_TNAME(DST_T,NPC) *out_ptr, - xf::xFSBMState &sbmstate, - short int height,short int width) + XF_TNAME(SRC_T,NPC) *right_ptr, + XF_TNAME(DST_T,NPC) *out_ptr, + xf::xFSBMState &sbmstate, + short int height,short int width) { - assert((SRC_T == XF_8UW) && " WORDWIDTH_SRC must be XF_8UW "); - assert((DST_T == XF_16UW) && " WORDWIDTH_DST must be XF_16UW "); - assert((NPC == XF_NPPC1) && " NPC must be XF_NPPC1 "); - assert((WSIZE%2 == 1) && (WSIZE < __XF_MIN(height,width) && (WSIZE >= 5)) && " WSIZE must be an odd number, less than minimum of height & width and greater than or equal to '5' "); - assert(((NDISP > 1) && (NDISP < width)) && " NDISP must be greater than '1' and less than the image width "); - assert((NDISP >= NDISP_UNIT) && " NDISP must not be lesser than NDISP_UNIT"); - assert((((NDISP/NDISP_UNIT)*NDISP_UNIT) == NDISP) && " NDISP/NDISP_UNIT must be a non-fractional number "); - assert(sbmstate.uniquenessRatio >= 0 && "uniqueness ratio must be non-negative"); - assert(sbmstate.preFilterCap >=1 && sbmstate.preFilterCap <= 63 && "preFilterCap must be within 1..63"); - assert(sbmstate.preFilterType == XF_STEREO_PREFILTER_SOBEL_TYPE); - - xFFindStereoCorrespondenceLBMNO(left_ptr,right_ptr,out_ptr,sbmstate,height,width); + assert((SRC_T == XF_8UW) && " WORDWIDTH_SRC must be XF_8UW "); + assert((DST_T == XF_16UW) && " WORDWIDTH_DST must be XF_16UW "); + assert((NPC == XF_NPPC1) && " NPC must be XF_NPPC1 "); + assert((WSIZE%2 == 1) && (WSIZE < __XF_MIN(height,width) && (WSIZE >= 5)) && " WSIZE must be an odd number, less than minimum of height & width and greater than or equal to '5' "); + assert(((NDISP > 1) && (NDISP < width)) && " NDISP must be greater than '1' and less than the image width "); + assert((NDISP >= NDISP_UNIT) && " NDISP must not be lesser than NDISP_UNIT"); + assert((((NDISP/NDISP_UNIT)*NDISP_UNIT) == NDISP) && " NDISP/NDISP_UNIT must be a non-fractional number "); + assert(sbmstate.uniquenessRatio >= 0 && "uniqueness ratio must be non-negative"); + assert(sbmstate.preFilterCap >=1 && sbmstate.preFilterCap <= 63 && "preFilterCap must be within 1..63"); + assert(sbmstate.preFilterType == XF_STEREO_PREFILTER_SOBEL_TYPE); + + xFFindStereoCorrespondenceLBMNO(left_ptr,right_ptr,out_ptr,sbmstate,height,width); } @@ -731,16 +731,17 @@ void xFFindStereoCorrespondenceLBM(XF_TNAME(SRC_T,NPC) *left_ptr, #pragma SDS data copy("_left_mat.data"[0:"_left_mat.size"]) #pragma SDS data copy("_right_mat.data"[0:"_right_mat.size"]) #pragma SDS data copy("_disp_mat.data"[0:"_disp_mat.size"]) -template + +template void StereoBM(xf::Mat &_left_mat, - xf::Mat &_right_mat, - xf::Mat &_disp_mat, - xf::xFSBMState &sbmstate) + xf::Mat &_right_mat, + xf::Mat &_disp_mat, + xf::xFSBMState &sbmstate) { #pragma HLS INLINE OFF - xFFindStereoCorrespondenceLBM(_left_mat.data,_right_mat.data,_disp_mat.data,sbmstate, - _left_mat.rows,_left_mat.cols); + xFFindStereoCorrespondenceLBM(_left_mat.data,_right_mat.data,_disp_mat.data,sbmstate, + _left_mat.rows,_left_mat.cols); } } diff --git a/include/imgproc/xf_stereo_pipeline.hpp b/include/imgproc/xf_stereo_pipeline.hpp index caa5c3c..880f704 100644 --- a/include/imgproc/xf_stereo_pipeline.hpp +++ b/include/imgproc/xf_stereo_pipeline.hpp @@ -123,9 +123,20 @@ void xFInitUndistortRectifyMapInverseKernel ( #pragma HLS ARRAY_PARTITION variable=distCoeffsHLS complete dim=0 #pragma HLS ARRAY_PARTITION variable=iRnewCameraMatrixHLS complete dim=0 - memcpy(cameraMatrixHLS,cameraMatrix,4*CM_SIZE); - memcpy(distCoeffsHLS,distCoeffs,4*N); - memcpy(iRnewCameraMatrixHLS,ir,4*CM_SIZE); +//#NO memcpy(cameraMatrixHLS,cameraMatrix,4*CM_SIZE); +//#NO memcpy(distCoeffsHLS,distCoeffs,4*N); +//#NO memcpy(iRnewCameraMatrixHLS,ir,4*CM_SIZE); + + for(int r = 0; r < CM_SIZE; r++) + { + cameraMatrixHLS[r] = cameraMatrix[r]; + iRnewCameraMatrixHLS[r] = ir[r]; + } + + for(int n = 0; n < N; n++) + { + distCoeffsHLS[n] = distCoeffs[n]; + } MAP_T mx; MAP_T my; diff --git a/include/imgproc/xf_warp_transform.hpp b/include/imgproc/xf_warp_transform.hpp index 2945a08..36df4ad 100644 --- a/include/imgproc/xf_warp_transform.hpp +++ b/include/imgproc/xf_warp_transform.hpp @@ -264,7 +264,126 @@ XF_TNAME(DEPTH,NPC) retrieve_EvOd_image4x1(int i,int j,int A, int B, int C, int return XF_TNAME(DEPTH,NPC)((op_val+(1<<(INTER_REMAP_COEF_BITS-1)))>>INTER_REMAP_COEF_BITS); }; -template + +template +void store_in_UramNN(XF_TNAME(DEPTH,NPC) in_pixel, ap_uint<16> i,ap_uint<16> j, ap_uint<64> bufUram[STORE_LINES][(COLS+7)/8]) +{ +#pragma HLS INLINE + + static XF_TNAME(DEPTH,NPC) sx8[8]; +#pragma HLS ARRAY_PARTITION variable=sx8 complete dim=1 + sx8[j%8] = in_pixel; + for (int k=0; k<8; k++) bufUram[i][j/8](k*8+7,k*8) = sx8[k]; +}; + +template +void store_in_UramBL(hls::stream< XF_TNAME(DEPTH,NPC)>& input_image, ap_uint<16> i,ap_uint<16> j, ap_uint<72> bufUram[(STORE_LINES+1)/2][(COLS+1)/2], short img_cols) +{ +#pragma HLS INLINE + + ap_int<16> i_hlf_mns1 = i/2-1; + i_hlf_mns1 = i_hlf_mns1 + (i_hlf_mns1 < 0 ? (STORE_LINES+1)/2 : 0); + + static XF_TNAME(DEPTH,NPC) lineBuf[COLS]; //addtitional cashing as VHLS doesn't support URAM Byte Enables + static XF_TNAME(DEPTH,NPC) s3x3[2][9]; //URAM-wide word is doubled to resolve pipelining read/write dependency +#pragma HLS ARRAY_PARTITION variable=s3x3 complete dim=0 +#pragma HLS dependence variable=s3x3 inter false RAW + static XF_TNAME(DEPTH,NPC) s3x3_2[9]; + static XF_TNAME(DEPTH,NPC) s0,s3; + + static XF_TNAME(DEPTH,NPC) in_pixel; + if (j1) for (int k=0; k<9; k++) bufUram[i_hlf_mns1][j/2-2](k*8+7,k*8) = s3x3[!!(j&2)][k]; + } else if (j0) for (int k=0; k<9; k++) bufUram[i/2][j/2-1](k*8+7,k*8) = s3x3_2[k]; + } else { // odd col + s3x3_2[0] = s0; + s3x3_2[1] = lineBuf[j]; + s3x3_2[3] = s3; + s3x3_2[4] = in_pixel; + + // this clearing is needed only for case of bottom zero padding (curently is not used at all) + s3x3_2[6] = 0; + s3x3_2[7] = 0; + s3x3_2[8] = 0; + //if (j==(img_cols-1)) { //these clearing and save is needed only at last column but may done every cycle + s3x3_2[2] = 0; + s3x3_2[5] = 0; + for (int k=0; k<9; k++) bufUram[i/2][j/2](k*8+7,k*8) = s3x3_2[k]; + //} + } + } +}; + +template +XF_TNAME(DEPTH,NPC) retrieve_UramNN(int i,int j, ap_uint<64> bufUram[STORE_LINES][(COLS+7)/8]) +{ +#pragma HLS INLINE + + i = i > (STORE_LINES - 1)? (i - STORE_LINES) : ((i < 0)? (i + STORE_LINES) : i); + XF_TNAME(DEPTH,NPC) dx8[8]; +#pragma HLS ARRAY_PARTITION variable=dx8 complete dim=1 + for (int k=0; k<8; k++) dx8[k] = bufUram[i][j/8](k*8+7,k*8); + return dx8[j%8]; +}; + +template +XF_TNAME(DEPTH,NPC) retrieve_UramBL(int i,int j,int A, int B, int C, int D, ap_uint<72> bufUram[(STORE_LINES+1)/2][(COLS+1)/2]) +{ +#pragma HLS INLINE + + i = (i > (STORE_LINES - 1))? (i - STORE_LINES) : ((i < 0)? (i + STORE_LINES) : i); + + XF_TNAME(DEPTH,NPC) d3x3[9]; +#pragma HLS ARRAY_PARTITION variable=d3x3 complete dim=1 + for (int k=0; k<9; k++) d3x3[k] = bufUram[i/2][j/2](k*8+7,k*8); + XF_TNAME(DEPTH,NPC) const px00 = d3x3[(i%2 )*3 + j%2 ]; + XF_TNAME(DEPTH,NPC) const px01 = d3x3[(i%2 )*3 + j%2+1]; + XF_TNAME(DEPTH,NPC) const px10 = d3x3[(i%2+1)*3 + j%2 ]; + XF_TNAME(DEPTH,NPC) const px11 = d3x3[(i%2+1)*3 + j%2+1]; + + int const op_val = (A*px00) + + (B*px01) + + (C*px10) + + (D*px11); + //returning the computed interpolated output after rounding off the op_val by adding 0.5 + //and shifting to right by INTER_REMAP_COEF_BITS + return XF_TNAME(DEPTH,NPC)((op_val+(1<<(INTER_REMAP_COEF_BITS-1)))>>INTER_REMAP_COEF_BITS); +}; + +//AK(ZoTech): rounding function to substitute one from math.h, consuming 2 BRAMs per call; not used as it is not bitexact with the math.h. +// template +// int round(T x) +// { +// #pragma HLS INLINE +// return (x + (x>=T(0) ? T(0.5) : T(-0.5))); +// }; + +//AK(ZoTech): floor function to substitute one from math.h, consuming 2 BRAMs per call; not used as it is not synthesisable if biexact. +// template +// int floor(T x) +// { +// #pragma HLS INLINE +// return (x - (x>=T(0) ? T(0) : T(1)-std::numeric_limits::epsilon() )); +// }; + +template int xFwarpTransformKernel(hls::stream< XF_TNAME(DEPTH,NPC) > &input_image, hls::stream< XF_TNAME(DEPTH,NPC) > &output_image, float P_matrix[9], short img_rows, short img_cols) { #pragma HLS INLINE @@ -298,6 +417,14 @@ int xFwarpTransformKernel(hls::stream< XF_TNAME(DEPTH,NPC) > &input_image, hls:: #pragma HLS DEPENDENCE variable=store1_pt_2OdR_EvC intra false #pragma HLS DEPENDENCE variable=store1_pt_2OdR_OdC intra false + //URAM based storages + ap_uint<64> bufUramNN[STORE_LINES][(COLS+7)/8]; +#pragma HLS RESOURCE variable=bufUramNN core=XPM_MEMORY uram +#pragma HLS dependence variable=bufUramNN inter false + //URAM storage garnularity for BL inerpolation is 3x3-pel block in 2x2-pel picture grid, it fits to one URAM word + ap_uint<72> bufUramBL[(STORE_LINES+1)/2][(COLS+1)/2]; +#pragma HLS RESOURCE variable=bufUramBL core=XPM_MEMORY uram +#pragma HLS dependence variable=bufUramBL inter false //varables for loop counters ap_uint<16> i=0,j=0,k=0,l=0,m=0,n=0,p=0; @@ -342,7 +469,7 @@ int xFwarpTransformKernel(hls::stream< XF_TNAME(DEPTH,NPC) > &input_image, hls:: MAIN_ROWS:for (i=0;i<(img_rows + START_ROW);i++) { #pragma HLS LOOP_TRIPCOUNT min=1 max=ROWS - MAIN_COLS:for(j=0;j<(img_cols);j++) + MAIN_COLS:for(j=0;j<(img_cols+3);j++) { #pragma HLS LOOP_TRIPCOUNT min=1 max=COLS #pragma HLS PIPELINE @@ -362,12 +489,16 @@ int xFwarpTransformKernel(hls::stream< XF_TNAME(DEPTH,NPC) > &input_image, hls:: //function to store the input image stream to //a buffer of size STORE_LINES rows //computing i-l to snap the writes to STORE_LINES size buffer + if (USE_URAM) + if (INTERPOLATION_TYPE) store_in_UramBL(input_image ,i-l,j, bufUramBL, img_cols); + else {if (j(input_image.read() ,i-l,j, bufUramNN);} + else if (j( input_image.read() ,i-l,j, store1_pt_2EvR_EvC, store1_pt_2EvR_OdC, store1_pt_2OdR_EvC, store1_pt_2OdR_OdC); } //condition to compute and stream out the output image //after START_ROW number of rows - if(i>=START_ROW) + if(i>=START_ROW && j &input_image, hls:: I1 = I - m; if(INTERPOLATION_TYPE==0) { + if (USE_URAM) + op_val = retrieve_UramNN (I1,J, bufUramNN); + else op_val = retrieve_EvOd_image1(I1,J, store1_pt_2EvR_EvC, store1_pt_2EvR_OdC, store1_pt_2OdR_EvC, store1_pt_2OdR_OdC); } else { //calling the read function with interpolation + if (USE_URAM) + op_val = retrieve_UramBL (I1,J,A,B,C,D, bufUramBL); + else op_val = retrieve_EvOd_image4x1(I1,J,A,B,C,D, store1_pt_2EvR_EvC, store1_pt_2EvR_OdC, store1_pt_2OdR_EvC, store1_pt_2OdR_OdC); } } @@ -497,7 +634,7 @@ return 0; #pragma SDS data access_pattern("_src_mat.data":SEQUENTIAL) #pragma SDS data access_pattern("_dst_mat.data":SEQUENTIAL) #pragma SDS data mem_attribute ("_src_mat.data":NON_CACHEABLE|PHYSICAL_CONTIGUOUS, "_dst_mat.data":NON_CACHEABLE|PHYSICAL_CONTIGUOUS) -template +template void warpTransform(xf::Mat & _src_mat, xf::Mat & _dst_mat, float P_matrix[9]) { #pragma HLS INLINE OFF @@ -516,7 +653,7 @@ hls::stream< XF_TNAME(TYPE,NPC) > out_stream; } } -xFwarpTransformKernel(in_stream, out_stream, P_matrix, _src_mat.rows, _src_mat.cols); +xFwarpTransformKernel(in_stream, out_stream, P_matrix, _src_mat.rows, _src_mat.cols); for(int i=0; i<_dst_mat.rows;i++) {