diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..11f61b3
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+/**/ide/**/*.*sdf
+/**/ide/**/*.tss
+/**/ide/**/*.suo
+/**/ide/**/Debug
+/**/ide/**/Release
diff --git a/aws_demo/Code_Structure.png b/aws_demo/Code_Structure.png
new file mode 100644
index 0000000..e83b55e
Binary files /dev/null and b/aws_demo/Code_Structure.png differ
diff --git a/aws_demo/README.md b/aws_demo/README.md
new file mode 100644
index 0000000..2503114
--- /dev/null
+++ b/aws_demo/README.md
@@ -0,0 +1,130 @@
+AWS Demo 
+======================
+The AWS Demo is set of examples demonstrate how use xfOpenCV library in kernels build for Amazone F1 instance. Each example could be build to run on FPGA (only F1 instance with "*FPGA Developer AMI*" could be used) or emulated for debug purpose in HW or SW emulation mode (any instance with "*FPGA Developer AMI*"could be used). 
+
+## EXAMPLES FILE HIERARCHY
+Each example is organized into the following folders 
+
+
+|  Folder Name | Contents |
+| :----- | :------ 
+| &lt;example&nbsp;name&gt; | **Root folder of example.** Folder contains input image(s), headers with kernel configuration and declaration, makefile, source code of host application, kernel wrapper and kernel.|
+| &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;hw |**Folder for FPGA flow.**  |
+| &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;afi |**Folder for AWS FPGA binary file generation.** After successful build folder will contain the kernel container  binary (`<kernel name>.xclbin`) to generate AWS FPGA binary file for Amazon F1 instance and register AFI. During generation all intermediate files will be stored there. |
+| &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;run |**Run folder of the example.** After successful build folder will contain host application executable. All result, intermediate and reference images generated by kernel and application will be stored there. Kernel container  |
+| &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;hw_emu |**Folder for HW emulation flow.**  |
+| &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;run |**Emulation folder of the example.** After successful build folder will contain host application executable and kernel container (`<kernel name>.xclbin`) for HW emulation. Emulation logs and data, result, intermediate and reference images generated by kernel emulation and application will be stored there.  |
+| &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;sw_emu |**Folder for HW emulation flow.**  |
+| &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;run |**Emulation folder of the example.** After successful build folder will contain host application executable and kernel container (`<kernel name>.xclbin`) for HW emulation. Emulation logs and data, result, intermediate and reference images generated by kernel emulation and application will be stored there.  |
+
+
+## HOW TO BUILD EXAMPLE
+Place xfOpenCV library (`xfopencv` folder) together with Amazon's FPGA framework
+```
+project_data
+ ├─ aws-fpga
+ └─ xfopencv
+```
+
+If you would like to have other folder structure you need tune [`aws_demo/common_makefile`](common_makefile). For more information please see [make_description.md](make_description.md) 
+
+### Prepare environment
+Run following code to prepare environment for build.
+```
+cd $AWS_FPGA_REPO_DIR 
+source sdaccel_setup.sh 
+source $XILINX_SDX/settings64.sh
+```
+
+### Build example for SW/HW emulation
+
+1. Go to root folder of example.
+2. Build whole example (`all`), kernel part only (`krnl`) or host application only (`host`) for HW (`hw_emu`) or SW (`sw_emu`) emulation with the following command:
+```
+make TARGET=hw_emu|sw_emu all|host|krnl
+```
+
+To erase all build data including host application executable and kernel binary files use following command:
+```
+make TARGET=hw_emu|sw_emu clean
+```
+
+### Build example for FPGA
+
+To build examples for FPGA F1 instance you will need access to [**AWS CLI**](https://aws.amazon.com/cli/) and [**S3**](https://aws.amazon.com/s3/). Please refer to **_[What Is the AWS Command Line Interface?](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-welcome.html)_** and **_[Getting Started with Amazon S3](https://aws.amazon.com/s3/getting-started/)_**.     
+1. Go to root folder of example.
+2. Build whole example (`all`), kernel part only (`krnl`) or host application only (`host`) for HW (`hw_emu`) or SW (`sw_emu`) emulation with the following command:
+```
+make TARGET=hw all|host|krnl
+```
+3. After kernel build complete go to ***afi*** folder to generate AWS FPGA binary file for Amazon F1 instance and register AFI
+4. Setup **[AWS CLI](https://aws.amazon.com/cli/)**  (see **_[What Is the AWS Command Line Interface?](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-welcome.html)_**)
+5. Run script placed in ***afi*** folder to generate AWS FPGA binary file for Amazon F1 instance and register AFI 
+```
+source ./gen_afi.sh
+```
+Script will create S3 bucket for FPGA image and launch image generation in background process. When script finish the FPGA image will not be ready.
+6. Wait until FPGA image will be generated. To check generation completion periodically run following command:
+```
+aws ec2 describe-fpga-images --fpga-image-id <afi id>
+```
+You can get `<afi id>` from script message or from file **`*_afi_id.txt`**. During generation you will see following message:
+```
+...
+     "State": {
+                "Code": "pending"
+              },
+...
+```   
+The FPGA image is ready if command print `available`:
+```
+...
+     "State": {
+                "Code": "available"
+              },
+...
+```   
+
+7. Copy `<kernel name>.awsxclbin` into ***hw/run*** folder  
+
+To erase all build data including host application executable but except content of ***afi*** folder use following command:
+```
+make TARGET=hw clean
+```
+
+## HOW TO RUN EXAMPLE
+
+### Prepare environment
+If you relaunch Amazon instance after build you need to repeat environment preparation step:
+```
+cd $AWS_FPGA_REPO_DIR 
+source sdaccel_setup.sh 
+source $XILINX_SDX/settings64.sh
+```
+### Run SW/HW emulation of example 
+
+1. Go to emulation folder of example (**`hw_emu/run`** or **`sw_emu/run`**).
+2. Set desired emulation option in `sdaccel.ini` file
+3. Launch emulation with the following command:
+```
+source run.sh 
+```
+
+### Run example on FPGA 
+
+1. Go to run folder of example (**`hw/run`**).
+2. Launch shell
+```
+sudo sh
+```
+3. Launch application with the following command:
+```
+source run.sh 
+```
+
+## REVISION HISTORY
+
+Date      | Readme Version | Release Notes
+--------  |----------------|-------------------------
+May 2018  | 1.0            | Initial version.
+ 
diff --git a/aws_demo/common_makefile b/aws_demo/common_makefile
new file mode 100644
index 0000000..b996b1e
--- /dev/null
+++ b/aws_demo/common_makefile
@@ -0,0 +1,196 @@
+########################################
+#                                      #
+#             Tools section            #
+#                                      #
+########################################
+
+XILINX_SDX ?= /opt/Xilinx/SDx/2017.1.op
+XILINX_HLS ?= $(XILINX_SDX)/Vivado_HLS
+
+
+SDX_CXX ?= $(XILINX_SDX)/bin/xcpp
+XOCC ?= $(XILINX_SDX)/bin/xocc
+
+RM = rm -f
+RMDIR = rm -rf
+
+ifeq "$(AWS_PLATFORM)" "$(AWS_PLATFORM_1DDR)"
+  XILINX_SDX_RUNTIME=/opt/Xilinx/2017.1.rte.1ddr/runtime/lib/x86_64
+else ifeq "$(AWS_PLATFORM)" "$(AWS_PLATFORM_4DDR)"
+  XILINX_SDX_RUNTIME=/opt/Xilinx/2017.1.rte.4ddr/runtime/lib/x86_64
+else ifeq "$(AWS_PLATFORM)" "$(AWS_PLATFORM_4DDR_DEBUG)"
+  XILINX_SDX_RUNTIME=/opt/Xilinx/2017.1.rte.4ddr_debug/runtime/lib/x86_64
+endif
+
+XFOPENCV ?= /home/centos/src/project_data/xfopencv
+
+TARGET ?= hw_emu
+
+########################################
+#                                      #
+#             Host section             #
+#                                      #
+########################################
+
+HOST_SDx_SRC ?=  xcl2
+
+SDx_LIB_DIR ?= $(SDACCEL_DIR)/examples/xilinx/libs/xcl2
+
+CXXFLAGS += -DSDX_PLATFORM=$(AWS_PLATFORM) -D__USE_XOPEN2K8 
+CXXFLAGS += -I$(XILINX_SDX)/runtime/include/1_2/ 
+CXXFLAGS += -I$(XILINX_SDX)/include/         
+CXXFLAGS += -I$(XFOPENCV)/include/  
+CXXFLAGS += -I$(SDx_LIB_DIR)/
+CXXFLAGS += -I$(XILINX_HLS)/include
+CXXFLAGS += -O2 -Wall -c -fmessage-length=0 -std=c++14
+
+#--- Specify OpenCV libraries ---#
+
+LDFLAGS += -L$(XILINX_SDX)/lnx64/tools/opencv
+LDFLAGS += -lopencv_core
+LDFLAGS += -lopencv_imgproc
+LDFLAGS += -lopencv_highgui
+
+#--- Specify common libraries ---#
+
+LDFLAGS += -L$(XILINX_SDX)/lib/lnx64.o 
+LDFLAGS += -lstdc++
+LDFLAGS += -lpthread 
+LDFLAGS += -lrt
+
+#--- Specify AWS libraries ---#
+
+LDFLAGS += -L$(XILINX_SDX_RUNTIME) 
+LDFLAGS += -lxilinxopencl
+
+#--- Specify runtime libraries ---#
+
+LDFLAGS += -Wl,-rpath,$(XILINX_SDX)/lnx64/tools/opencv
+LDFLAGS += -Wl,-rpath,$(XILINX_SDX)/lib/lnx64.o 
+LDFLAGS += -Wl,-rpath,$(XILINX_SDX_RUNTIME)
+
+#--- Specify objects ---#
+
+HOST_AWS_DIR = ./
+HOST_BLD_DIR = $(TARGET)/build/host
+HOST_RUN_DIR = $(TARGET)/run
+
+HOST_AWS_OBJ +=  $(addsuffix .o, $(addprefix $(HOST_BLD_DIR)/, $(HOST_AWS_SRC)) )
+HOST_SDx_OBJ +=  $(addsuffix .o, $(addprefix $(HOST_BLD_DIR)/, $(HOST_SDx_SRC)) )
+
+HOST_OBJ = $(HOST_AWS_OBJ) $(HOST_SDx_OBJ)
+
+HOST_EXE ?= $(HOST_RUN_DIR)/$(TEST_NAME)
+
+BUILD_SUBDIRS += $(HOST_BLD_DIR)
+
+
+########################################
+#                                      #
+#            Kernel section            #
+#                                      #
+########################################
+
+XOCC_OPTS += --platform $(AWS_PLATFORM) 
+XOCC_OPTS += --save-temps  
+XOCC_OPTS += --report system
+
+XOCC_INCL += -I$(XFOPENCV)/include
+XOCC_INCL += -I/opt/Xilinx/SDx/2017.4/include/ocv
+
+KERNEL_BLD_DIR = $(TARGET)/build/kernel
+
+ifeq "$(TARGET)" "hw"
+  KERNEL_RUN_DIR = $(TARGET)/afi
+else
+  KERNEL_RUN_DIR = $(TARGET)/run
+  XOCC_OPTS += -g
+endif
+
+BUILD_SUBDIRS += $(KERNEL_BLD_DIR)
+
+KERNEL_OBJ += $(addsuffix .xo    , $(addprefix $(KERNEL_BLD_DIR)/, $(KERNEL)) )
+KERNEL_BIN += $(addsuffix .xclbin, $(addprefix $(KERNEL_RUN_DIR)/, $(KERNEL)) )
+
+
+########################################
+#                                      #
+#            Build section             #
+#                                      #
+########################################
+
+.PHONY: all
+
+all: host krnl
+
+host: $(HOST_EXE)
+
+krnl: $(KERNEL_BIN)
+
+clean:
+	$(RMDIR) $(BUILD_SUBDIRS)
+	$(RMDIR) .Xil
+	$(RMDIR) $(HOST_RUN_DIR)/TempConfig
+	$(RM)    $(HOST_RUN_DIR)/*.jpg $(HOST_RUN_DIR)/*.png $(HOST_RUN_DIR)/*.log $(HOST_RUN_DIR)/*.csv $(HOST_RUN_DIR)/*.html
+	$(RM)    $(KERNEL_BIN)
+	$(RM)    $(HOST_EXE)
+	$(RM)    $(HOST_EXE)/*.*xclbin
+
+.PHONY: all
+
+#--- Kernel rules ---#
+
+$(KERNEL_OBJ): $(KERNEL_BLD_DIR)/%.xo : %_kernel_aws.cpp
+	@echo " "
+	@echo "================================================================"
+	@echo "Compilation of $< to $@"
+	@echo "================================================================"
+	@echo " "
+	@mkdir -p $(@D)
+	$(XOCC) -c -t $(TARGET)  $(XOCC_OPTS) $(XOCC_INCL) -k $(*F) --max_memory_ports $(*F) -I$(<D) --xp misc:solution_name=$(KERNEL_BLD_DIR)/$(*F)_compile -o$(@) $(<)
+	
+
+
+$(KERNEL_BIN): $(KERNEL_RUN_DIR)/%.xclbin : $(KERNEL_BLD_DIR)/%.xo
+	@echo " "
+	@echo "================================================================"
+	@echo "Linking of $@  ($(@D))"
+	@echo "================================================================"
+	@echo " "
+	mkdir -p $(KERNEL_RUN_DIR)
+	$(XOCC) -l -t $(TARGET) $(XOCC_OPTS) --jobs 8 --nk $(*F):1 --xp misc:solution_name=$(KERNEL_BLD_DIR)/$(*F)_link -o$(@) $(+)
+
+
+#--- Host rules ---#
+
+$(HOST_AWS_OBJ): $(HOST_BLD_DIR)/%.o : $(HOST_AWS_DIR)/%.cpp 
+	@echo " "
+	@echo "================================================================"
+	@echo "Compilation of $< to $@"
+	@echo "================================================================"
+	@echo " "
+	@mkdir -p $(HOST_BLD_DIR)                                       
+	$(SDX_CXX) $(CXXFLAGS) -o $(@) $(<)
+
+	
+$(HOST_SDx_OBJ): $(HOST_BLD_DIR)/%.o : $(SDx_LIB_DIR)/%.cpp 
+	@echo " "
+	@echo "================================================================"
+	@echo "Compilation of $< to $@"
+	@echo "================================================================"
+	@echo " "
+	@mkdir -p $(HOST_BLD_DIR)                                       
+	$(SDX_CXX) $(CXXFLAGS) -o $(@) $(<)
+
+
+$(HOST_EXE): $(HOST_OBJ)
+	@echo " "
+	@echo "================================================================"
+	@echo "Linking of $@  ($(@D))"
+	@echo "================================================================"
+	@echo " "
+	mkdir -p $(HOST_RUN_DIR)
+	$(SDX_CXX) -o "$@" $(+) $(LDFLAGS) 
+	
+
+
diff --git a/aws_demo/gaussianfilter/Gaussian_Filter_Diagram.png b/aws_demo/gaussianfilter/Gaussian_Filter_Diagram.png
new file mode 100644
index 0000000..0c687f8
Binary files /dev/null and b/aws_demo/gaussianfilter/Gaussian_Filter_Diagram.png differ
diff --git a/aws_demo/gaussianfilter/hw/afi/gen_afi.sh b/aws_demo/gaussianfilter/hw/afi/gen_afi.sh
new file mode 100644
index 0000000..bd8ae2a
--- /dev/null
+++ b/aws_demo/gaussianfilter/hw/afi/gen_afi.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+echo aws s3 rm --recursive s3://xfg
+aws s3 rm --recursive s3://xfg
+
+echo aws s3 rb s3://xfg
+aws s3 rb s3://xfg
+
+
+echo aws s3 mb s3://xfg
+aws s3 mb s3://xfg
+
+aws s3 mb s3://xfg/dcp
+touch FILES_GO_HERE.txt
+aws s3 cp FILES_GO_HERE.txt s3://xfg/dcp/
+
+
+aws s3 mb s3://xfg/log
+touch LOGS_FILES_GO_HERE.txt
+aws s3 cp LOGS_FILES_GO_HERE.txt s3://xfg/log/  
+
+aws s3 ls --recursive s3://xfg
+
+rm -f FILES_GO_HERE.txt
+rm -f LOGS_FILES_GO_HERE.txt
+
+$SDACCEL_DIR/tools/create_sdaccel_afi.sh -xclbin=xf_gaussian_filter.xclbin -s3_bucket=xfg -s3_dcp_key=dcp -s3_logs_key=log
+
+cat *afi_id*
+
+echo "use following command to check afi ready"
+echo "aws ec2 describe-fpga-images --fpga-image-id <afi id>"
diff --git a/aws_demo/gaussianfilter/hw/run/run.sh b/aws_demo/gaussianfilter/hw/run/run.sh
new file mode 100644
index 0000000..fb16389
--- /dev/null
+++ b/aws_demo/gaussianfilter/hw/run/run.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+
+source /opt/Xilinx/SDx/2017.1.rte.4ddr/setup.sh
+
+./gaussian_filter_test ../../im0.jpg
diff --git a/aws_demo/gaussianfilter/hw_emu/run/run.sh b/aws_demo/gaussianfilter/hw_emu/run/run.sh
new file mode 100644
index 0000000..e8dad87
--- /dev/null
+++ b/aws_demo/gaussianfilter/hw_emu/run/run.sh
@@ -0,0 +1,5 @@
+emconfigutil -f $AWS_PLATFORM
+
+export XCL_EMULATION_MODE=hw_emu
+
+./gaussian_filter_test ../../im0.jpg
diff --git a/aws_demo/gaussianfilter/hw_emu/run/sdaccel.ini b/aws_demo/gaussianfilter/hw_emu/run/sdaccel.ini
new file mode 100644
index 0000000..63a1cac
--- /dev/null
+++ b/aws_demo/gaussianfilter/hw_emu/run/sdaccel.ini
@@ -0,0 +1,5 @@
+[Debug]
+timeline_trace=true
+device_profile=true
+app_debug=true
+profile=true
diff --git a/aws_demo/gaussianfilter/im0.jpg b/aws_demo/gaussianfilter/im0.jpg
new file mode 100644
index 0000000..bce6d36
Binary files /dev/null and b/aws_demo/gaussianfilter/im0.jpg differ
diff --git a/aws_demo/gaussianfilter/makefile b/aws_demo/gaussianfilter/makefile
new file mode 100644
index 0000000..1a9f186
--- /dev/null
+++ b/aws_demo/gaussianfilter/makefile
@@ -0,0 +1,25 @@
+########################################
+#                                      #
+#             Host section             #
+#                                      #
+########################################
+
+TEST_NAME = gaussian_filter_test
+
+HOST_AWS_SRC  =  xf_gaussian_filter_accel_aws
+HOST_AWS_SRC +=  xf_gaussian_filter_tb
+
+########################################
+#                                      #
+#            Kernel section            #
+#                                      #
+########################################
+
+KERNEL = xf_gaussian_filter
+
+########################################
+
+include ../common_makefile
+
+
+
diff --git a/aws_demo/gaussianfilter/readme.md b/aws_demo/gaussianfilter/readme.md
new file mode 100644
index 0000000..9d22445
--- /dev/null
+++ b/aws_demo/gaussianfilter/readme.md
@@ -0,0 +1,183 @@
+# Gaussian Filter #
+
+Example demonstrates using of **`xf::GaussianBlur()`** and **`xf::resize()`** functions of xfOpenCV library in pipeline. Example designed to process one image once. If you would like to process many images in loop you need to extract from kernel interface wrapper FPGA & kernel initialization and finalization operations and move them to host application before and after processing loop respectively.
+
+## Code structure ##
+
+![](./../Code_Structure.png)
+
+| Component | Source files |
+| :-        | :-           |
+| *Kernel&nbsp;Configuration*          |**`xf_gaussian_filter_config.h`**<br/>**`xf_config_params.h`**|
+| *Host&nbsp;Application*              |**`xf_gaussian_filter_tb.cpp`**|
+| *Kernel&nbsp;Interface&nbsp;Wrapper* |**`xf_gaussian_filter_accel_aws.cpp`**|
+| *Kernel&nbsp;Driver*                 |**`xcl2.cpp (in SDx library)`**|
+| *Kernel*                             |**`xf_gaussian_filter_kernel_aws.cpp`**|
+
+## Kernel Configuration #
+
+Following constants in header files define kernel configuration
+
+| Constant | Possible values | Default Value | Description |
+| :-       | :-              | :-            | :-          |
+| **`FILTER_SIZE_3`**<br/>**`FILTER_SIZE_5`**<br/>**`FILTER_SIZE_7`**|**`0, 1`**| **`1`**<br/>**`0`**<br/>**`0`**| Select window size of the Gaussian filter. One of them should be defined as 1. And only one can be defined as 1 - others should be defined as 0 |
+| **`FILTER_WIDTH`**            |-|-|The window size of the Gaussian filter. Value set automatically depending on which **`FILTER_SIZE_n`** set to 1. 
+| **`SIGMA`**                   |-|-|Standard deviation of of Gaussian Filter. Value set automatically depending on which **`FILTER_SIZE_n`** set to 1.|
+| **`NPC1`**                    |**`XF_NPPC1`**<br/>**`XF_NPPC8`**|**`XF_NPPC1`**|Select level of parallelism in kernel (number of pixels which kernel process per clock cycle).|
+| **`XF_RESIZE_INTERPOLATION`** |**`XF_INTERPOLATION_NN`**<br/>**`XF_INTERPOLATION_BILINEAR`**<br/>**`XF_INTERPOLATION_AREA`**<br/>|**`XF_INTERPOLATION_NN`**|Types of Interpolaton techniques|
+| **`CV_RESIZE_INTERPOLATION`** |**`cv::INTER_NEAREST`**<br/>**`cv::INTER_LINEAR`**<br/>**`cv::INTER_AREA`**<br/>**`others are not suitable`**|**`cv::INTER_NEAREST`**|Types of Interpolaton techniques|
+| **`XF_GAUSSIAN_BORDER`**      |**`XF_BORDER_CONSTANT`**<br/>**`XF_BORDER_REPLICATE`**|**`XF_BORDER_CONSTANT`**|The way in which borders will be processed|
+| **`CV_GAUSSIAN_BORDER`**      |**`cv::BORDER_CONSTANT`**<br/>**`cv::BORDER_REPLICATE`**<br/>**`others are not suitable`**|**`cv::BORDER_CONSTANT`**|The way in which borders will be processed|
+| **`COLS_INP`**                |**`multiple of 8`**|**`1920`**|Maximum width  of input image|
+| **`ROWS_INP`**                |**`multiple of 8`**|**`1080`**|Maximum height of input image|
+| **`SCALE`**                   |**`> 0 and !=1`**|**`0.5`**|Define scale factor of image after Gaussian Filter.<br/>**Note: The **`xf::resize()`** doesn't support scale factor 1.**  |
+| **`COLS_OUT`**                |**`multiple of 8`**|**`COLS_INP/2`**|Maximum width of output image. Please keep value to correspond to the scale factor (**`SCALE`**). Value should be **`>= ceil(COLS_INP * SCALE)`** and should be multiple of 8.|
+| **`ROWS_OUT`**                |**`> 0`**|**`ROWS_INP/2`**|Maximum height of input image. Please keep value to correspond to the scale factor (**`SCALE`**). Value should be **`>= ceil(ROWS_INP * SCALE)`**|
+
+## Host Application ##
+Host application reads test image from file, process it with help of regular OpenCV library on host, perform same processing with help of FPGA kernel with function from xfOpenCV library and compare result.
+
+Input image of example ***im0.jpg*** placed in root folder of example. First filter applied to the image is **`xf::GaussianBlur()`**, next is **`xf::resize()`**. Both has analog with same name in OpenCV library. Application calculate difference between result images - images assumed equal if difference for each pixel not exceed 1. Result images will be stored into run folder. 
+
+The following images will be in run folder after execution:
+
+- ***xf_img_out.jpg*** - result of FPGA kernel processing
+- ***cv_img_out.jpg*** - result of OpenCV processing
+- ***error.png*** - contains difference of values for each pixel of result images
+
+
+## Kernel Interface Wrapper ##
+
+In conjunction with xfOpenCV library on host application is convenient to use xf::Mat or cv::Mat class and image manipulation functions. Unfortunately the XOCC kernel compiler doesn't support classes/structures as kernel input/output parameters. To pass xf::Mat to a kernel a wrapper is needed. The kernel interface wrapper convert interface convenient to host application to kernel interface available in Amazon F1 instance.
+
+For this example kernel interface wrapper also perform FPGA initialization, kernel downloading, initialization and finalization.
+
+
+| Parameter&nbsp;Name |Direction|Type | Description |
+| :-                  | :-      | :-  | :-          |
+| **`img_inp`** |Input  | **`xf::Mat<XF_8UC1, ROWS_INP, COLS_INP, NPC1> &`** | Input image |
+| **`img_out`** |Output | **`xf::Mat<XF_8UC1, ROWS_OUT, COLS_OUT, NPC1> &`** | Output image |
+| **`sigma`**   | Input | **`float`**                                      | Standard deviation of of Gaussian Filter |
+
+To forward these parameters to kernel wrapper create 2 buffers in global memory for images data. Wrapper decompose **`img_inp`** and **`img_out`** classes and pass member separately.
+
+
+## Kernel Driver ###
+
+Example use modification of SDx xcl kernel driver v.2 for Amazon F1 instance. Source code of this driver and description could be found in Amazon aws-fpga framework.
+
+## Kernel ##
+
+To apply Gaussian filter and change size of processed image the kernel pipeline functions from xfOpenCV library as shown on the image below.<br/>
+
+![](./Gaussian_Filter_Diagram.png)
+
+The kernel has following parameters:
+
+| Parameter&nbsp;Name |Direction|Type | Description |
+| :-                  | :-      | :-  | :-          |
+| **`img_inp`** |Input  | **`XF_TNAME(XF_8UC1, NPC1) *`** | Pointer to input image buffer |
+| **`img_out`** |Output | **`XF_TNAME(XF_8UC1, NPC1) *`** | Pointer to output image buffer|
+| **`rows_inp`**| Input | **`int`**                       | Height of input image |
+| **`cols_inp`**| Input | **`int`**                       | Width of input image  |
+| **`sigma`**   | Input | **`float`**                     | Standard deviation of of Gaussian Filter |
+| **`rows_out`**| Input | **`int`**                       | Height of output image |
+| **`cols_out`**| Input | **`int`**                       | Width of output image  |
+
+During synthesis for FPGA kernel's parameters should be mapped to HW interfaces supported on Amazon F1 instance. To map kernel parameters **`HLS INTERFACE`** pragma should be used. Supported following interfaces: **`m_axi`** and **`s_axilite`**. For **`m_axi`** offset can be set through **`s_axilite`** port only.
+
+Because functions from xfOpenCV library operate with **`xf::Mat`** class as image container kernel's parameters should be packed back to objects of this class. To do this you need following: 
+
+- Declare **`xf::Mat`** variable <br/> ***Note: due to XOCC issues use default constructor only - do not try initialize class members with help of non-default constructors***
+- Assign image size to **`rows`** and **`cols`** members
+- Copy image from input buffer to **`data`** member of **`xf::Mat`** or from **`data`** to output buffer
+
+```cpp
+xf::Mat<XF_8UC1, ROWS_INP, COLS_INP, NPC1> mi;
+
+mi.rows = rows_inp;
+mi.cols = cols_inp;
+
+for(int i=0; i < rows_inp; i++)
+  {
+    #pragma HLS LOOP_TRIPCOUNT min=1 max=pROWS_INP
+
+    for(int j=0; j < (cols_inp >> (XF_BITSHIFT(NPC1))); j++)
+      {
+        #pragma HLS LOOP_TRIPCOUNT min=1 max=pCOLS_INP/pNPC1
+        #pragma HLS PIPELINE
+        #pragma HLS loop_flatten off
+
+        *(mi.data + i*(cols_inp >> (XF_BITSHIFT(NPC1))) +j) = *(img_inp + i*(cols_inp >> (XF_BITSHIFT(NPC1))) +j);
+      }
+  }
+```
+**Note: `#pragma HLS` doesn't support constants defined through **`#define`** directive - use `const int`. In the code above `pROWS_INP`, `pCOLS_INP` and `pNPC1` are `const int` variables which get values from constants defined in xf_gaussian_filter_config.h with help of #define directive**
+
+```cpp
+  const int pROWS_INP = ROWS_INP;
+  const int pCOLS_INP = COLS_INP;
+  const int pNPC1     = NPC1;
+```
+
+Simple declaration of **`xf::Mat`** object create buffer to store whole image with maximum defined size. This buffer use FPGA internal memory blocks and even big FPGA devices could not have enough resources. You should use **`#pragma HLS stream`** to ask HLS convert big RAM buffer to small FIFO buffer 
+
+```cpp
+  xf::Mat<XF_8UC1, ROWS_INP, COLS_INP, NPC1> mi;
+  #pragma HLS stream variable=mi.data depth=pCOLS_INP/pNPC1
+```
+
+Please note that **`#pragma HLS stream`** could be used inside dataflow block, therefore kernel body should be declared as dataflow. This also permit pipeline functions from xfOpenCV library.
+
+```cpp
+void kernel(...)
+{
+  #pragma HLS INTERFACE ...
+  #pragma HLS INTERFACE ...
+  
+  #pragma HLS dataflow
+  ...
+}
+```
+
+## Known Issues
+
+- #### Kernel can't accept class/structure as parameters
+**Solution**: use simple types, pass class/structure members as separate parameters of simple types and compose class/structure object back inside kernel.
+
+- #### Using non-default constructors can cause kernel suspension on FPGA and HW emulation
+**Solution**: use default constructor for object declaration and next assign desired values to the members separately.
+
+```cpp
+xf::Mat<XF_8UC1, ROWS_INP, COLS_INP, NPC1> mi;
+
+mi.rows = rows_inp;
+mi.cols = cols_inp;
+```
+
+- #### **`#pragma HLS`** doesn't support constants defined through **`#define`** directive.
+**Solution**: use **`const int`** instead
+
+
+```cpp
+#define ROWS_INP 1080
+
+void kernel(...)
+{
+  const int pROWS_INP = ROWS_INP;
+
+  for(int i=0; i < rows_inp; i++)
+    {
+      #pragma HLS LOOP_TRIPCOUNT min=1 max=pROWS_INP
+      ...
+    }
+  ...
+}
+```
+
+
+## Revision History
+
+Date      | Readme Version | Release Notes
+--------  |----------------|-------------------------
+May 2018  | 1.0            | Initial version.
diff --git a/aws_demo/gaussianfilter/sw_emu/run/run.sh b/aws_demo/gaussianfilter/sw_emu/run/run.sh
new file mode 100644
index 0000000..35d1bd4
--- /dev/null
+++ b/aws_demo/gaussianfilter/sw_emu/run/run.sh
@@ -0,0 +1,5 @@
+emconfigutil -f $AWS_PLATFORM
+
+export XCL_EMULATION_MODE=sw_emu
+
+./gaussian_filter_test ../../im0.jpg
diff --git a/aws_demo/gaussianfilter/sw_emu/run/sdaccel.ini b/aws_demo/gaussianfilter/sw_emu/run/sdaccel.ini
new file mode 100644
index 0000000..63a1cac
--- /dev/null
+++ b/aws_demo/gaussianfilter/sw_emu/run/sdaccel.ini
@@ -0,0 +1,5 @@
+[Debug]
+timeline_trace=true
+device_profile=true
+app_debug=true
+profile=true
diff --git a/aws_demo/gaussianfilter/xf_config_params.h b/aws_demo/gaussianfilter/xf_config_params.h
new file mode 100644
index 0000000..dfd727c
--- /dev/null
+++ b/aws_demo/gaussianfilter/xf_config_params.h
@@ -0,0 +1,3 @@
+#define FILTER_SIZE_3 1
+#define FILTER_SIZE_5 0
+#define FILTER_SIZE_7 0
diff --git a/aws_demo/gaussianfilter/xf_gaussian_filter_accel_aws.cpp b/aws_demo/gaussianfilter/xf_gaussian_filter_accel_aws.cpp
new file mode 100644
index 0000000..9e427fc
--- /dev/null
+++ b/aws_demo/gaussianfilter/xf_gaussian_filter_accel_aws.cpp
@@ -0,0 +1,57 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <vector>
+
+#include "xcl2.hpp" 
+
+#include "xf_gaussian_filter_config.h"
+
+#define CL_MIGRATE_MEM_OBJECT_KERNEL 0       //OpenCL define constant to indicate memory object migration to host only, to make program more readable define "counterpart" constant
+
+void gaussian_filter_accel(xf::Mat<XF_8UC1, ROWS_INP, COLS_INP, NPC1> &img_inp, xf::Mat<XF_8UC1, ROWS_OUT, COLS_OUT, NPC1> &img_out, float sigma)
+{
+    std::vector<cl::Device> devices = xcl::get_xil_devices();
+
+    cl::Device device = devices[0];
+
+    cl::Context context(device);
+
+    cl::CommandQueue q(context, device, CL_QUEUE_PROFILING_ENABLE);
+    std::string device_name = device.getInfo<CL_DEVICE_NAME>();
+
+    std::string binaryFile = (xcl::is_emulation() || xcl::is_hw_emulation ()) ? "xf_gaussian_filter.xclbin" : "xf_gaussian_filter.awsxclbin";
+    
+    std::cout << "======== " <<  binaryFile << " ========" << std::endl;
+    
+    cl::Program::Binaries bins = xcl::import_binary_file(binaryFile);
+    devices.resize(1);
+    cl::Program program(context, devices, bins);
+    cl::Kernel kernel(program,"xf_gaussian_filter");
+
+    //----------- Allocate Buffer in Global Memory -----------//
+
+    cl::Buffer buffer_inp(context,CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY , img_inp.rows * img_inp.cols, img_inp.data);
+    cl::Buffer buffer_out(context,CL_MEM_USE_HOST_PTR | CL_MEM_WRITE_ONLY, img_out.rows * img_out.cols, img_out.data);
+
+    std::vector<cl::Memory> writeBufVec;
+    writeBufVec.push_back(buffer_inp);
+    
+    //----------- Migrate  input data to device global memory -----------//
+    
+    q.enqueueMigrateMemObjects(writeBufVec, CL_MIGRATE_MEM_OBJECT_KERNEL);
+
+    auto krnl = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, int, int, float, int, int >(kernel);
+
+    //----------- Launch the Kernel -----------//
+
+    krnl(cl::EnqueueArgs(q, cl::NDRange(1,1,1), cl::NDRange(1,1,1)), buffer_inp, buffer_out, img_inp.rows, img_inp.cols, sigma, img_out.rows, img_out.cols);
+    
+    //----------- Copy Result from Device Global Memory to Host Local Memory -----------//
+    
+    std::vector<cl::Memory> readBufVec;
+    readBufVec.push_back(buffer_out);
+
+    q.enqueueMigrateMemObjects(readBufVec,CL_MIGRATE_MEM_OBJECT_HOST);
+
+    q.finish();
+}
diff --git a/aws_demo/gaussianfilter/xf_gaussian_filter_config.h b/aws_demo/gaussianfilter/xf_gaussian_filter_config.h
new file mode 100644
index 0000000..0a35c2b
--- /dev/null
+++ b/aws_demo/gaussianfilter/xf_gaussian_filter_config.h
@@ -0,0 +1,80 @@
+/***************************************************************************
+Copyright (c) 2016, Xilinx, Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, 
+are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, 
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, 
+this list of conditions and the following disclaimer in the documentation 
+and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors 
+may be used to endorse or promote products derived from this software 
+without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 
+EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+***************************************************************************/
+
+#ifndef _XF_GAUSSIAN_FILTER_CONFIG_H_
+//{
+    #define _XF_GAUSSIAN_FILTER_CONFIG_H_
+
+    #include "hls_stream.h"
+    #include "common/xf_common.h"
+    #include "common/xf_utility.h"
+    #include "imgproc/xf_gaussian_filter.hpp"
+    #include "xf_config_params.h"
+
+    typedef unsigned short int  uint16_t;
+
+    #define SCALE    ( 0.5f )
+
+    #define ROWS_INP ( 1080 )
+    #define COLS_INP ( 1920 )
+
+    #define ROWS_OUT ( ROWS_INP / 2 )
+    #define COLS_OUT ( COLS_INP / 2 )
+
+    //----------------- Filters parameters -----------------//
+
+    #define XF_RESIZE_INTERPOLATION XF_INTERPOLATION_NN          // Interpolation type for xf::resize() inside kernel
+    #define CV_RESIZE_INTERPOLATION cv::INTER_NEAREST            // Interpolation type for cv::resize() called from testbench
+                                                                    
+    #define XF_GAUSSIAN_BORDER  XF_BORDER_CONSTANT               // Border type of xfopencv Gaussian filter inside kernel
+    #define CV_GAUSSIAN_BORDER  cv::BORDER_CONSTANT              // Border type of   opencv Gaussian filter called from testbench
+
+    #if FILTER_SIZE_3                                            // Set Gaussian filter parameters depending on constant defined in xf_config_params.h
+    //{
+        #define FILTER_WIDTH (  3  )
+        #define SIGMA        ( 0.5f)
+    //}
+    #elif FILTER_SIZE_5
+    //{
+        #define FILTER_WIDTH (    5    )
+        #define SIGMA        ( 0.8333f )
+    //}
+    #elif FILTER_SIZE_7
+    //{
+        #define FILTER_WIDTH (     7    )
+        #define SIGMA        ( 1.16666f )
+    //}
+    #endif
+
+    #define NPC1 XF_NPPC1
+
+    void gaussian_filter_accel(xf::Mat<XF_8UC1, ROWS_INP, COLS_INP, NPC1> &img_inp, xf::Mat<XF_8UC1, ROWS_OUT, COLS_OUT, NPC1> &img_out, float sigma);
+//}
+#endif //_XF_GAUSSIAN_FILTER_CONFIG_H_
diff --git a/aws_demo/gaussianfilter/xf_gaussian_filter_kernel_aws.cpp b/aws_demo/gaussianfilter/xf_gaussian_filter_kernel_aws.cpp
new file mode 100644
index 0000000..50207c6
--- /dev/null
+++ b/aws_demo/gaussianfilter/xf_gaussian_filter_kernel_aws.cpp
@@ -0,0 +1,96 @@
+//Includes
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include "hls_stream.h"
+
+#include "common/xf_common.h"
+
+#include "xf_gaussian_filter_config.h"
+
+#include "imgproc/xf_gaussian_filter.hpp"
+#include "imgproc/xf_resize.hpp"
+
+extern "C" void xf_gaussian_filter(XF_TNAME(XF_8UC1, NPC1) *img_inp, XF_TNAME(XF_8UC1, NPC1) *img_out, int rows_inp, int cols_inp, float sigma, int rows_out, int cols_out);
+
+void xf_gaussian_filter(XF_TNAME(XF_8UC1, NPC1) *img_inp, XF_TNAME(XF_8UC1, NPC1) *img_out, int rows_inp, int cols_inp, float sigma, int rows_out, int cols_out)
+{
+  #pragma HLS INTERFACE m_axi     port=img_inp  offset=slave bundle=gmem
+  #pragma HLS INTERFACE m_axi     port=img_out  offset=slave bundle=gmem
+
+  #pragma HLS INTERFACE s_axilite port=img_inp               bundle=control
+  #pragma HLS INTERFACE s_axilite port=img_out               bundle=control
+      
+  #pragma HLS INTERFACE s_axilite port=rows_inp              bundle=control
+  #pragma HLS INTERFACE s_axilite port=cols_inp              bundle=control
+  #pragma HLS INTERFACE s_axilite port=sigma                 bundle=control
+                                                            
+  #pragma HLS INTERFACE s_axilite port=rows_out              bundle=control
+  #pragma HLS INTERFACE s_axilite port=cols_out              bundle=control
+                                                            
+  #pragma HLS INTERFACE s_axilite port=return                bundle=control
+
+  #pragma HLS dataflow
+
+  const int pROWS_INP = ROWS_INP;
+  const int pCOLS_INP = COLS_INP;
+
+  const int pROWS_OUT = ROWS_OUT;
+  const int pCOLS_OUT = COLS_OUT;
+  
+  const int pNPC1 = NPC1;
+
+  xf::Mat<XF_8UC1, ROWS_INP, COLS_INP, NPC1> mi;
+  xf::Mat<XF_8UC1, ROWS_INP, COLS_INP, NPC1> mf;
+
+  #pragma HLS stream variable=mi.data depth=pCOLS_INP/pNPC1
+  #pragma HLS stream variable=mf.data depth=pCOLS_INP/pNPC1
+
+  xf::Mat<XF_8UC1, ROWS_OUT, COLS_OUT, NPC1> mo;
+
+  #pragma HLS stream variable=mo.data depth=pCOLS_OUT/pNPC1
+
+  mi.rows = rows_inp;  mi.cols = cols_inp;
+  mf.rows = rows_inp;  mf.cols = cols_inp;
+
+  mo.rows = rows_out;  mo.cols = cols_out;
+
+  /********************************************************/
+
+  for(int i=0; i < rows_inp; i++)
+    {
+      #pragma HLS LOOP_TRIPCOUNT min=1 max=pROWS_INP
+
+      for(int j=0; j < (cols_inp >> (XF_BITSHIFT(NPC1))); j++)
+        {
+          #pragma HLS LOOP_TRIPCOUNT min=1 max=pCOLS_INP/pNPC1
+          #pragma HLS PIPELINE
+          #pragma HLS loop_flatten off
+
+          *(mi.data + i*(cols_inp >> (XF_BITSHIFT(NPC1))) +j) = *(img_inp + i*(cols_inp >> (XF_BITSHIFT(NPC1))) +j);
+        }
+    }
+
+  xf::GaussianBlur<FILTER_WIDTH, XF_GAUSSIAN_BORDER, XF_8UC1, ROWS_INP, COLS_INP, NPC1>(mi, mf, sigma);
+
+
+  xf::resize<XF_RESIZE_INTERPOLATION, XF_8UC1, ROWS_INP, COLS_INP, ROWS_OUT, COLS_OUT, NPC1> (mf, mo);
+
+  for(int i=0; i < rows_out; i++)
+    {
+      #pragma HLS LOOP_TRIPCOUNT min=1 max=pROWS_OUT
+
+      for(int j=0; j < (cols_out >> (XF_BITSHIFT(NPC1))); j++)
+        {
+          #pragma HLS LOOP_TRIPCOUNT min=1 max=pCOLS_OUT/pNPC1
+          #pragma HLS PIPELINE
+          #pragma HLS loop_flatten off
+
+          *(img_out + i*(cols_out >> (XF_BITSHIFT(NPC1))) +j)  = *(mo.data + i*(cols_out >> (XF_BITSHIFT(NPC1))) +j) ;
+        }
+    }
+
+}
+
+
diff --git a/aws_demo/gaussianfilter/xf_gaussian_filter_tb.cpp b/aws_demo/gaussianfilter/xf_gaussian_filter_tb.cpp
new file mode 100644
index 0000000..05307d6
--- /dev/null
+++ b/aws_demo/gaussianfilter/xf_gaussian_filter_tb.cpp
@@ -0,0 +1,138 @@
+/***************************************************************************
+ Copyright (c) 2016, Xilinx, Inc.
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+ 3. Neither the name of the copyright holder nor the names of its contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ***************************************************************************/
+
+#include "opencv2/opencv.hpp"
+#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/highgui/highgui.hpp"
+
+#include "common/xf_sw_utils.h"
+
+#include "xf_gaussian_filter_config.h"
+
+using namespace std;
+
+int main(int argc, char **argv) 
+{
+  if (argc != 2)
+    {
+      printf("Usage: <executable> <input image path> \n");
+      return -1;
+    }
+
+  cv::Mat cv_img_inp, cv_img_out, cv_img_ref;
+  cv::Mat diff;
+
+  int rows_out, cols_out;
+
+  cv_img_inp = cv::imread(argv[1], 0); // reading in the color image
+
+  if (!cv_img_inp.data) 
+    {
+      printf("Failed to load the image ... !!!");
+      return -1;
+    }
+
+  rows_out = cv_img_inp.rows * SCALE;
+  cols_out = cv_img_inp.cols * SCALE;
+
+  cv_img_ref.create(cv_img_inp.rows, cv_img_inp.cols, cv_img_inp.depth()); // create memory for OCV output image
+
+  cv_img_out.create(rows_out, cols_out, cv_img_inp.depth()); // create memory for OCV output image
+
+  float sigma = SIGMA;
+
+  // OpenCV Gaussian filter function
+  cv::GaussianBlur(cv_img_inp, cv_img_ref, cvSize(FILTER_WIDTH, FILTER_WIDTH), SIGMA, SIGMA, CV_GAUSSIAN_BORDER);
+
+  cv::resize(cv_img_ref, cv_img_out, cvSize(cv_img_out.cols, cv_img_out.rows), 0, 0, CV_RESIZE_INTERPOLATION );
+
+  imwrite("cv_img_out.jpg", cv_img_out);
+
+
+  diff.create(cv_img_out.rows, cv_img_out.cols, cv_img_out.depth()); // create memory for diff image
+
+
+  //=====================================================================//
+
+
+  xf::Mat<XF_8UC1, ROWS_INP, COLS_INP, NPC1> xf_img_inp(cv_img_inp.rows,cv_img_inp.cols);
+  xf::Mat<XF_8UC1, ROWS_OUT, COLS_OUT, NPC1> xf_img_out(cv_img_out.rows,cv_img_out.cols);
+
+  xf_img_inp = xf::imread<XF_8UC1, ROWS_INP, COLS_INP, NPC1>(argv[1], 0);
+
+  gaussian_filter_accel(xf_img_inp, xf_img_out, sigma);
+
+
+  // Write output image
+  xf::imwrite("xf_img_out.jpg",xf_img_out);
+
+
+  xf::absDiff(cv_img_out, xf_img_out, diff); // Compute absolute difference image
+
+  imwrite("error.png", diff); // Save the difference image for debugging purpose
+
+  // Find minimum and maximum differences.
+
+  #define THRESHOLD 1
+
+  double minval = 256, maxval = 0;
+  int cnt = 0;
+
+  for( int i = 0; i < diff.rows; i++ ) 
+    {
+      for( int j = 0; j < diff.cols; j++ ) 
+        {
+          uchar v = diff.at<uchar>(i, j);
+
+          if( v > THRESHOLD )  
+            cnt++;
+          
+          if (minval > v) minval = v;
+          if (maxval < v) maxval = v;
+        }
+    }
+
+  float err_per = 100.0 * (float) cnt / (cv_img_inp.rows * cv_img_inp.cols);
+  
+  printf( "\nMinimum error in intensity = %f\n", minval);
+  printf(   "Maximum error in intensity = %f\n", maxval);
+
+  printf( "\nPercentage of pixels above error threshold = %f\n", err_per);
+        
+  if(err_per > 1) 
+    {  
+      printf("\nTest Failed\n");  
+      return -1; 
+    }
+
+  printf("\nTest Pass\n");  
+  
+  return 0;
+}
diff --git a/aws_demo/ide/vs/Gaussian_Filter.vcxproj b/aws_demo/ide/vs/Gaussian_Filter.vcxproj
new file mode 100644
index 0000000..274464e
--- /dev/null
+++ b/aws_demo/ide/vs/Gaussian_Filter.vcxproj
@@ -0,0 +1,82 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\..\common_makefile" />
+    <None Include="..\..\gaussianfilter\makefile" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\..\gaussianfilter\xf_gaussian_filter_accel_aws.cpp" />
+    <ClCompile Include="..\..\gaussianfilter\xf_gaussian_filter_kernel_aws.cpp" />
+    <ClCompile Include="..\..\gaussianfilter\xf_gaussian_filter_tb.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\gaussianfilter\xf_config_params.h" />
+    <ClInclude Include="..\..\gaussianfilter\xf_gaussian_filter_config.h" />
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{4F175088-E060-4DD3-B199-92A67421ACE2}</ProjectGuid>
+    <RootNamespace>Gaussian_Filter</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v110</PlatformToolset>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v110</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup />
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <AdditionalIncludeDirectories>..\..\..\include;..\..\..\examples\gaussianfilter;..\..\..\..\aws-fpga\SDAccel\examples\xilinx\libs\xcl2;..\..\..\..\OpenCV;..\..\..\..\OpenCL;..\..\..\..\SDx\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <AdditionalIncludeDirectories>..\..\..\include;..\..\..\examples\gaussianfilter;..\..\..\..\aws-fpga\SDAccel\examples\xilinx\libs\xcl2;..\..\..\..\OpenCV;..\..\..\..\OpenCL;..\..\..\..\SDx\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/aws_demo/ide/vs/Gaussian_Filter.vcxproj.filters b/aws_demo/ide/vs/Gaussian_Filter.vcxproj.filters
new file mode 100644
index 0000000..bc11928
--- /dev/null
+++ b/aws_demo/ide/vs/Gaussian_Filter.vcxproj.filters
@@ -0,0 +1,25 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <Filter Include="h">
+      <UniqueIdentifier>{b5f48f04-d9bc-4b50-b9a6-c222b80c54d0}</UniqueIdentifier>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\..\gaussianfilter\makefile" />
+    <None Include="..\..\common_makefile" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\..\gaussianfilter\xf_gaussian_filter_accel_aws.cpp" />
+    <ClCompile Include="..\..\gaussianfilter\xf_gaussian_filter_kernel_aws.cpp" />
+    <ClCompile Include="..\..\gaussianfilter\xf_gaussian_filter_tb.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\gaussianfilter\xf_gaussian_filter_config.h">
+      <Filter>h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\gaussianfilter\xf_config_params.h">
+      <Filter>h</Filter>
+    </ClInclude>
+  </ItemGroup>
+</Project>
\ No newline at end of file
diff --git a/aws_demo/ide/vs/Stereo_Pipeline.vcxproj b/aws_demo/ide/vs/Stereo_Pipeline.vcxproj
new file mode 100644
index 0000000..1cc6c15
--- /dev/null
+++ b/aws_demo/ide/vs/Stereo_Pipeline.vcxproj
@@ -0,0 +1,86 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{BE04D816-584B-4836-9C40-FBA3C41593F2}</ProjectGuid>
+    <RootNamespace>Stereo_Pipeline</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v110</PlatformToolset>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v110</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup />
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <AdditionalIncludeDirectories>..\..\..\include;..\..\..\examples\gaussianfilter;..\..\..\..\aws-fpga\SDAccel\examples\xilinx\libs\xcl2;..\..\..\..\OpenCV;..\..\..\..\OpenCL;..\..\..\..\SDx\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <AdditionalIncludeDirectories>..\..\..\include;..\..\..\examples\gaussianfilter;..\..\..\..\aws-fpga\SDAccel\examples\xilinx\libs\xcl2;..\..\..\..\OpenCV;..\..\..\..\OpenCL;..\..\..\..\SDx\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <None Include="..\..\stereopipeline\makefile" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\..\stereopipeline\xf_stereo_pipeline_accel_aws.cpp" />
+    <ClCompile Include="..\..\stereopipeline\xf_stereo_pipeline_kernel_aws.cpp" />
+    <ClCompile Include="..\..\stereopipeline\xf_stereo_pipeline_tb.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\..\include\imgproc\xf_remap.hpp" />
+    <ClInclude Include="..\..\..\include\imgproc\xf_stereoBM.hpp" />
+    <ClInclude Include="..\..\..\include\imgproc\xf_stereo_pipeline.hpp" />
+    <ClInclude Include="..\..\stereopipeline\cameraParameters.h" />
+    <ClInclude Include="..\..\stereopipeline\xf_config_params.h" />
+    <ClInclude Include="..\..\stereopipeline\xf_headers.h" />
+    <ClInclude Include="..\..\stereopipeline\xf_stereo_pipeline_config.h" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/aws_demo/ide/vs/Stereo_Pipeline.vcxproj.filters b/aws_demo/ide/vs/Stereo_Pipeline.vcxproj.filters
new file mode 100644
index 0000000..f7ce491
--- /dev/null
+++ b/aws_demo/ide/vs/Stereo_Pipeline.vcxproj.filters
@@ -0,0 +1,42 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <None Include="..\..\stereopipeline\makefile" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\..\stereopipeline\xf_stereo_pipeline_tb.cpp" />
+    <ClCompile Include="..\..\stereopipeline\xf_stereo_pipeline_kernel_aws.cpp" />
+    <ClCompile Include="..\..\stereopipeline\xf_stereo_pipeline_accel_aws.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <Filter Include="xfopencv">
+      <UniqueIdentifier>{c22c4267-6cf2-4761-b9f9-50716799aa60}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="h">
+      <UniqueIdentifier>{4f3f7a8a-73c4-455b-a5de-39d8d14135a4}</UniqueIdentifier>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\..\include\imgproc\xf_stereo_pipeline.hpp">
+      <Filter>xfopencv</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\..\include\imgproc\xf_remap.hpp">
+      <Filter>xfopencv</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\..\include\imgproc\xf_stereoBM.hpp">
+      <Filter>xfopencv</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\stereopipeline\xf_config_params.h">
+      <Filter>h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\stereopipeline\xf_headers.h">
+      <Filter>h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\stereopipeline\xf_stereo_pipeline_config.h">
+      <Filter>h</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\stereopipeline\cameraParameters.h">
+      <Filter>h</Filter>
+    </ClInclude>
+  </ItemGroup>
+</Project>
\ No newline at end of file
diff --git a/aws_demo/ide/vs/aws_demo.sln b/aws_demo/ide/vs/aws_demo.sln
new file mode 100644
index 0000000..1b5179c
--- /dev/null
+++ b/aws_demo/ide/vs/aws_demo.sln
@@ -0,0 +1,32 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2012
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "examples", "examples.vcxproj", "{4ABDD7A8-F903-4D12-B736-4BEF47FB6F4B}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Gaussian_Filter", "Gaussian_Filter.vcxproj", "{4F175088-E060-4DD3-B199-92A67421ACE2}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Stereo_Pipeline", "Stereo_Pipeline.vcxproj", "{BE04D816-584B-4836-9C40-FBA3C41593F2}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Win32 = Debug|Win32
+		Release|Win32 = Release|Win32
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{4ABDD7A8-F903-4D12-B736-4BEF47FB6F4B}.Debug|Win32.ActiveCfg = Debug|Win32
+		{4ABDD7A8-F903-4D12-B736-4BEF47FB6F4B}.Debug|Win32.Build.0 = Debug|Win32
+		{4ABDD7A8-F903-4D12-B736-4BEF47FB6F4B}.Release|Win32.ActiveCfg = Release|Win32
+		{4ABDD7A8-F903-4D12-B736-4BEF47FB6F4B}.Release|Win32.Build.0 = Release|Win32
+		{4F175088-E060-4DD3-B199-92A67421ACE2}.Debug|Win32.ActiveCfg = Debug|Win32
+		{4F175088-E060-4DD3-B199-92A67421ACE2}.Debug|Win32.Build.0 = Debug|Win32
+		{4F175088-E060-4DD3-B199-92A67421ACE2}.Release|Win32.ActiveCfg = Release|Win32
+		{4F175088-E060-4DD3-B199-92A67421ACE2}.Release|Win32.Build.0 = Release|Win32
+		{BE04D816-584B-4836-9C40-FBA3C41593F2}.Debug|Win32.ActiveCfg = Debug|Win32
+		{BE04D816-584B-4836-9C40-FBA3C41593F2}.Debug|Win32.Build.0 = Debug|Win32
+		{BE04D816-584B-4836-9C40-FBA3C41593F2}.Release|Win32.ActiveCfg = Release|Win32
+		{BE04D816-584B-4836-9C40-FBA3C41593F2}.Release|Win32.Build.0 = Release|Win32
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/aws_demo/ide/vs/examples.vcxproj b/aws_demo/ide/vs/examples.vcxproj
new file mode 100644
index 0000000..922b667
--- /dev/null
+++ b/aws_demo/ide/vs/examples.vcxproj
@@ -0,0 +1,89 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{4ABDD7A8-F903-4D12-B736-4BEF47FB6F4B}</ProjectGuid>
+    <RootNamespace>examples</RootNamespace>
+    <ProjectName>examples</ProjectName>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v110</PlatformToolset>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v110</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <PreBuildEventUseInBuild>false</PreBuildEventUseInBuild>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\..\examples\accumulate\xf_accumulate_config.h" />
+    <ClInclude Include="..\..\..\examples\accumulate\xf_config_params.h" />
+    <ClInclude Include="..\..\..\examples\accumulate\xf_headers.h" />
+    <ClInclude Include="..\..\..\examples\stereopipeline\cameraParameters.h" />
+    <ClInclude Include="..\..\..\examples\stereopipeline\xf_config_params.h" />
+    <ClInclude Include="..\..\..\examples\stereopipeline\xf_headers.h" />
+    <ClInclude Include="..\..\..\examples\stereopipeline\xf_stereo_pipeline_config.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\..\..\examples\accumulate\xf_accumulate_image_accel.cpp" />
+    <ClCompile Include="..\..\..\examples\accumulate\xf_accumulate_image_tb.cpp" />
+    <ClCompile Include="..\..\..\examples\stereopipeline\xf_stereo_pipeline_accel.cpp" />
+    <ClCompile Include="..\..\..\examples\stereopipeline\xf_stereo_pipeline_tb.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\..\..\examples\stereopipeline\description.json" />
+    <None Include="..\..\..\examples\stereopipeline\Makefile" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/aws_demo/ide/vs/examples.vcxproj.filters b/aws_demo/ide/vs/examples.vcxproj.filters
new file mode 100644
index 0000000..877d59f
--- /dev/null
+++ b/aws_demo/ide/vs/examples.vcxproj.filters
@@ -0,0 +1,56 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <ClInclude Include="..\..\..\examples\accumulate\xf_accumulate_config.h">
+      <Filter>accumulate</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\..\examples\accumulate\xf_config_params.h">
+      <Filter>accumulate</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\..\examples\accumulate\xf_headers.h">
+      <Filter>accumulate</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\..\examples\stereopipeline\cameraParameters.h">
+      <Filter>stereopipeline</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\..\examples\stereopipeline\xf_config_params.h">
+      <Filter>stereopipeline</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\..\examples\stereopipeline\xf_headers.h">
+      <Filter>stereopipeline</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\..\examples\stereopipeline\xf_stereo_pipeline_config.h">
+      <Filter>stereopipeline</Filter>
+    </ClInclude>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\..\..\examples\accumulate\xf_accumulate_image_accel.cpp">
+      <Filter>accumulate</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\..\examples\accumulate\xf_accumulate_image_tb.cpp">
+      <Filter>accumulate</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\..\examples\stereopipeline\xf_stereo_pipeline_accel.cpp">
+      <Filter>stereopipeline</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\..\examples\stereopipeline\xf_stereo_pipeline_tb.cpp">
+      <Filter>stereopipeline</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <Filter Include="accumulate">
+      <UniqueIdentifier>{fdb0de10-a233-42a7-908e-33cb0564f49c}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="stereopipeline">
+      <UniqueIdentifier>{8cbe6705-9bb3-4895-918d-c42dcf0d36f6}</UniqueIdentifier>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\..\..\examples\stereopipeline\Makefile">
+      <Filter>stereopipeline</Filter>
+    </None>
+    <None Include="..\..\..\examples\stereopipeline\description.json">
+      <Filter>stereopipeline</Filter>
+    </None>
+  </ItemGroup>
+</Project>
\ No newline at end of file
diff --git a/aws_demo/make_description.md b/aws_demo/make_description.md
new file mode 100644
index 0000000..90a7c88
--- /dev/null
+++ b/aws_demo/make_description.md
@@ -0,0 +1,71 @@
+# Makefiles description # 
+
+Examples for Amazon F1 instance use example specific and common makefile. Example specific makefile is placed at root folder of example and include common makefile ([`aws_demo/common_makefile`](common_makefile)). 
+
+## Example specific makefile ##
+Example specific makefile contains following variables to list source files for host application and kernel. 
+
+### Variables for host part ###
+ 
+| Variable&nbsp;Name |Necessity | Purpose |
+|       :-           |   :-     |    :-   |
+| **`TEST_NAME`** |Mandatory| Name of the host executable which will be created as successful build result in **`run`** folder |
+| **`HOST_AWS_SRC`** |Mandatory| List of host source files placed in root folder of example |
+| **`HOST_SDx_SRC`** <br/> **`SDx_LIB_DIR`** | Optional | The **`HOST_SDx_SRC`** contains list of SDx kernel driver source files which provide interaction between host and FPGA kernel on Amazon F1 instance and **`SDx_LIB_DIR`** contains path to these sources. Originally all examples use xcl driver v.2. Default values are assigned in [`common_makefile`](common_makefile). If you would like to use other driver you need to do following: <br/> 1) Modify example source code to use desired driver; <br/>2) assign list of appropriate library source files to **`HOST_SDx_SRC`**; <br/>3) setup path to the library in **`SDx_LIB_DIR`** variable. <br/> Settings of these variables in example specific makefile override default values of [`common_makefile`](common_makefile) |
+
+### Variables for kernel part ###
+
+| Variable&nbsp;Name |Necessity | Purpose |
+|       :-           |   :-     |    :-   |
+| **`KERNEL`** |Mandatory| Name of the kernel should be same as kernel source file name (without extension) |
+
+
+## Common makefile ##
+Common makefile contains following variables and makefile's targets for host application and kernel. 
+
+### Variables for host part ###
+ 
+| Variable&nbsp;Name |Default&nbsp;value | Description |
+|       :-           |          :-       |      :-     |
+| **`XILINX_SDX`**        |**`/opt/Xilinx/SDx/2017.1.op`**| Path to Xilinx SDx toolset on Amazon F1 instance |
+| **`XILINX_HLS`**        |**`$(XILINX_SDX)/Vivado_HLS`** | Path to Xilinx Vivado HLS                        |
+| **`SDX_CXX`**           |**`$(XILINX_SDX)/bin/xcpp`**   | Alias for Xilinx SDx compiler                    |
+| **`XOCC`**              |**`$(XILINX_SDX)/bin/xocc`**   | ALias for Xilinx XOCC compiler                   |
+| **`XILINX_SDX_RUNTIME`**| -                             | Set automatically to run-time library of selected platform (value of **`$(AWS_PLATFORM)`**).|   
+| **`XFOPENCV`**          |**`/home/centos/src/project_data/xfopencv`** | Location of xfOpenCV library. <br/>***Note: If you place xfOpenCV library in other location than recommended (default) please update this variable!***                    |
+| **`TARGET`**            |**`hw_emu`**                   | The target flow. This variable should be override by desired target flow (**`hw/sw_emu/hw_emu`**) in make command line |
+| **`HOST_SDx_SRC`**      |**`xcl2`**                     | List of SDx kernel driver source files which provide interaction between host and FPGA kernel on Amazon F1 instance. Originally all examples use xcl driver v.2.|
+| **`SDx_LIB_DIR`**       |**`$(SDACCEL_DIR)/examples/xilinx/libs/xcl2`**   | Path to SDx kernel driver source files |
+| **`CXXFLAGS`**          |-                              | Contains SDx compiler options. Please see default value in [`common_makefile`](common_makefile) |
+| **`LDFLAGS`**           |-                              | Contains SDx linker options. Please see default value in [`common_makefile`](common_makefile) <br/>***Note: Host application needs specific version of run-time shared libraries. Important to explicitly specify for linker needed libraries with help of `-rpath` option. Take it in mind in case of [`common_makefile`](common_makefile) modification *** |
+| **`HOST_AWS_DIR`**      |**`./`**                       | Root folder of example                           |
+| **`HOST_BLD_DIR`**      |**`$(TARGET)/build/host`**     | Build folder for host application build artifacts|
+| **`HOST_RUN_DIR`**      |**`$(TARGET)/run`**            | Run folder of host application                   |
+| **`HOST_EXE`**          |**`$(HOST_RUN_DIR)/$(TEST_NAME)`** | Host application executable name with path   |
+
+### Variables for kernel part ###
+
+| Variable&nbsp;Name |Default value | Description |
+|       :-           |     :-       |      :-     |
+| **`XOCC_OPTS`**         |-                              | Contains XOCC options. Please see default value in [`common_makefile`](common_makefile) |
+| **`XOCC_INCL`**         |-                              | Contains paths to search header files. Please see default value in [`common_makefile`](common_makefile) |   
+| **`KERNEL_BLD_DIR`**    |**`$(TARGET)/build/kernel`**   | Build folder for kernel build artifacts|
+| **`KERNEL_RUN_DIR`**    |-                              | Folder to store kernel binary (`.xclbin`). Default value depends on target flow. Please see default value in [`common_makefile`](common_makefile)|
+
+
+### Makefile targets ###
+
+| Target&nbsp;label | Description |
+| :-                | :-          |
+| **`all`**         | Build host application and kernel for target flow specified by **`$(TARGET)`** variable |
+| **`host`**        | Build host application only for target flow specified by **`$(TARGET)`** variable |   
+| **`krnl`**        | Build kernel only for target flow specified by **`$(TARGET)`** variable |   
+| **`clean`**       | Clean build artifacts of target flow specified by **`$(TARGET)`** variable. <br/>**_Note: afi folder of FPGA flow ($(TARGET) == hw) kept untouched. You should clean it manually if needed_** |   
+
+
+
+## REVISION HISTORY
+
+Date      | Readme Version | Release Notes
+--------  |----------------|-------------------------
+May 2018  | 1.0            | Initial version.
diff --git a/aws_demo/stereopipeline/Stereo_Pipeline_Diagram.png b/aws_demo/stereopipeline/Stereo_Pipeline_Diagram.png
new file mode 100644
index 0000000..94c4f14
Binary files /dev/null and b/aws_demo/stereopipeline/Stereo_Pipeline_Diagram.png differ
diff --git a/aws_demo/stereopipeline/cameraParameters.h b/aws_demo/stereopipeline/cameraParameters.h
new file mode 100644
index 0000000..491ace5
--- /dev/null
+++ b/aws_demo/stereopipeline/cameraParameters.h
@@ -0,0 +1,173 @@
+typedef float param_T;
+
+//#define OLD720PPARAMS
+//#define JACKSCAMERA_FHD
+//#define JACKSCAMERA_HACKF_720P
+//#define DANSCAMERA_FHD
+//#define DANSCAMERA_720P
+#define DANSCAMERA_HACKF_720P
+
+#ifdef OLD720PPARAMS
+param_T cameraMA_l[9] = { 1000, 0.0, 950, 0.0, 1000, 950, 0, 0, 1 };
+param_T irA_l[9] = { 0.001,0,    -0.95, 0,    0.001,-0.95, 0,0,1 };
+param_T distC_l[5] = { 0,0,0,0,0 };
+param_T cameraMA_r[9] = { 1000, 0.0, 950, 0.0, 1000, 950, 0, 0, 1 };
+param_T irA_r[9] = { 0.001,0,    -0.95, 0,    0.001,-0.95, 0,0,1 };
+param_T distC_r[5] = { 0,0,0,0,0 };
+#endif
+
+
+#ifdef JACKSCAMERA_HACKF_720P
+param_T cameraMA_l[9] =
+{  933.6330000000,     0.0000000000,   695.1210000000,
+		0.0000000000,   933.6330000000,   357.9060000000,
+		0.0000000000,     0.0000000000,     1.0000000000};
+
+param_T cameraMA_r[9] =
+{  933.0330000000,     0.0000000000,   713.4870000000,
+		0.0000000000,   933.0330000000,   371.9880000000,
+		0.0000000000,     0.0000000000,     1.0000000000};
+
+param_T distC_l[5] =
+{   -0.1742480000,     0.0257726000,     0.0000000000,     0.0000000000,     0.0000000000};
+
+param_T distC_r[5] =
+{   -0.1761240000,     0.0290219000,     0.0000000000,     0.0000000000,     0.0000000000};
+
+param_T irA_l[9] =
+{    0.0012029958,     0.0000006953,    -0.8686828369,
+		-0.0000006919,     0.0012030057,    -0.4384867217,
+		0.0000049680,    -0.0000008223,     0.9967224703};
+
+param_T irA_r[9] =
+{    0.0012030062,     0.0000000000,    -0.8643069377,
+		-0.0000000000,     0.0012030059,    -0.4403546203,
+		0.0000000000,     0.0000008238,     0.9996986971};
+#endif
+
+
+#ifdef JACKSCAMERA_FHD
+// ZED 2x 1080p Stereo Camera
+// from SN2484.conf [LEFT_CAM_FHD] fx,cx,fy,cy
+// Camera Mat Left:
+param_T cameraMA_l[9] =
+{ 1400.4500000000,     0.0000000000,  1073.2400000000,
+		0.0000000000,  1400.4500000000,   538.8110000000,
+		0.0000000000,     0.0000000000,     1.0000000000};
+
+//inv(Rotation Mat * new Camera Mat) Left:
+param_T irA_l[9] = 
+{    0.0008019968,     0.0000004635,    -0.9044744614,
+		-0.0000004613,     0.0008020035,    -0.4432839221,
+		0.0000033120,    -0.0000005482,     0.9965779658};
+
+// from SN2484.conf [LEFT_CAM_FHD] k1,k2
+param_T distC_l[5] = 
+{ -0.174248, 0.0257726, 0, 0, 0 };
+
+// from SN2484.conf [RIGHT_CAM_FHD] fx,cx,fy,cy
+// Camera Mat Right:
+param_T cameraMA_r[9] = 
+{ 1399.5500000000,     0.0000000000,  1109.9700000000,
+		0.0000000000,  1399.5500000000,   566.9750000000,
+		0.0000000000,     0.0000000000,     1.0000000000};
+
+// inv(Rotation Mat * new Camera Mat) Right:
+param_T irA_r[9] = 
+{    0.0008020038,     0.0000000000,    -0.9000960887,
+		-0.0000000000,     0.0008020036,    -0.4451724061,
+		0.0000000000,     0.0000005492,     0.9996953980};
+
+// from SN2484.conf [RIGHT_CAM_FHD] k1,k2
+param_T distC_r[5] = 
+{ -0.176124, 0.0290219, 0, 0, 0 };
+
+#endif
+
+#ifdef DANSCAMERA_720P
+param_T cameraMA_l[9] =
+{  699.8780000000,     0.0000000000,   663.4510000000,
+		0.0000000000,   699.8780000000,   377.0150000000,
+		0.0000000000,     0.0000000000,     1.0000000000};
+
+param_T cameraMA_r[9] =
+{  700.0990000000,     0.0000000000,   678.2970000000,
+		0.0000000000,   700.0990000000,   359.6230000000,
+		0.0000000000,     0.0000000000,     1.0000000000};
+
+param_T distC_l[5] =
+{   -0.1693980000,     0.0227329000,     0.0000000000,     0.0000000000,     0.0000000000};
+
+param_T distC_r[5] =
+{   -0.1705810000,     0.0249444000,     0.0000000000,     0.0000000000,     0.0000000000};
+
+param_T irA_l[9] =
+{    0.0024720519,    -0.0000000039,    -1.7464382128,
+		0.0000000015,     0.0024721905,    -0.9765267985,
+		0.0000261810,     0.0000002197,     0.9814731201};
+
+param_T irA_r[9] =
+{    0.0024721905,     0.0000000000,    -1.7359468834,
+		-0.0000000000,     0.0024721905,    -0.9763479760,
+		-0.0000000000,    -0.0000002197,     1.0000867727};
+#endif
+
+#ifdef DANSCAMERA_HACKF_720P
+param_T cameraMA_l[9] =
+{  933.1730000000,     0.0000000000,   663.4510000000,
+		0.0000000000,   933.1730000000,   377.0150000000,
+		0.0000000000,     0.0000000000,     1.0000000000};
+
+param_T cameraMA_r[9] =
+{  933.4670000000,     0.0000000000,   678.2970000000,
+		0.0000000000,   933.4670000000,   359.6230000000,
+		0.0000000000,     0.0000000000,     1.0000000000};
+
+param_T distC_l[5] =
+{   -0.1693980000,     0.0227329000,     0.0000000000,     0.0000000000,     0.0000000000};
+
+param_T distC_r[5] =
+{   -0.1705810000,     0.0249444000,     0.0000000000,     0.0000000000,     0.0000000000};
+
+param_T irA_l[9] =
+{    0.0011976323,    -0.0000000019,    -0.8153011732,
+		0.0000000007,     0.0011976994,    -0.4422348617,
+		0.0000126839,     0.0000001064,     0.9913820905};
+
+param_T irA_r[9] =
+{    0.0011976994,     0.0000000000,    -0.8047567905,
+		-0.0000000000,     0.0011976994,    -0.4420566166,
+		-0.0000000000,    -0.0000001064,     1.0000392898};
+#endif
+
+
+#ifdef DANSCAMERA_FHD
+param_T cameraMA_l[9] =
+{ 1399.7600000000,     0.0000000000,  1009.9000000000,
+		0.0000000000,  1399.7600000000,   577.0300000000,
+		0.0000000000,     0.0000000000,     1.0000000000};
+
+param_T cameraMA_r[9] =
+{ 1400.2000000000,     0.0000000000,  1039.5900000000,
+		0.0000000000,  1400.2000000000,   542.2460000000,
+		0.0000000000,     0.0000000000,     1.0000000000};
+
+param_T distC_l[5] =
+{   -0.1693980000,     0.0227329000,     0.0000000000,     0.0000000000,     0.0000000000};
+
+param_T distC_r[5] =
+{   -0.1705810000,     0.0249444000,     0.0000000000,     0.0000000000,     0.0000000000};
+
+param_T irA_l[9] =
+{    0.0007984219,    -0.0000000012,    -0.8338509656,
+		0.0000000005,     0.0007984666,    -0.4484861710,
+		0.0000084559,     0.0000000710,     0.9911850779};
+
+param_T irA_r[9] =
+{    0.0007984666,     0.0000000000,    -0.8233076329,
+		-0.0000000000,     0.0007984666,    -0.4483079145,
+		-0.0000000000,    -0.0000000710,     1.0000398454};
+#endif
+
+
+
diff --git a/aws_demo/stereopipeline/hw/afi/gen_afi.sh b/aws_demo/stereopipeline/hw/afi/gen_afi.sh
new file mode 100644
index 0000000..d3ec1e7
--- /dev/null
+++ b/aws_demo/stereopipeline/hw/afi/gen_afi.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+echo aws s3 rm --recursive s3://xfsp
+aws s3 rm --recursive s3://xfsp
+
+echo aws s3 rb s3://xfsp
+aws s3 rb s3://xfsp
+
+
+echo aws s3 mb s3://xfsp
+aws s3 mb s3://xfsp
+
+aws s3 mb s3://xfsp/dcp
+touch FILES_GO_HERE.txt
+aws s3 cp FILES_GO_HERE.txt s3://xfsp/dcp/
+
+
+aws s3 mb s3://xfsp/log
+touch LOGS_FILES_GO_HERE.txt
+aws s3 cp LOGS_FILES_GO_HERE.txt s3://xfsp/log/  
+
+aws s3 ls --recursive s3://xfsp
+
+rm -f FILES_GO_HERE.txt
+rm -f LOGS_FILES_GO_HERE.txt
+
+$SDACCEL_DIR/tools/create_sdaccel_afi.sh -xclbin=xf_stereo_pipeline.xclbin -s3_bucket=xfsp -s3_dcp_key=dcp -s3_logs_key=log
+
+cat *afi_id*
+
+echo "use following command to check afi ready"
+echo "aws ec2 describe-fpga-images --fpga-image-id <afi id>"
diff --git a/aws_demo/stereopipeline/hw/run/run.sh b/aws_demo/stereopipeline/hw/run/run.sh
new file mode 100644
index 0000000..08c6dc4
--- /dev/null
+++ b/aws_demo/stereopipeline/hw/run/run.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+
+source /opt/Xilinx/SDx/2017.1.rte.4ddr/setup.sh
+
+./stereo_pipeline_test ../../left.png ../../right.png
diff --git a/aws_demo/stereopipeline/hw_emu/run/run.sh b/aws_demo/stereopipeline/hw_emu/run/run.sh
new file mode 100644
index 0000000..277ae3d
--- /dev/null
+++ b/aws_demo/stereopipeline/hw_emu/run/run.sh
@@ -0,0 +1,5 @@
+emconfigutil -f $AWS_PLATFORM
+
+export XCL_EMULATION_MODE=hw_emu
+
+./stereo_pipeline_test ../../left.png ../../right.png
diff --git a/aws_demo/stereopipeline/hw_emu/run/sdaccel.ini b/aws_demo/stereopipeline/hw_emu/run/sdaccel.ini
new file mode 100644
index 0000000..63a1cac
--- /dev/null
+++ b/aws_demo/stereopipeline/hw_emu/run/sdaccel.ini
@@ -0,0 +1,5 @@
+[Debug]
+timeline_trace=true
+device_profile=true
+app_debug=true
+profile=true
diff --git a/aws_demo/stereopipeline/left.png b/aws_demo/stereopipeline/left.png
new file mode 100644
index 0000000..2819082
Binary files /dev/null and b/aws_demo/stereopipeline/left.png differ
diff --git a/aws_demo/stereopipeline/makefile b/aws_demo/stereopipeline/makefile
new file mode 100644
index 0000000..1eabcfc
--- /dev/null
+++ b/aws_demo/stereopipeline/makefile
@@ -0,0 +1,22 @@
+########################################
+#                                      #
+#             Host section             #
+#                                      #
+########################################
+
+TEST_NAME = stereo_pipeline_test
+
+HOST_AWS_SRC +=  xf_stereo_pipeline_accel_aws
+HOST_AWS_SRC +=  xf_stereo_pipeline_tb
+
+########################################
+#                                      #
+#            Kernel section            #
+#                                      #
+########################################
+
+KERNEL = xf_stereo_pipeline
+
+########################################
+
+include ../common_makefile
diff --git a/aws_demo/stereopipeline/readme.md b/aws_demo/stereopipeline/readme.md
new file mode 100644
index 0000000..42ace91
--- /dev/null
+++ b/aws_demo/stereopipeline/readme.md
@@ -0,0 +1,198 @@
+# Stereo Pipeline #
+
+Disparity map generation is one of the first steps in creating a three dimensional map of the environment. The xfOpenCV library has components to build an image processing pipeline to compute a disparity map given the camera parameters and inputs from a stereo camera setup.
+
+Example demonstrates using of **`xf::InitUndistortRectifyMapInverse()`**, **`xf::remap()`** and **`xf::StereoBM()`** functions of xfOpenCV library in pipeline to compute disparity map. Example designed to process one image set (image from left and right cameras) once. If you would like to process many sets of images in loop you need to extract from kernel interface wrapper FPGA & kernel initialization and finalization operations and move them to host application before and after processing loop respectively.
+
+## Code structure ##
+
+![](./../Code_Structure.png)
+
+| Component | Source files |
+| :-        | :-           |
+| *Kernel&nbsp;Configuration*          |**`xf_stereo_pipeline_config.h`**<br/>**`xf_config_params.h`**<br/>**`cameraParameters.h`**|
+| *Host&nbsp;Application*              |**`xf_stereo_pipeline_tb.cpp`**|
+| *Kernel&nbsp;Interface&nbsp;Wrapper* |**`xf_stereo_pipeline_accel_aws.cpp`**|
+| *Kernel&nbsp;Driver*                 |**`xcl2.cpp (in SDx library)`**|
+| *Kernel*                             |**`xf_stereo_pipeline_kernel_aws.cpp`**|
+
+## Kernel Configuration #
+
+Following constants in header files define kernel configuration
+
+| Constant | Possible values | Default Value | Description |
+| :-       | :-              | :-            | :-          |
+| **`XF_WIDTH`**             |**`> 0`**|**`1280`**|Maximum width of input image|
+| **`XF_HEIGHT`**            |**`> 0`**|**`720`**|Maximum height of input image|
+| **`XF_CAMERA_MATRIX_SIZE`**|**`9`**|**`9`**|Number of element in camera coordinate system matrix (9 == 3 x 3 matrix)|
+| **`XF_DIST_COEFF_SIZE`**   |**`4, 5, 8`**|**`5`**|Size of array with distortion coefficients|
+| **`NO_OF_DISPARITIES`**    |**`0 < value < XF_WIDTH &&`**<br/>**`value >= PARALLEL_UNITS &&`**<br/>**`value % PARALLEL_UNITS == 0`**| **`48`**| Number of disparities |
+| **`PARALLEL_UNITS`**       |**` > 0`**|**`16`**|Number of disparities to be computed in parallel|
+| **`SAD_WINDOW_SIZE`**      |**`value % 2 == 1 &&`**<br/>**`> minimum image heght &&`**<br/>**`< 21`**|15|Size of the window used for disparity computation|
+| **`XF_REMAP_BUFSIZE`**     |**`> 0`**|**`128`** |Number of input image rows to be buffered inside **`xf::remap()`**|
+
+## Host Application ##
+Host application reads two test images from file (images of left and right cameras) and forward them with cameras matrices, distortion coefficients and transformation matrices predefined in **`cameraParameters.h`** to the kernel for disparity map computation.
+
+Input images of example **_left.png_** and **_right.png_** placed in root folder of example. Disparity map calculated in kernel writing as image to **_hls_output.png_**.
+
+
+## Kernel Interface Wrapper ##
+
+In conjunction with xfOpenCV library on host application is convenient to use xf::Mat or cv::Mat class and image manipulation functions. Unfortunately the XOCC kernel compiler doesn't support classes/structures as kernel input/output parameters. To pass xf::Mat to a kernel a wrapper is needed. The kernel interface wrapper convert interface convenient to host application to kernel interface available in Amazon F1 instance.
+
+For this example kernel interface wrapper also perform FPGA initialization, kernel downloading, initialization and finalization.
+
+
+| Parameter&nbsp;Name |Direction|Type | Description |
+| :-                  | :-      | :-  | :-          |
+| **`xf_img_l`**      |Input | **`xf::Mat<XF_8UC1, XF_HEIGHT,`**<br/>**`XF_WIDTH, XF_NPPC1> &`**  | Input image from left camera |
+| **`xf_img_r`**      |Input | **`xf::Mat<XF_8UC1, XF_HEIGHT,`**<br/>**`XF_WIDTH, XF_NPPC1> &`**  | Input image from right camera |
+| **`xf_img_d`**      |Output| **`xf::Mat<XF_16UC1, XF_HEIGHT,`**<br/>**` XF_WIDTH, XF_NPPC1> &`**| Output disparity map |
+| **`bm_state`**      |Input | **`xf::xFSBMState<SAD_WINDOW_SIZE,`**<br/>**`NO_OF_DISPARITIES, PARALLEL_UNITS> &`**       | Set of various parameters regarding the stereo block matching algorithm |
+| **`cameraMA_l_fix`**|Input | **`ap_fixed<32,12>`**| Left  camera parameters matrix        |
+| **`cameraMA_r_fix`**|Input | **`ap_fixed<32,12>`**| Right camera parameters matrix        |
+| **`distC_l_fix`**   |Input | **`ap_fixed<32,12>`**| Left  image distortion coefficients   |
+| **`distC_r_fix`**   |Input | **`ap_fixed<32,12>`**| Right image distortion coefficients   |
+| **`irA_l_fix`**     |Input | **`ap_fixed<32,12>`**| Left  image transformation matrix     |
+| **`irA_r_fix`**     |Input | **`ap_fixed<32,12>`**| Right image transformation matrix     |
+| **`cm_size`**       |Input | **`int`**            | Size of camera parameters matrix      |
+| **`dc_size`**       |Input | **`int`**            | Size of distortion coefficients array |
+
+To forward these parameters to kernel wrapper create 10 buffers in global memory for images data, disparity map and transformation parameters matrices & arrays. Wrapper decompose **`xf_img_l`**, **`xf_img_r`**, **`xf_img_d`** and **`bm_state`** classes and pass member separately. Not all members of **`bm_state`** wrapper transfers to kernel. Most members of **`bm_state`** is predefined by template or calculated based on others members. Therefore kernel could fully restore values of host side **`bm_state`** based on template and restricted set of values. Wrapper forward as regular parameters of type **`int`** following members of **`bm_state`**: **`preFilterType`**, **`preFilterCap`**, **`minDisparity`**, **`textureThreshold`**, **`uniquenessRatio`** 
+
+
+## Kernel Driver ###
+
+Example use modification of SDx xcl kernel driver v.2 for Amazon F1 instance. Source code of this driver and description could be found in Amazon aws-fpga framework.
+
+## Kernel ##
+
+To calculate disparity map the kernel pipeline functions from xfOpenCV as shown on the image below.<br/>
+
+![](./Stereo_Pipeline_Diagram.png)
+
+The kernel has following parameters:
+
+| Parameter&nbsp;Name |Direction|Type | Description |
+| :-                  | :-      | :-  | :-          |
+| **`img_l`**         |Input | **`XF_TNAME(XF_8UC1, XF_NPPC1) *`** | Pointer to input image buffer from left  camera       |
+| **`img_r`**         |Input | **`XF_TNAME(XF_8UC1, XF_NPPC1) *`** | Pointer to input image buffer from right camera       |
+| **`cameraMA_l_fix`**|Input | **`ap_fixed<32,12> *`**             | Pointer to buffer with left  camera parameters matrix |
+| **`cameraMA_r_fix`**|Input | **`ap_fixed<32,12> *`**             | Pointer to buffer with right camera parameters matrix |
+| **`distC_l_fix`**   |Input | **`ap_fixed<32,12> *`**             | Pointer to buffer with left  image distortion coefficients |
+| **`distC_r_fix`**   |Input | **`ap_fixed<32,12> *`**             | Pointer to buffer with right image distortion coefficients |
+| **`irA_l_fix`**     |Input | **`ap_fixed<32,12> *`**             | Pointer to buffer with left  image transformation matrix   |
+| **`irA_r_fix`**     |Input | **`ap_fixed<32,12> *`**             | Pointer to buffer with right image transformation matrix   |
+| **`img_d`**         |Output| **`XF_TNAME(XF_16UC1, XF_NPPC1) *`**| Pointer to buffer for output disparity map                 |
+| **`preFilterType`**<br/>**`preFilterCap`**<br/>**`minDisparity`**<br/>**`textureThreshold`**<br/>**`uniquenessRatio`**<br/>| Input | **`int`**|Restricted set members of **`xf::xFSBMState<...>`** structure which have arbitrary values. This set is enough to restore values of all member of **`xf::xFSBMState<...>`** same as on host side|
+| **`cm_size`**       |Input | **`int`**                           | Size of camera parameters matrix     |
+| **`dc_size`**       |Input | **`int`**                           | Size of distortion coefficients array|
+| **`rows`**          |Input | **`int`**                           | Height of images and disparity map|
+| **`cols`**          |Input | **`int`**                           | Width  of images and disparity map |
+
+
+
+During synthesis for FPGA kernel's parameters should be mapped to HW interfaces supported on Amazon F1 instance. To map kernel parameters **`HLS INTERFACE`** pragma should be used. Supported following interfaces: **`m_axi`** and **`s_axilite`**. For **`m_axi`** offset can be set through **`s_axilite`** port only.
+
+Because functions from xfOpenCV library operate with **`xf::Mat`** class as image container kernel's parameters should be packed back to objects of this class. To do this you need following: 
+
+- Declare **`xf::Mat`** variable <br/> ***Note: due to XOCC issues use default constructor only - do not try initialize class members with help of non-default constructors***
+- Assign image size to **`rows`** and **`cols`** members
+- Copy image from input buffer to **`data`** member of **`xf::Mat`** or from **`data`** to output buffer
+
+```cpp
+xf::Mat<XF_8UC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1> xf_img_l;
+xf::Mat<XF_8UC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1> xf_img_r;
+
+xf_img_l.rows = rows; xf_img_l.cols = cols;
+xf_img_r.rows = rows; xf_img_r.cols = cols;
+
+for(int i=0; i < rows; i++)
+  {
+    #pragma HLS LOOP_TRIPCOUNT min=1 max=pROWS
+
+    for(int j=0; j < (cols >> (XF_BITSHIFT(XF_NPPC1))); j++)
+      {
+        #pragma HLS LOOP_TRIPCOUNT min=1 max=pCOLS/pNPC
+        #pragma HLS PIPELINE
+        #pragma HLS loop_flatten off
+
+        *(xf_img_l.data + i*(cols >> (XF_BITSHIFT(XF_NPPC1))) +j) = *(img_l + i*(cols >> (XF_BITSHIFT(XF_NPPC1))) +j);
+        *(xf_img_r.data + i*(cols >> (XF_BITSHIFT(XF_NPPC1))) +j) = *(img_r + i*(cols >> (XF_BITSHIFT(XF_NPPC1))) +j);         
+      }
+  }
+```
+**Note: `#pragma HLS` doesn't support constants defined through **`#define`** directive - use `const int`. In the code above `pROWS`, `pCOLS` and `pNPC` are `const int` variables which get values from constants defined in xf_stereo_pipeline_config.h with help of #define directive**
+
+```cpp
+const int pROWS = XF_HEIGHT;
+const int pCOLS = XF_WIDTH;
+const int pNPC  = XF_NPPC1;
+```
+
+Simple declaration of **`xf::Mat`** object create buffer to store whole image with maximum defined size. This buffer use FPGA internal memory blocks and even big FPGA devices could not have enough resources. You should use **`#pragma HLS stream`** to ask HLS convert big RAM buffer to small FIFO buffer 
+
+```cpp
+xf::Mat<XF_8UC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1> xf_img_l;
+xf::Mat<XF_8UC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1> xf_img_r;
+
+#pragma HLS stream variable=xf_img_l.data  depth=pCOLS/pNPC 
+#pragma HLS stream variable=xf_img_r.data depth=pCOLS/pNPC 
+```
+
+Please note that **`#pragma HLS stream`** could be used inside dataflow block, therefore kernel body should be declared as dataflow. This also permit pipeline functions from xfOpenCV library.
+
+```cpp
+void kernel(...)
+{
+  #pragma HLS INTERFACE ...
+  #pragma HLS INTERFACE ...
+  
+  #pragma HLS dataflow
+  ...
+}
+```
+
+## Known Issues
+
+- #### Kernel can't accept class/structure as parameters
+**Solution**: use simple types, pass class/structure members as separate parameters of simple types and compose class/structure object back inside kernel.
+
+- #### Using non-default constructors can cause kernel suspension on FPGA and HW emulation
+**Solution**: use default constructor for object declaration and next assign desired values to the members separately.
+
+```cpp
+xf::Mat<XF_8UC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1> xf_img_l;
+xf::Mat<XF_8UC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1> xf_img_r;
+
+xf_img_l.rows = rows; xf_img_l.cols = cols;
+xf_img_r.rows = rows; xf_img_r.cols = cols;
+```
+
+- #### **`#pragma HLS`** doesn't support constants defined through **`#define`** directive.
+**Solution**: use **`const int`** instead
+
+
+```cpp
+#define XF_HEIGHT 720
+
+void kernel(...)
+{
+  const int pROWS = XF_HEIGHT;
+
+  for(int i=0; i < rows; i++)
+    {
+      #pragma HLS LOOP_TRIPCOUNT min=1 max=pROWS
+      ...
+    }
+  ...
+}
+```
+
+
+## Revision History
+
+Date      | Readme Version | Release Notes
+--------  |----------------|-------------------------
+May 2018  | 1.0            | Initial version.
diff --git a/aws_demo/stereopipeline/right.png b/aws_demo/stereopipeline/right.png
new file mode 100644
index 0000000..4e31067
Binary files /dev/null and b/aws_demo/stereopipeline/right.png differ
diff --git a/aws_demo/stereopipeline/sw_emu/run/run.sh b/aws_demo/stereopipeline/sw_emu/run/run.sh
new file mode 100644
index 0000000..d75a02e
--- /dev/null
+++ b/aws_demo/stereopipeline/sw_emu/run/run.sh
@@ -0,0 +1,5 @@
+emconfigutil -f $AWS_PLATFORM
+
+export XCL_EMULATION_MODE=sw_emu
+
+./stereo_pipeline_test ../../left.png ../../right.png
\ No newline at end of file
diff --git a/aws_demo/stereopipeline/sw_emu/run/sdaccel.ini b/aws_demo/stereopipeline/sw_emu/run/sdaccel.ini
new file mode 100644
index 0000000..63a1cac
--- /dev/null
+++ b/aws_demo/stereopipeline/sw_emu/run/sdaccel.ini
@@ -0,0 +1,5 @@
+[Debug]
+timeline_trace=true
+device_profile=true
+app_debug=true
+profile=true
diff --git a/aws_demo/stereopipeline/xf_config_params.h b/aws_demo/stereopipeline/xf_config_params.h
new file mode 100644
index 0000000..3f56cdc
--- /dev/null
+++ b/aws_demo/stereopipeline/xf_config_params.h
@@ -0,0 +1,11 @@
+/* NO_OF_DISPARITIES must be greater than '0' and less than the image width */
+#define NO_OF_DISPARITIES	48
+
+/* NO_OF_DISPARITIES must not be lesser than PARALLEL_UNITS and NO_OF_DISPARITIES/PARALLEL_UNITS must be a non-fractional number */
+#define PARALLEL_UNITS		16
+
+/* SAD window size must be an odd number and it must be less than minimum of image height and width and less than the tested size '21' */
+#define SAD_WINDOW_SIZE		15
+
+// Configure this based on the number of rows needed for Remap function
+#define XF_REMAP_BUFSIZE   128
diff --git a/aws_demo/stereopipeline/xf_headers.h b/aws_demo/stereopipeline/xf_headers.h
new file mode 100644
index 0000000..793f8da
--- /dev/null
+++ b/aws_demo/stereopipeline/xf_headers.h
@@ -0,0 +1,56 @@
+/***************************************************************************
+ Copyright (c) 2016, Xilinx, Inc.
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+ 3. Neither the name of the copyright holder nor the names of its contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ HOWEVER CXFSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ***************************************************************************/
+#ifndef _XF_HEADERS_H_
+#define _XF_HEADERS_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+
+#undef __ARM_NEON__
+#undef __ARM_NEON
+#include "opencv/cv.h"
+#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/video/tracking.hpp"
+#define __ARM_NEON__
+#define __ARM_NEON
+
+
+#if __SDSCC__
+#include "sds_lib.h"
+#define TIME_STAMP_INIT  unsigned int clock_start, clock_end;  clock_start = sds_clock_counter();
+#define TIME_STAMP  { clock_end = sds_clock_counter(); printf("elapsed time %lu \n", clock_end-clock_start); clock_start = sds_clock_counter();  }
+#endif
+
+#include "common/xf_sw_utils.h"
+
+#endif//_XF_HEADERS_H_
+
diff --git a/aws_demo/stereopipeline/xf_stereo_pipeline_accel_aws.cpp b/aws_demo/stereopipeline/xf_stereo_pipeline_accel_aws.cpp
new file mode 100644
index 0000000..1bae1c1
--- /dev/null
+++ b/aws_demo/stereopipeline/xf_stereo_pipeline_accel_aws.cpp
@@ -0,0 +1,110 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <vector>
+
+#include "xcl2.hpp" 
+
+#include "xf_stereo_pipeline_config.h"
+
+typedef xf::xFSBMState<SAD_WINDOW_SIZE,NO_OF_DISPARITIES,PARALLEL_UNITS> xf_BMState;
+
+#define CL_MIGRATE_MEM_OBJECT_KERNEL 0       //OpenCL define constant to indicate memory object migration to host only, to make program more readable define "counterpart" constant
+
+void stereo_pipeline_accel
+  (
+    //                      Left                              |                       Right
+    xf::Mat<XF_8UC1 , XF_HEIGHT, XF_WIDTH, XF_NPPC1> &xf_img_l, xf::Mat<XF_8UC1 , XF_HEIGHT, XF_WIDTH, XF_NPPC1> &xf_img_r, 
+                                                              
+    xf::Mat<XF_16UC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1> &xf_img_d,
+
+    xf::xFSBMState<SAD_WINDOW_SIZE,NO_OF_DISPARITIES,PARALLEL_UNITS> &bm_state, 
+    
+    ap_fixed<32,12> *cameraMA_l_fix                           , ap_fixed<32,12> *cameraMA_r_fix, 
+    ap_fixed<32,12> *distC_l_fix                              , ap_fixed<32,12> *distC_r_fix   , 
+    ap_fixed<32,12> *irA_l_fix                                , ap_fixed<32,12> *irA_r_fix     , 
+    
+    int cm_size, 
+    int dc_size
+  )
+{
+    std::vector<cl::Device> devices = xcl::get_xil_devices();
+
+    cl::Device device = devices[0];
+
+    cl::Context context(device);
+
+    cl::CommandQueue q(context, device, CL_QUEUE_PROFILING_ENABLE);
+    std::string device_name = device.getInfo<CL_DEVICE_NAME>();
+
+    std::string binaryFile = (xcl::is_emulation() || xcl::is_hw_emulation ()) ? "xf_stereo_pipeline.xclbin" : "xf_stereo_pipeline.awsxclbin";
+    
+    std::cout << "======== " <<  binaryFile << " ========" << std::endl;
+    
+    cl::Program::Binaries bins = xcl::import_binary_file(binaryFile);
+    devices.resize(1);
+    cl::Program program(context, devices, bins);
+    cl::Kernel kernel(program,"xf_stereo_pipeline");
+
+    //----------- Allocate Buffer in Global Memory -----------//
+
+    int rows = xf_img_l.rows;
+    int cols = xf_img_l.cols;
+
+    int pixel_qnt = rows * cols;
+
+    cl::Buffer buffer_l   (context, cl_mem_flags(CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY ), pixel_qnt * 1, (void*)xf_img_l.data );          cl::Buffer buffer_r   (context, cl_mem_flags(CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY), pixel_qnt * 1, (void*)xf_img_r.data);
+                                                                                        
+    cl::Buffer buffer_cm_l(context, cl_mem_flags(CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY ), cm_size   * 4, (void*)cameraMA_l_fix);          cl::Buffer buffer_cm_r(context, cl_mem_flags(CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY), cm_size   * 4, (void*)cameraMA_r_fix);
+    cl::Buffer buffer_dc_l(context, cl_mem_flags(CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY ), dc_size   * 4, (void*)distC_l_fix   );          cl::Buffer buffer_dc_r(context, cl_mem_flags(CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY), dc_size   * 4, (void*)distC_r_fix   );
+    cl::Buffer buffer_ir_l(context, cl_mem_flags(CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY ), cm_size   * 4, (void*)irA_l_fix     );          cl::Buffer buffer_ir_r(context, cl_mem_flags(CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY), cm_size   * 4, (void*)irA_r_fix     );
+
+    cl::Buffer buffer_d   (context, cl_mem_flags(CL_MEM_USE_HOST_PTR | CL_MEM_WRITE_ONLY), pixel_qnt * 2, xf_img_d.data);                                                                                     
+
+    std::vector<cl::Memory> kernel_wr_buf;
+
+    kernel_wr_buf.push_back(buffer_l   );               kernel_wr_buf.push_back(buffer_r   );
+    kernel_wr_buf.push_back(buffer_cm_l);               kernel_wr_buf.push_back(buffer_cm_r);
+    kernel_wr_buf.push_back(buffer_dc_l);               kernel_wr_buf.push_back(buffer_dc_r);
+    kernel_wr_buf.push_back(buffer_ir_l);               kernel_wr_buf.push_back(buffer_ir_r);
+
+    //----------- Migrate  input data to device global memory -----------//
+    
+    q.enqueueMigrateMemObjects(kernel_wr_buf, CL_MIGRATE_MEM_OBJECT_KERNEL);
+
+    // The kernel parameters should be rearranged: input buffers, output buffers, variables
+    //                                              
+    //                                img_l        img_r        cm_l         cm_r         dc_l         dc_r         ir_l         ir_r         img_s               
+    auto krnl = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, int,  // preFilterType,
+                                                                                                                                                       int,  // preFilterCap,
+                                                                                                                                                       int,  // minDisparity,
+                                                                                                                                                       int,  // textureThreshold,
+                                                                                                                                                       int,  // uniquenessRatio,
+
+                                                                                                                                                       int,  // cm_size  
+                                                                                                                                                       int,  // dc_size   
+                                                                                                                                                       int,  // rows  
+                                                                                                                                                       int   // cols
+                                                                                                                                                       >(kernel);
+
+    //----------- Launch the Kernel -----------//
+                                                                                     
+    krnl(cl::EnqueueArgs(q, cl::NDRange(1,1,1), cl::NDRange(1,1,1)), buffer_l, buffer_r, buffer_cm_l, buffer_cm_r, buffer_dc_l, buffer_dc_r, buffer_ir_l, buffer_ir_r, buffer_d, bm_state.preFilterType,
+                                                                                                                                                                                 bm_state.preFilterCap,
+                                                                                                                                                                                 bm_state.minDisparity,
+                                                                                                                                                                                 bm_state.textureThreshold,
+                                                                                                                                                                                 bm_state.uniquenessRatio,
+
+                                                                                                                                                                                 cm_size, 
+                                                                                                                                                                                 dc_size, 
+                                                                                                                                                                                 rows, 
+                                                                                                                                                                                 cols);
+    
+    //----------- Copy Result from Device Global Memory to Host Local Memory -----------//
+    
+    std::vector<cl::Memory> kernel_rd_buf;
+    kernel_rd_buf.push_back(buffer_d);
+
+    q.enqueueMigrateMemObjects(kernel_rd_buf, CL_MIGRATE_MEM_OBJECT_HOST);
+
+    q.finish();
+}
diff --git a/aws_demo/stereopipeline/xf_stereo_pipeline_config.h b/aws_demo/stereopipeline/xf_stereo_pipeline_config.h
new file mode 100644
index 0000000..ec9eefc
--- /dev/null
+++ b/aws_demo/stereopipeline/xf_stereo_pipeline_config.h
@@ -0,0 +1,69 @@
+/***************************************************************************
+Copyright (c) 2016, Xilinx, Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+may be used to endorse or promote products derived from this software
+without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ***************************************************************************/
+
+#ifndef _XF_STEREO_PIPELINE_CONFIG_H_
+#define _XF_STEREO_PIPELINE_CONFIG_H_
+
+#include "hls_stream.h"
+
+#include "common/xf_common.h"
+#include "common/xf_utility.h"
+
+#include "xf_config_params.h"
+
+
+/* config width and height */
+#define XF_HEIGHT  720
+#define XF_WIDTH   1280
+
+#define XF_CAMERA_MATRIX_SIZE 9
+#define XF_DIST_COEFF_SIZE 5
+
+
+void stereo_pipeline_accel
+  (
+    //                      Left                              |                       Right
+    xf::Mat<XF_8UC1 , XF_HEIGHT, XF_WIDTH, XF_NPPC1> &xf_img_l, xf::Mat<XF_8UC1 , XF_HEIGHT, XF_WIDTH, XF_NPPC1> &xf_img_r, 
+                                                              
+    xf::Mat<XF_16UC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1> &xf_img_s,
+
+    xf::xFSBMState<SAD_WINDOW_SIZE,NO_OF_DISPARITIES,PARALLEL_UNITS> &bm_state, 
+    
+    ap_fixed<32,12> *cameraMA_l_fix                           , ap_fixed<32,12> *cameraMA_r_fix, 
+    ap_fixed<32,12> *distC_l_fix                              , ap_fixed<32,12> *distC_r_fix   , 
+    ap_fixed<32,12> *irA_l_fix                                , ap_fixed<32,12> *irA_r_fix     , 
+    
+    int cm_size, 
+    int dc_size
+  );
+
+
+#endif // _XF_STEREO_PIPELINE_CONFIG_H_
+
diff --git a/aws_demo/stereopipeline/xf_stereo_pipeline_kernel_aws.cpp b/aws_demo/stereopipeline/xf_stereo_pipeline_kernel_aws.cpp
new file mode 100644
index 0000000..2080d2d
--- /dev/null
+++ b/aws_demo/stereopipeline/xf_stereo_pipeline_kernel_aws.cpp
@@ -0,0 +1,216 @@
+//Includes
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include "xf_stereo_pipeline_config.h"
+
+#include "imgproc/xf_stereo_pipeline.hpp"
+#include "imgproc/xf_remap.hpp"
+#include "imgproc/xf_stereoBM.hpp"
+
+
+extern "C"                                            
+  {                                                      
+    void xf_stereo_pipeline                            
+      (                                     
+        //           Left                 |              Right
+        XF_TNAME(XF_8UC1, XF_NPPC1) *img_l,  XF_TNAME(XF_8UC1, XF_NPPC1) *img_r,       
+                                                      
+        ap_fixed<32,12> *cameraMA_l_fix   ,  ap_fixed<32,12> *cameraMA_r_fix,                                                     
+        ap_fixed<32,12> *distC_l_fix      ,  ap_fixed<32,12> *distC_r_fix   ,                                                     
+        ap_fixed<32,12> *irA_l_fix        ,  ap_fixed<32,12> *irA_r_fix     ,                                                     
+
+        XF_TNAME(XF_16UC1, XF_NPPC1) *img_d ,
+
+        int preFilterType,
+        int preFilterCap,
+        int minDisparity,
+        int textureThreshold,
+        int uniquenessRatio,
+
+        int cm_size, 
+        int dc_size,                   
+                                                      
+        int rows,                                     
+        int cols                                      
+      );                                                                                                                                                        
+  }
+
+void xf_stereo_pipeline
+  (    
+    //           Left                 |              Right                                              
+    XF_TNAME(XF_8UC1, XF_NPPC1) *img_l,  XF_TNAME(XF_8UC1 , XF_NPPC1) *img_r,      
+                                                                         
+    ap_fixed<32,12> *cameraMA_l_fix   ,  ap_fixed<32,12> *cameraMA_r_fix,                                                     
+    ap_fixed<32,12> *distC_l_fix      ,  ap_fixed<32,12> *distC_r_fix   ,                                                     
+    ap_fixed<32,12> *irA_l_fix        ,  ap_fixed<32,12> *irA_r_fix     ,                                                     
+                                                                         
+    XF_TNAME(XF_16UC1, XF_NPPC1) *img_d,
+
+    int preFilterType,
+    int preFilterCap,
+    int minDisparity,
+    int textureThreshold,
+    int uniquenessRatio,
+
+    int cm_size, 
+    int dc_size,                  
+                                                                         
+    int rows,                                    
+    int cols                                     
+  )
+{
+  #pragma HLS INTERFACE m_axi     port=img_l      offset=slave bundle=gmem_i_l
+  #pragma HLS INTERFACE m_axi     port=img_r      offset=slave bundle=gmem_i_r
+                                                 
+  #pragma HLS INTERFACE m_axi     port=cameraMA_l_fix    offset=slave bundle=gmem_l
+  #pragma HLS INTERFACE m_axi     port=cameraMA_r_fix    offset=slave bundle=gmem_r
+                                                         
+  #pragma HLS INTERFACE m_axi     port=distC_l_fix       offset=slave bundle=gmem_l
+  #pragma HLS INTERFACE m_axi     port=distC_r_fix       offset=slave bundle=gmem_r
+                                                         
+  #pragma HLS INTERFACE m_axi     port=irA_l_fix         offset=slave bundle=gmem_l
+  #pragma HLS INTERFACE m_axi     port=irA_r_fix         offset=slave bundle=gmem_r
+
+  #pragma HLS INTERFACE m_axi     port=img_d             offset=slave bundle=gmem_s
+
+
+  #pragma HLS INTERFACE s_axilite     port=img_l              bundle=control
+  #pragma HLS INTERFACE s_axilite     port=img_r              bundle=control
+                                                             
+  #pragma HLS INTERFACE s_axilite     port=cameraMA_l_fix     bundle=control
+  #pragma HLS INTERFACE s_axilite     port=cameraMA_r_fix     bundle=control
+                                                              
+  #pragma HLS INTERFACE s_axilite     port=distC_l_fix        bundle=control
+  #pragma HLS INTERFACE s_axilite     port=distC_r_fix        bundle=control
+                                                              
+  #pragma HLS INTERFACE s_axilite     port=irA_l_fix          bundle=control
+  #pragma HLS INTERFACE s_axilite     port=irA_r_fix          bundle=control
+
+
+  #pragma HLS INTERFACE s_axilite     port=img_d              bundle=control
+
+
+  #pragma HLS INTERFACE s_axilite port=preFilterType     bundle=control
+  #pragma HLS INTERFACE s_axilite port=preFilterCap      bundle=control
+  #pragma HLS INTERFACE s_axilite port=minDisparity      bundle=control
+  #pragma HLS INTERFACE s_axilite port=textureThreshold  bundle=control
+  #pragma HLS INTERFACE s_axilite port=uniquenessRatio   bundle=control
+
+  #pragma HLS INTERFACE s_axilite port=cm_size           bundle=control
+  #pragma HLS INTERFACE s_axilite port=dc_size           bundle=control
+                                                      
+  #pragma HLS INTERFACE s_axilite port=rows              bundle=control
+  #pragma HLS INTERFACE s_axilite port=cols              bundle=control
+                                                      
+  #pragma HLS INTERFACE s_axilite port=return            bundle=control
+
+
+  #pragma HLS INLINE OFF
+  #pragma HLS dataflow
+
+
+  const int pROWS = XF_HEIGHT;
+  const int pCOLS = XF_WIDTH ;
+
+  const int pNPC  = XF_NPPC1;
+
+  xf::Mat<XF_8UC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1> xf_img_l;   // don't use non default constructor xf::Mat<...> xf_img_l(rows, cols) - kernel will suspend on hw emulation and FPGA
+  xf::Mat<XF_8UC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1> xf_img_r;
+
+  #pragma HLS stream variable=xf_img_l.data  depth=pCOLS/pNPC 
+  #pragma HLS stream variable=xf_img_r.data depth=pCOLS/pNPC 
+
+
+  xf::Mat<XF_16UC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1> xf_img_d;
+
+  #pragma HLS stream variable=xf_img_d.data  depth=pCOLS/pNPC 
+
+  xf::Mat<XF_32FC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1> xf_map_x_l;
+  xf::Mat<XF_32FC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1> xf_map_y_l;
+
+  #pragma HLS stream variable=xf_map_x_l.data depth=pCOLS/pNPC 
+  #pragma HLS stream variable=xf_map_y_l.data depth=pCOLS/pNPC 
+
+  xf::Mat<XF_32FC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1> xf_map_x_r;
+  xf::Mat<XF_32FC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1> xf_map_y_r;
+
+  #pragma HLS stream variable=xf_map_x_r.data depth=pCOLS/pNPC 
+  #pragma HLS stream variable=xf_map_y_r.data depth=pCOLS/pNPC 
+
+  xf::Mat<XF_8UC1,  XF_HEIGHT, XF_WIDTH, XF_NPPC1> xf_remapped_l;
+  xf::Mat<XF_8UC1,  XF_HEIGHT, XF_WIDTH, XF_NPPC1> xf_remapped_r;
+
+  #pragma HLS stream variable=xf_remapped_l.data depth=pCOLS/pNPC 
+  #pragma HLS stream variable=xf_remapped_r.data depth=pCOLS/pNPC 
+
+  xf::xFSBMState<SAD_WINDOW_SIZE,NO_OF_DISPARITIES,PARALLEL_UNITS> bm_state;
+
+  xf_img_l.rows = rows; xf_img_l.cols = cols;
+  xf_img_r.rows = rows; xf_img_r.cols = cols;
+  xf_img_d.rows = rows; xf_img_d.cols = cols;
+  
+  xf_map_x_l.rows = rows; xf_map_x_l.cols = cols;
+  xf_map_y_l.rows = rows; xf_map_y_l.cols = cols;
+  xf_map_x_r.rows = rows; xf_map_x_r.cols = cols;
+  xf_map_y_r.rows = rows; xf_map_y_r.cols = cols;
+
+  xf_remapped_l.rows = rows;  xf_remapped_l.cols = cols;
+  xf_remapped_r.rows = rows;  xf_remapped_r.cols = cols;
+
+	bm_state.preFilterType       = preFilterType   ;
+	bm_state.preFilterCap        = preFilterCap    ;
+	bm_state.minDisparity        = minDisparity    ;
+	bm_state.textureThreshold    = textureThreshold;
+	bm_state.uniquenessRatio     = uniquenessRatio ;
+
+  for(int i=0; i < rows; i++)
+    {
+      #pragma HLS LOOP_TRIPCOUNT min=1 max=pROWS
+
+      for(int j=0; j < (cols >> (XF_BITSHIFT(XF_NPPC1))); j++)
+        {
+          #pragma HLS LOOP_TRIPCOUNT min=1 max=pCOLS/pNPC
+          #pragma HLS PIPELINE
+          #pragma HLS loop_flatten off
+
+          *(xf_img_l.data + i*(cols >> (XF_BITSHIFT(XF_NPPC1))) +j) = *(img_l + i*(cols >> (XF_BITSHIFT(XF_NPPC1))) +j);
+          *(xf_img_r.data + i*(cols >> (XF_BITSHIFT(XF_NPPC1))) +j) = *(img_r + i*(cols >> (XF_BITSHIFT(XF_NPPC1))) +j);         
+        }
+    }
+
+ 
+  xf::InitUndistortRectifyMapInverse < XF_CAMERA_MATRIX_SIZE, XF_DIST_COEFF_SIZE, XF_32FC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1 > (cameraMA_l_fix, distC_l_fix, irA_l_fix, xf_map_x_l, xf_map_y_l, cm_size, dc_size);
+
+  xf::remap <XF_REMAP_BUFSIZE, XF_INTERPOLATION_BILINEAR, XF_8UC1, XF_32FC1, XF_8UC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1, false> ( xf_img_l, xf_remapped_l, xf_map_x_l, xf_map_y_l ); 
+
+
+
+  xf::InitUndistortRectifyMapInverse < XF_CAMERA_MATRIX_SIZE, XF_DIST_COEFF_SIZE, XF_32FC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1 > (cameraMA_r_fix, distC_r_fix, irA_r_fix, xf_map_x_r, xf_map_y_r, cm_size, dc_size);
+
+  xf::remap <XF_REMAP_BUFSIZE, XF_INTERPOLATION_BILINEAR, XF_8UC1, XF_32FC1, XF_8UC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1, false> ( xf_img_r, xf_remapped_r, xf_map_x_r, xf_map_y_r); 
+
+
+
+
+  xf::StereoBM <SAD_WINDOW_SIZE, NO_OF_DISPARITIES, PARALLEL_UNITS, XF_8UC1, XF_16UC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1> ( xf_remapped_l, xf_remapped_r, xf_img_d, bm_state);
+
+
+
+
+  for(int i=0; i < rows; i++)
+    {
+      #pragma HLS LOOP_TRIPCOUNT min=1 max=pROWS
+
+      for(int j=0; j < (cols >> (XF_BITSHIFT(XF_NPPC1))); j++)
+        {
+          #pragma HLS LOOP_TRIPCOUNT min=1 max=pCOLS/pNPC
+          #pragma HLS PIPELINE
+          #pragma HLS loop_flatten off
+
+          *(img_d + i*(cols >> (XF_BITSHIFT(XF_NPPC1))) +j) = *(xf_img_d.data + i*(cols >> (XF_BITSHIFT(XF_NPPC1))) +j);
+        }
+    }
+
+}
diff --git a/aws_demo/stereopipeline/xf_stereo_pipeline_tb.cpp b/aws_demo/stereopipeline/xf_stereo_pipeline_tb.cpp
new file mode 100644
index 0000000..273e67b
--- /dev/null
+++ b/aws_demo/stereopipeline/xf_stereo_pipeline_tb.cpp
@@ -0,0 +1,129 @@
+/***************************************************************************
+Copyright (c) 2016, Xilinx, Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+may be used to endorse or promote products derived from this software
+without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ***************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+
+#include "opencv/cv.h"
+#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/video/tracking.hpp"
+
+#include "common/xf_sw_utils.h" 
+
+#include "xf_stereo_pipeline_config.h"
+
+#include "cameraParameters.h"
+
+using namespace std;
+
+int main(int argc, char** argv)
+{
+  cv::setUseOptimized(false);
+
+  if(argc != 3)
+  {
+    fprintf(stderr,"Invalid Number of Arguments!\nUsage: <executable> <left image> <right image>\n");
+    return -1;
+  }
+
+
+  cv::Mat cv_img_l, cv_img_r;
+
+  cv_img_l = cv::imread(argv[1], 0);
+  cv_img_r = cv::imread(argv[2], 0);
+
+  //////////////////  HLS TOP Function Call  ////////////////////////
+  
+  xf::Mat<XF_8UC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1> xf_img_l(cv_img_l.rows, cv_img_l.cols);
+  xf::Mat<XF_8UC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1> xf_img_r(cv_img_r.rows, cv_img_r.cols);
+
+  int rows = cv_img_l.rows;
+  int cols = cv_img_l.cols;
+
+  xf::Mat<XF_16UC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1> xf_img_d(rows,cols);
+
+  // camera parameters for rectification
+
+  ap_fixed<32,12> *cameraMA_l_fix = (ap_fixed<32,12>*)malloc(XF_CAMERA_MATRIX_SIZE*sizeof(ap_fixed<32,12>));
+  ap_fixed<32,12> *cameraMA_r_fix = (ap_fixed<32,12>*)malloc(XF_CAMERA_MATRIX_SIZE*sizeof(ap_fixed<32,12>));
+  ap_fixed<32,12> *irA_l_fix      = (ap_fixed<32,12>*)malloc(XF_CAMERA_MATRIX_SIZE*sizeof(ap_fixed<32,12>));
+  ap_fixed<32,12> *irA_r_fix      = (ap_fixed<32,12>*)malloc(XF_CAMERA_MATRIX_SIZE*sizeof(ap_fixed<32,12>));
+  ap_fixed<32,12> *distC_l_fix    = (ap_fixed<32,12>*)malloc(XF_DIST_COEFF_SIZE   *sizeof(ap_fixed<32,12>));
+  ap_fixed<32,12> *distC_r_fix    = (ap_fixed<32,12>*)malloc(XF_DIST_COEFF_SIZE   *sizeof(ap_fixed<32,12>));
+
+
+  xf_img_l = xf::imread<XF_8UC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1>(argv[1], 0);
+  xf_img_r = xf::imread<XF_8UC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1>(argv[2], 0);
+
+  xf::xFSBMState<SAD_WINDOW_SIZE,NO_OF_DISPARITIES,PARALLEL_UNITS> bm_state;
+
+  bm_state.preFilterCap = 31;
+  bm_state.uniquenessRatio = 15;
+  bm_state.textureThreshold = 20;
+  bm_state.minDisparity = 0;
+
+  // copy camera params
+  for(int i=0; i<XF_CAMERA_MATRIX_SIZE; i++) 
+    {
+      cameraMA_l_fix[i] = (ap_fixed<32,12>)cameraMA_l[i];
+      cameraMA_r_fix[i] = (ap_fixed<32,12>)cameraMA_r[i];
+      irA_l_fix     [i] = (ap_fixed<32,12>)irA_l     [i];
+      irA_r_fix     [i] = (ap_fixed<32,12>)irA_r     [i];
+    }
+
+  // copy distortion coefficients
+  for(int i=0; i<XF_DIST_COEFF_SIZE; i++) 
+    {
+      distC_l_fix[i] = (ap_fixed<32,12>)distC_l[i];
+      distC_r_fix[i] = (ap_fixed<32,12>)distC_r[i];
+    }
+
+  printf("starting the kernel...\n");
+
+
+  stereo_pipeline_accel(xf_img_l, xf_img_r, xf_img_d, bm_state, cameraMA_l_fix, cameraMA_r_fix, distC_l_fix, distC_r_fix, irA_l_fix, irA_r_fix, 9, 5);
+
+
+  cv::Mat out_disp_16(rows,cols,CV_16UC1);
+  cv::Mat out_disp_08(rows,cols,CV_8UC1 );
+
+  out_disp_16.data = xf_img_d.copyFrom();
+
+  out_disp_16.convertTo(out_disp_08, CV_8U, (256.0/NO_OF_DISPARITIES)/(16.));
+
+  imwrite("hls_output.png",out_disp_08);
+
+  printf ("run complete !\n\n");
+
+  return 0;
+}
+
diff --git a/examples/lkdensepyrof/xf_config_params.h b/examples/lkdensepyrof/xf_config_params.h
index 05cd576..a1d0193 100644
--- a/examples/lkdensepyrof/xf_config_params.h
+++ b/examples/lkdensepyrof/xf_config_params.h
@@ -10,4 +10,6 @@
 #define HEIGHT 1080
 #define WIDTH 1920
 
-#define NUM_LINES_FINDIT 50
\ No newline at end of file
+#define NUM_LINES_FINDIT 50
+
+#define XF_USE_URAM false
diff --git a/examples/lkdensepyrof/xf_pyr_dense_optical_flow_accel.cpp b/examples/lkdensepyrof/xf_pyr_dense_optical_flow_accel.cpp
index 1cfb0d7..79653c8 100644
--- a/examples/lkdensepyrof/xf_pyr_dense_optical_flow_accel.cpp
+++ b/examples/lkdensepyrof/xf_pyr_dense_optical_flow_accel.cpp
@@ -35,10 +35,10 @@ void pyr_dense_optical_flow_pyr_down_accel(xf::Mat<XF_8UC1,HEIGHT,WIDTH,XF_NPPC1
 	{
 	#pragma SDS async(1)
 	#pragma SDS resource(1)
-		xf::pyrDown<XF_8UC1,HEIGHT,WIDTH,XF_NPPC1>(mat_imagepyr1[pyr_comp], mat_imagepyr1[pyr_comp+1]);
+		xf::pyrDown<XF_8UC1,HEIGHT,WIDTH,XF_NPPC1,XF_USE_URAM>(mat_imagepyr1[pyr_comp], mat_imagepyr1[pyr_comp+1]);
 	#pragma SDS async(2)
 	#pragma SDS resource(2)
-		xf::pyrDown<XF_8UC1,HEIGHT,WIDTH,XF_NPPC1>(mat_imagepyr2[pyr_comp], mat_imagepyr2[pyr_comp+1]);
+		xf::pyrDown<XF_8UC1,HEIGHT,WIDTH,XF_NPPC1,XF_USE_URAM>(mat_imagepyr2[pyr_comp], mat_imagepyr2[pyr_comp+1]);
 	#pragma SDS wait(1)
 	#pragma SDS wait(2)	
 	}
@@ -46,6 +46,6 @@ void pyr_dense_optical_flow_pyr_down_accel(xf::Mat<XF_8UC1,HEIGHT,WIDTH,XF_NPPC1
 
 void pyr_dense_optical_flow_accel(xf::Mat<XF_8UC1,HEIGHT,WIDTH,XF_NPPC1> & _current_img, xf::Mat<XF_8UC1,HEIGHT,WIDTH,XF_NPPC1> & _next_image, xf::Mat<XF_32UC1,HEIGHT,WIDTH,XF_NPPC1> & _streamFlowin, xf::Mat<XF_32UC1,HEIGHT,WIDTH,XF_NPPC1> & _streamFlowout, const int level, const unsigned char scale_up_flag, float scale_in, ap_uint<1> init_flag)
 {	
-	xf::densePyrOpticalFlow<NUM_LEVELS, NUM_LINES_FINDIT, WINSIZE_OFLOW, TYPE_FLOW_WIDTH, TYPE_FLOW_INT, XF_8UC1, HEIGHT, WIDTH, XF_NPPC1>(_current_img, _next_image, _streamFlowin, _streamFlowout, level, scale_up_flag, scale_in, init_flag);
+	xf::densePyrOpticalFlow<NUM_LEVELS, NUM_LINES_FINDIT, WINSIZE_OFLOW, TYPE_FLOW_WIDTH, TYPE_FLOW_INT, XF_8UC1, HEIGHT, WIDTH, XF_NPPC1, XF_USE_URAM>(_current_img, _next_image, _streamFlowin, _streamFlowout, level, scale_up_flag, scale_in, init_flag);
 }
 
diff --git a/examples/lknpyroflow/xf_config_params.h b/examples/lknpyroflow/xf_config_params.h
index 6e91f55..c9b25c8 100644
--- a/examples/lknpyroflow/xf_config_params.h
+++ b/examples/lknpyroflow/xf_config_params.h
@@ -1,4 +1,5 @@
 #define MAX_HEIGHT 2160
 #define MAX_WIDTH  3840
 #define WORD_SZ 1
-#define KMED 25
\ No newline at end of file
+#define KMED 25
+#define XF_USE_URAM false
diff --git a/examples/lknpyroflow/xf_dense_npyr_optical_flow_accel.cpp b/examples/lknpyroflow/xf_dense_npyr_optical_flow_accel.cpp
index cbb1ba8..7980c37 100644
--- a/examples/lknpyroflow/xf_dense_npyr_optical_flow_accel.cpp
+++ b/examples/lknpyroflow/xf_dense_npyr_optical_flow_accel.cpp
@@ -31,6 +31,6 @@ EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 void dense_non_pyr_of_accel(xf::Mat<XF_8UC1, MAX_HEIGHT, MAX_WIDTH, NPPC> &buf0, xf::Mat<XF_8UC1, MAX_HEIGHT, MAX_WIDTH, NPPC> &buf1, xf::Mat<XF_32FC1,MAX_HEIGHT, MAX_WIDTH, NPPC> &flowx, xf::Mat<XF_32FC1,MAX_HEIGHT, MAX_WIDTH, NPPC> &flowy)
 {	
-	xf::DenseNonPyrLKOpticalFlow<KMED, XF_8UC1, MAX_HEIGHT, MAX_WIDTH, NPPC>(buf0, buf1, flowx, flowy);
+	xf::DenseNonPyrLKOpticalFlow<KMED, XF_8UC1, MAX_HEIGHT, MAX_WIDTH, NPPC, XF_USE_URAM>(buf0, buf1, flowx, flowy);
 }
 
diff --git a/examples/remap/xf_config_params.h b/examples/remap/xf_config_params.h
index 801d559..da08235 100644
--- a/examples/remap/xf_config_params.h
+++ b/examples/remap/xf_config_params.h
@@ -4,3 +4,4 @@
 
 // The type of interpolation, define "XF_REMAP_INTERPOLATION" as either "XF_INTERPOLATION_NN" or "XF_INTERPOLATION_BILINEAR"
 #define XF_REMAP_INTERPOLATION XF_INTERPOLATION_BILINEAR
+#define XF_USE_URAM false
diff --git a/examples/remap/xf_remap_accel.cpp b/examples/remap/xf_remap_accel.cpp
index 37b4d72..b6bb4ee 100644
--- a/examples/remap/xf_remap_accel.cpp
+++ b/examples/remap/xf_remap_accel.cpp
@@ -32,6 +32,6 @@ EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 void remap_accel(xf::Mat<TYPE, XF_HEIGHT, XF_WIDTH, XF_NPPC1> &inMat, xf::Mat<TYPE, XF_HEIGHT, XF_WIDTH, XF_NPPC1> &remappedMat,
 	xf::Mat<XF_32FC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1> &mapxMat, xf::Mat<XF_32FC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1> &mapyMat)
 {
-	xf::remap<XF_WIN_ROWS,XF_REMAP_INTERPOLATION,TYPE,XF_32FC1,TYPE,XF_HEIGHT,XF_WIDTH,XF_NPPC1>(inMat,remappedMat,mapxMat,mapyMat);
+	xf::remap<XF_WIN_ROWS,XF_REMAP_INTERPOLATION,TYPE,XF_32FC1,TYPE,XF_HEIGHT,XF_WIDTH,XF_NPPC1,XF_USE_URAM>(inMat,remappedMat,mapxMat,mapyMat);
 }
 
diff --git a/examples/stereolbm/xf_stereoBM_tb.cpp b/examples/stereolbm/xf_stereoBM_tb.cpp
index 4a98676..b906da7 100644
--- a/examples/stereolbm/xf_stereoBM_tb.cpp
+++ b/examples/stereolbm/xf_stereoBM_tb.cpp
@@ -137,9 +137,9 @@ int main(int argc, char** argv)
 
 	int cnt=0, total = 0;
 
-	for(int i=(SAD_WINDOW_SIZE>>1)+20; i<out_disp_img.rows-((SAD_WINDOW_SIZE>>1)+20); i++)
+	for(int i=SAD_WINDOW_SIZE; i<out_disp_img.rows-SAD_WINDOW_SIZE; i++)
 	{
-		for(int j=(NO_OF_DISPARITIES-1)+(SAD_WINDOW_SIZE>>1)+20; j<out_disp_img.cols-((SAD_WINDOW_SIZE>>1)+20); j++)
+		for(int j=SAD_WINDOW_SIZE; j<out_disp_img.cols-SAD_WINDOW_SIZE; j++)
 		{
 			total ++;
 			int diff = (disp8.at<unsigned char> (i,j))-(out_disp_img.data[i*out_disp_img.cols +j]);
diff --git a/examples/stereopipeline/xf_stereo_pipeline_accel.cpp b/examples/stereopipeline/xf_stereo_pipeline_accel.cpp
index 4436890..ce0f3c7 100644
--- a/examples/stereopipeline/xf_stereo_pipeline_accel.cpp
+++ b/examples/stereopipeline/xf_stereo_pipeline_accel.cpp
@@ -29,18 +29,36 @@ EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ***************************************************************************/
 #include "xf_stereo_pipeline_config.h"
 
-void stereopipeline_accel(xf::Mat<XF_8UC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1> &leftMat, xf::Mat<XF_8UC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1> &rightMat, xf::Mat<XF_16UC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1> &dispMat,
-	xf::Mat<XF_32FC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1> &mapxLMat, xf::Mat<XF_32FC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1> &mapyLMat, xf::Mat<XF_32FC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1> &mapxRMat, 
-	xf::Mat<XF_32FC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1> &mapyRMat, xf::Mat<XF_8UC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1> &leftRemappedMat, xf::Mat<XF_8UC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1> &rightRemappedMat,
-	xf::xFSBMState<SAD_WINDOW_SIZE,NO_OF_DISPARITIES,PARALLEL_UNITS> &bm_state, ap_fixed<32,12> *cameraMA_l_fix, ap_fixed<32,12> *cameraMA_r_fix, ap_fixed<32,12> *distC_l_fix, ap_fixed<32,12> *distC_r_fix, 
-	ap_fixed<32,12> *irA_l_fix, ap_fixed<32,12> *irA_r_fix, int _cm_size, int _dc_size)
+void stereopipeline_accel
+  (
+    xf::Mat<XF_8UC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1> &leftMat,   xf::Mat<XF_8UC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1> &rightMat, 
+    
+    xf::Mat<XF_16UC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1> &dispMat,
+	  
+    xf::Mat<XF_32FC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1> &mapxLMat, xf::Mat<XF_32FC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1> &mapyLMat, 
+    xf::Mat<XF_32FC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1> &mapxRMat, xf::Mat<XF_32FC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1> &mapyRMat, 
+
+    xf::Mat<XF_8UC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1> &leftRemappedMat, xf::Mat<XF_8UC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1> &rightRemappedMat,
+	  
+    xf::xFSBMState<SAD_WINDOW_SIZE,NO_OF_DISPARITIES,PARALLEL_UNITS> &bm_state, 
+    
+    ap_fixed<32,12> *cameraMA_l_fix,  ap_fixed<32,12> *cameraMA_r_fix, 
+    ap_fixed<32,12> *distC_l_fix   ,  ap_fixed<32,12> *distC_r_fix   , 
+	  ap_fixed<32,12> *irA_l_fix     ,  ap_fixed<32,12> *irA_r_fix     , 
+
+    int _cm_size, int _dc_size
+  )
 {
-	xf::InitUndistortRectifyMapInverse<XF_CAMERA_MATRIX_SIZE,XF_DIST_COEFF_SIZE,XF_32FC1,XF_HEIGHT,XF_WIDTH,XF_NPPC1>(cameraMA_l_fix,distC_l_fix,irA_l_fix,mapxLMat,mapyLMat,_cm_size,_dc_size);
-	xf::remap<XF_REMAP_BUFSIZE,XF_INTERPOLATION_BILINEAR,XF_8UC1,XF_32FC1,XF_8UC1,XF_HEIGHT,XF_WIDTH,XF_NPPC1>(leftMat,leftRemappedMat,mapxLMat,mapyLMat);
+	xf::InitUndistortRectifyMapInverse<XF_CAMERA_MATRIX_SIZE, XF_DIST_COEFF_SIZE, XF_32FC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1>(cameraMA_l_fix, distC_l_fix, irA_l_fix, mapxLMat, mapyLMat, _cm_size, _dc_size);
+	
+  xf::remap<XF_REMAP_BUFSIZE, XF_INTERPOLATION_BILINEAR, XF_8UC1, XF_32FC1, XF_8UC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1>(leftMat, leftRemappedMat, mapxLMat, mapyLMat);
 	
-	xf::InitUndistortRectifyMapInverse<XF_CAMERA_MATRIX_SIZE,XF_DIST_COEFF_SIZE,XF_32FC1,XF_HEIGHT,XF_WIDTH,XF_NPPC1>(cameraMA_r_fix,distC_r_fix,irA_r_fix,mapxRMat,mapyRMat,_cm_size,_dc_size);
-	xf::remap<XF_REMAP_BUFSIZE,XF_INTERPOLATION_BILINEAR,XF_8UC1,XF_32FC1,XF_8UC1,XF_HEIGHT,XF_WIDTH,XF_NPPC1>(leftMat,leftRemappedMat,mapxLMat,mapyLMat);
+
+
+
+	xf::InitUndistortRectifyMapInverse<XF_CAMERA_MATRIX_SIZE, XF_DIST_COEFF_SIZE, XF_32FC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1>(cameraMA_r_fix, distC_r_fix, irA_r_fix, mapxRMat, mapyRMat, _cm_size, _dc_size);
+	xf::remap<XF_REMAP_BUFSIZE, XF_INTERPOLATION_BILINEAR, XF_8UC1, XF_32FC1, XF_8UC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1>(leftMat, leftRemappedMat, mapxLMat, mapyLMat);
 	
-	xf::StereoBM<SAD_WINDOW_SIZE,NO_OF_DISPARITIES,PARALLEL_UNITS,XF_8UC1,XF_16UC1,XF_HEIGHT,XF_WIDTH,XF_NPPC1>(leftRemappedMat, rightRemappedMat, dispMat, bm_state);
+	xf::StereoBM<SAD_WINDOW_SIZE, NO_OF_DISPARITIES, PARALLEL_UNITS, XF_8UC1, XF_16UC1, XF_HEIGHT, XF_WIDTH, XF_NPPC1>(leftRemappedMat,  rightRemappedMat,  dispMat, bm_state);
 }
 
diff --git a/examples/warptransform/xf_config_params.h b/examples/warptransform/xf_config_params.h
index e903c30..8d2ee85 100644
--- a/examples/warptransform/xf_config_params.h
+++ b/examples/warptransform/xf_config_params.h
@@ -16,4 +16,7 @@
 #define INTERPOLATION 1
 
 //transform type 0-AFFINE 1-PERSPECTIVE
-#define TRANSFORM_TYPE 0
\ No newline at end of file
+#define TRANSFORM_TYPE 0
+
+//usage of URAMs for buffers implementation
+#define XF_USE_URAM false
diff --git a/examples/warptransform/xf_warp_transform_accel.cpp b/examples/warptransform/xf_warp_transform_accel.cpp
index a9d0b65..1295a80 100644
--- a/examples/warptransform/xf_warp_transform_accel.cpp
+++ b/examples/warptransform/xf_warp_transform_accel.cpp
@@ -31,5 +31,5 @@ EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 void warp_transform_accel(xf::Mat<XF_8UC1,HEIGHT,WIDTH,XF_NPPC1> &_src, xf::Mat<XF_8UC1,HEIGHT,WIDTH,XF_NPPC1> &_dst, float *R)
 {	
-	xf::warpTransform<NUM_STORE_ROWS, START_PROC, TRANSFORM_TYPE, INTERPOLATION, XF_8UC1, HEIGHT, WIDTH, XF_NPPC1>(_src, _dst, R);
+	xf::warpTransform<NUM_STORE_ROWS, START_PROC, TRANSFORM_TYPE, INTERPOLATION, XF_8UC1, HEIGHT, WIDTH, XF_NPPC1, XF_USE_URAM>(_src, _dst, R);
 }
diff --git a/include/imgproc/xf_dense_npyr_optical_flow.hpp b/include/imgproc/xf_dense_npyr_optical_flow.hpp
index b79a327..791c78d 100644
--- a/include/imgproc/xf_dense_npyr_optical_flow.hpp
+++ b/include/imgproc/xf_dense_npyr_optical_flow.hpp
@@ -170,7 +170,7 @@ namespace xf{
 	// TODO: 
 	// 1. Dont need the entire column for img1Win and img2Win. Need only the kernel
 	// 2. Full line buffer is not needed
-	template<int ROWS, int COLS, int NPC, int WINDOW_SIZE>
+	template<int ROWS, int COLS, int NPC, int WINDOW_SIZE, bool USE_URAM>
 	static void computeSums16 (hls::stream < mywide_t< XF_NPIXPERCYCLE(NPC) > > img1Col [(WINDOW_SIZE+1)], 
 					  hls::stream < mywide_t< XF_NPIXPERCYCLE(NPC) > > img2Col [(WINDOW_SIZE+1)], 
 					  hls::stream <int>& ixix_out0, 
@@ -208,34 +208,55 @@ namespace xf{
 	  // For II=1 pipelining, need two read and 1 write ports. Simulating it with
 	  // two arrays that have their write ports tied together.
 	  // TODO need only MAX_WODTH/2. Have to adjust zIdx and nIdx as well
-	  static int csIxixO [COLS], csIxiyO [COLS], csIyiyO [COLS], csDixO [COLS], csDiyO [COLS];
-	  static int csIxixE [COLS], csIxiyE [COLS], csIyiyE [COLS], csDixE [COLS], csDiyE [COLS];
-
-	  static int cbIxixO [COLS], cbIxiyO [COLS], cbIyiyO [COLS], cbDixO [COLS], cbDiyO [COLS];
-	  static int cbIxixE [COLS], cbIxiyE [COLS], cbIyiyE [COLS], cbDixE [COLS], cbDiyE [COLS];
-
-	  int zIdx= - (WINDOW_SIZE-2);   // odd
-	  int zIdx1 = zIdx + 1;   // even
-
-	  int nIdx = zIdx + WINDOW_SIZE-2; // even (0)
-	  int nIdx1 = nIdx + 1;     // odd
-
+	  static int csIxixO [COLS/2], csIxiyO [COLS/2], csIyiyO [COLS/2], csDixO [COLS/2], csDiyO [COLS/2];
+	  static int csIxixE [COLS/2], csIxiyE [COLS/2], csIyiyE [COLS/2], csDixE [COLS/2], csDiyE [COLS/2];
+
+	  static int cbIxixO [COLS/2], cbIxiyO [COLS/2], cbIyiyO [COLS/2], cbDixO [COLS/2], cbDiyO [COLS/2];
+	  static int cbIxixE [COLS/2], cbIxiyE [COLS/2], cbIyiyE [COLS/2], cbDixE [COLS/2], cbDiyE [COLS/2];
+
+	  int zIdx=      - (WINDOW_SIZE/2-1);
+	  int nIdx = zIdx + WINDOW_SIZE/2-1;
+
+	#pragma HLS ARRAY_MAP variable=csIxixO instance=csO vertical
+	#pragma HLS ARRAY_MAP variable=csIxiyO instance=csO vertical
+	#pragma HLS ARRAY_MAP variable=csIyiyO instance=csO vertical
+	#pragma HLS ARRAY_MAP variable=csDixO  instance=csO vertical
+	#pragma HLS ARRAY_MAP variable=csDiyO  instance=csO vertical
+
+	#pragma HLS ARRAY_MAP variable=csIxixE instance=csE vertical
+	#pragma HLS ARRAY_MAP variable=csIxiyE instance=csE vertical
+	#pragma HLS ARRAY_MAP variable=csIyiyE instance=csE vertical
+	#pragma HLS ARRAY_MAP variable=csDixE  instance=csE vertical
+	#pragma HLS ARRAY_MAP variable=csDiyE  instance=csE vertical
+
+	#pragma HLS ARRAY_MAP variable=cbIxixO instance=cb vertical
+	#pragma HLS ARRAY_MAP variable=cbIxiyO instance=cb vertical
+	#pragma HLS ARRAY_MAP variable=cbIyiyO instance=cb vertical
+	#pragma HLS ARRAY_MAP variable=cbDixO  instance=cb vertical
+	#pragma HLS ARRAY_MAP variable=cbDiyO  instance=cb vertical
+	#pragma HLS ARRAY_MAP variable=cbIxixE instance=cb vertical
+	#pragma HLS ARRAY_MAP variable=cbIxiyE instance=cb vertical
+	#pragma HLS ARRAY_MAP variable=cbIyiyE instance=cb vertical
+	#pragma HLS ARRAY_MAP variable=cbDixE  instance=cb vertical
+	#pragma HLS ARRAY_MAP variable=cbDiyE  instance=cb vertical
+
+    if (USE_URAM) {
+	#pragma HLS RESOURCE variable=csIxixO core=XPM_MEMORY uram
+	#pragma HLS RESOURCE variable=csIxixE core=XPM_MEMORY uram
+	#pragma HLS RESOURCE variable=cbIxixO core=XPM_MEMORY uram
+    }
+    else {
 	#pragma HLS RESOURCE variable=csIxixO core=RAM_2P_BRAM
-	#pragma HLS RESOURCE variable=csIxiyO core=RAM_2P_BRAM
-	#pragma HLS RESOURCE variable=csIyiyO core=RAM_2P_BRAM
-	#pragma HLS RESOURCE variable=csDixO core=RAM_2P_BRAM
-	#pragma HLS RESOURCE variable=csDiyO core=RAM_2P_BRAM
+	#pragma HLS RESOURCE variable=csIxixE core=RAM_2P_BRAM
+	#pragma HLS RESOURCE variable=cbIxixO core=RAM_2P_BRAM
+    }
+
 	#pragma HLS DEPENDENCE variable=csIxixO inter WAR false
 	#pragma HLS DEPENDENCE variable=csIxiyO inter WAR false
 	#pragma HLS DEPENDENCE variable=csIyiyO inter WAR false
 	#pragma HLS DEPENDENCE variable=csDixO  inter WAR false
 	#pragma HLS DEPENDENCE variable=csDiyO  inter WAR false
 
-	#pragma HLS RESOURCE variable=csIxixE core=RAM_2P_BRAM
-	#pragma HLS RESOURCE variable=csIxiyE core=RAM_2P_BRAM
-	#pragma HLS RESOURCE variable=csIyiyE core=RAM_2P_BRAM
-	#pragma HLS RESOURCE variable=csDixE core=RAM_2P_BRAM
-	#pragma HLS RESOURCE variable=csDiyE core=RAM_2P_BRAM
 	#pragma HLS DEPENDENCE variable=csIxixE inter WAR false
 	#pragma HLS DEPENDENCE variable=csIxiyE inter WAR false
 	#pragma HLS DEPENDENCE variable=csIyiyE inter WAR false
@@ -243,28 +264,12 @@ namespace xf{
 	#pragma HLS DEPENDENCE variable=csDiyE  inter WAR false
 
 
-	#pragma HLS RESOURCE variable=cbIxixO core=RAM_2P_BRAM
-	#pragma HLS RESOURCE variable=cbIxiyO core=RAM_2P_BRAM
-	#pragma HLS RESOURCE variable=cbIyiyO core=RAM_2P_BRAM
-	#pragma HLS RESOURCE variable=cbDixO core=RAM_2P_BRAM
-	#pragma HLS RESOURCE variable=cbDiyO core=RAM_2P_BRAM
 	#pragma HLS DEPENDENCE variable=cbIxixO inter WAR false
 	#pragma HLS DEPENDENCE variable=cbIxiyO inter WAR false
 	#pragma HLS DEPENDENCE variable=cbIyiyO inter WAR false
 	#pragma HLS DEPENDENCE variable=cbDixO  inter WAR false
 	#pragma HLS DEPENDENCE variable=cbDiyO  inter WAR false
 
-#if PLATFORM_ZCU104
-	#pragma HLS RESOURCE variable=cbIxixE core=XPM_MEMORY uram
-	#pragma HLS RESOURCE variable=cbIxiyE core=XPM_MEMORY uram
-	#pragma HLS RESOURCE variable=cbIyiyE core=XPM_MEMORY uram
-#else
-	#pragma HLS RESOURCE variable=cbIxixE core=RAM_2P_BRAM
-	#pragma HLS RESOURCE variable=cbIxiyE core=RAM_2P_BRAM
-	#pragma HLS RESOURCE variable=cbIyiyE core=RAM_2P_BRAM
-#endif
-	#pragma HLS RESOURCE variable=cbDixE core=RAM_2P_BRAM
-	#pragma HLS RESOURCE variable=cbDiyE core=RAM_2P_BRAM
 	#pragma HLS DEPENDENCE variable=cbIxixE inter WAR false
 	#pragma HLS DEPENDENCE variable=cbIxiyE inter WAR false
 	#pragma HLS DEPENDENCE variable=cbIyiyE inter WAR false
@@ -284,18 +289,18 @@ namespace xf{
 		  int csIxixL1 = 0, csIxiyL1 = 0, csIyiyL1 = 0, csDixL1  = 0, csDiyL1  = 0;
 
 		  if (zIdx >= 0) {
-			csIxixL0 = csIxixO [zIdx];
-			csIxiyL0 = csIxiyO [zIdx];
-			csIyiyL0 = csIyiyO [zIdx];
-			csDixL0  = csDixO [zIdx];
-			csDiyL0  = csDiyO [zIdx];
-		  }
-		  if (zIdx1 >= 0) {
-			csIxixL1 = csIxixE [zIdx1];
-			csIxiyL1 = csIxiyE [zIdx1];
-			csIyiyL1 = csIyiyE [zIdx1];
-			csDixL1  = csDixE [zIdx1];
-			csDiyL1  = csDiyE [zIdx1];
+	        int const zIdxPrev = zIdx==0 ? cols/2-1 : zIdx-1;
+			csIxixL0 = csIxixO [zIdxPrev];
+			csIxiyL0 = csIxiyO [zIdxPrev];
+			csIyiyL0 = csIyiyO [zIdxPrev];
+			csDixL0  = csDixO  [zIdxPrev];
+			csDiyL0  = csDiyO  [zIdxPrev];
+
+			csIxixL1 = csIxixE [zIdx];
+			csIxiyL1 = csIxiyE [zIdx];
+			csIyiyL1 = csIyiyE [zIdx];
+			csDixL1  = csDixE  [zIdx];
+			csDiyL1  = csDiyE  [zIdx];
 		  }
 
 		  for (int wr=0; wr<(WINDOW_SIZE+1); ++wr) {
@@ -344,11 +349,11 @@ namespace xf{
 		  csDixR0  = cbDixE [nIdx]  + delBotR0 * cIxBotR0 - delTopR0 * cIxTopR0;
 		  csDiyR0  = cbDiyE [nIdx]  + delBotR0 * cIyBotR0 - delTopR0 * cIyTopR0;
 
-		  csIxixR1 = cbIxixO [nIdx1] + cIxBotR1 * cIxBotR1 - cIxTopR1 * cIxTopR1;
-		  csIxiyR1 = cbIxiyO [nIdx1] + cIxBotR1 * cIyBotR1 - cIxTopR1 * cIyTopR1;
-		  csIyiyR1 = cbIyiyO [nIdx1] + cIyBotR1 * cIyBotR1 - cIyTopR1 * cIyTopR1;
-		  csDixR1  = cbDixO [nIdx1]  + delBotR1 * cIxBotR1 - delTopR1 * cIxTopR1;
-		  csDiyR1  = cbDiyO [nIdx1]  + delBotR1 * cIyBotR1 - delTopR1 * cIyTopR1;
+		  csIxixR1 = cbIxixO [nIdx] + cIxBotR1 * cIxBotR1 - cIxTopR1 * cIxTopR1;
+		  csIxiyR1 = cbIxiyO [nIdx] + cIxBotR1 * cIyBotR1 - cIxTopR1 * cIyTopR1;
+		  csIyiyR1 = cbIyiyO [nIdx] + cIyBotR1 * cIyBotR1 - cIyTopR1 * cIyTopR1;
+		  csDixR1  = cbDixO  [nIdx] + delBotR1 * cIxBotR1 - delTopR1 * cIxTopR1;
+		  csDiyR1  = cbDiyO  [nIdx] + delBotR1 * cIyBotR1 - delTopR1 * cIyTopR1;
 
 		  int tmpixix0 = (csIxixR0 - csIxixL0);
 		  int tmpixix1 = (csIxixR0 - csIxixL0) + (csIxixR1 - csIxixL1);
@@ -415,29 +420,22 @@ namespace xf{
 		  csDixE  [nIdx] = csDixR0;
 		  csDiyE  [nIdx] = csDiyR0;
 
-		  cbIxixO [nIdx1] = csIxixR1;
-		  cbIxiyO [nIdx1] = csIxiyR1;
-		  cbIyiyO [nIdx1] = csIyiyR1;
-		  cbDixO  [nIdx1] = csDixR1;
-		  cbDiyO  [nIdx1] = csDiyR1;
-
-		  csIxixO [nIdx1] = csIxixR1;
-		  csIxiyO [nIdx1] = csIxiyR1;
-		  csIyiyO [nIdx1] = csIyiyR1;
-		  csDixO  [nIdx1] = csDixR1;
-		  csDiyO  [nIdx1] = csDiyR1;
-
-		  // zIdx is always odd, zIdx1 is even
-		  // nIdx is always even, nIdx1 is odd
-		  zIdx += 2;
-		  if (zIdx >= cols) zIdx = 1;
-		  zIdx1 += 2;
-		  if (zIdx1 == cols) zIdx1 = 0;
-
-		  nIdx += 2;
-		  if (nIdx == cols) nIdx = 0;
-		  nIdx1 += 2;
-		  if (nIdx1 >= cols) nIdx1 = 1;
+		  cbIxixO [nIdx] = csIxixR1;
+		  cbIxiyO [nIdx] = csIxiyR1;
+		  cbIyiyO [nIdx] = csIyiyR1;
+		  cbDixO  [nIdx] = csDixR1;
+		  cbDiyO  [nIdx] = csDiyR1;
+
+		  csIxixO [nIdx] = csIxixR1;
+		  csIxiyO [nIdx] = csIxiyR1;
+		  csIyiyO [nIdx] = csIyiyR1;
+		  csDixO  [nIdx] = csDixR1;
+		  csDiyO  [nIdx] = csDiyR1;
+
+		  zIdx ++;
+		  if (zIdx == cols/2) zIdx = 0;
+		  nIdx ++;
+		  if (nIdx == cols/2) nIdx = 0;
 		}
 	  }
 
@@ -446,12 +444,12 @@ namespace xf{
 	  // TODO zero in the line buffer instead, for r < WINDOW_SIZE
 	  for (int r = 0; r < (WINDOW_SIZE+1); r++) {
 		  #pragma HLS LOOP_TRIPCOUNT min=1 max=WINDOW_SIZE+1
-		#pragma HLS PIPELINE
+		#pragma HLS UNROLL
 		img1Win [r] = 0; img1Win [r+(WINDOW_SIZE+1)] = 0; img2Win [r] = 0;
 		img1Col0 [r] =0; img2Col0 [r] =0;
 		img1Col1 [r] =0; img2Col1 [r] =0;
 	  }
-	  for (int r=0; r < cols; ++r) {
+	  for (int r=0; r < cols/2; ++r) {
 		  #pragma HLS LOOP_TRIPCOUNT min=1 max=COLS
 		#pragma HLS PIPELINE
 		csIxixO [r] = 0; csIxiyO [r] = 0; csIyiyO [r] = 0; csDixO [r] = 0; csDiyO [r] = 0;
@@ -534,15 +532,27 @@ namespace xf{
 
 	// line buffer for both input images. Can be split to a fn that models a single
 	// linebuffer
-	template<int ROWS, int COLS, int NPC, int WINDOW_SIZE>
+	template<int ROWS, int COLS, int NPC, int WINDOW_SIZE, bool USE_URAM>
 	static void lbWrapper16 (hls::stream < mywide_t< XF_NPIXPERCYCLE(NPC) > >& f0Stream, 
 					hls::stream < mywide_t< XF_NPIXPERCYCLE(NPC) > >& f1Stream, 
 					hls::stream < mywide_t< XF_NPIXPERCYCLE(NPC) > > img1Col[(WINDOW_SIZE+1)], 
 					hls::stream < mywide_t< XF_NPIXPERCYCLE(NPC) > > img2Col[(WINDOW_SIZE+1)], int rows, int cols, int size)
 	{
-	  static  mywide_t< XF_NPIXPERCYCLE(NPC) >  lb1 [(WINDOW_SIZE+1)][COLS/2], lb2 [(WINDOW_SIZE+1)][COLS/2];
-	#pragma HLS ARRAY_PARTITION variable=lb1 complete dim=1
-	#pragma HLS ARRAY_PARTITION variable=lb2 complete dim=1
+	  static pix_t lb1 [(WINDOW_SIZE+1)][COLS/XF_NPIXPERCYCLE(NPC)][XF_NPIXPERCYCLE(NPC)],
+	               lb2 [(WINDOW_SIZE+1)][COLS/XF_NPIXPERCYCLE(NPC)][XF_NPIXPERCYCLE(NPC)];
+
+    #pragma HLS ARRAY_MAP variable=lb1 instance=lbMap vertical
+    #pragma HLS ARRAY_MAP variable=lb2 instance=lbMap vertical
+
+    #pragma HLS ARRAY_RESHAPE variable=lb1 complete dim=1      
+    #pragma HLS ARRAY_RESHAPE variable=lb2 complete dim=1
+    #pragma HLS ARRAY_RESHAPE variable=lb1 complete dim=3      
+    #pragma HLS ARRAY_RESHAPE variable=lb2 complete dim=3
+
+    if (USE_URAM) {
+    #pragma HLS RESOURCE variable=lb1 core=XPM_MEMORY uram
+    #pragma HLS RESOURCE variable=lb2 core=XPM_MEMORY uram
+    }
 
 	  for (int r = 0; r < rows; r++) {
 		  #pragma HLS LOOP_TRIPCOUNT min=1 max=ROWS
@@ -552,43 +562,53 @@ namespace xf{
 		  #pragma HLS pipeline
 		  // shift up both linebuffers at col=c
 		  for (int i = 0; i < ((WINDOW_SIZE+1) - 1); i++) {
-			lb1 [i][c] = lb1 [i + 1][c];
-			img1Col [i]. write (lb1 [i][c]);
-			
-			lb2 [i][c] = lb2 [i+1][c];
-			img2Col [i]. write (lb2 [i][c]);
+			mywide_t< XF_NPIXPERCYCLE(NPC) > lb;
+
+			for (int k = 0; k <XF_NPIXPERCYCLE(NPC); k++) {
+			  lb.data[k] = lb1[i + 1][c][k];
+			  lb1[i][c][k] = lb.data[k];
+		    }
+ 			img1Col[i].write(lb);
+
+			for (int k = 0; k <XF_NPIXPERCYCLE(NPC); k++) {
+			  lb.data[k] = lb2[i + 1][c][k];
+			  lb2[i][c][k] = lb.data[k];
+		    }
+ 			img2Col[i].write(lb);
 		  }
 
 		  // read in the new pixels at col=c and row=bottom_of_lb
 		   mywide_t< XF_NPIXPERCYCLE(NPC) >  pix0 = f0Stream. read ();
-		  lb1 [(WINDOW_SIZE+1) - 1][c] = pix0;
 		  img1Col [(WINDOW_SIZE+1) - 1]. write (pix0);
 
 		   mywide_t< XF_NPIXPERCYCLE(NPC) >  pix1 = f1Stream. read ();
-		  lb2 [(WINDOW_SIZE+1) -1][c] = pix1;
 		  img2Col [(WINDOW_SIZE+1) - 1]. write (pix1);
+
+          for (int k = 0; k <XF_NPIXPERCYCLE(NPC); k++) {
+		    lb1 [(WINDOW_SIZE+1) - 1][c][k] = pix0.data[k];
+		    lb2 [(WINDOW_SIZE+1) - 1][c][k] = pix1.data[k];
+		  }
 		}
 	  }
 
 
 	  // cleanup
-	   mywide_t< XF_NPIXPERCYCLE(NPC) >  tmpClr;
-	  tmpClr. data [0] = 0;
-	  tmpClr. data [1] = 0;
-	  for (int r = 0; r < (WINDOW_SIZE+1); r++) {
-		  #pragma HLS LOOP_TRIPCOUNT min=1 max=WINDOW_SIZE+1
-		for (int c = 0; c < cols/2; c++) {
+      for (int c = 0; c < cols/2; c++) {
 		  #pragma HLS LOOP_TRIPCOUNT min=1 max=COLS/2
 		  #pragma HLS PIPELINE
-		  lb1 [r][c] = tmpClr;
-		  lb2 [r][c] = tmpClr;
+	    for (int r = 0; r < (WINDOW_SIZE+1); r++) {
+		  #pragma HLS LOOP_TRIPCOUNT min=1 max=WINDOW_SIZE+1
+          for (int k = 0; k <XF_NPIXPERCYCLE(NPC); k++) {
+		    lb1[r][c][k] = 0;
+		    lb2[r][c][k] = 0;
+		  }
 		}
 	  }
 	}
 
 	// top level wrapper to avoid dataflow problems
 	//void flowWrap (mywide_t frame0[NUM_WORDS], mywide_t frame1[NUM_WORDS], rgba2_t framef[NUM_WORDS])
-	template<int ROWS, int COLS, int NPC, int WINDOW_SIZE>
+	template<int ROWS, int COLS, int NPC, int WINDOW_SIZE, bool USE_URAM>
 	static void flowWrap16 (ap_uint<16> *frame0, ap_uint<16> *frame1, ap_uint<64> *flowx, ap_uint<64> *flowy, int rows, int cols, int size)
 	{
 	//#pragma HLS data_pack variable=frame0
@@ -642,8 +662,8 @@ namespace xf{
 	  readMatRows16<ROWS, COLS, NPC, WINDOW_SIZE> (frame0, f0Stream, rows, cols, size);
 	  readMatRows16<ROWS, COLS, NPC, WINDOW_SIZE> (frame1, f1Stream, rows, cols, size);
 
-	  lbWrapper16<ROWS, COLS, NPC, WINDOW_SIZE> (f0Stream, f1Stream, img1Col, img2Col, rows, cols, size);
-	  computeSums16<ROWS, COLS, NPC, WINDOW_SIZE> (img1Col, img2Col, 
+	  lbWrapper16  <ROWS, COLS, NPC, WINDOW_SIZE, USE_URAM> (f0Stream, f1Stream, img1Col, img2Col, rows, cols, size);
+	  computeSums16<ROWS, COLS, NPC, WINDOW_SIZE, USE_URAM> (img1Col, img2Col, 
 				   ixix0, ixiy0, iyiy0, dix0, diy0, 
 				   ixix1, ixiy1, iyiy1, dix1, diy1, rows, cols, size);
 
@@ -666,12 +686,12 @@ namespace xf{
 	// ulonglong = 64 bits, 32 bits per color pixel (rgba), so two color pix
 	//void fpga_optflow (unsigned short *frame0, unsigned short *frame1, unsigned long long *framef)
 	//void fpga_optflow (unsigned short frame0[NUM_WORDS], unsigned short frame1[NUM_WORDS], unsigned long long framef[NUM_WORDS])
-	template<int ROWS, int COLS, int NPC, int WINDOW_SIZE>
+	template<int ROWS, int COLS, int NPC, int WINDOW_SIZE, bool USE_URAM>
 	static void fpga_optflow16 (ap_uint<16> *frame0, ap_uint<16> *frame1, ap_uint<64> *flowx, ap_uint<64> *flowy, int rows, int cols, int size)
 	{
 	#pragma HLS inline off
 
-	  flowWrap16<ROWS, COLS, NPC, WINDOW_SIZE> (frame0, frame1, flowx, flowy, rows, cols, size);
+	  flowWrap16<ROWS, COLS, NPC, WINDOW_SIZE, USE_URAM> (frame0, frame1, flowx, flowy, rows, cols, size);
 
 	  return;
 
@@ -713,7 +733,7 @@ namespace xf{
 	// TODO: 
 	// 1. Dont need the entire column for img1Win and img2Win. Need only the kernel
 	// 2. Full line buffer is not needed
-	template<int ROWS, int COLS, int NPC, int WINDOW_SIZE>
+	template<int ROWS, int COLS, int NPC, int WINDOW_SIZE, bool USE_URAM>
 	static void computeSums (hls::stream <pix_t> img1Col [(WINDOW_SIZE+1)], 
 					  hls::stream <pix_t> img2Col [(WINDOW_SIZE+1)], 
 					  hls::stream <int>& ixix_out, 
@@ -742,21 +762,38 @@ namespace xf{
 	  int zIdx= - (WINDOW_SIZE-2);
 	  int nIdx = zIdx + WINDOW_SIZE-2;
 
+	#pragma HLS ARRAY_MAP variable=csIxix instance=cs vertical
+	#pragma HLS ARRAY_MAP variable=csIxiy instance=cs vertical
+	#pragma HLS ARRAY_MAP variable=csIyiy instance=cs vertical
+	#pragma HLS ARRAY_MAP variable=csDix  instance=cs vertical
+	#pragma HLS ARRAY_MAP variable=csDiy  instance=cs vertical
+
+    if (USE_URAM) {
+	#pragma HLS RESOURCE variable=csIxix core=XPM_MEMORY uram
+    }
+	else {
 	#pragma HLS RESOURCE variable=csIxix core=RAM_2P_BRAM
-	#pragma HLS RESOURCE variable=csIxiy core=RAM_2P_BRAM
-	#pragma HLS RESOURCE variable=csIyiy core=RAM_2P_BRAM
-	#pragma HLS RESOURCE variable=csDix core=RAM_2P_BRAM
-	#pragma HLS RESOURCE variable=csDiy core=RAM_2P_BRAM
+	}
+	
 	#pragma HLS DEPENDENCE variable=csIxix inter WAR false
 	#pragma HLS DEPENDENCE variable=csIxiy inter WAR false
 	#pragma HLS DEPENDENCE variable=csIyiy inter WAR false
 	#pragma HLS DEPENDENCE variable=csDix  inter WAR false
 	#pragma HLS DEPENDENCE variable=csDiy  inter WAR false
+
+	#pragma HLS ARRAY_MAP variable=cbIxix instance=cb vertical
+	#pragma HLS ARRAY_MAP variable=cbIxiy instance=cb vertical
+	#pragma HLS ARRAY_MAP variable=cbIyiy instance=cb vertical
+	#pragma HLS ARRAY_MAP variable=cbDix  instance=cb vertical
+	#pragma HLS ARRAY_MAP variable=cbDiy  instance=cb vertical
+
+    if (USE_URAM) {
+	#pragma HLS RESOURCE variable=cbIxix core=XPM_MEMORY uram
+    }
+    else {
 	#pragma HLS RESOURCE variable=cbIxix core=RAM_2P_BRAM
-	#pragma HLS RESOURCE variable=cbIxiy core=RAM_2P_BRAM
-	#pragma HLS RESOURCE variable=cbIyiy core=RAM_2P_BRAM
-	#pragma HLS RESOURCE variable=cbDix core=RAM_2P_BRAM
-	#pragma HLS RESOURCE variable=cbDiy core=RAM_2P_BRAM
+    }
+
 	#pragma HLS DEPENDENCE variable=cbIxix inter WAR false
 	#pragma HLS DEPENDENCE variable=cbIxiy inter WAR false
 	#pragma HLS DEPENDENCE variable=cbIyiy inter WAR false
@@ -860,7 +897,7 @@ namespace xf{
 	  // TODO zero in the line buffer instead, for r < WINDOW_SIZE
 	  for (int r = 0; r < (WINDOW_SIZE+1); r++) {
 		  #pragma HLS LOOP_TRIPCOUNT min=1 max=WINDOW_SIZE+1
-		#pragma HLS PIPELINE
+		#pragma HLS UNROLL
 		img1Win [r] = 0; img1Win [r+(WINDOW_SIZE+1)] = 0; img2Win [r] = 0;
 		img1Col_ [r] =0; img2Col_ [r] =0;
 	  }
@@ -957,15 +994,21 @@ namespace xf{
 
 	// line buffer for both input images. Can be split to a fn that models a single
 	// linebuffer
-	template<int ROWS, int COLS, int NPC, int WINDOW_SIZE>
+	template<int ROWS, int COLS, int NPC, int WINDOW_SIZE, bool USE_URAM>
 	static void lbWrapper (hls::stream <pix_t>& f0Stream, 
 					hls::stream <pix_t>& f1Stream, 
 					hls::stream <pix_t> img1Col[(WINDOW_SIZE+1)], 
 					hls::stream <pix_t> img2Col[(WINDOW_SIZE+1)], int rows, int cols, int size)
 	{
 	static pix_t lb1 [(WINDOW_SIZE+1)][COLS], lb2 [(WINDOW_SIZE+1)][COLS];
-	#pragma HLS ARRAY_PARTITION variable=lb1 complete dim=1
-	#pragma HLS ARRAY_PARTITION variable=lb2 complete dim=1
+	#pragma HLS ARRAY_MAP variable=lb1 instance=lbMap vertical
+	#pragma HLS ARRAY_MAP variable=lb2 instance=lbMap vertical
+	#pragma HLS ARRAY_RESHAPE variable=lb1 complete dim=1
+	#pragma HLS ARRAY_RESHAPE variable=lb2 complete dim=1
+	if (USE_URAM) {
+	#pragma HLS RESOURCE variable=lb1 core=XPM_MEMORY uram
+	#pragma HLS RESOURCE variable=lb2 core=XPM_MEMORY uram
+	}
 
 	  for (int r = 0; r < rows; r++) {
 		  #pragma HLS LOOP_TRIPCOUNT min=1 max=ROWS
@@ -996,11 +1039,11 @@ namespace xf{
 
 
 	  // cleanup
-	  for (int r = 0; r < (WINDOW_SIZE+1); r++) {
-		  #pragma HLS LOOP_TRIPCOUNT min=1 max=ROWS
-		for (int c = 0; c < COLS; c++) {
+      for (int c = 0; c < cols; c++) {
 		  #pragma HLS LOOP_TRIPCOUNT min=1 max=COLS
 		  #pragma HLS PIPELINE
+        for (int r = 0; r < (WINDOW_SIZE+1); r++) {
+		  #pragma HLS LOOP_TRIPCOUNT min=1 max=WINDOW_SIZE+1
 		  lb1 [r][c] = 0;
 		  lb2 [r][c] = 0;
 		}
@@ -1008,7 +1051,7 @@ namespace xf{
 	}
 
 	// top level wrapper to avoid dataflow problems
-	template<int ROWS, int COLS, int NPC, int WINDOW_SIZE>
+	template<int ROWS, int COLS, int NPC, int WINDOW_SIZE, bool USE_URAM>
 	static void flowWrap (ap_uint<8> *frame0, ap_uint<8> *frame1, float *flowx, float *flowy, int rows, int cols, int size)
 	{
 	#pragma HLS inline off
@@ -1046,9 +1089,9 @@ namespace xf{
 		readMatRows<ROWS, COLS, NPC, WINDOW_SIZE> (frame0, f0Stream, rows, cols, size);
 		readMatRows<ROWS, COLS, NPC, WINDOW_SIZE> (frame1, f1Stream, rows, cols, size);
 
-		lbWrapper<ROWS, COLS, NPC, WINDOW_SIZE> (f0Stream, f1Stream, img1Col, img2Col, rows, cols, size);
+		lbWrapper  <ROWS, COLS, NPC, WINDOW_SIZE, USE_URAM> (f0Stream, f1Stream, img1Col, img2Col, rows, cols, size);
 	  
-		computeSums<ROWS, COLS, NPC, WINDOW_SIZE> (img1Col, img2Col, ixix, ixiy, iyiy, dix, diy, rows, cols, size);
+		computeSums<ROWS, COLS, NPC, WINDOW_SIZE, USE_URAM> (img1Col, img2Col, ixix, ixiy, iyiy, dix, diy, rows, cols, size);
 	  
 		computeFlow<ROWS, COLS, NPC, WINDOW_SIZE> (ixix, ixiy, iyiy, dix, diy, fx, fy, rows, cols, size);
 
@@ -1062,12 +1105,12 @@ namespace xf{
 	//  frame0 - First input frame (grayscale 1 byte per pixel)
 	//  frame1 - Second input frame (grayscale 1 byte per pixel)
 	//  framef - Output frame with flows visualized. 3 bytes per pixel + 1 byte padding 
-	template<int ROWS, int COLS, int NPC, int WINDOW_SIZE>
+	template<int ROWS, int COLS, int NPC, int WINDOW_SIZE, bool USE_URAM>
 	static void fpga_optflow8 (ap_uint<8> *frame0, ap_uint<8> *frame1, float *flowx, float *flowy, int rows, int cols, int size)
 	{
 	#pragma HLS inline off
 
-	  flowWrap<ROWS, COLS, NPC, WINDOW_SIZE>(frame0, frame1, flowx, flowy, rows, cols, size);
+	  flowWrap<ROWS, COLS, NPC, WINDOW_SIZE, USE_URAM>(frame0, frame1, flowx, flowy, rows, cols, size);
 
 	  return;
 
@@ -1087,16 +1130,16 @@ namespace xf{
 #pragma SDS data copy("frame1.data"[0:"frame1.size"])
 #pragma SDS data copy("flowx.data"[0:"flowx.size"])
 #pragma SDS data copy("flowy.data"[0:"flowy.size"])
-template<int WINDOW_SIZE, int TYPE, int ROWS, int COLS, int NPC>
+template<int WINDOW_SIZE, int TYPE, int ROWS, int COLS, int NPC, bool USE_URAM = false>
 void DenseNonPyrLKOpticalFlow (xf::Mat<TYPE, ROWS, COLS, NPC> & frame0, xf::Mat<TYPE, ROWS, COLS, NPC> & frame1, xf::Mat<XF_32FC1, ROWS, COLS, NPC> & flowx, xf::Mat<XF_32FC1, ROWS, COLS, NPC> & flowy)
 {
 	if(NPC==XF_NPPC1)
 	{
-		fpga_optflow8 <ROWS, COLS, NPC, WINDOW_SIZE> ( (ap_uint<8> *) frame0.data, (ap_uint<8> *)frame1.data, (float *)flowx.data, (float *)flowy.data, frame0.rows, frame0.cols, frame0.size);
+		fpga_optflow8 <ROWS, COLS, NPC, WINDOW_SIZE, USE_URAM> ( (ap_uint<8> *) frame0.data, (ap_uint<8> *)frame1.data, (float *)flowx.data, (float *)flowy.data, frame0.rows, frame0.cols, frame0.size);
 	}
 	else
 	{
-		fpga_optflow16 <ROWS, COLS, NPC, WINDOW_SIZE> ( (ap_uint<16> *) frame0.data, (ap_uint<16> *) frame1.data, (ap_uint<64> *)flowx.data, (ap_uint<64> *)flowy.data, frame0.rows, frame0.cols, frame0.size);
+		fpga_optflow16 <ROWS, COLS, NPC, WINDOW_SIZE, USE_URAM> ( (ap_uint<16> *) frame0.data, (ap_uint<16> *) frame1.data, (ap_uint<64> *)flowx.data, (ap_uint<64> *)flowy.data, frame0.rows, frame0.cols, frame0.size);
 	}
 }
 }
diff --git a/include/imgproc/xf_gaussian_filter.hpp b/include/imgproc/xf_gaussian_filter.hpp
index 8dc51d9..242ce0c 100644
--- a/include/imgproc/xf_gaussian_filter.hpp
+++ b/include/imgproc/xf_gaussian_filter.hpp
@@ -1138,45 +1138,49 @@ void xFGaussianFilter(hls::stream< XF_SNAME(WORDWIDTH)> &_src, hls::stream< XF_S
 #pragma SDS data access_pattern("_dst.data":SEQUENTIAL)
 #pragma SDS data copy("_dst.data"[0:"_dst.size"])
 
-template<int FILTER_SIZE, int BORDER_TYPE, int SRC_T, int ROWS, int COLS,int NPC = 1>
-void GaussianBlur(xf::Mat<SRC_T, ROWS, COLS, NPC> & _src, xf::Mat<SRC_T, ROWS, COLS, NPC> & _dst, float sigma)
+template<int FILTER_SIZE, int BORDER_TYPE, int SRC_T, int ROWS, int COLS,int NPC>
+void GaussianBlur(xf::Mat<SRC_T, ROWS, COLS, NPC> &_src, xf::Mat<SRC_T, ROWS, COLS, NPC> &_dst, float sigma)
 {
-#pragma HLS inline off
+  #pragma HLS inline off
 
-#pragma HLS dataflow
+  #pragma HLS dataflow
 
-	hls::stream<XF_TNAME(SRC_T,NPC)>src;
-	hls::stream< XF_TNAME(SRC_T,NPC)> dst;
+  hls::stream<XF_TNAME(SRC_T,NPC)> src;
+  hls::stream<XF_TNAME(SRC_T,NPC)> dst;
 
-	/********************************************************/
+  /********************************************************/
 
-	Read_yuyv_Loop:
-	for(int i=0; i<_src.rows;i++)
-	{
-	#pragma HLS LOOP_TRIPCOUNT min=1 max=ROWS
-		for(int j=0; j<(_src.cols)>>(XF_BITSHIFT(NPC));j++)
-		{
-	#pragma HLS LOOP_TRIPCOUNT min=1 max=COLS/NPC
-			#pragma HLS PIPELINE
-			#pragma HLS loop_flatten off
-			src.write( *(_src.data + i*(_src.cols>>(XF_BITSHIFT(NPC))) +j) );
-		}
-	}
+  Read_yuyv_Loop:
+  for(int i=0; i < _src.rows; i++)
+    {
+      #pragma HLS LOOP_TRIPCOUNT min=1 max=ROWS
 
-	xFGaussianFilter< ROWS, COLS, XF_DEPTH(SRC_T,NPC),NPC,XF_WORDWIDTH(SRC_T,NPC)>(src, dst, FILTER_SIZE, BORDER_TYPE, _src.rows,_src.cols,sigma);
+      for(int j=0; j < (_src.cols)>>(XF_BITSHIFT(NPC)); j++)
+        {
+          #pragma HLS LOOP_TRIPCOUNT min=1 max=COLS/NPC
+          #pragma HLS PIPELINE
+          #pragma HLS loop_flatten off
 
-	for(int i=0; i<_dst.rows;i++)
-	{
-	#pragma HLS LOOP_TRIPCOUNT min=1 max=ROWS
-		for(int j=0; j<(_dst.cols)>>(XF_BITSHIFT(NPC));j++)
-		{
-	#pragma HLS LOOP_TRIPCOUNT min=1 max=COLS/NPC
-			#pragma HLS PIPELINE
-			#pragma HLS loop_flatten off
-			*(_dst.data + i*(_dst.cols>>(XF_BITSHIFT(NPC))) +j) = dst.read();
+          src.write( *(_src.data + i*(_src.cols>>(XF_BITSHIFT(NPC))) +j) );
+        }
+    }
 
-		}
-	}
+  xFGaussianFilter< ROWS, COLS, XF_DEPTH(SRC_T,NPC),NPC,XF_WORDWIDTH(SRC_T,NPC)>(src, dst, FILTER_SIZE, BORDER_TYPE, _src.rows, _src.cols, sigma);
+
+  for(int i=0; i < _src.rows; i++)
+    {
+      #pragma HLS LOOP_TRIPCOUNT min=1 max=ROWS
+
+      for(int j=0; j < (_src.cols)>>(XF_BITSHIFT(NPC)); j++)
+        {
+          #pragma HLS LOOP_TRIPCOUNT min=1 max=COLS/NPC
+          #pragma HLS PIPELINE
+          #pragma HLS loop_flatten off
+
+          *(_dst.data + i*(_src.cols>>(XF_BITSHIFT(NPC))) +j) = dst.read();
+
+        }
+    }
 }
 }
 #endif //_XF_GAUSSIAN_HPP_
diff --git a/include/imgproc/xf_pyr_dense_optical_flow.hpp b/include/imgproc/xf_pyr_dense_optical_flow.hpp
index d7ce4ff..92079c4 100644
--- a/include/imgproc/xf_pyr_dense_optical_flow.hpp
+++ b/include/imgproc/xf_pyr_dense_optical_flow.hpp
@@ -232,7 +232,7 @@ void find_flow(hls::stream< ap_fixed<SIXIY_WIDTH,SIXIY_INT> > &strmSigmaIx2, hls
 } // end find_flow()
 
 
-template<unsigned short MAXHEIGHT, unsigned short MAXWIDTH, int NUM_PYR_LEVELS, int NUM_LINES, int WINSIZE, int FLOW_WIDTH, int FLOW_INT>
+template<unsigned short MAXHEIGHT, unsigned short MAXWIDTH, int NUM_PYR_LEVELS, int NUM_LINES, int WINSIZE, int FLOW_WIDTH, int FLOW_INT, bool USE_URAM>
 void xFLKOpticalFlowDenseKernel(unsigned char *currImg, unsigned char *nextImg, unsigned int *strmFlowin, unsigned int *strmFlow, const unsigned int rows, const unsigned int cols, const unsigned int prev_rows, const unsigned int prev_cols, const int level, const bool scale_up_flag, float scale_in, ap_uint<1> init_flag) {
 
 const int WINDOW_SIZE  = WINDOW_SIZE_FL;
@@ -290,22 +290,20 @@ const int ITCMP_INT    = FLOW_INT+12;
 	split_stream_int_fixed<MAXHEIGHT, MAXWIDTH, FLOW_WIDTH, FLOW_INT>(strmFlowin, strmFlowU_split, strmFlowV_split, prev_rows, prev_cols, level);
 	
 	//scaling up U and V streams whenever scaleup is enabled
-	scale_up<MAXHEIGHT, MAXWIDTH, FLOW_WIDTH, FLOW_INT, SCCMP_WIDTH, SCCMP_INT, RMAPPX_WIDTH, RMAPPX_INT, SCALE_WIDTH, SCALE_INT>( strmFlowU_split, strmFlowU_scaled, prev_rows, prev_cols, rows, cols, 2, scale_up_flag, scale_in);
-	scale_up<MAXHEIGHT, MAXWIDTH, FLOW_WIDTH, FLOW_INT, SCCMP_WIDTH, SCCMP_INT, RMAPPX_WIDTH, RMAPPX_INT, SCALE_WIDTH, SCALE_INT>( strmFlowV_split, strmFlowV_scaled, prev_rows, prev_cols, rows, cols, 2, scale_up_flag, scale_in);
+	scale_up<MAXHEIGHT, MAXWIDTH, FLOW_WIDTH, FLOW_INT, SCCMP_WIDTH, SCCMP_INT, RMAPPX_WIDTH, RMAPPX_INT, SCALE_WIDTH, SCALE_INT, USE_URAM>( strmFlowU_split, strmFlowU_scaled, strmFlowV_split, strmFlowV_scaled, prev_rows, prev_cols, rows, cols, 2, scale_up_flag, scale_in);
 	
 	//Finding the Temporal and space gradients for the input set of images
-	findGradients<MAXHEIGHT, MAXWIDTH, NUM_PYR_LEVELS, NUM_LINES, WINSIZE, IT_WIDTH, IT_INT, ITCMP_WIDTH, ITCMP_INT, FLOW_WIDTH, FLOW_INT, RMAPPX_WIDTH, RMAPPX_INT>(currImg, nextImg, strmIt_float, strmIx, strmIy, rows, cols, strmFlowU_scaled, strmFlowV_scaled, strmFlowU_in1, strmFlowV_in1, level);
+	findGradients<MAXHEIGHT, MAXWIDTH, NUM_PYR_LEVELS, NUM_LINES, WINSIZE, IT_WIDTH, IT_INT, ITCMP_WIDTH, ITCMP_INT, FLOW_WIDTH, FLOW_INT, RMAPPX_WIDTH, RMAPPX_INT, USE_URAM>(currImg, nextImg, strmIt_float, strmIx, strmIy, rows, cols, strmFlowU_scaled, strmFlowV_scaled, strmFlowU_in1, strmFlowV_in1, level);
 
 	//finding the hessian matrix
-	find_G_and_b_matrix<MAXHEIGHT, MAXWIDTH, WINSIZE, IT_WIDTH, IT_INT, SIXIY_WIDTH, SIXIY_INT, SIXYIT_WIDTH, SIXYIT_INT>(strmIx, strmIy, strmIt_float,  sigmaIx2, sigmaIy2, sigmaIxIy, sigmaIxIt, sigmaIyIt, rows, cols, level);
+	find_G_and_b_matrix<MAXHEIGHT, MAXWIDTH, WINSIZE, IT_WIDTH, IT_INT, SIXIY_WIDTH, SIXIY_INT, SIXYIT_WIDTH, SIXYIT_INT, USE_URAM>(strmIx, strmIy, strmIt_float,  sigmaIx2, sigmaIy2, sigmaIxIy, sigmaIxIt, sigmaIyIt, rows, cols, level);
 	
 	//computing the the optical flow
 	
 	find_flow<MAXHEIGHT, MAXWIDTH, SIXIY_WIDTH, SIXIY_INT, SIXYIT_WIDTH, SIXYIT_INT, FLOW_WIDTH, FLOW_INT, DET_WIDTH, DET_INT, DIVBY_WIDTH, DIVBY_INT, FLCMP_WIDTH, FLCMP_INT, WINSIZE>(sigmaIx2, sigmaIy2, sigmaIxIy, sigmaIxIt, sigmaIyIt, strmFlowU_in1, strmFlowV_in1, strmFlowU_fil, strmFlowV_fil, flagU, flagV, rows, cols,level,scale_up_flag,init_flag);
 	
 	//filtering the flow vectors using median blur
-	auMedianBlur<MAXHEIGHT, MAXWIDTH, 0, 0, 0, 0, WINDOW_SIZE, WINDOW_SIZE*WINDOW_SIZE, FLOW_WIDTH, FLOW_INT> (strmFlowU_fil, strmFlowU_fil_out, flagU, WINDOW_SIZE,1,rows,cols);
-	auMedianBlur<MAXHEIGHT, MAXWIDTH, 0, 0, 0, 0, WINDOW_SIZE, WINDOW_SIZE*WINDOW_SIZE, FLOW_WIDTH, FLOW_INT> (strmFlowV_fil, strmFlowV_fil_out, flagV, WINDOW_SIZE,1,rows,cols);
+	auMedianBlur<MAXHEIGHT, MAXWIDTH, 0, 0, 0, 0, WINDOW_SIZE, WINDOW_SIZE*WINDOW_SIZE, FLOW_WIDTH, FLOW_INT, USE_URAM> (strmFlowU_fil, strmFlowU_fil_out, flagU, strmFlowV_fil, strmFlowV_fil_out, flagV, WINDOW_SIZE,1,rows,cols);
 	
 	//stitching the U and V flow streams to a single flow stream
 	stitch_stream_fixed_int<MAXHEIGHT, MAXWIDTH, FLOW_WIDTH, FLOW_INT>(strmFlowU_fil_out, strmFlowV_fil_out, strmFlow, rows, cols, level);
diff --git a/include/imgproc/xf_pyr_dense_optical_flow_find_gradients.hpp b/include/imgproc/xf_pyr_dense_optical_flow_find_gradients.hpp
index 4fcac07..d80d597 100644
--- a/include/imgproc/xf_pyr_dense_optical_flow_find_gradients.hpp
+++ b/include/imgproc/xf_pyr_dense_optical_flow_find_gradients.hpp
@@ -71,6 +71,8 @@ ap_fixed<IT_WIDTH,IT_INT> findIntensity(unsigned char lineBuffer[NUM_LINES+1][MA
 
 		// Find which location in linebuffers to access
 		int lx0 = tmp_locj;
+		// AK,ZoTech: here out of bound of current level picture access may happen, thus workaround for bound padding suggested:
+		// int lx1 = lx0 + ((lx0<(cols-1)) ? 1:0);
 		int lx1 = lx0 + 1;
 
 		ap_fixed<ITCMP_WIDTH,ITCMP_INT> fracx = ap_fixed<ITCMP_WIDTH,ITCMP_INT>(tmp_locj - lx0);
@@ -98,7 +100,7 @@ ap_fixed<IT_WIDTH,IT_INT> findIntensity(unsigned char lineBuffer[NUM_LINES+1][MA
 	
 } // end findIntensity()
 
-template<unsigned short MAXHEIGHT, unsigned short MAXWIDTH, int NUM_PYR_LEVELS, int NUM_LINES, int WINSIZE, int IT_WIDTH, int IT_INT, int ITCMP_WIDTH, int ITCMP_INT, int FLOW_WIDTH, int FLOW_INT, int RMAPPX_WIDTH, int RMAPPX_INT>
+template<unsigned short MAXHEIGHT, unsigned short MAXWIDTH, int NUM_PYR_LEVELS, int NUM_LINES, int WINSIZE, int IT_WIDTH, int IT_INT, int ITCMP_WIDTH, int ITCMP_INT, int FLOW_WIDTH, int FLOW_INT, int RMAPPX_WIDTH, int RMAPPX_INT, bool USE_URAM>
 void findGradients(unsigned char *currImg3, unsigned char *nextImg, hls::stream< ap_fixed<IT_WIDTH,IT_INT> > &strmIt, hls::stream< ap_int<9> > &strmIx, hls::stream< ap_int<9> > &strmIy,
 		unsigned int rows, unsigned int cols, hls::stream< ap_fixed<FLOW_WIDTH,FLOW_INT> > &strmFlowUin, hls::stream< ap_fixed<FLOW_WIDTH,FLOW_INT> > &strmFlowVin,
 		hls::stream< ap_fixed<FLOW_WIDTH,FLOW_INT> > &strmFlowU_in1, hls::stream< ap_fixed<FLOW_WIDTH,FLOW_INT> > &strmFlowV_in1, int level) {
@@ -132,11 +134,17 @@ sprintf(name,"gy_hw%d.txt",level);
 	unsigned int read_curimg = 0;
 	unsigned int read_nxtimg = 0;
 	
+    //AK,ZoTech: this buffer needs initialization as workaround to exclude "X" values in co-sim.
 	unsigned char lineBuffer[NUM_LINES+1][MAXWIDTH];
-#pragma HLS array_partition variable=lineBuffer complete dim=1
+#pragma HLS array_reshape variable=lineBuffer complete dim=1
 
 	unsigned char curr_img_buf[2][MAXWIDTH];
-#pragma HLS array_partition variable=curr_img_buf complete dim=1
+#pragma HLS array_reshape variable=curr_img_buf complete dim=1
+
+if (USE_URAM) {	
+#pragma HLS RESOURCE variable=lineBuffer   core=XPM_MEMORY uram
+#pragma HLS RESOURCE variable=curr_img_buf core=XPM_MEMORY uram
+}
 
 	unsigned char effBufferedLines = std::min(NUM_LINES,(1<<(NUM_PYR_LEVELS - 1 - level))*(WINSIZE-1) + 1); /**** Change this appropriately in original function***/
 	ap_uint<8> totalLinesInBuffer = effBufferedLines + 1;
diff --git a/include/imgproc/xf_pyr_dense_optical_flow_median_blur.hpp b/include/imgproc/xf_pyr_dense_optical_flow_median_blur.hpp
index e0a4a7b..1ac33d7 100644
--- a/include/imgproc/xf_pyr_dense_optical_flow_median_blur.hpp
+++ b/include/imgproc/xf_pyr_dense_optical_flow_median_blur.hpp
@@ -113,43 +113,15 @@ void auMedianProc(
 }
 
 template<int ROWS, int COLS, int DEPTH, int NPC, int WORDWIDTH, int TC, int WIN_SZ, int WIN_SZ_SQ, int FLOW_WIDTH, int FLOW_INT>
-void ProcessMedian3x3(hls::stream< ap_fixed<FLOW_WIDTH,FLOW_INT> > & _src_mat,
+void ProcessMedian3x3(
 		hls::stream< ap_fixed<FLOW_WIDTH,FLOW_INT> > & _out_mat, hls::stream< bool > &flag,
-		ap_fixed<FLOW_WIDTH,FLOW_INT> buf[WIN_SZ][(COLS >> NPC)], ap_fixed<FLOW_WIDTH,FLOW_INT> src_buf[WIN_SZ][1+(WIN_SZ-1)],
+		ap_fixed<FLOW_WIDTH,FLOW_INT> src_buf[WIN_SZ][1+(WIN_SZ-1)], ap_fixed<FLOW_WIDTH,FLOW_INT> buf_cop[WIN_SZ],
 		ap_fixed<FLOW_WIDTH,FLOW_INT> OutputValues[1],
-		ap_fixed<FLOW_WIDTH,FLOW_INT> &P0, uint16_t img_width,  uint16_t img_height, uint16_t &shift_x,  ap_uint<13> row_ind[WIN_SZ], ap_uint<13> row, ap_uint<8> win_size)
+		ap_fixed<FLOW_WIDTH,FLOW_INT> &P0, uint16_t img_width,  uint16_t img_height, uint16_t &shift_x,  ap_uint<13> row_ind[WIN_SZ], ap_uint<13> row, ap_uint<16> col, ap_uint<8> win_size)
 {
 #pragma HLS INLINE
 
-	ap_fixed<FLOW_WIDTH,FLOW_INT> buf_cop[WIN_SZ];
-#pragma HLS ARRAY_PARTITION variable=buf_cop complete dim=1
-	
 	uint16_t npc = 1;
-	Col_Loop:
-	for(ap_uint<16> col = 0; col < img_width+(WIN_SZ>>1); col++)
-	{
-#pragma HLS LOOP_TRIPCOUNT min=1 max=TC
-#pragma HLS pipeline
-#pragma HLS LOOP_FLATTEN OFF
-
-		if(row < img_height && col < img_width)
-			buf[row_ind[win_size-1]][col] = _src_mat.read(); // Read data
-		else
-			buf[row_ind[win_size-1]][col] = 0;
-
-		for(int copy_buf_var=0;copy_buf_var<WIN_SZ;copy_buf_var++)
-		{
-#pragma HLS LOOP_TRIPCOUNT min=1 max=WIN_SZ
-#pragma HLS UNROLL
-			if(	(row >(img_height-1)) && (copy_buf_var>(win_size-1-(row-(img_height-1)))))
-			{
-				buf_cop[copy_buf_var] = buf[(row_ind[win_size-1-(row-(img_height-1))])][col];
-			}
-			else
-			{
-				buf_cop[copy_buf_var] = buf[(row_ind[copy_buf_var])][col];
-			}
-		}
 		
 		// if(NPC == AU_NPPC8)
 		// {
@@ -167,7 +139,9 @@ void ProcessMedian3x3(hls::stream< ap_fixed<FLOW_WIDTH,FLOW_INT> > & _src_mat,
 	#pragma HLS UNROLL
 				if(col<img_width)
 				{
-					src_buf[extract_px][win_size-1] = buf_cop[extract_px];
+                   if((row >(img_height-1)) && (extract_px>(win_size-1-(row-(img_height-1)))))
+                        src_buf[extract_px][win_size-1] = buf_cop[(row_ind[win_size-1-(row-(img_height-1))])];
+                   else src_buf[extract_px][win_size-1] = buf_cop[(row_ind[extract_px])];
 				}
 				else
 				{
@@ -216,14 +190,16 @@ void ProcessMedian3x3(hls::stream< ap_fixed<FLOW_WIDTH,FLOW_INT> > & _src_mat,
 				}
 			}
 		}
-	} // Col_Loop
 }
 
 
 
-template<int ROWS, int COLS, int DEPTH, int NPC, int WORDWIDTH, int TC,int WIN_SZ, int WIN_SZ_SQ, int FLOW_WIDTH, int FLOW_INT>
-void auMedian3x3(hls::stream< ap_fixed<FLOW_WIDTH,FLOW_INT> > &_src_mat,
-		hls::stream< ap_fixed<FLOW_WIDTH,FLOW_INT> > &_out_mat, hls::stream< bool > &flag, ap_uint<8> win_size,
+template<int ROWS, int COLS, int DEPTH, int NPC, int WORDWIDTH, int TC,int WIN_SZ, int WIN_SZ_SQ, int FLOW_WIDTH, int FLOW_INT, bool USE_URAM>
+void auMedian3x3(hls::stream< ap_fixed<FLOW_WIDTH,FLOW_INT> > &_src_mat0,
+                 hls::stream< ap_fixed<FLOW_WIDTH,FLOW_INT> > &_out_mat0, hls::stream< bool > &flag0,
+                 hls::stream< ap_fixed<FLOW_WIDTH,FLOW_INT> > &_src_mat1,
+                 hls::stream< ap_fixed<FLOW_WIDTH,FLOW_INT> > &_out_mat1, hls::stream< bool > &flag1,
+        ap_uint<8> win_size,
 		uint16_t img_height, uint16_t img_width)
 {
 	ap_uint<13> row_ind[WIN_SZ];
@@ -234,19 +210,32 @@ void auMedian3x3(hls::stream< ap_fixed<FLOW_WIDTH,FLOW_INT> > &_src_mat,
 	ap_uint<16> row, col;
 
 
-	ap_fixed<FLOW_WIDTH,FLOW_INT> OutputValues[1];
+	ap_fixed<FLOW_WIDTH,FLOW_INT> OutputValues[2][1];
 #pragma HLS ARRAY_PARTITION variable=OutputValues complete dim=1
+#pragma HLS ARRAY_PARTITION variable=OutputValues complete dim=2
 
+	ap_fixed<FLOW_WIDTH,FLOW_INT> buf_cop[2][WIN_SZ];
+#pragma HLS ARRAY_PARTITION variable=buf_cop complete dim=1
+#pragma HLS ARRAY_PARTITION variable=buf_cop complete dim=2
+	
 
-	ap_fixed<FLOW_WIDTH,FLOW_INT> src_buf[WIN_SZ][1+(WIN_SZ-1)];
+	ap_fixed<FLOW_WIDTH,FLOW_INT> src_buf[2][WIN_SZ][1+(WIN_SZ-1)];
 #pragma HLS ARRAY_PARTITION variable=src_buf complete dim=1
 #pragma HLS ARRAY_PARTITION variable=src_buf complete dim=2
+#pragma HLS ARRAY_PARTITION variable=src_buf complete dim=3
 // src_buf1 et al merged 
 	ap_fixed<FLOW_WIDTH,FLOW_INT> P0;
 
-	ap_fixed<FLOW_WIDTH,FLOW_INT> buf[WIN_SZ][(COLS >> NPC)];
-#pragma HLS ARRAY_PARTITION variable=buf complete dim=1
+	ap_fixed<FLOW_WIDTH,FLOW_INT> buf[2][WIN_SZ][(COLS >> NPC)];
+#pragma HLS ARRAY_RESHAPE variable=buf complete dim=1
+#pragma HLS ARRAY_RESHAPE variable=buf complete dim=2
+
+if (USE_URAM) {	
+#pragma HLS RESOURCE variable=buf core=XPM_MEMORY uram
+}
+else {
 #pragma HLS RESOURCE variable=buf core=RAM_S2P_BRAM
+}	
 
 //initializing row index
 	
@@ -265,7 +254,8 @@ void auMedian3x3(hls::stream< ap_fixed<FLOW_WIDTH,FLOW_INT> > &_src_mat,
 	#pragma HLS LOOP_TRIPCOUNT min=TC max=TC
 	#pragma HLS pipeline
 	#pragma HLS LOOP_FLATTEN OFF
-			buf[init_buf][col] = _src_mat.read();
+			buf[0][init_buf][col] = _src_mat0.read();
+			buf[1][init_buf][col] = _src_mat1.read();
 		}
 	}
 	
@@ -277,7 +267,8 @@ void auMedian3x3(hls::stream< ap_fixed<FLOW_WIDTH,FLOW_INT> > &_src_mat,
 			{
 	#pragma HLS LOOP_TRIPCOUNT min=WIN_SZ max=WIN_SZ
 	#pragma HLS UNROLL
-				buf[init_buf][col] = buf[row_ind[win_size>>1]][col];
+				buf[0][init_buf][col] = buf[0][row_ind[win_size>>1]][col];
+				buf[1][init_buf][col] = buf[1][row_ind[win_size>>1]][col];
 			}
 		}
 	
@@ -296,7 +287,36 @@ void auMedian3x3(hls::stream< ap_fixed<FLOW_WIDTH,FLOW_INT> > &_src_mat,
 			// }
 		// }
 		P0 = 0;
-		ProcessMedian3x3<ROWS, COLS, DEPTH, NPC, WORDWIDTH, TC, WIN_SZ, WIN_SZ_SQ, FLOW_WIDTH, FLOW_INT>(_src_mat, _out_mat, flag, buf, src_buf,OutputValues, P0, img_width, img_height, shift_x, row_ind, row,win_size);
+	    Col_Loop:
+	    for(ap_uint<16> col = 0; col < img_width+(WIN_SZ>>1); col++)
+	    {
+#pragma HLS LOOP_TRIPCOUNT min=1 max=TC
+#pragma HLS pipeline
+#pragma HLS LOOP_FLATTEN OFF
+
+		for(int copy_buf_var=0;copy_buf_var<WIN_SZ;copy_buf_var++)
+		{
+#pragma HLS LOOP_TRIPCOUNT min=1 max=WIN_SZ
+#pragma HLS UNROLL
+			buf_cop[0][copy_buf_var] = buf[0][copy_buf_var][col];
+			buf_cop[1][copy_buf_var] = buf[1][copy_buf_var][col];
+		}
+
+		if(row < img_height && col < img_width) {
+			buf_cop[0][row_ind[win_size-1]] = _src_mat0.read(); // Read data
+			buf_cop[1][row_ind[win_size-1]] = _src_mat1.read(); // Read data
+        }
+		else {
+			buf_cop[0][row_ind[win_size-1]] = 0;
+			buf_cop[1][row_ind[win_size-1]] = 0;
+        }
+
+		buf[0][row_ind[win_size-1]][col] = buf_cop[0][row_ind[win_size-1]];
+		buf[1][row_ind[win_size-1]][col] = buf_cop[1][row_ind[win_size-1]];
+
+		ProcessMedian3x3<ROWS, COLS, DEPTH, NPC, WORDWIDTH, TC, WIN_SZ, WIN_SZ_SQ, FLOW_WIDTH, FLOW_INT>(_out_mat0, flag0, src_buf[0], buf_cop[0], OutputValues[0], P0, img_width, img_height, shift_x, row_ind, row,col,win_size);
+		ProcessMedian3x3<ROWS, COLS, DEPTH, NPC, WORDWIDTH, TC, WIN_SZ, WIN_SZ_SQ, FLOW_WIDTH, FLOW_INT>(_out_mat1, flag1, src_buf[1], buf_cop[1], OutputValues[1], P0, img_width, img_height, shift_x, row_ind, row,col,win_size);
+	    } // Col_Loop
 	
 		//update indices
 		ap_uint<13> zero_ind = row_ind[0];
@@ -311,10 +331,13 @@ void auMedian3x3(hls::stream< ap_fixed<FLOW_WIDTH,FLOW_INT> > &_src_mat,
 	} // Row_Loop
 }
 
-template<int ROWS,int COLS,int DEPTH,int NPC,int WORDWIDTH,int PIPELINEFLAG, int WIN_SZ, int WIN_SZ_SQ, int FLOW_WIDTH, int FLOW_INT>
+template<int ROWS,int COLS,int DEPTH,int NPC,int WORDWIDTH,int PIPELINEFLAG, int WIN_SZ, int WIN_SZ_SQ, int FLOW_WIDTH, int FLOW_INT, bool USE_URAM>
 void auMedianBlur(
-		hls::stream< ap_fixed<FLOW_WIDTH,FLOW_INT> > &_src,
-		hls::stream< ap_fixed<FLOW_WIDTH,FLOW_INT> > &_dst, hls::stream< bool > &flag, ap_uint<8> win_size,
+		hls::stream< ap_fixed<FLOW_WIDTH,FLOW_INT> > &_src0,
+		hls::stream< ap_fixed<FLOW_WIDTH,FLOW_INT> > &_dst0, hls::stream< bool > &flag0,
+		hls::stream< ap_fixed<FLOW_WIDTH,FLOW_INT> > &_src1,
+		hls::stream< ap_fixed<FLOW_WIDTH,FLOW_INT> > &_dst1, hls::stream< bool > &flag1,
+        ap_uint<8> win_size,
 		int _border_type,uint16_t imgheight,uint16_t imgwidth)
 {
 #pragma HLS inline off
@@ -329,7 +352,7 @@ void auMedianBlur(
 	imgwidth = imgwidth >> NPC;
 
 
-	auMedian3x3< ROWS, COLS, DEPTH, NPC, WORDWIDTH, (COLS>>NPC)+(WIN_SZ>>1), WIN_SZ, WIN_SZ_SQ, FLOW_WIDTH, FLOW_INT>(_src, _dst,flag,WIN_SZ,imgheight,imgwidth);
+	auMedian3x3< ROWS, COLS, DEPTH, NPC, WORDWIDTH, (COLS>>NPC)+(WIN_SZ>>1), WIN_SZ, WIN_SZ_SQ, FLOW_WIDTH, FLOW_INT, USE_URAM>(_src0, _dst0,flag0,_src1, _dst1,flag1,WIN_SZ,imgheight,imgwidth);
 
 
 }
diff --git a/include/imgproc/xf_pyr_dense_optical_flow_oflow_process.hpp b/include/imgproc/xf_pyr_dense_optical_flow_oflow_process.hpp
index 2500114..cdc9a7c 100644
--- a/include/imgproc/xf_pyr_dense_optical_flow_oflow_process.hpp
+++ b/include/imgproc/xf_pyr_dense_optical_flow_oflow_process.hpp
@@ -29,29 +29,38 @@
  ***************************************************************************/
 #ifndef __XF_PYR_DENSE_OPTICAL_FLOW_OFLOW_PROCESS__
 #define __XF_PYR_DENSE_OPTICAL_FLOW_OFLOW_PROCESS__
-template<unsigned short MAXHEIGHT, unsigned short MAXWIDTH, int WINSIZE, int IT_WIDTH, int IT_INT, int SIXIY_WIDTH, int SIXIY_INT, int SIXYIT_WIDTH, int SIXYIT_INT>
+template<unsigned short MAXHEIGHT, unsigned short MAXWIDTH, int WINSIZE, int IT_WIDTH, int IT_INT, int SIXIY_WIDTH, int SIXIY_INT, int SIXYIT_WIDTH, int SIXYIT_INT, bool USE_URAM>
 void find_G_and_b_matrix(hls::stream< ap_int<9> > &strmIx, hls::stream< ap_int<9> > &strmIy, hls::stream< ap_fixed<IT_WIDTH,IT_INT> > &strmIt,
 		hls::stream< ap_fixed<SIXIY_WIDTH,SIXIY_INT> > &sigmaIx2, hls::stream< ap_fixed<SIXIY_WIDTH,SIXIY_INT> > &sigmaIy2, hls::stream< ap_fixed<SIXIY_WIDTH,SIXIY_INT> > &sigmaIxIy,
 		hls::stream< ap_fixed<SIXYIT_WIDTH,SIXYIT_INT> > &sigmaIxIt, hls::stream< ap_fixed<SIXYIT_WIDTH,SIXYIT_INT> > &sigmaIyIt, unsigned int rows, unsigned int cols, int level) {
 #pragma HLS inline off
 	// bufLines is used to buffer Ix, Iy, It in that order
 	ap_int<9> bufLines_ix[WINSIZE][MAXWIDTH+(WINSIZE>>1)];
-#pragma HLS array_partition variable=bufLines_ix complete dim=1
+#pragma HLS array_reshape variable=bufLines_ix complete dim=1
 	ap_int<9> bufLines_iy[WINSIZE][MAXWIDTH+(WINSIZE>>1)];
-#pragma HLS array_partition variable=bufLines_iy complete dim=1
+#pragma HLS array_reshape variable=bufLines_iy complete dim=1
 	ap_fixed<IT_WIDTH,IT_INT> bufLines_it[WINSIZE][MAXWIDTH+(WINSIZE>>1)];
-#pragma HLS array_partition variable=bufLines_it complete dim=1
+#pragma HLS array_reshape variable=bufLines_it complete dim=1
 
 	ap_fixed<SIXIY_WIDTH,SIXIY_INT>  colsum_IxIx[MAXWIDTH+(WINSIZE>>1)];
 	ap_fixed<SIXIY_WIDTH,SIXIY_INT>  colsum_IxIy[MAXWIDTH+(WINSIZE>>1)];
 	ap_fixed<SIXIY_WIDTH,SIXIY_INT>  colsum_IyIy[MAXWIDTH+(WINSIZE>>1)];
 	ap_fixed<SIXYIT_WIDTH,SIXYIT_INT> colsum_IxIt[MAXWIDTH+(WINSIZE>>1)];
 	ap_fixed<SIXYIT_WIDTH,SIXYIT_INT> colsum_IyIt[MAXWIDTH+(WINSIZE>>1)];
-#pragma HLS RESOURCE variable=colsum_IxIx core=RAM_T2P_BRAM
-#pragma HLS RESOURCE variable=colsum_IxIy core=RAM_T2P_BRAM
-#pragma HLS RESOURCE variable=colsum_IyIy core=RAM_T2P_BRAM
-#pragma HLS RESOURCE variable=colsum_IxIt core=RAM_T2P_BRAM
-#pragma HLS RESOURCE variable=colsum_IyIt core=RAM_T2P_BRAM
+
+#pragma HLS ARRAY_MAP variable=bufLines_ix instance=buffers vertical
+#pragma HLS ARRAY_MAP variable=bufLines_iy instance=buffers vertical
+#pragma HLS ARRAY_MAP variable=bufLines_it instance=buffers vertical
+
+#pragma HLS ARRAY_MAP variable=colsum_IxIx instance=buffers vertical
+#pragma HLS ARRAY_MAP variable=colsum_IxIy instance=buffers vertical
+#pragma HLS ARRAY_MAP variable=colsum_IyIy instance=buffers vertical
+#pragma HLS ARRAY_MAP variable=colsum_IxIt instance=buffers vertical
+#pragma HLS ARRAY_MAP variable=colsum_IyIt instance=buffers vertical
+
+if (USE_URAM) {
+#pragma HLS RESOURCE variable=bufLines_ix core=XPM_MEMORY uram
+}
 
 	ap_fixed<SIXIY_WIDTH,SIXIY_INT>  colsum_prevWIN_IxIx[WINSIZE];
 	ap_fixed<SIXIY_WIDTH,SIXIY_INT>  colsum_prevWIN_IxIy[WINSIZE];
diff --git a/include/imgproc/xf_pyr_dense_optical_flow_scale.hpp b/include/imgproc/xf_pyr_dense_optical_flow_scale.hpp
index 42ec06e..9b11634 100644
--- a/include/imgproc/xf_pyr_dense_optical_flow_scale.hpp
+++ b/include/imgproc/xf_pyr_dense_optical_flow_scale.hpp
@@ -31,7 +31,9 @@
 #define __XF_PYR_DENSE_OPTICAL_FLOW_SCALE__
 
 template<int MAXWIDTH, int FLOW_WIDTH, int FLOW_INT, int SCCMP_WIDTH, int SCCMP_INT, int SCALE_WIDTH, int SCALE_INT>
-void load_data (hls::stream< ap_fixed<FLOW_WIDTH,FLOW_INT> > &inStrm, ap_fixed<FLOW_WIDTH,FLOW_INT> buf[MAXWIDTH], int rows, int cols, bool &flagLoaded, int i, ap_ufixed<SCALE_WIDTH,SCALE_INT> scaleI, ap_fixed<SCCMP_WIDTH,SCCMP_INT> &fracI, int &prevIceil) {
+void load_data (hls::stream< ap_fixed<FLOW_WIDTH,FLOW_INT> > &inStrm0,
+                hls::stream< ap_fixed<FLOW_WIDTH,FLOW_INT> > &inStrm1,
+                ap_fixed<FLOW_WIDTH,FLOW_INT> buf[2][MAXWIDTH], int rows, int cols, bool &flagLoaded, int i, ap_ufixed<SCALE_WIDTH,SCALE_INT> scaleI, ap_fixed<SCCMP_WIDTH,SCCMP_INT> &fracI, int &prevIceil) {
 #pragma HLS inline off
 	ap_fixed<SCCMP_WIDTH,SCCMP_INT> iSmall = i * scaleI;
 	int iSmallFloor = (int) iSmall;
@@ -42,7 +44,8 @@ void load_data (hls::stream< ap_fixed<FLOW_WIDTH,FLOW_INT> > &inStrm, ap_fixed<F
 #pragma HLS LOOP_TRIPCOUNT min=1 max=MAXWIDTH
 #pragma HLS pipeline ii=1
 #pragma HLS LOOP_FLATTEN OFF
-			buf[i] = inStrm.read();
+			buf[0][i] = inStrm0.read();
+			buf[1][i] = inStrm1.read();
 		}
 		prevIceil = iSmallFloor + 1;
 	}
@@ -70,13 +73,18 @@ ap_fixed<FLOW_WIDTH,FLOW_INT> compute_result(ap_fixed<SCCMP_WIDTH,SCCMP_INT> fra
 } // end compute_result()
 
 template<unsigned short MAXHEIGHT, unsigned short MAXWIDTH, int FLOW_WIDTH, int FLOW_INT, int SCCMP_WIDTH, int SCCMP_INT, int RMAPPX_WIDTH, int RMAPPX_INT, int SCALE_WIDTH, int SCALE_INT>
-void process(ap_fixed<FLOW_WIDTH,FLOW_INT> buf[MAXWIDTH], ap_fixed<FLOW_WIDTH,FLOW_INT> buffer[2][MAXWIDTH], unsigned short int outRows, unsigned short int outCols, hls::stream< ap_fixed<FLOW_WIDTH,FLOW_INT> >& outStrm, bool flagLoaded, int row, ap_ufixed<SCALE_WIDTH,SCALE_INT> scaleI, ap_ufixed<SCALE_WIDTH,SCALE_INT> scaleJ, ap_fixed<SCCMP_WIDTH,SCCMP_INT> fracI, int mul) {
-#pragma HLS array_partition variable=buffer dim=1 complete
+void process(ap_fixed<FLOW_WIDTH,FLOW_INT> buf[2][MAXWIDTH], ap_fixed<FLOW_WIDTH,FLOW_INT> buffer[2][2][MAXWIDTH], unsigned short int outRows, unsigned short int outCols,
+             hls::stream< ap_fixed<FLOW_WIDTH,FLOW_INT> >& outStrm0,
+             hls::stream< ap_fixed<FLOW_WIDTH,FLOW_INT> >& outStrm1,
+             bool flagLoaded, int row, ap_ufixed<SCALE_WIDTH,SCALE_INT> scaleI, ap_ufixed<SCALE_WIDTH,SCALE_INT> scaleJ, ap_fixed<SCCMP_WIDTH,SCCMP_INT> fracI, int mul) {
 #pragma HLS inline off
 	int bufCount = 0;
 	ap_fixed<FLOW_WIDTH,FLOW_INT> regLoad;
 	int prevJceil = -1;
-	ap_fixed<FLOW_WIDTH,FLOW_INT> i0=0, i1=0, i2=0, i3=0;
+	ap_fixed<FLOW_WIDTH,FLOW_INT> i0[2]={0,0};
+	ap_fixed<FLOW_WIDTH,FLOW_INT> i1[2]={0,0};
+	ap_fixed<FLOW_WIDTH,FLOW_INT> i2[2]={0,0};
+	ap_fixed<FLOW_WIDTH,FLOW_INT> i3[2]={0,0};
 	L3:for (ap_uint<16> j=0; j<outCols; j++) {
 #pragma HLS LOOP_TRIPCOUNT min=1 max=MAXWIDTH
 #pragma HLS pipeline
@@ -92,63 +100,78 @@ void process(ap_fixed<FLOW_WIDTH,FLOW_INT> buf[MAXWIDTH], ap_fixed<FLOW_WIDTH,FL
 		if (row == 0) {
 			fracI = 1;
 			if (j==0) {
-				ap_fixed<FLOW_WIDTH,FLOW_INT> reg = buf[bufCount];
-				buffer[1][bufCount] = reg;
-				i3 = reg;
+              for (int k=0; k<2; k++) {
+                ap_fixed<FLOW_WIDTH,FLOW_INT> reg = buf[k][bufCount];
+                buffer[k][1][bufCount] = reg;
+                i3[k] = reg;
+              }
 				fracI = 1; fracJ = 1;
 				bufCount++;
 				prevJceil = 0;
 			}
 			else if (j<outCols) {
 				if (prevJceil == jSmallFloor) {
-					i2 = i3;
-					ap_fixed<FLOW_WIDTH,FLOW_INT> reg = buf[bufCount];
-					buffer[1][bufCount] = reg;
-					i3 = reg;
+                  for (int k=0; k<2; k++) {
+					i2[k] = i3[k];
+					ap_fixed<FLOW_WIDTH,FLOW_INT> reg = buf[k][bufCount];
+					buffer[k][1][bufCount] = reg;
+					i3[k] = reg;
+                  }
 					bufCount++;
 					prevJceil = jSmallFloor + 1;
 				}
 			}
 			else {
-				i3 = buffer[1][bufCount-1];
+				i3[0] = buffer[0][1][bufCount-1];
+				i3[1] = buffer[1][1][bufCount-1];
 				fracI = 1; fracJ = 1;
 			}
 		}
 		else if (row < outRows-1) {
 			if (j==0) {
-				i0 = 0; i2 = 0;
+				i0[0] = 0; i2[0] = 0;
+				i0[1] = 0; i2[1] = 0;
 				fracJ = 1;
 				if (flagLoaded) {
-					ap_fixed<FLOW_WIDTH,FLOW_INT> reg = buf[bufCount];
-					ap_fixed<FLOW_WIDTH,FLOW_INT> tmp = buffer[1][bufCount];
-					buffer[0][bufCount] =  tmp;
-					i1 = tmp;
-					buffer[1][bufCount] = reg;
-					i3 = reg;
+                  for (int k=0; k<2; k++) {
+					ap_fixed<FLOW_WIDTH,FLOW_INT> reg = buf[k][bufCount];
+					ap_fixed<FLOW_WIDTH,FLOW_INT> tmp = buffer[k][1][bufCount];
+					buffer[k][0][bufCount] =  tmp;
+					i1[k] = tmp;
+					buffer[k][1][bufCount] = reg;
+					i3[k] = reg;
+                  }
 					bufCount++;
 				}
 				else {
-					i1 = buffer[0][bufCount];
-					i3 = buffer[1][bufCount];
+                  for (int k=0; k<2; k++) {
+					i1[k] = buffer[k][0][bufCount];
+					i3[k] = buffer[k][1][bufCount];
+                  }
 					bufCount++;
 				}
 				prevJceil = 0;
 			}
 			else if (j < outCols) {
 				if (prevJceil == jSmallFloor) {
-					i0 = i1; i2 = i3;
+					i0[0] = i1[0]; i2[0] = i3[0];
+					i0[1] = i1[1]; i2[1] = i3[1];
 					if (flagLoaded) {		
-						ap_fixed<FLOW_WIDTH,FLOW_INT> reg = buf[bufCount];
-						ap_fixed<FLOW_WIDTH,FLOW_INT> tmp = buffer[1][bufCount];
-						buffer[0][bufCount] =  tmp;
-						i1 = tmp;
-						buffer[1][bufCount] = reg;
-						i3 = reg;
+                      for (int k=0; k<2; k++) {
+						ap_fixed<FLOW_WIDTH,FLOW_INT> reg = buf[k][bufCount];
+						ap_fixed<FLOW_WIDTH,FLOW_INT> tmp = buffer[k][1][bufCount];
+						buffer[k][0][bufCount] =  tmp;
+						i1[k] = tmp;
+						buffer[k][1][bufCount] = reg;
+						i3[k] = reg;
+                      }
 						bufCount++;
 					}
 					else {
-						i1 = buffer[0][bufCount];
-						i3 = buffer[1][bufCount];
+                      for (int k=0; k<2; k++) {
+						i1[k] = buffer[k][0][bufCount];
+						i3[k] = buffer[k][1][bufCount];
+                      }
 						bufCount++;
 					}
 					prevJceil = jSmallFloor + 1;
@@ -160,40 +183,55 @@ void process(ap_fixed<FLOW_WIDTH,FLOW_INT> buf[MAXWIDTH], ap_fixed<FLOW_WIDTH,FL
 		}
 		else {
 			if (j==0) {
-				i3 = buffer[1][bufCount];
+				i3[0] = buffer[0][1][bufCount];
+				i3[1] = buffer[1][1][bufCount];
 				fracI = 1; fracJ = 1;
 				bufCount++;
 				prevJceil = 0;
 			}
 			else if (j < outCols) {
 				if (prevJceil == jSmallFloor) {
-					i2 = i3;
-					ap_fixed<FLOW_WIDTH,FLOW_INT> reg = buffer[1][bufCount];
-					i3 = reg;
+                  for (int k=0; k<2; k++) {
+					i2[k] = i3[k];
+					ap_fixed<FLOW_WIDTH,FLOW_INT> reg = buffer[k][1][bufCount];
+					i3[k] = reg;
+                  }
 					bufCount++;
 					prevJceil = jSmallFloor + 1;
 				}
 				fracI = 1;
 			}
 			else { 
-				i3 = buffer[1][bufCount-1];
+				i3[0] = buffer[0][1][bufCount-1];
+				i3[1] = buffer[1][1][bufCount-1];
 				fracI = 1; fracJ = 1;
 			}
 
 		} // end else
-		ap_fixed<FLOW_WIDTH,FLOW_INT> resIf = compute_result <FLOW_WIDTH, FLOW_INT, SCCMP_WIDTH, SCCMP_INT, RMAPPX_WIDTH, RMAPPX_INT>(fracI, fracJ, i0, i1, i2, i3);
-		outStrm.write(resIf<<1);
+		ap_fixed<FLOW_WIDTH,FLOW_INT> resIf0 = compute_result <FLOW_WIDTH, FLOW_INT, SCCMP_WIDTH, SCCMP_INT, RMAPPX_WIDTH, RMAPPX_INT>(fracI, fracJ, i0[0], i1[0], i2[0], i3[0]);
+		outStrm0.write(resIf0<<1);
+		ap_fixed<FLOW_WIDTH,FLOW_INT> resIf1 = compute_result <FLOW_WIDTH, FLOW_INT, SCCMP_WIDTH, SCCMP_INT, RMAPPX_WIDTH, RMAPPX_INT>(fracI, fracJ, i0[1], i1[1], i2[1], i3[1]);
+		outStrm1.write(resIf1<<1);
 
 	} // end L3
 } // end process()
-template<unsigned short MAXHEIGHT, unsigned short MAXWIDTH, int FLOW_WIDTH, int FLOW_INT, int SCCMP_WIDTH, int SCCMP_INT, int RMAPPX_WIDTH, int RMAPPX_INT, int SCALE_WIDTH, int SCALE_INT>
-void scale_up( hls::stream< ap_fixed<FLOW_WIDTH,FLOW_INT> > &inStrm, hls::stream< ap_fixed<FLOW_WIDTH,FLOW_INT> > &outStrm, 
+template<unsigned short MAXHEIGHT, unsigned short MAXWIDTH, int FLOW_WIDTH, int FLOW_INT, int SCCMP_WIDTH, int SCCMP_INT, int RMAPPX_WIDTH, int RMAPPX_INT, int SCALE_WIDTH, int SCALE_INT, bool USE_URAM>
+void scale_up( hls::stream< ap_fixed<FLOW_WIDTH,FLOW_INT> > &inStrm0, hls::stream< ap_fixed<FLOW_WIDTH,FLOW_INT> > &outStrm0,
+               hls::stream< ap_fixed<FLOW_WIDTH,FLOW_INT> > &inStrm1, hls::stream< ap_fixed<FLOW_WIDTH,FLOW_INT> > &outStrm1,
 			unsigned short int inRows, unsigned short int inCols, unsigned short int outRows, unsigned short int outCols, int mul, const bool scale_up_flag, float scale_comp) {
 #pragma HLS inline off
 
-	ap_fixed<FLOW_WIDTH,FLOW_INT> buffer[2][MAXWIDTH];
-#pragma HLS array_partition variable=buffer dim=1 complete
-	ap_fixed<FLOW_WIDTH,FLOW_INT> buf0[MAXWIDTH], buf1[MAXWIDTH];
+	ap_fixed<FLOW_WIDTH,FLOW_INT> buffer[2][2][MAXWIDTH];
+#pragma HLS array_reshape variable=buffer dim=1 complete
+#pragma HLS array_reshape variable=buffer dim=2 complete
+	ap_fixed<FLOW_WIDTH,FLOW_INT> buf0[2][MAXWIDTH], buf1[2][MAXWIDTH];
+#pragma HLS array_reshape variable=buf0 dim=1 complete
+#pragma HLS array_reshape variable=buf1 dim=1 complete
+if (USE_URAM) {	
+#pragma HLS RESOURCE variable=buffer core=XPM_MEMORY uram
+#pragma HLS RESOURCE variable=buf0   core=XPM_MEMORY uram
+#pragma HLS RESOURCE variable=buf1   core=XPM_MEMORY uram
+}
 	
 	ap_ufixed<SCALE_WIDTH,SCALE_INT> scaleI = (ap_ufixed<SCALE_WIDTH,SCALE_INT>)scale_comp;
 	ap_ufixed<SCALE_WIDTH,SCALE_INT> scaleJ = (ap_ufixed<SCALE_WIDTH,SCALE_INT>)scale_comp;
@@ -213,32 +251,33 @@ void scale_up( hls::stream< ap_fixed<FLOW_WIDTH,FLOW_INT> > &inStrm, hls::stream
 	#pragma HLS LOOP_TRIPCOUNT min=1 max=MAXWIDTH
 	#pragma HLS pipeline II=1
 	#pragma HLS LOOP_FLATTEN OFF
-				outStrm.write((ap_fixed<FLOW_WIDTH,FLOW_INT>)inStrm.read());
+				outStrm0.write((ap_fixed<FLOW_WIDTH,FLOW_INT>)inStrm0.read());
+				outStrm1.write((ap_fixed<FLOW_WIDTH,FLOW_INT>)inStrm1.read());
 			}
 		}
 	}
 	else{
 		int prevIceil = -1;
-		load_data<MAXWIDTH, FLOW_WIDTH, FLOW_INT, SCCMP_WIDTH, SCCMP_INT, SCALE_WIDTH, SCALE_INT>(inStrm, buf0, inRows, inCols, flagLoaded0, 0, scaleI, fracI0, prevIceil);
+		load_data<MAXWIDTH, FLOW_WIDTH, FLOW_INT, SCCMP_WIDTH, SCCMP_INT, SCALE_WIDTH, SCALE_INT>(inStrm0, inStrm1, buf0, inRows, inCols, flagLoaded0, 0, scaleI, fracI0, prevIceil);
 		L2:for (ap_uint<16> i=0; i<outRows-1; i++) {
 	#pragma HLS LOOP_TRIPCOUNT min=1 max=MAXHEIGHT
 			if (flag==0) {
-				load_data<MAXWIDTH, FLOW_WIDTH, FLOW_INT, SCCMP_WIDTH, SCCMP_INT, SCALE_WIDTH, SCALE_INT>(inStrm, buf1, inRows, inCols, flagLoaded1, i+1, scaleI, fracI1, prevIceil);
-				process<MAXHEIGHT, MAXWIDTH, FLOW_WIDTH, FLOW_INT, SCCMP_WIDTH, SCCMP_INT, RMAPPX_WIDTH, RMAPPX_INT, SCALE_WIDTH, SCALE_INT>(buf0, buffer, outRows, outCols, outStrm, flagLoaded0, i, scaleI, scaleJ, fracI0, mul);
+				load_data<MAXWIDTH, FLOW_WIDTH, FLOW_INT, SCCMP_WIDTH, SCCMP_INT, SCALE_WIDTH, SCALE_INT>(inStrm0, inStrm1, buf1, inRows, inCols, flagLoaded1, i+1, scaleI, fracI1, prevIceil);
+				process<MAXHEIGHT, MAXWIDTH, FLOW_WIDTH, FLOW_INT, SCCMP_WIDTH, SCCMP_INT, RMAPPX_WIDTH, RMAPPX_INT, SCALE_WIDTH, SCALE_INT>(buf0, buffer, outRows, outCols, outStrm0, outStrm1, flagLoaded0, i, scaleI, scaleJ, fracI0, mul);
 				flag = 1;
 			}
 			else {
-				load_data<MAXWIDTH, FLOW_WIDTH, FLOW_INT, SCCMP_WIDTH, SCCMP_INT, SCALE_WIDTH, SCALE_INT>(inStrm, buf0, inRows, inCols, flagLoaded0, i+1, scaleI, fracI0, prevIceil);
-				process<MAXHEIGHT, MAXWIDTH, FLOW_WIDTH, FLOW_INT, SCCMP_WIDTH, SCCMP_INT, RMAPPX_WIDTH, RMAPPX_INT, SCALE_WIDTH, SCALE_INT>(buf1, buffer, outRows, outCols, outStrm, flagLoaded1, i, scaleI, scaleJ, fracI1, mul);
+				load_data<MAXWIDTH, FLOW_WIDTH, FLOW_INT, SCCMP_WIDTH, SCCMP_INT, SCALE_WIDTH, SCALE_INT>(inStrm0, inStrm1, buf0, inRows, inCols, flagLoaded0, i+1, scaleI, fracI0, prevIceil);
+				process<MAXHEIGHT, MAXWIDTH, FLOW_WIDTH, FLOW_INT, SCCMP_WIDTH, SCCMP_INT, RMAPPX_WIDTH, RMAPPX_INT, SCALE_WIDTH, SCALE_INT>(buf1, buffer, outRows, outCols, outStrm0, outStrm1, flagLoaded1, i, scaleI, scaleJ, fracI1, mul);
 				flag = 0;
 			}
 		} // end L2
 
 		if (flag ==0) {
-			process<MAXHEIGHT, MAXWIDTH, FLOW_WIDTH, FLOW_INT, SCCMP_WIDTH, SCCMP_INT, RMAPPX_WIDTH, RMAPPX_INT>(buf0, buffer, outRows, outCols, outStrm, flagLoaded0, outRows-1, scaleI, scaleJ, fracI0, mul);
+			process<MAXHEIGHT, MAXWIDTH, FLOW_WIDTH, FLOW_INT, SCCMP_WIDTH, SCCMP_INT, RMAPPX_WIDTH, RMAPPX_INT>(buf0, buffer, outRows, outCols, outStrm0, outStrm1, flagLoaded0, outRows-1, scaleI, scaleJ, fracI0, mul);
 		}
 		else {
-			process<MAXHEIGHT, MAXWIDTH, FLOW_WIDTH, FLOW_INT, SCCMP_WIDTH, SCCMP_INT, RMAPPX_WIDTH, RMAPPX_INT>(buf1, buffer, outRows, outCols, outStrm, flagLoaded1, outRows-1, scaleI, scaleJ, fracI1, mul);
+			process<MAXHEIGHT, MAXWIDTH, FLOW_WIDTH, FLOW_INT, SCCMP_WIDTH, SCCMP_INT, RMAPPX_WIDTH, RMAPPX_INT>(buf1, buffer, outRows, outCols, outStrm0, outStrm1, flagLoaded1, outRows-1, scaleI, scaleJ, fracI1, mul);
 		}
 	}
 
diff --git a/include/imgproc/xf_pyr_dense_optical_flow_wrapper.hpp b/include/imgproc/xf_pyr_dense_optical_flow_wrapper.hpp
index af41219..655aee1 100644
--- a/include/imgproc/xf_pyr_dense_optical_flow_wrapper.hpp
+++ b/include/imgproc/xf_pyr_dense_optical_flow_wrapper.hpp
@@ -51,11 +51,11 @@ namespace xf{
 #pragma SDS data data_mover("_next_image.data":AXIDMA_SIMPLE)
 #pragma SDS data data_mover("_streamFlowin.data":AXIDMA_SIMPLE)
 #pragma SDS data data_mover("_streamFlowout.data":AXIDMA_SIMPLE)
-template<int NUM_PYR_LEVELS, int NUM_LINES, int WINSIZE, int FLOW_WIDTH, int FLOW_INT, int TYPE, int ROWS, int COLS, int NPC>
+template<int NUM_PYR_LEVELS, int NUM_LINES, int WINSIZE, int FLOW_WIDTH, int FLOW_INT, int TYPE, int ROWS, int COLS, int NPC, bool USE_URAM = false>
 void densePyrOpticalFlow(xf::Mat<XF_8UC1,ROWS,COLS,XF_NPPC1> & _current_img, xf::Mat<XF_8UC1,ROWS,COLS,XF_NPPC1> & _next_image, xf::Mat<XF_32UC1,ROWS,COLS,XF_NPPC1> & _streamFlowin, xf::Mat<XF_32UC1,ROWS,COLS,XF_NPPC1> & _streamFlowout, const int level, const unsigned char scale_up_flag, float scale_in, ap_uint<1> init_flag)
 {
 	#pragma HLS INLINE OFF
-	xFLKOpticalFlowDenseKernel<ROWS, COLS, NUM_PYR_LEVELS, NUM_LINES, WINSIZE, FLOW_WIDTH, FLOW_INT>((unsigned char *)_current_img.data, (unsigned char *)_next_image.data, (unsigned int *)_streamFlowin.data, (unsigned int *)_streamFlowout.data, _current_img.rows, _current_img.cols, _streamFlowin.rows, _streamFlowin.cols, level, scale_up_flag, scale_in, init_flag);
+	xFLKOpticalFlowDenseKernel<ROWS, COLS, NUM_PYR_LEVELS, NUM_LINES, WINSIZE, FLOW_WIDTH, FLOW_INT, USE_URAM>((unsigned char *)_current_img.data, (unsigned char *)_next_image.data, (unsigned int *)_streamFlowin.data, (unsigned int *)_streamFlowout.data, _current_img.rows, _current_img.cols, _streamFlowin.rows, _streamFlowin.cols, level, scale_up_flag, scale_in, init_flag);
 }
 }	
 #endif
diff --git a/include/imgproc/xf_pyr_down.hpp b/include/imgproc/xf_pyr_down.hpp
index 88df8e8..f1c9ef4 100644
--- a/include/imgproc/xf_pyr_down.hpp
+++ b/include/imgproc/xf_pyr_down.hpp
@@ -37,7 +37,7 @@
 
 namespace xf{
 
-template <unsigned int ROWS, unsigned int COLS, unsigned int TYPE, unsigned int NPC>
+template <unsigned int ROWS, unsigned int COLS, unsigned int TYPE, unsigned int NPC, bool USE_URAM>
 void xFpyrDownKernel(XF_TNAME(TYPE,NPC) *in_image, XF_TNAME(TYPE,NPC) *out_image, unsigned short in_rows, unsigned short in_cols)
 {
 #pragma HLS DATAFLOW
@@ -55,7 +55,7 @@ void xFpyrDownKernel(XF_TNAME(TYPE,NPC) *in_image, XF_TNAME(TYPE,NPC) *out_image
 			read_pointer++;
 		}
 	}
-	xFPyrDownGaussianBlur<ROWS,COLS,TYPE, NPC, XF_WORDWIDTH(TYPE,NPC), 0,5,25>(_filter_in, _filter_out, 5, XF_BORDER_CONSTANT,in_rows,in_cols);
+	xFPyrDownGaussianBlur<ROWS,COLS,TYPE, NPC, XF_WORDWIDTH(TYPE,NPC), 0,5,25, USE_URAM>(_filter_in, _filter_out, 5, XF_BORDER_CONSTANT,in_rows,in_cols);
 	unsigned int write_ptr = 0;
 	for(int i=0;i<in_rows;i++)
 	{
@@ -83,13 +83,13 @@ void xFpyrDownKernel(XF_TNAME(TYPE,NPC) *in_image, XF_TNAME(TYPE,NPC) *out_image
 //#pragma SDS data data_mover("_src.data":AXIDMA_SIMPLE)
 //#pragma SDS data data_mover("_dst.data":AXIDMA_SIMPLE)
 #pragma SDS data copy("_src.data"[0:"_src.size"], "_dst.data"[0:"_dst.size"])
-template<int TYPE, int ROWS, int COLS, int NPC> 
+template<int TYPE, int ROWS, int COLS, int NPC, bool USE_URAM = false> 
 void pyrDown (xf::Mat<TYPE, ROWS, COLS, NPC> & _src, xf::Mat<TYPE, ROWS, COLS, NPC> & _dst)
 {
 #pragma HLS INLINE OFF
 	unsigned short input_height = _src.rows;
 	unsigned short input_width = _src.cols;
-	xFpyrDownKernel<ROWS, COLS, TYPE, NPC>(_src.data, _dst.data, input_height, input_width);
+	xFpyrDownKernel<ROWS, COLS, TYPE, NPC, USE_URAM>(_src.data, _dst.data, input_height, input_width);
 	return;
 }
 }
diff --git a/include/imgproc/xf_pyr_down_gaussian_blur.hpp b/include/imgproc/xf_pyr_down_gaussian_blur.hpp
index c87db8e..6c61198 100644
--- a/include/imgproc/xf_pyr_down_gaussian_blur.hpp
+++ b/include/imgproc/xf_pyr_down_gaussian_blur.hpp
@@ -99,29 +99,31 @@ void xFPyrDownprocessgaussian(hls::stream< XF_TNAME(DEPTH,NPC) > & _src_mat,
 #pragma HLS LOOP_FLATTEN OFF
 #pragma HLS LOOP_TRIPCOUNT min=1 max=TC
 #pragma HLS pipeline
-		if(row < img_height && col < img_width)
-			buf[row_ind[win_size-1]][col] = _src_mat.read(); // Read data
-
 		for(int copy_buf_var=0;copy_buf_var<WIN_SZ;copy_buf_var++)
 		{
 #pragma HLS LOOP_TRIPCOUNT min=1 max=WIN_SZ
 #pragma HLS UNROLL
-			if(	(row >(img_height-1)) && (copy_buf_var>(win_size-1-(row-(img_height-1)))))
-			{
-				buf_cop[copy_buf_var] = buf[(row_ind[win_size-1-(row-(img_height-1))])][col];
-			}
-			else
-			{
-				buf_cop[copy_buf_var] = buf[(row_ind[copy_buf_var])][col];
-			}
+           buf_cop[copy_buf_var] = buf[copy_buf_var][col];
 		}
+
+        if(row < img_height && col < img_width)
+            buf    [row_ind[win_size-1]][col] =
+            buf_cop[row_ind[win_size-1]]      = _src_mat.read(); // Read data
+
 		for(int extract_px=0;extract_px<WIN_SZ;extract_px++)
 		{
 #pragma HLS LOOP_TRIPCOUNT min=1 max=WIN_SZ
 #pragma HLS UNROLL
 			if(col<img_width)
 			{
-				src_buf[extract_px][win_size-1] = buf_cop[extract_px];
+               if(	(row >(img_height-1)) && (extract_px>(win_size-1-(row-(img_height-1)))))
+               {
+                  src_buf[extract_px][win_size-1] = buf_cop[(row_ind[win_size-1-(row-(img_height-1))])];
+               }
+               else
+               {
+                  src_buf[extract_px][win_size-1] = buf_cop[(row_ind[extract_px])];
+               }
 			}
 			else
 			{
@@ -158,7 +160,7 @@ void xFPyrDownprocessgaussian(hls::stream< XF_TNAME(DEPTH,NPC) > & _src_mat,
 
 
 
-template<int ROWS, int COLS, int DEPTH, int NPC, int WORDWIDTH, int TC,int WIN_SZ, int WIN_SZ_SQ>
+template<int ROWS, int COLS, int DEPTH, int NPC, int WORDWIDTH, int TC,int WIN_SZ, int WIN_SZ_SQ, bool USE_URAM>
 void xf_pyrdown_gaussian_nxn(hls::stream< XF_TNAME(DEPTH,NPC) > &_src_mat,
 		hls::stream< XF_TNAME(DEPTH,NPC) > &_out_mat, ap_uint<8> win_size,
 		uint16_t img_height, uint16_t img_width)
@@ -181,8 +183,12 @@ void xf_pyrdown_gaussian_nxn(hls::stream< XF_TNAME(DEPTH,NPC) > &_src_mat,
 	XF_TNAME(DEPTH,NPC) P0;
 
 	XF_TNAME(DEPTH,NPC) buf[WIN_SZ][(COLS >> XF_BITSHIFT(NPC))];
-#pragma HLS ARRAY_PARTITION variable=buf complete dim=1
+#pragma HLS ARRAY_RESHAPE variable=buf complete dim=1 	
+if (USE_URAM) {
+#pragma HLS RESOURCE variable=buf core=XPM_MEMORY uram
+} else {
 #pragma HLS RESOURCE variable=buf core=RAM_S2P_BRAM
+}
 
 //initializing row index
 	
@@ -209,11 +215,13 @@ void xf_pyrdown_gaussian_nxn(hls::stream< XF_TNAME(DEPTH,NPC) > &_src_mat,
 		for(col = 0; col < img_width; col++)
 		{
 	#pragma HLS LOOP_TRIPCOUNT min=1 max=TC
+    #pragma HLS pipeline
+			XF_TNAME(DEPTH,NPC) const bufTemp = buf[row_ind[win_size>>1]][col];
 			for(int init_buf=0; init_buf < WIN_SZ>>1;init_buf++)
 			{
 	#pragma HLS LOOP_TRIPCOUNT min=1 max=WIN_SZ
 	#pragma HLS UNROLL
-				buf[init_buf][col] = buf[row_ind[win_size>>1]][col];
+				buf[init_buf][col] = bufTemp;
 			}
 		}
 	
@@ -237,7 +245,7 @@ void xf_pyrdown_gaussian_nxn(hls::stream< XF_TNAME(DEPTH,NPC) > &_src_mat,
 	} // Row_Loop
 }
 
-template<int ROWS,int COLS,int DEPTH,int NPC,int WORDWIDTH,int PIPELINEFLAG, int WIN_SZ, int WIN_SZ_SQ>
+template<int ROWS,int COLS,int DEPTH,int NPC,int WORDWIDTH,int PIPELINEFLAG, int WIN_SZ, int WIN_SZ_SQ, bool USE_URAM>
 void xFPyrDownGaussianBlur(
 		hls::stream< XF_TNAME(DEPTH,NPC) > &_src,
 		hls::stream< XF_TNAME(DEPTH,NPC) > &_dst, ap_uint<8> win_size,
@@ -249,7 +257,7 @@ void xFPyrDownGaussianBlur(
 
 	imgwidth = imgwidth >> XF_BITSHIFT(NPC);
 
-	xf_pyrdown_gaussian_nxn<ROWS,COLS,DEPTH,NPC,WORDWIDTH,(COLS>>XF_BITSHIFT(NPC))+(WIN_SZ>>1),WIN_SZ, WIN_SZ_SQ>(_src, _dst,WIN_SZ,imgheight,imgwidth);
+	xf_pyrdown_gaussian_nxn<ROWS,COLS,DEPTH,NPC,WORDWIDTH,(COLS>>XF_BITSHIFT(NPC))+(WIN_SZ>>1),WIN_SZ, WIN_SZ_SQ, USE_URAM>(_src, _dst,WIN_SZ,imgheight,imgwidth);
 
 }
 
diff --git a/include/imgproc/xf_remap.hpp b/include/imgproc/xf_remap.hpp
index 459e7e1..4ea675a 100644
--- a/include/imgproc/xf_remap.hpp
+++ b/include/imgproc/xf_remap.hpp
@@ -44,7 +44,7 @@ EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 namespace xf{
 
-template <int WIN_ROW, int ROWS, int COLS, typename SRC_T, typename DST_T, typename MAP_T>
+template <int WIN_ROW, int ROWS, int COLS, bool USE_URAM, typename SRC_T, typename DST_T, typename MAP_T>
 void xFRemapNNI(
 		hls::stream< SRC_T >   &src,
 		hls::stream< DST_T >   &dst,
@@ -57,6 +57,24 @@ void xFRemapNNI(
 #pragma HLS ARRAY_PARTITION variable=buf complete dim=1
 
 	SRC_T s;
+
+	ap_uint<64> bufUram[WIN_ROW][(COLS+7)/8];
+#pragma HLS RESOURCE variable=bufUram core=XPM_MEMORY uram
+    //additional separation of URAM buffer to single URAMs to exclude their built-in cascading and thus limited timing
+    //due to inability of VHLS to schedule built-in cascade register (OREG_CAS) 
+    enum {
+      BUF_DEPTH = WIN_ROW * ((COLS+7)/8),
+      URAM_DEPTH = 4096,
+      BUF_URAMS = (BUF_DEPTH + URAM_DEPTH-1) / URAM_DEPTH,
+      PART_FACTOR = BUF_URAMS != 2 ? BUF_URAMS : 1 // exluding factor=2 as it leads to II degradation, so built-in cascading is left for the case of just 2 URAMs
+    };
+    if (USE_URAM) {
+      assert(PART_FACTOR <= ((COLS+7)/8));
+      #pragma HLS array_partition variable=bufUram dim=2 factor=PART_FACTOR block
+    }
+	SRC_T sx8[8];
+#pragma HLS ARRAY_PARTITION variable=sx8 complete dim=1
+
 	DST_T d;
 	MAP_T mx_fl;
 	MAP_T my_fl;
@@ -75,14 +93,23 @@ void xFRemapNNI(
 		loop_width: for( int j=0; j< cols; j++)
 		{
 #pragma HLS PIPELINE II=1
-#pragma HLS dependence array inter false
+#pragma HLS dependence variable=buf     inter false
+#pragma HLS dependence variable=bufUram inter false
+#pragma HLS dependence variable=r       inter false
 #pragma HLS LOOP_TRIPCOUNT min=1 max=COLS
 
 			if(i<rows&& j<cols)
 			{
 				src >> s;
+
+                if (USE_URAM) {
+			      sx8[j%8] = s;
+			      for (int k=0; k<8; k++) bufUram[i % WIN_ROW][j/8](k*8+7,k*8) = sx8[k];
+		        }
 			}
-			buf[i % WIN_ROW][j] = s;
+
+            if (!USE_URAM)
+			  buf[i % WIN_ROW][j] = s;
 			r[i % WIN_ROW] = i;
 
 			if(i>=ishift)
@@ -94,6 +121,12 @@ void xFRemapNNI(
 
 				bool in_range = (y>=0 && y<rows && r[y%WIN_ROW] == y && x>=0 && x<cols);
 				if(in_range)
+				  if (USE_URAM) {
+				    DST_T dx9[8];
+#pragma HLS ARRAY_PARTITION variable=dx9 complete dim=1
+				    for (int k=0; k<8; k++) dx9[k] = bufUram[y%WIN_ROW][x/8](k*8+7,k*8);
+				    d = dx9[x%8];
+				  } else
 					d = buf[y%WIN_ROW][x];
 				else
 					d = 0;
@@ -106,7 +139,7 @@ void xFRemapNNI(
 
 
 #define TWO_POW_16 65536
-template <int WIN_ROW, int ROWS, int COLS, typename SRC_T, typename DST_T, typename MAP_T>
+template <int WIN_ROW, int ROWS, int COLS, bool USE_URAM, typename SRC_T, typename DST_T, typename MAP_T>
 void xFRemapLI(
 		hls::stream< SRC_T >   &src,
 		hls::stream< DST_T >   &dst,
@@ -116,10 +149,33 @@ void xFRemapLI(
 )
 {
 	// Add one to always get zero for boundary interpolation. Maybe need initialization here?
+    //AK,ZoTech: this buffer needs initialization as workaround for correct boundary filtering, otherwise X are generated in co-sim.
 	DST_T buf[WIN_ROW/2+1][2][COLS/2+1][2];
 #pragma HLS array_partition complete variable=buf dim=2
 #pragma HLS array_partition complete variable=buf dim=4
 	SRC_T s;
+	
+    //URAM storage garnularity is 3x3-pel block in 2x2-pel picture grid, it fits to one URAM word
+    ap_uint<72> bufUram[(WIN_ROW+1)/2][(COLS+1)/2];
+#pragma HLS RESOURCE variable=bufUram core=XPM_MEMORY uram
+    //additional separation of URAM buffer to single URAMs to exclude their built-in cascading and thus limited timing
+    //due to inability of VHLS to schedule built-in cascade register (OREG_CAS) 
+    enum {
+      BUF_DEPTH = ((WIN_ROW+1)/2) * ((COLS+1)/2),
+      URAM_DEPTH = 4096,
+      BUF_URAMS = (BUF_DEPTH + URAM_DEPTH-1) / URAM_DEPTH,
+      PART_FACTOR = BUF_URAMS != 2 ? BUF_URAMS : 1 // exluding factor=2 as it leads to II degradation, so built-in cascading is left for the case of just 2 URAMs
+    };
+    if (USE_URAM) {
+      assert(PART_FACTOR <= ((COLS+1)/2));
+      #pragma HLS array_partition variable=bufUram dim=2 factor=PART_FACTOR block
+    }
+    SRC_T lineBuf[COLS]; //addtitional cashing as VHLS doesn't support URAM Byte Enables
+	SRC_T s3x3[2][9]; //URAM-wide word is doubled to resolve pipelining read/write dependency
+#pragma HLS ARRAY_PARTITION complete variable=s3x3 dim=0
+	SRC_T s3x3_2[9];
+    SRC_T s0,s3;
+
 	MAP_T mx;
 	MAP_T my;
 
@@ -135,25 +191,73 @@ void xFRemapLI(
 #pragma HLS LOOP_FLATTEN OFF
 #pragma HLS LOOP_TRIPCOUNT min=1 max=row_tripcount
 
-		loop_width: for( int j=0; j< cols; j++)
+		loop_width: for( int j=0; j< cols+3; j++)
 		{
 #pragma HLS PIPELINE II=1
-#pragma HLS dependence array inter false
+#pragma HLS dependence variable=buf     inter false
+#pragma HLS dependence variable=bufUram inter false
+#pragma HLS dependence variable=s3x3    inter false RAW
+#pragma HLS dependence variable=r1      inter false
+#pragma HLS dependence variable=r2      inter false
 #pragma HLS LOOP_TRIPCOUNT min=1 max=COLS
 
 			if(i<rows&& j<cols)
 			{
 				src >> s;
 			}
+
+            if (USE_URAM && i<rows+1) {
+			  if (!(i%2)) { // even row, stored in line buffer for 1st row of 3x3 block, and in URAM for 3d row of 3x3 block
+                if (!(j%2)) { // even col
+                  if (j<cols) lineBuf[j] = s0 = s;
+                  else s0 = 0;
+                  s3x3[!(j&2)][8] = s0;
+				  if ((i/2)>0 && (j/2)>1) for (int k=0; k<9; k++) bufUram[(i/2-1)%(WIN_ROW/2)][j/2-2](k*8+7,k*8) = s3x3[!!(j&2)][k];
+                } else if (j<cols) { // odd col
+                  lineBuf[j] = s;
+                  if ((i/2)>0) {
+                  	for (int k=0; k<6; k++) s3x3[!!(j&2)][k] = bufUram[(i/2-1)%(WIN_ROW/2)][j/2](k*8+7,k*8);
+                    s3x3[!!(j&2)][6] = s0;
+                    s3x3[!!(j&2)][7] = s;
+                    s3x3[!!(j&2)][8] = 0;
+                  }
+   		        }
+              } else if (j<cols) { // odd row, togeher with fetched from line buffer 1st row of 3x3 block is stored in URAM
+                if (!(j%2)) { // even col
+                  s3x3_2[2] = s0 = lineBuf[j];
+                  s3x3_2[5] = s3 = s;
+				  if ((j/2)>0) for (int k=0; k<9; k++) bufUram[(i/2)%(WIN_ROW/2)][j/2-1](k*8+7,k*8) = s3x3_2[k];
+                } else { // odd col
+                  s3x3_2[0] = s0;
+				  s3x3_2[1] = lineBuf[j];
+                  s3x3_2[3] = s3;
+				  s3x3_2[4] = s;
+
+				  // this clearing is needed only for case of bottom zero padding (curently last(bottom-right) sample value is used)
+                  s3x3_2[6] = 0;
+                  s3x3_2[7] = 0;
+                  s3x3_2[8] = 0;
+			      //if (j==(cols-1)) { //these clearing and save is needed only at last column but may done every cycle
+			      s3x3_2[2] = 0;
+			      s3x3_2[5] = 0;
+			      for (int k=0; k<9; k++) bufUram[(i/2)%(WIN_ROW/2)][j/2](k*8+7,k*8) = s3x3_2[k];
+			      //}
+			    }
+              }
+            }
+
+            if (!USE_URAM && j<cols) {
 			if((i % WIN_ROW) % 2) {
 				buf[(i % WIN_ROW)/2][(i % WIN_ROW) % 2][j/2][j%2] = s;
 			} else {
 				buf[(i % WIN_ROW)/2][(i % WIN_ROW) % 2][j/2][j%2] = s;
 			}
+            }
+
 			r1[i % WIN_ROW] = i;
 			r2[i % WIN_ROW] = i;
 
-			if(i>=ishift)
+			if(i>=ishift && j<cols)
 			{
 				mapx >> mx;
 				mapy >> my;
@@ -198,6 +302,16 @@ void xFRemapLI(
 				ya1 = (y/2)%(WIN_ROW/2);
 
 				DST_T d00, d01, d10, d11;
+
+              if (USE_URAM) {
+                DST_T d3x3[9];
+#pragma HLS ARRAY_PARTITION variable=d3x3 complete dim=1
+                for (int k=0; k<9; k++) d3x3[k] = bufUram[ya1][xa1](k*8+7,k*8);
+				d00 = d3x3[(y%2  )*3 + x%2  ];
+				d01 = d3x3[(y%2  )*3 + x%2+1];
+				d10 = d3x3[(y%2+1)*3 + x%2  ];
+				d11 = d3x3[(y%2+1)*3 + x%2+1];
+			  } else {
 				d00=buf[ya0][0][xa0][0];
 				d01=buf[ya0][0][xa1][1];
 				d10=buf[ya1][1][xa0][0];
@@ -211,6 +325,7 @@ void xFRemapLI(
 					std::swap(d00,d10);
 					std::swap(d01,d11);
 				}
+			  }
 				ap_ufixed<2*HLS_INTER_BITS + 1, 1> k01 = (1-iv)*(  iu); // iu-iu*iv
 				ap_ufixed<2*HLS_INTER_BITS + 1, 1> k10 = (  iv)*(1-iu); // iv-iu*iv
 				ap_ufixed<2*HLS_INTER_BITS + 1, 1> k11 = (  iv)*(  iu); // iu*iv
@@ -230,7 +345,7 @@ void xFRemapLI(
 	}
 }
 
-template <int WIN_ROW, int INTERPOLATION_TYPE, int ROWS, int COLS, typename SRC_T, typename DST_T, typename MAP_T>
+template <int WIN_ROW, int INTERPOLATION_TYPE, int ROWS, int COLS, bool USE_URAM, typename SRC_T, typename DST_T, typename MAP_T>
 void xFRemapKernel(
 		hls::stream< SRC_T >    &src,
 		hls::stream< DST_T >   &dst,
@@ -240,9 +355,9 @@ void xFRemapKernel(
 )
 {
 	if(INTERPOLATION_TYPE == XF_INTERPOLATION_NN) {
-		xFRemapNNI<WIN_ROW,ROWS,COLS>(src, dst, mapx, mapy,rows,cols);
+		xFRemapNNI<WIN_ROW,ROWS,COLS,USE_URAM>(src, dst, mapx, mapy,rows,cols);
 	} else if(INTERPOLATION_TYPE == XF_INTERPOLATION_BILINEAR) {
-		xFRemapLI<WIN_ROW,ROWS,COLS>(src, dst, mapx, mapy,rows,cols);
+		xFRemapLI<WIN_ROW,ROWS,COLS,USE_URAM>(src, dst, mapx, mapy,rows,cols);
 	}
 	else {
 		assert (((INTERPOLATION_TYPE == XF_INTERPOLATION_NN)||(INTERPOLATION_TYPE == XF_INTERPOLATION_BILINEAR)) && "The INTERPOLATION_TYPE must be either XF_INTERPOLATION_NN or XF_INTERPOLATION_BILINEAR");
@@ -253,7 +368,7 @@ void xFRemapKernel(
 //#pragma SDS data mem_attribute("_src_mat.data":NON_CACHEABLE|PHYSICAL_CONTIGUOUS,"_remapped_mat.data":NON_CACHEABLE|PHYSICAL_CONTIGUOUS,"_mapx_mat.data":NON_CACHEABLE|PHYSICAL_CONTIGUOUS,"_mapy_mat.data":NON_CACHEABLE|PHYSICAL_CONTIGUOUS)
 #pragma SDS data access_pattern("_src_mat.data":SEQUENTIAL,"_remapped_mat.data":SEQUENTIAL,"_mapx_mat.data":SEQUENTIAL,"_mapy_mat.data":SEQUENTIAL)
 #pragma SDS data copy("_src_mat.data"[0:"_src_mat.rows*_src_mat.cols"], "_remapped_mat.data"[0:"_remapped_mat.size"],"_mapx_mat.data"[0:"_mapx_mat.size"],"_mapy_mat.data"[0:"_mapy_mat.size"])
-template<int WIN_ROWS, int INTERPOLATION_TYPE, int SRC_T, int MAP_T, int DST_T, int ROWS, int COLS, int NPC = XF_NPPC1>
+template<int WIN_ROWS, int INTERPOLATION_TYPE, int SRC_T, int MAP_T, int DST_T, int ROWS, int COLS, int NPC, bool USE_URAM>
 void remap (xf::Mat<SRC_T, ROWS, COLS, NPC> &_src_mat, xf::Mat<DST_T, ROWS, COLS, NPC> &_remapped_mat, xf::Mat<MAP_T, ROWS, COLS, NPC> &_mapx_mat,
 		xf::Mat<MAP_T, ROWS, COLS, NPC> &_mapy_mat)
 {
@@ -304,7 +419,7 @@ void remap (xf::Mat<SRC_T, ROWS, COLS, NPC> &_src_mat, xf::Mat<DST_T, ROWS, COLS
 		}
 	}
 
-	xFRemapKernel <WIN_ROWS,INTERPOLATION_TYPE,ROWS,COLS> (_src, _remapped, _mapx, _mapy, rows, cols);
+	xFRemapKernel <WIN_ROWS,INTERPOLATION_TYPE,ROWS,COLS,USE_URAM> (_src, _remapped, _mapx, _mapy, rows, cols);
 
 	xfremap_output_loop:
 	for (int i = 0; i < loop_count; i++)
diff --git a/include/imgproc/xf_stereoBM.hpp b/include/imgproc/xf_stereoBM.hpp
index 93448c5..b705546 100644
--- a/include/imgproc/xf_stereoBM.hpp
+++ b/include/imgproc/xf_stereoBM.hpp
@@ -62,99 +62,99 @@ template<typename T>
 T xFabsdiff2(T a, T b)
 {
 #pragma HLS INLINE
-	int x = a-b;
+  int x = a-b;
 #pragma HLS RESOURCE variable=x core=AddSubnS
-	T r;
-	if (x > 0)
-	{
-		r = x;
-	}
-	else
-	{
-		r = -x;
-	}
-	return r;
+  T r;
+  if (x > 0)
+  {
+    r = x;
+  }
+  else
+  {
+    r = -x;
+  }
+  return r;
 }
 
 template<int SIZE>
 class xFMinSAD
 {
 public:
-	template <typename T, typename T_idx>
-	static void find(T a[SIZE], T_idx &loc, T &val)
-	{
+  template <typename T, typename T_idx>
+  static void find(T a[SIZE], T_idx &loc, T &val)
+  {
 #pragma HLS INLINE
 #pragma HLS array_partition variable=a complete dim=0
 
-		T a1[SIZE/2];
-		T a2[SIZE-SIZE/2];
+    T a1[SIZE/2];
+    T a2[SIZE-SIZE/2];
 
-		for(int i = 0; i < SIZE/2; i++)
-		{
+    for(int i = 0; i < SIZE/2; i++)
+    {
 #pragma HLS UNROLL
-			a1[i] = a[i];
-		}
-		for(int i = 0; i < SIZE-SIZE/2; i++)
-		{
+      a1[i] = a[i];
+    }
+    for(int i = 0; i < SIZE-SIZE/2; i++)
+    {
 #pragma HLS UNROLL
-			a2[i] = a[i+SIZE/2];
-		}
-
-		T_idx l1,l2;
-		T v1,v2;
-		xFMinSAD<SIZE/2>::find(a1,l1,v1);
-		xFMinSAD<SIZE-SIZE/2>::find(a2,l2,v2);
-
-		if(v2 <= v1)
-		{
-			val = v2;
-			loc = l2+SIZE/2;
-		}
-		else
-		{
-			val = v1;
-			loc = l1;
-		}
-	}
+      a2[i] = a[i+SIZE/2];
+    }
+
+    T_idx l1,l2;
+    T v1,v2;
+    xFMinSAD<SIZE/2>::find(a1,l1,v1);
+    xFMinSAD<SIZE-SIZE/2>::find(a2,l2,v2);
+
+    if(v2 <= v1)
+    {
+      val = v2;
+      loc = l2+SIZE/2;
+    }
+    else
+    {
+      val = v1;
+      loc = l1;
+    }
+  }
 };
 
 template<>
 class xFMinSAD<1>
 {
 public:
-	template <typename T, typename T_idx>
-	static void find(T a[1], T_idx &loc, T &val)
-	{
+  template <typename T, typename T_idx>
+  static void find(T a[1], T_idx &loc, T &val)
+  {
 #pragma HLS INLINE
 
-		loc = 0;
-		val = a[0];
-	}
+    loc = 0;
+    val = a[0];
+  }
 };
 
 template<>
 class xFMinSAD<2>
 {
 public:
-	template <typename T, typename T_idx>
-	static void find(T a[2], T_idx &loc, T &val)
-	{
+  template <typename T, typename T_idx>
+  static void find(T a[2], T_idx &loc, T &val)
+  {
 #pragma HLS INLINE
 #pragma HLS array_partition variable=a complete dim=0
 
-		T_idx l1=0, l2=1;
-		T v1=a[0], v2=a[1];
-		if(v2 <= v1)
-		{
-			val = v2;
-			loc = l2;
-		}
-		else
-		{
-			val = v1;
-			loc = l1;
-		}
-	}
+    T_idx l1=0, l2=1;
+    T v1=a[0], v2=a[1];
+    if(v2 <= v1)
+    {
+      val = v2;
+      loc = l2;
+    }
+    else
+    {
+      val = v1;
+      loc = l1;
+    }
+  }
 };
 
 /* TEXTURE THRESHOLD computation */
@@ -163,32 +163,32 @@ void xFUpdateTextureSum(unsigned char window[WSIZE][L_WIN_COLS],unsigned char l_
 {
 #pragma HLS INLINE
 
-	int abs_diff[WSIZE];
-	int col_sums = 0;
+  int abs_diff[WSIZE];
+  int col_sums = 0;
 
-	text_sum_loop1:
-	for (int i = 0; i < WSIZE; i++)
-	{
+  text_sum_loop1:
+  for (int i = 0; i < WSIZE; i++)
+  {
 #pragma HLS UNROLL
-		col_sums += (i > row? 0 : xFabsdiff2((int)(l_tmp[i]), cap)) - (((col < WSIZE) || (i > row) ) ? 0 : xFabsdiff2((int)window[i][WSIZE-1], cap));
-	}
+    col_sums += (i > row? 0 : xFabsdiff2((int)(l_tmp[i]), cap)) - (((col < WSIZE) || (i > row) ) ? 0 : xFabsdiff2((int)window[i][WSIZE-1], cap));
+  }
 
-	int tmp_prev[2];
-	int tmp_int_sums;
+  int tmp_prev[2];
+  int tmp_int_sums;
 
-	tmp_prev[0] = col>0 ? (int)text_sum[0]:(int)0;
-	tmp_prev[1] = col_sums;
+  tmp_prev[0] = col>0 ? (int)text_sum[0]:(int)0;
+  tmp_prev[1] = col_sums;
 
-	//shift right
-	for(int j = WSIZE-1; j >= 1; j--)
-	{
+  //shift right
+  for(int j = WSIZE-1; j >= 1; j--)
+  {
 #pragma HLS UNROLL
-		text_sum[j] = text_sum[j-1];
-	}
+    text_sum[j] = text_sum[j-1];
+  }
 
-	//	shift_right<ap_uint<32>, NDISP_UNITS,SAD_COL_SIZE,NPC>(text_sum);
-	tmp_int_sums = tmp_prev[0] + tmp_prev[1];
-	text_sum[0] = tmp_int_sums;
+  //  shift_right<ap_uint<32>, NDISP_UNITS,SAD_COL_SIZE,NPC>(text_sum);
+  tmp_int_sums = tmp_prev[0] + tmp_prev[1];
+  text_sum[0] = tmp_int_sums;
 }
 
 template<typename T,int ROWS,int COLS>
@@ -196,57 +196,57 @@ void xFShiftRight(T buf[ROWS][COLS])
 {
 #pragma HLS INLINE
 
-	shift_right_loop2:
-	for(unsigned char j = COLS-1; j >= 1; j--)
-	{
+  shift_right_loop2:
+  for(unsigned char j = COLS-1; j >= 1; j--)
+  {
 #pragma HLS UNROLL
-		shift_right_loop1:
-		for(unsigned char i = 0; i < ROWS; i++)
-		{
+    shift_right_loop1:
+    for(unsigned char i = 0; i < ROWS; i++)
+    {
 #pragma HLS UNROLL
-			buf[i][j] = buf[i][j-1];
-		}
-	}
+      buf[i][j] = buf[i][j-1];
+    }
+  }
 }
 
 template<int ROWS,int COLS,typename T>
 void xFInsertLeft(T buf[ROWS][COLS],T tmp[ROWS])
 {
 #pragma HLS INLINE
-	insert_right_loop1:
-	for(unsigned char i = 0; i < ROWS; i++)
-	{
+  insert_right_loop1:
+  for(unsigned char i = 0; i < ROWS; i++)
+  {
 #pragma HLS UNROLL
-		buf[i][0] = tmp[i];
-	}
+    buf[i][0] = tmp[i];
+  }
 }
 
 template<int WSIZE, int L_WIN_COLS, int R_WIN_COLS, typename T>
 short int xFSADComputeInc(
-		T l_win[WSIZE][L_WIN_COLS],
-		T r_win_s[WSIZE][R_WIN_COLS],
-		unsigned char d,
-		unsigned short col,
-		short int sad_cols_d[WSIZE])
+    T l_win[WSIZE][L_WIN_COLS],
+    T r_win_s[WSIZE][R_WIN_COLS],
+    unsigned char d,
+    unsigned short col,
+    short int sad_cols_d[WSIZE])
 {
 #pragma HLS inline
-	short int a_sum = 0, b_sum = 0;
-	// compute new column sads;
-	for (unsigned char i = 0; i < WSIZE; i++) {
-		b_sum += __ABS((unsigned char)l_win[i][0] - (unsigned char)r_win_s[i][d]);
-	}
-	// valid guard;
-	if (col < d) b_sum = 0;
-	// get previous sad_cols value;
-	a_sum = sad_cols_d[WSIZE-1];
-	// shift sad_cols[d];
-	for (unsigned char j = WSIZE-1; j > 0; j--) {
-		sad_cols_d[j] = sad_cols_d[j-1];
-	}
-	// fill in sad_cols with newly computed values;
-	sad_cols_d[0] = b_sum;
-
-	return (-a_sum+b_sum);
+  short int a_sum = 0, b_sum = 0;
+  // compute new column sads;
+  for (unsigned char i = 0; i < WSIZE; i++) {
+    b_sum += __ABS((unsigned char)l_win[i][0] - (unsigned char)r_win_s[i][d]);
+  }
+  // valid guard;
+  if (col < d) b_sum = 0;
+  // get previous sad_cols value;
+  a_sum = sad_cols_d[WSIZE-1];
+  // shift sad_cols[d];
+  for (unsigned char j = WSIZE-1; j > 0; j--) {
+    sad_cols_d[j] = sad_cols_d[j-1];
+  }
+  // fill in sad_cols with newly computed values;
+  sad_cols_d[0] = b_sum;
+
+  return (-a_sum+b_sum);
 }
 
 
@@ -256,264 +256,264 @@ int WSIZE,int NDISP,int NDISP_UNIT, int SWEEP_FACT, int ROW_TC,
 int COL_TC,int BUF_SIZE,
 int LWINWIDTH,int RWINWIDTH,int DISPWORDWIDTH,int SADWORDWIDTH>
 void xFSADBlockMatching(
-		hls::stream<XF_TNAME(WORDWIDTH_SRC,1)> &left,
-		hls::stream<XF_TNAME(WORDWIDTH_SRC,1)> &right,
-		hls::stream<XF_TNAME(WORDWIDTH_DST,1)>& out,
-		xf::xFSBMState<WSIZE, NDISP, NDISP_UNIT>& state,
-		short int height, short int width)
+    hls::stream<XF_TNAME(WORDWIDTH_SRC,1)> &left,
+    hls::stream<XF_TNAME(WORDWIDTH_SRC,1)> &right,
+    hls::stream<XF_TNAME(WORDWIDTH_DST,1)>& out,
+    xf::xFSBMState<WSIZE, NDISP, NDISP_UNIT>& state,
+    short int height, short int width)
 {
-	//create the left and right line buffers.
-	XF_TNAME(WORDWIDTH_SRC,1) left_line_buf[WSIZE][BUF_SIZE];
-#if PLATFORM_ZCU104	
+  //create the left and right line buffers.
+  XF_TNAME(WORDWIDTH_SRC,1) left_line_buf[WSIZE][BUF_SIZE];
+#if PLATFORM_ZCU104
 #pragma HLS RESOURCE variable=left_line_buf core=XPM_MEMORY uram
 #endif
 #pragma HLS ARRAY_PARTITION variable=left_line_buf complete dim=1
 
-	XF_TNAME(WORDWIDTH_SRC,1) right_line_buf[WSIZE][BUF_SIZE];
-#if PLATFORM_ZCU104	
+  XF_TNAME(WORDWIDTH_SRC,1) right_line_buf[WSIZE][BUF_SIZE];
+#if PLATFORM_ZCU104
 #pragma HLS RESOURCE variable=right_line_buf core=XPM_MEMORY uram
 #endif
 #pragma HLS ARRAY_PARTITION variable=right_line_buf complete dim=1
 
-	//create the left and right window buffers.
-	unsigned char l_window[WSIZE][LWINWIDTH];
+  //create the left and right window buffers.
+  unsigned char l_window[WSIZE][LWINWIDTH];
 #pragma HLS ARRAY_PARTITION variable=l_window complete dim=2
 #pragma HLS ARRAY_PARTITION variable=l_window complete dim=1
-	unsigned char r_window[WSIZE][RWINWIDTH];
+  unsigned char r_window[WSIZE][RWINWIDTH];
 #pragma HLS ARRAY_PARTITION variable=r_window complete dim=2
 #pragma HLS ARRAY_PARTITION variable=r_window complete dim=1
 
-	int TMP_INT_MAX_PACK;
-	TMP_INT_MAX_PACK = 2147483647;
+  int TMP_INT_MAX_PACK;
+  TMP_INT_MAX_PACK = 2147483647;
 
-	short int FILTERED = 0;//((state.minDisparity - 1) << 4);
-	unsigned char cap = state.preFilterCap;
-	unsigned char l_tmp[WSIZE];
+  short int FILTERED = 0;//((state.minDisparity - 1) << 4);
+  unsigned char cap = state.preFilterCap;
+  unsigned char l_tmp[WSIZE];
 #pragma HLS array_partition variable=l_tmp complete dim=0
-	unsigned char r_tmp[WSIZE];
+  unsigned char r_tmp[WSIZE];
 #pragma HLS array_partition variable=r_tmp complete dim=0
-	int text_sum[WSIZE];
+  int text_sum[WSIZE];
 #pragma HLS ARRAY_PARTITION variable=text_sum complete dim=0
-	int sad[NDISP_UNIT];
+  int sad[NDISP_UNIT];
 #pragma HLS array_partition variable=sad complete dim=0
 
-	short int sad_cols[NDISP_UNIT][WSIZE];
+  short int sad_cols[NDISP_UNIT][WSIZE];
 #pragma HLS array_partition variable=sad_cols complete dim=0
-	int minsad[COLS+WSIZE-1];
-#if PLATFORM_ZCU104	
+  int minsad[COLS+WSIZE-1];
+#if PLATFORM_ZCU104
 #pragma HLS RESOURCE variable=minsad core=XPM_MEMORY uram
 #endif
-	XF_TNAME(WORDWIDTH_DST,1) mind[BUF_SIZE];
-#if PLATFORM_ZCU104	
+  XF_TNAME(WORDWIDTH_DST,1) mind[BUF_SIZE];
+#if PLATFORM_ZCU104
 #pragma HLS RESOURCE variable=mind core=XPM_MEMORY uram
 #endif
-	bool skip[BUF_SIZE];
-#if PLATFORM_ZCU104	
+  bool skip[BUF_SIZE];
+#if PLATFORM_ZCU104
 #pragma HLS RESOURCE variable=skip core=XPM_MEMORY uram
 #endif
 
-	loop_row:
-	for (unsigned short row = 0; row < height+WSIZE-1; row++) {
+  loop_row:
+  for (unsigned short row = 0; row < height+WSIZE-1; row++) {
 #pragma HLS LOOP_TRIPCOUNT min=ROW_TC max=ROW_TC
 
-		loop_mux:
-		for (unsigned char sweep = 0; sweep < state.sweepFactor; sweep++) {
+    loop_mux:
+    for (unsigned char sweep = 0; sweep < state.sweepFactor; sweep++) {
 #pragma HLS LOOP_TRIPCOUNT min=SWEEP_FACT max=SWEEP_FACT
 
-			loop_sad_init:
-			for (unsigned char d = 0; d < NDISP_UNIT; d++) {
+      loop_sad_init:
+      for (unsigned char d = 0; d < NDISP_UNIT; d++) {
 #pragma HLS unroll
-				sad[d] = 0;
-				for (unsigned char i = 0; i < WSIZE; i++) {
+        sad[d] = 0;
+        for (unsigned char i = 0; i < WSIZE; i++) {
 #pragma HLS unroll
-					sad_cols[d][i] = 0;
-				}
-			}
-			loop_col:
-			for (unsigned short col = 0; col < width+WSIZE-1; col++) {
+          sad_cols[d][i] = 0;
+        }
+      }
+      loop_col:
+      for (unsigned short col = 0; col < width+WSIZE-1; col++) {
 #pragma HLS LOOP_TRIPCOUNT min=COL_TC max=COL_TC
 
 #pragma HLS loop_flatten
 #pragma HLS pipeline II=1
 
-				unsigned char tmp_l = cap,tmp_r=cap;
+        unsigned char tmp_l = cap,tmp_r=cap;
 
-				if (sweep == 0) {
-					// load and shifting buffs
-					// shift down
-					for(unsigned char sd = WSIZE-1; sd > 0; sd--) {
+        if (sweep == 0) {
+          // load and shifting buffs
+          // shift down
+          for(unsigned char sd = WSIZE-1; sd > 0; sd--) {
 #pragma HLS unroll
-						left_line_buf[sd][col] = left_line_buf[sd-1][col];
-					}
+            left_line_buf[sd][col] = left_line_buf[sd-1][col];
+          }
 
-					for(unsigned char sd = WSIZE-1; sd > 0; sd--) {
+          for(unsigned char sd = WSIZE-1; sd > 0; sd--) {
 #pragma HLS unroll
-						right_line_buf[sd][col] = right_line_buf[sd-1][col];
-					}
-
-					if (!(row < (WSIZE-1)/2 || row >= height+(WSIZE-1)/2 || col < (WSIZE-1)/2 || col >= width+(WSIZE-1)/2)) {
-						tmp_l = left.read();
-						tmp_r = right.read();
-					}
-					// insert bottom
-					left_line_buf[0][col] = tmp_l;
-					right_line_buf[0][col] = tmp_r;
-					loop_get_data_from_linebuff:
-					for (unsigned char i = 0; i < WSIZE; i++) {
-						l_tmp[i] = left_line_buf[i][col];
-						r_tmp[i] = right_line_buf[i][col];
-					}
-				} else {
-					unsigned short offset = sweep * NDISP_UNIT;
-					loop_get_data_from_linebuff_with_offset:
-					for (unsigned char i = 0; i < WSIZE; i++) {
-						l_tmp[i] = left_line_buf[i][col];
-						r_tmp[i] = right_line_buf[i][col-offset < 0 ? 0 : col-offset];
-					}
-				}
-
-				xFUpdateTextureSum<WSIZE,0,LWINWIDTH,WORDWIDTH_SRC>(l_window,l_tmp,row,col,state.preFilterCap,text_sum);
-
-				xFShiftRight<unsigned char,WSIZE,LWINWIDTH>(l_window);
-				xFShiftRight<unsigned char,WSIZE,RWINWIDTH>(r_window);
-				xFInsertLeft<WSIZE,LWINWIDTH,unsigned char>(l_window,l_tmp);
-				xFInsertLeft<WSIZE,RWINWIDTH,unsigned char>(r_window,r_tmp);
-
-				loop_sad_compute:
-				for (unsigned char d = 0; d < NDISP_UNIT; d++) {
-					sad[d] += (int)xFSADComputeInc<WSIZE, LWINWIDTH, RWINWIDTH, unsigned char>(l_window, r_window, d, col, sad_cols[d]);
-				}
-
-				int skip_val[BUF_SIZE];
-#if PLATFORM_ZCU104	
+            right_line_buf[sd][col] = right_line_buf[sd-1][col];
+          }
+
+          if (!(row < (WSIZE-1)/2 || row >= height+(WSIZE-1)/2 || col < (WSIZE-1)/2 || col >= width+(WSIZE-1)/2)) {
+            tmp_l = left.read();
+            tmp_r = right.read();
+          }
+          // insert bottom
+          left_line_buf[0][col] = tmp_l;
+          right_line_buf[0][col] = tmp_r;
+          loop_get_data_from_linebuff:
+          for (unsigned char i = 0; i < WSIZE; i++) {
+            l_tmp[i] = left_line_buf[i][col];
+            r_tmp[i] = right_line_buf[i][col];
+          }
+        } else {
+          unsigned short offset = sweep * NDISP_UNIT;
+          loop_get_data_from_linebuff_with_offset:
+          for (unsigned char i = 0; i < WSIZE; i++) {
+            l_tmp[i] = left_line_buf[i][col];
+            r_tmp[i] = right_line_buf[i][col-offset < 0 ? 0 : col-offset];
+          }
+        }
+
+        xFUpdateTextureSum<WSIZE,0,LWINWIDTH,WORDWIDTH_SRC>(l_window,l_tmp,row,col,state.preFilterCap,text_sum);
+
+        xFShiftRight<unsigned char,WSIZE,LWINWIDTH>(l_window);
+        xFShiftRight<unsigned char,WSIZE,RWINWIDTH>(r_window);
+        xFInsertLeft<WSIZE,LWINWIDTH,unsigned char>(l_window,l_tmp);
+        xFInsertLeft<WSIZE,RWINWIDTH,unsigned char>(r_window,r_tmp);
+
+        loop_sad_compute:
+        for (unsigned char d = 0; d < NDISP_UNIT; d++) {
+          sad[d] += (int)xFSADComputeInc<WSIZE, LWINWIDTH, RWINWIDTH, unsigned char>(l_window, r_window, d, col, sad_cols[d]);
+        }
+
+        int skip_val[BUF_SIZE];
+#if PLATFORM_ZCU104
 #pragma HLS RESOURCE variable=skip_val core=XPM_MEMORY uram
 #endif
-				int edge_neighbor[BUF_SIZE];
-#if PLATFORM_ZCU104	
+        int edge_neighbor[BUF_SIZE];
+#if PLATFORM_ZCU104
 #pragma HLS RESOURCE variable=edge_neighbor core=XPM_MEMORY uram
 #endif
-				int edge[BUF_SIZE];
-#if PLATFORM_ZCU104	
+        int edge[BUF_SIZE];
+#if PLATFORM_ZCU104
 #pragma HLS RESOURCE variable=edge core=XPM_MEMORY uram
 #endif
-				int minsad_p[BUF_SIZE];
-#if PLATFORM_ZCU104	
+        int minsad_p[BUF_SIZE];
+#if PLATFORM_ZCU104
 #pragma HLS RESOURCE variable=minsad_p core=XPM_MEMORY uram
 #endif
-				int minsad_n[BUF_SIZE];
-#if PLATFORM_ZCU104	
+        int minsad_n[BUF_SIZE];
+#if PLATFORM_ZCU104
 #pragma HLS RESOURCE variable=minsad_n core=XPM_MEMORY uram
 #endif
 
-				// SAD computing and store output
-				if (row >= WSIZE-1 && col >= WSIZE-1) {
-					int skip_flag = 0;
-					if (text_sum[0] < state.textureThreshold) skip_flag = 1; // texture threshold check
-					if ((row - WSIZE+1) < (WSIZE-1)/2 || (row - WSIZE+1) >= height - (WSIZE-1)/2) skip_flag = 1;  // border skip horizontal
-					if ((col - WSIZE+1) < NDISP-1 + (WSIZE-1)/2 || (col - WSIZE+1) >= width - (WSIZE-1)/2) skip_flag = 1; // border skip vertical
-
-					int gminsad = TMP_INT_MAX_PACK;
-					XF_TNAME(WORDWIDTH_DST,1) gmind = 0;
-					bool gskip = 0;
-					int gskip_val = TMP_INT_MAX_PACK;
-					int gedge_neighbor = TMP_INT_MAX_PACK;  // for uniqueness check
-					int gedge=0; // for subpixel interpolation
-					if (NDISP_UNIT != 1)
-						gedge = sad[1];
-
-					int lminsad = TMP_INT_MAX_PACK;
-					XF_TNAME(WORDWIDTH_DST,1) lmind = 0;
-					int gminsad_p = TMP_INT_MAX_PACK;
-					int gminsad_n = TMP_INT_MAX_PACK;
-
-					if (sweep > 0) {
-						gminsad = minsad[col];
-						gmind   = mind[col];
-						gskip = skip[col];
-						gskip_val = skip_val[col];
-						gedge_neighbor = edge_neighbor[col];
-						if (sweep == 1 && NDISP_UNIT == 1)
-							gedge_neighbor = TMP_INT_MAX_PACK;
-						gedge = edge[col];
-						gminsad_p = minsad_p[col];
-						gminsad_n = (gmind == sweep*NDISP_UNIT-1 ? sad[0] : minsad_n[col]);
-					}
-
-					xFMinSAD<NDISP_UNIT>::find(sad, lmind, lminsad);
-
-					if (lminsad <= gminsad) {
-						gskip = 0;
-						if (state.uniquenessRatio > 0) {
-							int thresh = lminsad + (lminsad * state.uniquenessRatio / 100);
-							if (gminsad <= thresh && lmind+sweep*NDISP_UNIT > gmind+1) {
-								gskip = 1;
-								gskip_val = gminsad;
-							} else if (gminsad <= thresh && lmind+sweep*NDISP_UNIT == gmind+1 && gskip_val <= thresh) {
-								gskip = 1;
-								// gskip_val unchanged;
-							} else if (gminsad <= thresh && lmind+sweep*NDISP_UNIT == gmind+1 && gedge_neighbor <= thresh) {
-								gskip = 1;
-								gskip_val = gedge_neighbor;
-							}
-							loop_unique_search_0:
-							for (unsigned char d = 0; d < NDISP_UNIT; d++) {
-								if (sad[d] <= thresh && sad[d] < gskip_val && (d < lmind-1 || d > lmind+1)) {
-									gskip = 1;
-									gskip_val = sad[d];
-								}
-							}
-						}
-						// update global values;
-						gminsad_p = (lmind == 0 ? gedge : sad[lmind-1]);
-						if (NDISP_UNIT == 1)
-							gminsad_n = sad[lmind == NDISP_UNIT-1 ? 0 : (int)(lmind+1)];
-						else
-							gminsad_n = sad[lmind == NDISP_UNIT-1 ? lmind-1 : lmind+1];
-						gminsad = lminsad;
-						gmind = lmind + sweep*NDISP_UNIT;
-					} else {
-						if (state.uniquenessRatio > 0) {
-							int thresh = gminsad + (gminsad * state.uniquenessRatio / 100);
-							loop_unique_search_1:
-							for (unsigned char d = 0; d < NDISP_UNIT; d++) {
-								if (sad[d] <= thresh && sad[d] < gskip_val && ((gmind == (sweep*NDISP_UNIT-1)) ? ((sweep*NDISP_UNIT+d) > (gmind+1)) : 1)) {
-									gskip = 1;
-									gskip_val = sad[d];
-								}
-							}
-						}
-					}
-					minsad[col] = gminsad;
-					mind[col] = gmind;
-					skip[col] = gskip;
-					skip_val[col] = gskip_val;
-					if (NDISP_UNIT == 1)
-						edge_neighbor[col] = edge[col];
-					else
-						edge_neighbor[col] = sad[NDISP_UNIT-2];
-					edge[col] = sad[NDISP_UNIT-1];
-					minsad_p[col] = gminsad_p;
-					minsad_n[col] = gminsad_n;
-
-					if (sweep == state.sweepFactor-1) {
-						ap_int<xFBitWidth<255*WSIZE*WSIZE>::Value> p = gmind==0?gminsad_n:gminsad_p;
-						ap_int<xFBitWidth<255*WSIZE*WSIZE>::Value> n = gmind==NDISP-1?gminsad_p:gminsad_n;
-						ap_int<xFBitWidth<255*WSIZE*WSIZE>::Value> k = p + n - 2*gminsad + __ABS((int)p - (int)n);
-
-						ap_int<xFBitWidth<255*WSIZE*WSIZE>::Value+8> num = p - n;
-						num = num << 8;
-						ap_int<10> delta = 0;
-						if (k != 0) delta = num/k;
-						XF_TNAME(WORDWIDTH_DST,1) out_disp = ((gmind*256 + delta + 15) >> 4);
-
-						skip_flag |= gskip;
-						if (skip_flag) out_disp = FILTERED;
-						out.write(out_disp);
-					}
-				}
-			}
-		}
-	}
+        // SAD computing and store output
+        if (row >= WSIZE-1 && col >= WSIZE-1) {
+          int skip_flag = 0;
+          if (text_sum[0] < state.textureThreshold) skip_flag = 1; // texture threshold check
+          if ((row - WSIZE+1) < (WSIZE-1)/2 || (row - WSIZE+1) >= height - (WSIZE-1)/2) skip_flag = 1;  // border skip horizontal
+          if ((col - WSIZE+1) < NDISP-1 + (WSIZE-1)/2 || (col - WSIZE+1) >= width - (WSIZE-1)/2) skip_flag = 1; // border skip vertical
+
+          int gminsad = TMP_INT_MAX_PACK;
+          XF_TNAME(WORDWIDTH_DST,1) gmind = 0;
+          bool gskip = 0;
+          int gskip_val = TMP_INT_MAX_PACK;
+          int gedge_neighbor = TMP_INT_MAX_PACK;  // for uniqueness check
+          int gedge=0; // for subpixel interpolation
+          if (NDISP_UNIT != 1)
+            gedge = sad[1];
+
+          int lminsad = TMP_INT_MAX_PACK;
+          XF_TNAME(WORDWIDTH_DST,1) lmind = 0;
+          int gminsad_p = TMP_INT_MAX_PACK;
+          int gminsad_n = TMP_INT_MAX_PACK;
+
+          if (sweep > 0) {
+            gminsad = minsad[col];
+            gmind   = mind[col];
+            gskip = skip[col];
+            gskip_val = skip_val[col];
+            gedge_neighbor = edge_neighbor[col];
+            if (sweep == 1 && NDISP_UNIT == 1)
+              gedge_neighbor = TMP_INT_MAX_PACK;
+            gedge = edge[col];
+            gminsad_p = minsad_p[col];
+            gminsad_n = (gmind == sweep*NDISP_UNIT-1 ? sad[0] : minsad_n[col]);
+          }
+
+          xFMinSAD<NDISP_UNIT>::find(sad, lmind, lminsad);
+
+          if (lminsad <= gminsad) {
+            gskip = 0;
+            if (state.uniquenessRatio > 0) {
+              int thresh = lminsad + (lminsad * state.uniquenessRatio / 100);
+              if (gminsad <= thresh && lmind+sweep*NDISP_UNIT > gmind+1) {
+                gskip = 1;
+                gskip_val = gminsad;
+              } else if (gminsad <= thresh && lmind+sweep*NDISP_UNIT == gmind+1 && gskip_val <= thresh) {
+                gskip = 1;
+                // gskip_val unchanged;
+              } else if (gminsad <= thresh && lmind+sweep*NDISP_UNIT == gmind+1 && gedge_neighbor <= thresh) {
+                gskip = 1;
+                gskip_val = gedge_neighbor;
+              }
+              loop_unique_search_0:
+              for (unsigned char d = 0; d < NDISP_UNIT; d++) {
+                if (sad[d] <= thresh && sad[d] < gskip_val && (d < lmind-1 || d > lmind+1)) {
+                  gskip = 1;
+                  gskip_val = sad[d];
+                }
+              }
+            }
+            // update global values;
+            gminsad_p = (lmind == 0 ? gedge : sad[lmind-1]);
+            if (NDISP_UNIT == 1)
+              gminsad_n = sad[lmind == NDISP_UNIT-1 ? 0 : (int)(lmind+1)];
+            else
+              gminsad_n = sad[lmind == NDISP_UNIT-1 ? lmind-1 : lmind+1];
+            gminsad = lminsad;
+            gmind = lmind + sweep*NDISP_UNIT;
+          } else {
+            if (state.uniquenessRatio > 0) {
+              int thresh = gminsad + (gminsad * state.uniquenessRatio / 100);
+              loop_unique_search_1:
+              for (unsigned char d = 0; d < NDISP_UNIT; d++) {
+                if (sad[d] <= thresh && sad[d] < gskip_val && ((gmind == (sweep*NDISP_UNIT-1)) ? ((sweep*NDISP_UNIT+d) > (gmind+1)) : 1)) {
+                  gskip = 1;
+                  gskip_val = sad[d];
+                }
+              }
+            }
+          }
+          minsad[col] = gminsad;
+          mind[col] = gmind;
+          skip[col] = gskip;
+          skip_val[col] = gskip_val;
+          if (NDISP_UNIT == 1)
+            edge_neighbor[col] = edge[col];
+          else
+            edge_neighbor[col] = sad[NDISP_UNIT-2];
+          edge[col] = sad[NDISP_UNIT-1];
+          minsad_p[col] = gminsad_p;
+          minsad_n[col] = gminsad_n;
+
+          if (sweep == state.sweepFactor-1) {
+            ap_int<xFBitWidth<255*WSIZE*WSIZE>::Value> p = gmind==0?gminsad_n:gminsad_p;
+            ap_int<xFBitWidth<255*WSIZE*WSIZE>::Value> n = gmind==NDISP-1?gminsad_p:gminsad_n;
+            ap_int<xFBitWidth<255*WSIZE*WSIZE>::Value> k = p + n - 2*gminsad + __ABS((int)p - (int)n);
+
+            ap_int<xFBitWidth<255*WSIZE*WSIZE>::Value+8> num = p - n;
+            num = num << 8;
+            ap_int<10> delta = 0;
+            if (k != 0) delta = num/k;
+            XF_TNAME(WORDWIDTH_DST,1) out_disp = ((gmind*256 + delta + 15) >> 4);
+
+            skip_flag |= gskip;
+            if (skip_flag) out_disp = FILTERED;
+            out.write(out_disp);
+          }
+        }
+      }
+    }
+  }
 }
 
 
@@ -522,65 +522,65 @@ template <int NPC>
 void xFImageClipUtility(int i, int j, int k, int height, int width, int *pix)
 {
 #pragma HLS INLINE OFF
-	if (i<1 || i > height-2 || (j*(1<<XF_BITSHIFT(NPC))+k < 1) || (j*(1<<XF_BITSHIFT(NPC))+k) > width-2)
-		*pix = 0;
+  if (i<1 || i > height-2 || (j*(1<<XF_BITSHIFT(NPC))+k < 1) || (j*(1<<XF_BITSHIFT(NPC))+k) > width-2)
+    *pix = 0;
 }
 
 
 /* Clips the Output from the Sobel function based on the Cap value input */
 template<int ROWS, int COLS, int NPC,int DEPTH_SRC, int DEPTH_DST, int SRC_T, int DST_T,int COLS_TC>
 void xFImageClip(
-		hls::stream<XF_TNAME(SRC_T,1)>& src,
-		hls::stream<XF_TNAME(DST_T,1)>& dst,
-		int cap, short int height, short int width)
+    hls::stream<XF_TNAME(SRC_T,1)>& src,
+    hls::stream<XF_TNAME(DST_T,1)>& dst,
+    int cap, short int height, short int width)
 {
-	loop_row_clip:
-	for (short i = 0; i < height; i++)
-	{
+  loop_row_clip:
+  for (short i = 0; i < height; i++)
+  {
 #pragma HLS LOOP_TRIPCOUNT min=ROWS max=ROWS
 #pragma HLS LOOP_FLATTEN off
 
-		loop_col_clip:
-		for (short j = 0; j < (width>>XF_BITSHIFT(NPC)); j++)
-		{
+    loop_col_clip:
+    for (short j = 0; j < (width>>XF_BITSHIFT(NPC)); j++)
+    {
 #pragma HLS PIPELINE II=1
 #pragma HLS LOOP_TRIPCOUNT min=COLS_TC max=COLS_TC
-			XF_TNAME(SRC_T,1) tmp = src.read();
-			XF_TNAME(DST_T,1) tmp_out;
-			for (int k = 0; k < (1<<XF_BITSHIFT(NPC)); k++)
-			{
+      XF_TNAME(SRC_T,1) tmp = src.read();
+      XF_TNAME(DST_T,1) tmp_out;
+      for (int k = 0; k < (1<<XF_BITSHIFT(NPC)); k++)
+      {
 #pragma HLS UNROLL
-				int pix = (XF_PTNAME(DEPTH_SRC))tmp.range((k+1)*XF_PIXELDEPTH(DEPTH_SRC)-1,k*XF_PIXELDEPTH(DEPTH_SRC));
-				xFImageClipUtility<NPC>(i,j,k,height,width,&pix);
-
-				XF_PTNAME(DEPTH_DST) p = (XF_PTNAME(DEPTH_DST))(pix < -cap ? 0 : pix > cap ? cap*2 : pix + cap);
-				tmp_out.range((k+1)*XF_PIXELDEPTH(DEPTH_DST)-1,k*XF_PIXELDEPTH(DEPTH_DST)) = (XF_PTNAME(DEPTH_DST))p;
-			}
-			dst.write(tmp_out);
-		}
-	}
+        int pix = (XF_PTNAME(DEPTH_SRC))tmp.range((k+1)*XF_PIXELDEPTH(DEPTH_SRC)-1,k*XF_PIXELDEPTH(DEPTH_SRC));
+        xFImageClipUtility<NPC>(i,j,k,height,width,&pix);
+
+        XF_PTNAME(DEPTH_DST) p = (XF_PTNAME(DEPTH_DST))(pix < -cap ? 0 : pix > cap ? cap*2 : pix + cap);
+        tmp_out.range((k+1)*XF_PIXELDEPTH(DEPTH_DST)-1,k*XF_PIXELDEPTH(DEPTH_DST)) = (XF_PTNAME(DEPTH_DST))p;
+      }
+      dst.write(tmp_out);
+    }
+  }
 }
 
 
 /* For reading the Gradient-Y stream, rather than letting the stream dangling */
 template<int ROWS, int COLS, int NPC, int DEPTH_SRC, int DEPTH_DST, int SRC_T, int COLS_TC>
 void xFReadOutStream(
-		hls::stream<XF_TNAME(SRC_T,1)>& src,
-		short int height,short int width)
+    hls::stream<XF_TNAME(SRC_T,1)>& src,
+    short int height,short int width)
 {
-	loop_row_clip:
-	for (short i = 0; i < height; i++)
-	{
+  loop_row_clip:
+  for (short i = 0; i < height; i++)
+  {
 #pragma HLS LOOP_TRIPCOUNT min=ROWS max=ROWS
 #pragma HLS LOOP_FLATTEN off
-		loop_col_clip:
-		for (short j = 0; j < (width>>XF_BITSHIFT(NPC)); j++)
-		{
+    loop_col_clip:
+    for (short j = 0; j < (width>>XF_BITSHIFT(NPC)); j++)
+    {
 #pragma HLS PIPELINE II=1
 #pragma HLS LOOP_TRIPCOUNT min=COLS_TC max=COLS_TC
-			XF_TNAME(SRC_T,1) tmp = src.read();
-		}
-	}
+      XF_TNAME(SRC_T,1) tmp = src.read();
+    }
+  }
 }
 
 
@@ -589,138 +589,138 @@ template <int ROWS, int COLS, int SRC_T, int FILTER_T, int DST_T>
 void xFStereoPreProcess(hls::stream<XF_TNAME(SRC_T,1)> &in_strm, hls::stream<XF_TNAME(DST_T,1)>& clipped_strm, int preFilterType,int preFilterCap, short int height, short int width)
 {
 #pragma HLS INLINE
-		hls::stream<XF_TNAME(FILTER_T,1)> in_sobel_x("in_sobel_x");
-		hls::stream<XF_TNAME(FILTER_T,1)> in_sobel_y("in_sobel_y");
+    hls::stream<XF_TNAME(FILTER_T,1)> in_sobel_x("in_sobel_x");
+    hls::stream<XF_TNAME(FILTER_T,1)> in_sobel_y("in_sobel_y");
 
-		xFSobelFilter<ROWS,COLS,XF_8UP,XF_16SP,XF_NPPC1,SRC_T,FILTER_T>(in_strm ,in_sobel_x ,in_sobel_y ,3,XF_BORDER_CONSTANT,height,width);
-		xFImageClip<ROWS,COLS,XF_NPPC1,XF_16SP,XF_8UP,FILTER_T,DST_T,COLS>(in_sobel_x,clipped_strm,preFilterCap,height,width);
-		xFReadOutStream<ROWS,COLS,XF_NPPC1,XF_16SP,XF_8UP,FILTER_T,COLS>(in_sobel_y,height,width);
+    xFSobelFilter<ROWS,COLS,XF_8UP,XF_16SP,XF_NPPC1,SRC_T,FILTER_T>(in_strm ,in_sobel_x ,in_sobel_y ,3,XF_BORDER_CONSTANT,height,width);
+    xFImageClip<ROWS,COLS,XF_NPPC1,XF_16SP,XF_8UP,FILTER_T,DST_T,COLS>(in_sobel_x,clipped_strm,preFilterCap,height,width);
+    xFReadOutStream<ROWS,COLS,XF_NPPC1,XF_16SP,XF_8UP,FILTER_T,COLS>(in_sobel_y,height,width);
 }
 
 
 /* This function performs preprocessing and disparity computation for NO mode */
 template <int ROWS, int COLS, int SRC_T, int DST_T, int NPC, int WSIZE, int NDISP, int NDISP_UNIT, int SWEEP_FACT>
 void xFFindStereoCorrespondenceLBMNO_pipeline (hls::stream<XF_TNAME(SRC_T,NPC)> &_left_strm,
-		hls::stream<XF_TNAME(SRC_T,NPC)> &_right_strm,
-		XF_TNAME(DST_T,NPC) *disp_ptr ,
-		xf::xFSBMState<WSIZE,NDISP,NDISP_UNIT> &sbmstate,
-		short int height, short int width)
+    hls::stream<XF_TNAME(SRC_T,NPC)> &_right_strm,
+    XF_TNAME(DST_T,NPC) *disp_ptr ,
+    xf::xFSBMState<WSIZE,NDISP,NDISP_UNIT> &sbmstate,
+    short int height, short int width)
 {
 #pragma HLS INLINE
 
-	hls::stream< XF_TNAME(SRC_T,NPC) > left_clipped("left_clipped");
-	hls::stream< XF_TNAME(SRC_T,NPC) > right_clipped("right_clipped");
+  hls::stream< XF_TNAME(SRC_T,NPC) > left_clipped("left_clipped");
+  hls::stream< XF_TNAME(SRC_T,NPC) > right_clipped("right_clipped");
 
-	hls::stream< XF_TNAME(DST_T,NPC) > _disp_strm("disparity stream");
+  hls::stream< XF_TNAME(DST_T,NPC) > _disp_strm("disparity stream");
 
 #pragma HLS DATAFLOW
 
-	int TC=(ROWS*COLS);
+  int TC=(ROWS*COLS);
 
-	/* Sobel and Clipping */
-	xFStereoPreProcess<ROWS,COLS,SRC_T,XF_16UW,SRC_T>(_left_strm,left_clipped,sbmstate.preFilterType,sbmstate.preFilterCap,height,width);
-	xFStereoPreProcess<ROWS,COLS,SRC_T,XF_16UW,SRC_T>(_right_strm,right_clipped,sbmstate.preFilterType,sbmstate.preFilterCap,height,width);
+  /* Sobel and Clipping */
+  xFStereoPreProcess<ROWS,COLS,SRC_T,XF_16UW,SRC_T>(_left_strm,left_clipped,sbmstate.preFilterType,sbmstate.preFilterCap,height,width);
+  xFStereoPreProcess<ROWS,COLS,SRC_T,XF_16UW,SRC_T>(_right_strm,right_clipped,sbmstate.preFilterType,sbmstate.preFilterCap,height,width);
 
-	/* SAD and disparity computation */
-	xFSADBlockMatching<ROWS,COLS,SRC_T,DST_T,WSIZE,NDISP,NDISP_UNIT,SWEEP_FACT,ROWS+WSIZE-1,COLS+WSIZE-1,
-	COLS+WSIZE-1,WSIZE,WSIZE+NDISP_UNIT-1,XF_16UW,XF_32UW>(left_clipped,right_clipped,_disp_strm,sbmstate,height,width);
+  /* SAD and disparity computation */
+  xFSADBlockMatching<ROWS,COLS,SRC_T,DST_T,WSIZE,NDISP,NDISP_UNIT,SWEEP_FACT,ROWS+WSIZE-1,COLS+WSIZE-1,
+  COLS+WSIZE-1,WSIZE,WSIZE+NDISP_UNIT-1,XF_16UW,XF_32UW>(left_clipped,right_clipped,_disp_strm,sbmstate,height,width);
 
-	for (int i = 0; i < height*width; i++)
-	{
+  for (int i = 0; i < height*width; i++)
+  {
 #pragma HLS pipeline ii=1
 #pragma HLS LOOP_TRIPCOUNT min=1 max=TC
-		*(disp_ptr + i) = _disp_strm.read();
-	}
+    *(disp_ptr + i) = _disp_strm.read();
+  }
 }
 
 
 /* This function performs preprocessing and disparity computation for NO mode */
 template <int ROWS, int COLS, int SRC_T, int DST_T, int NPC, int WSIZE, int NDISP, int NDISP_UNIT, int SWEEP_FACT>
 void xFFindStereoCorrespondenceLBMNO (XF_TNAME(SRC_T,NPC) *left_ptr,
-		XF_TNAME(SRC_T,NPC) *right_ptr,
-		XF_TNAME(DST_T,NPC) *disp_ptr ,
-		xf::xFSBMState<WSIZE,NDISP,NDISP_UNIT> &sbmstate,
-		short int height, short int width)
+    XF_TNAME(SRC_T,NPC) *right_ptr,
+    XF_TNAME(DST_T,NPC) *disp_ptr ,
+    xf::xFSBMState<WSIZE,NDISP,NDISP_UNIT> &sbmstate,
+    short int height, short int width)
 {
-	hls::stream< XF_TNAME(SRC_T,NPC) > _left_strm;
-	hls::stream< XF_TNAME(SRC_T,NPC) > _right_strm;
+  hls::stream< XF_TNAME(SRC_T,NPC) > _left_strm;
+  hls::stream< XF_TNAME(SRC_T,NPC) > _right_strm;
 
-	hls::stream< XF_TNAME(SRC_T,NPC) > left_clipped("left_clipped");
-	hls::stream< XF_TNAME(SRC_T,NPC) > right_clipped("right_clipped");
+  hls::stream< XF_TNAME(SRC_T,NPC) > left_clipped("left_clipped");
+  hls::stream< XF_TNAME(SRC_T,NPC) > right_clipped("right_clipped");
 
-	hls::stream< XF_TNAME(DST_T,NPC) > _disp_strm("disparity stream");
+  hls::stream< XF_TNAME(DST_T,NPC) > _disp_strm("disparity stream");
 #pragma HLS DATAFLOW
 
-	int TC=(ROWS*COLS);
-	for (int i = 0; i < height*width; i++)
-	{
+  int TC=(ROWS*COLS);
+  for (int i = 0; i < height*width; i++)
+  {
 #pragma HLS pipeline ii=1
 #pragma HLS LOOP_TRIPCOUNT min=1 max=TC
-		_left_strm.write(*(left_ptr + i));
-		_right_strm.write(*(right_ptr + i));
-	}
+    _left_strm.write(*(left_ptr + i));
+    _right_strm.write(*(right_ptr + i));
+  }
 
-	/* Sobel and Clipping */
-	xFStereoPreProcess<ROWS,COLS,SRC_T,XF_16UW,SRC_T>(_left_strm,left_clipped,sbmstate.preFilterType,sbmstate.preFilterCap,height,width);
-	xFStereoPreProcess<ROWS,COLS,SRC_T,XF_16UW,SRC_T>(_right_strm,right_clipped,sbmstate.preFilterType,sbmstate.preFilterCap,height,width);
+  /* Sobel and Clipping */
+  xFStereoPreProcess<ROWS,COLS,SRC_T,XF_16UW,SRC_T>(_left_strm,left_clipped,sbmstate.preFilterType,sbmstate.preFilterCap,height,width);
+  xFStereoPreProcess<ROWS,COLS,SRC_T,XF_16UW,SRC_T>(_right_strm,right_clipped,sbmstate.preFilterType,sbmstate.preFilterCap,height,width);
 
-	/* SAD and disparity computation */
-	xFSADBlockMatching<ROWS,COLS,SRC_T,DST_T,WSIZE,NDISP,NDISP_UNIT,SWEEP_FACT,ROWS+WSIZE-1,COLS+WSIZE-1,
-	COLS+WSIZE-1,WSIZE,WSIZE+NDISP_UNIT-1,XF_16UW,XF_32UW>(left_clipped,right_clipped,_disp_strm,sbmstate,height,width);
+  /* SAD and disparity computation */
+  xFSADBlockMatching<ROWS,COLS,SRC_T,DST_T,WSIZE,NDISP,NDISP_UNIT,SWEEP_FACT,ROWS+WSIZE-1,COLS+WSIZE-1,
+  COLS+WSIZE-1,WSIZE,WSIZE+NDISP_UNIT-1,XF_16UW,XF_32UW>(left_clipped,right_clipped,_disp_strm,sbmstate,height,width);
 
-	for (int i = 0; i < height*width; i++)
-	{
+  for (int i = 0; i < height*width; i++)
+  {
 #pragma HLS pipeline ii=1
 #pragma HLS LOOP_TRIPCOUNT min=1 max=TC
-		*(disp_ptr + i) = _disp_strm.read();
-	}
+    *(disp_ptr + i) = _disp_strm.read();
+  }
 }
 
 
 /* Calls the functions based on the PIXEL PARALLELISM configuration */
 template<int ROWS, int COLS, int SRC_T, int DST_T, int NPC, int WSIZE, int NDISP, int NDISP_UNIT>
 void xFFindStereoCorrespondenceLBM_pipeline(hls::stream<XF_TNAME(SRC_T,NPC)> &_left_strm,
-		hls::stream<XF_TNAME(SRC_T,NPC)> &_right_strm,
-		XF_TNAME(DST_T,NPC) *out_ptr,
-		xf::xFSBMState<WSIZE,NDISP,NDISP_UNIT> &sbmstate,
-		short int height,short int width)
+    hls::stream<XF_TNAME(SRC_T,NPC)> &_right_strm,
+    XF_TNAME(DST_T,NPC) *out_ptr,
+    xf::xFSBMState<WSIZE,NDISP,NDISP_UNIT> &sbmstate,
+    short int height,short int width)
 {
 #pragma HLS INLINE
 
-	assert((SRC_T == XF_8UW) && " WORDWIDTH_SRC must be XF_8UW ");
-	assert((DST_T == XF_16UW) && " WORDWIDTH_DST must be XF_16UW ");
-	assert((NPC == XF_NPPC1) && " NPC must be XF_NPPC1 ");
-	assert((WSIZE%2 == 1) && (WSIZE < __XF_MIN(height,width) && (WSIZE >= 5)) && " WSIZE must be an odd number, less than minimum of height & width and greater than or equal to '5'  ");
-	assert(((NDISP > 1) && (NDISP < width)) && " NDISP must be greater than '1' and less than the image width ");
-	assert((NDISP >= NDISP_UNIT) && " NDISP must not be lesser than NDISP_UNIT");
-	assert((((NDISP/NDISP_UNIT)*NDISP_UNIT) == NDISP) && " NDISP/NDISP_UNIT must be a non-fractional number ");
-	assert(sbmstate.uniquenessRatio >= 0 && "uniqueness ratio must be non-negative");
-	assert(sbmstate.preFilterCap >=1 && sbmstate.preFilterCap <= 63 && "preFilterCap must be within 1..63");
-	assert(sbmstate.preFilterType == XF_STEREO_PREFILTER_SOBEL_TYPE);
-
-	xFFindStereoCorrespondenceLBMNO_pipeline<ROWS,COLS,SRC_T,DST_T,NPC,WSIZE,NDISP,NDISP_UNIT,(NDISP/NDISP_UNIT)+((NDISP%NDISP_UNIT)!=0)>(_left_strm,_right_strm,out_ptr,sbmstate,height,width);
+  assert((SRC_T == XF_8UW) && " WORDWIDTH_SRC must be XF_8UW ");
+  assert((DST_T == XF_16UW) && " WORDWIDTH_DST must be XF_16UW ");
+  assert((NPC == XF_NPPC1) && " NPC must be XF_NPPC1 ");
+  assert((WSIZE%2 == 1) && (WSIZE < __XF_MIN(height,width) && (WSIZE >= 5)) && " WSIZE must be an odd number, less than minimum of height & width and greater than or equal to '5'  ");
+  assert(((NDISP > 1) && (NDISP < width)) && " NDISP must be greater than '1' and less than the image width ");
+  assert((NDISP >= NDISP_UNIT) && " NDISP must not be lesser than NDISP_UNIT");
+  assert((((NDISP/NDISP_UNIT)*NDISP_UNIT) == NDISP) && " NDISP/NDISP_UNIT must be a non-fractional number ");
+  assert(sbmstate.uniquenessRatio >= 0 && "uniqueness ratio must be non-negative");
+  assert(sbmstate.preFilterCap >=1 && sbmstate.preFilterCap <= 63 && "preFilterCap must be within 1..63");
+  assert(sbmstate.preFilterType == XF_STEREO_PREFILTER_SOBEL_TYPE);
+
+  xFFindStereoCorrespondenceLBMNO_pipeline<ROWS,COLS,SRC_T,DST_T,NPC,WSIZE,NDISP,NDISP_UNIT,(NDISP/NDISP_UNIT)+((NDISP%NDISP_UNIT)!=0)>(_left_strm,_right_strm,out_ptr,sbmstate,height,width);
 }
 
 /* Calls the functions based on the PIXEL PARALLELISM configuration */
 template<int ROWS, int COLS, int SRC_T, int DST_T, int NPC, int WSIZE, int NDISP, int NDISP_UNIT>
 void xFFindStereoCorrespondenceLBM(XF_TNAME(SRC_T,NPC) *left_ptr,
-		XF_TNAME(SRC_T,NPC) *right_ptr,
-		XF_TNAME(DST_T,NPC) *out_ptr,
-		xf::xFSBMState<WSIZE,NDISP,NDISP_UNIT> &sbmstate,
-		short int height,short int width)
+    XF_TNAME(SRC_T,NPC) *right_ptr,
+    XF_TNAME(DST_T,NPC) *out_ptr,
+    xf::xFSBMState<WSIZE,NDISP,NDISP_UNIT> &sbmstate,
+    short int height,short int width)
 {
-	assert((SRC_T == XF_8UW) && " WORDWIDTH_SRC must be XF_8UW ");
-	assert((DST_T == XF_16UW) && " WORDWIDTH_DST must be XF_16UW ");
-	assert((NPC == XF_NPPC1) && " NPC must be XF_NPPC1 ");
-	assert((WSIZE%2 == 1) && (WSIZE < __XF_MIN(height,width) && (WSIZE >= 5)) && " WSIZE must be an odd number, less than minimum of height & width and greater than or equal to '5'  ");
-	assert(((NDISP > 1) && (NDISP < width)) && " NDISP must be greater than '1' and less than the image width ");
-	assert((NDISP >= NDISP_UNIT) && " NDISP must not be lesser than NDISP_UNIT");
-	assert((((NDISP/NDISP_UNIT)*NDISP_UNIT) == NDISP) && " NDISP/NDISP_UNIT must be a non-fractional number ");
-	assert(sbmstate.uniquenessRatio >= 0 && "uniqueness ratio must be non-negative");
-	assert(sbmstate.preFilterCap >=1 && sbmstate.preFilterCap <= 63 && "preFilterCap must be within 1..63");
-	assert(sbmstate.preFilterType == XF_STEREO_PREFILTER_SOBEL_TYPE);
-
-	xFFindStereoCorrespondenceLBMNO<ROWS,COLS,SRC_T,DST_T,NPC,WSIZE,NDISP,NDISP_UNIT,(NDISP/NDISP_UNIT)+((NDISP%NDISP_UNIT)!=0)>(left_ptr,right_ptr,out_ptr,sbmstate,height,width);
+  assert((SRC_T == XF_8UW) && " WORDWIDTH_SRC must be XF_8UW ");
+  assert((DST_T == XF_16UW) && " WORDWIDTH_DST must be XF_16UW ");
+  assert((NPC == XF_NPPC1) && " NPC must be XF_NPPC1 ");
+  assert((WSIZE%2 == 1) && (WSIZE < __XF_MIN(height,width) && (WSIZE >= 5)) && " WSIZE must be an odd number, less than minimum of height & width and greater than or equal to '5'  ");
+  assert(((NDISP > 1) && (NDISP < width)) && " NDISP must be greater than '1' and less than the image width ");
+  assert((NDISP >= NDISP_UNIT) && " NDISP must not be lesser than NDISP_UNIT");
+  assert((((NDISP/NDISP_UNIT)*NDISP_UNIT) == NDISP) && " NDISP/NDISP_UNIT must be a non-fractional number ");
+  assert(sbmstate.uniquenessRatio >= 0 && "uniqueness ratio must be non-negative");
+  assert(sbmstate.preFilterCap >=1 && sbmstate.preFilterCap <= 63 && "preFilterCap must be within 1..63");
+  assert(sbmstate.preFilterType == XF_STEREO_PREFILTER_SOBEL_TYPE);
+
+  xFFindStereoCorrespondenceLBMNO<ROWS,COLS,SRC_T,DST_T,NPC,WSIZE,NDISP,NDISP_UNIT,(NDISP/NDISP_UNIT)+((NDISP%NDISP_UNIT)!=0)>(left_ptr,right_ptr,out_ptr,sbmstate,height,width);
 }
 
 
@@ -731,16 +731,17 @@ void xFFindStereoCorrespondenceLBM(XF_TNAME(SRC_T,NPC) *left_ptr,
 #pragma SDS data copy("_left_mat.data"[0:"_left_mat.size"])
 #pragma SDS data copy("_right_mat.data"[0:"_right_mat.size"])
 #pragma SDS data copy("_disp_mat.data"[0:"_disp_mat.size"])
-template <int WSIZE, int NDISP, int NDISP_UNIT, int SRC_T, int DST_T, int ROWS, int COLS, int NPC = XF_NPPC1>
+
+template <int WSIZE, int NDISP, int NDISP_UNIT, int SRC_T, int DST_T, int ROWS, int COLS, int NPC>
 void StereoBM(xf::Mat<SRC_T, ROWS, COLS, NPC> &_left_mat,
-		xf::Mat<SRC_T, ROWS, COLS, NPC> &_right_mat,
-		xf::Mat<DST_T, ROWS, COLS, NPC> &_disp_mat,
-		xf::xFSBMState<WSIZE,NDISP,NDISP_UNIT> &sbmstate)
+    xf::Mat<SRC_T, ROWS, COLS, NPC> &_right_mat,
+    xf::Mat<DST_T, ROWS, COLS, NPC> &_disp_mat,
+    xf::xFSBMState<WSIZE,NDISP,NDISP_UNIT> &sbmstate)
 {
 #pragma HLS INLINE OFF
 
-	xFFindStereoCorrespondenceLBM<ROWS,COLS,SRC_T,DST_T,NPC,WSIZE,NDISP,NDISP_UNIT>(_left_mat.data,_right_mat.data,_disp_mat.data,sbmstate,
-			_left_mat.rows,_left_mat.cols);
+  xFFindStereoCorrespondenceLBM<ROWS,COLS,SRC_T,DST_T,NPC,WSIZE,NDISP,NDISP_UNIT>(_left_mat.data,_right_mat.data,_disp_mat.data,sbmstate,
+      _left_mat.rows,_left_mat.cols);
 }
 }
 
diff --git a/include/imgproc/xf_stereo_pipeline.hpp b/include/imgproc/xf_stereo_pipeline.hpp
index caa5c3c..880f704 100644
--- a/include/imgproc/xf_stereo_pipeline.hpp
+++ b/include/imgproc/xf_stereo_pipeline.hpp
@@ -123,9 +123,20 @@ void xFInitUndistortRectifyMapInverseKernel (
 #pragma HLS ARRAY_PARTITION variable=distCoeffsHLS complete dim=0
 #pragma HLS ARRAY_PARTITION variable=iRnewCameraMatrixHLS complete dim=0
 
-	memcpy(cameraMatrixHLS,cameraMatrix,4*CM_SIZE);
-	memcpy(distCoeffsHLS,distCoeffs,4*N);
-	memcpy(iRnewCameraMatrixHLS,ir,4*CM_SIZE);
+//#NO	memcpy(cameraMatrixHLS,cameraMatrix,4*CM_SIZE);
+//#NO	memcpy(distCoeffsHLS,distCoeffs,4*N);
+//#NO	memcpy(iRnewCameraMatrixHLS,ir,4*CM_SIZE);
+
+  for(int r = 0; r < CM_SIZE; r++)
+    {
+      cameraMatrixHLS[r]      = cameraMatrix[r];
+      iRnewCameraMatrixHLS[r] = ir[r];
+    }
+
+  for(int n = 0; n < N; n++)
+    {
+      distCoeffsHLS[n] = distCoeffs[n];
+    }
 
 	MAP_T mx;
 	MAP_T my;
diff --git a/include/imgproc/xf_warp_transform.hpp b/include/imgproc/xf_warp_transform.hpp
index 2945a08..36df4ad 100644
--- a/include/imgproc/xf_warp_transform.hpp
+++ b/include/imgproc/xf_warp_transform.hpp
@@ -264,7 +264,126 @@ XF_TNAME(DEPTH,NPC) retrieve_EvOd_image4x1(int i,int j,int A, int B, int C, int
 	return XF_TNAME(DEPTH,NPC)((op_val+(1<<(INTER_REMAP_COEF_BITS-1)))>>INTER_REMAP_COEF_BITS);
 };
 
-template <int NPC, int ROWS, int COLS, int DEPTH, int STORE_LINES, int START_ROW, int TRANSFORM, bool INTERPOLATION_TYPE>
+
+template<int COLS, int STORE_LINES, int DEPTH, int NPC>
+void store_in_UramNN(XF_TNAME(DEPTH,NPC) in_pixel, ap_uint<16> i,ap_uint<16> j, ap_uint<64> bufUram[STORE_LINES][(COLS+7)/8])
+{
+#pragma HLS INLINE
+
+    static XF_TNAME(DEPTH,NPC) sx8[8];
+#pragma HLS ARRAY_PARTITION variable=sx8 complete dim=1
+    sx8[j%8] = in_pixel;
+    for (int k=0; k<8; k++) bufUram[i][j/8](k*8+7,k*8) = sx8[k];
+};
+
+template<int COLS, int STORE_LINES, int DEPTH, int NPC>
+void store_in_UramBL(hls::stream< XF_TNAME(DEPTH,NPC)>& input_image, ap_uint<16> i,ap_uint<16> j, ap_uint<72> bufUram[(STORE_LINES+1)/2][(COLS+1)/2], short img_cols)
+{
+#pragma HLS INLINE
+
+    ap_int<16> i_hlf_mns1 = i/2-1;
+    i_hlf_mns1 = i_hlf_mns1 + (i_hlf_mns1 < 0 ? (STORE_LINES+1)/2 : 0);
+
+    static XF_TNAME(DEPTH,NPC) lineBuf[COLS]; //addtitional cashing as VHLS doesn't support URAM Byte Enables
+    static XF_TNAME(DEPTH,NPC) s3x3[2][9]; //URAM-wide word is doubled to resolve pipelining read/write dependency
+#pragma HLS ARRAY_PARTITION variable=s3x3 complete dim=0
+#pragma HLS dependence      variable=s3x3 inter false RAW
+    static XF_TNAME(DEPTH,NPC) s3x3_2[9];
+    static XF_TNAME(DEPTH,NPC) s0,s3;
+
+    static XF_TNAME(DEPTH,NPC) in_pixel;
+    if (j<img_cols) in_pixel = input_image.read();
+
+    if (!(i%2)) { // even row, stored in line buffer for 1st row of 3x3 block, and in URAM for 3d row of 3x3 block
+      if (!(j%2)) { // even col
+        if (j<img_cols) lineBuf[j] = s0 = in_pixel;
+        else s0 = 0;
+        s3x3[!(j&2)][8] = s0;
+        if ((j/2)>1) for (int k=0; k<9; k++) bufUram[i_hlf_mns1][j/2-2](k*8+7,k*8) = s3x3[!!(j&2)][k];
+      } else if (j<img_cols) { // odd col
+        lineBuf[j] = in_pixel;
+        for (int k=0; k<6; k++) s3x3[!!(j&2)][k] = bufUram[i_hlf_mns1][j/2](k*8+7,k*8);
+        s3x3[!!(j&2)][6] = s0;
+        s3x3[!!(j&2)][7] = in_pixel;
+        s3x3[!!(j&2)][8] = 0;
+   	  }
+    } else if (j<img_cols) { // odd row, togeher with fetched from line buffer 1st row of 3x3 block is stored in URAM
+      if (!(j%2)) { // even col
+        s3x3_2[2] = s0 = lineBuf[j];
+        s3x3_2[5] = s3 = in_pixel;
+        if ((j/2)>0) for (int k=0; k<9; k++) bufUram[i/2][j/2-1](k*8+7,k*8) = s3x3_2[k];
+      } else { // odd col
+        s3x3_2[0] = s0;
+        s3x3_2[1] = lineBuf[j];
+        s3x3_2[3] = s3;
+        s3x3_2[4] = in_pixel;
+
+        // this clearing is needed only for case of bottom zero padding (curently is not used at all)
+        s3x3_2[6] = 0;
+        s3x3_2[7] = 0;
+        s3x3_2[8] = 0;
+        //if (j==(img_cols-1)) { //these clearing and save is needed only at last column but may done every cycle
+        s3x3_2[2] = 0;
+        s3x3_2[5] = 0;
+        for (int k=0; k<9; k++) bufUram[i/2][j/2](k*8+7,k*8) = s3x3_2[k];
+        //}
+      }
+    }
+};
+
+template<int COLS, int STORE_LINES, int DEPTH, int NPC>
+XF_TNAME(DEPTH,NPC) retrieve_UramNN(int i,int j, ap_uint<64> bufUram[STORE_LINES][(COLS+7)/8])
+{
+#pragma HLS INLINE
+
+	i = i > (STORE_LINES - 1)? (i - STORE_LINES) : ((i < 0)? (i + STORE_LINES) : i);
+    XF_TNAME(DEPTH,NPC) dx8[8];
+#pragma HLS ARRAY_PARTITION variable=dx8 complete dim=1
+    for (int k=0; k<8; k++) dx8[k] = bufUram[i][j/8](k*8+7,k*8);
+    return dx8[j%8];
+};
+
+template<int COLS, int STORE_LINES, int DEPTH, int NPC>
+XF_TNAME(DEPTH,NPC) retrieve_UramBL(int i,int j,int A, int B, int C, int D, ap_uint<72> bufUram[(STORE_LINES+1)/2][(COLS+1)/2])
+{
+#pragma HLS INLINE
+
+	i = (i > (STORE_LINES - 1))? (i - STORE_LINES) : ((i < 0)? (i + STORE_LINES) : i);
+
+    XF_TNAME(DEPTH,NPC) d3x3[9];
+#pragma HLS ARRAY_PARTITION variable=d3x3 complete dim=1
+    for (int k=0; k<9; k++) d3x3[k] = bufUram[i/2][j/2](k*8+7,k*8);
+    XF_TNAME(DEPTH,NPC) const px00 = d3x3[(i%2  )*3 + j%2  ];
+    XF_TNAME(DEPTH,NPC) const px01 = d3x3[(i%2  )*3 + j%2+1];
+    XF_TNAME(DEPTH,NPC) const px10 = d3x3[(i%2+1)*3 + j%2  ];
+    XF_TNAME(DEPTH,NPC) const px11 = d3x3[(i%2+1)*3 + j%2+1];
+
+    int const op_val = (A*px00)
+                     + (B*px01)
+                     + (C*px10)
+                     + (D*px11);
+    //returning the computed interpolated output after rounding off the op_val by adding 0.5
+    //and shifting to right by INTER_REMAP_COEF_BITS
+    return XF_TNAME(DEPTH,NPC)((op_val+(1<<(INTER_REMAP_COEF_BITS-1)))>>INTER_REMAP_COEF_BITS);
+};
+
+//AK(ZoTech): rounding function to substitute one from math.h, consuming 2 BRAMs per call; not used as it is not bitexact with the math.h.
+// template<class T>
+// int round(T x)
+// {
+// #pragma HLS INLINE
+// 	return (x + (x>=T(0) ? T(0.5) : T(-0.5)));
+// };
+
+//AK(ZoTech): floor function to substitute one from math.h, consuming 2 BRAMs per call; not used as it is not synthesisable if biexact.
+// template<class T>
+// int floor(T x)
+// {
+// #pragma HLS INLINE
+//     return (x - (x>=T(0) ? T(0) : T(1)-std::numeric_limits<T>::epsilon() ));
+// };
+
+template <int NPC, int ROWS, int COLS, int DEPTH, int STORE_LINES, int START_ROW, int TRANSFORM, bool INTERPOLATION_TYPE, bool USE_URAM>
 int xFwarpTransformKernel(hls::stream< XF_TNAME(DEPTH,NPC) > &input_image, hls::stream< XF_TNAME(DEPTH,NPC) > &output_image, float P_matrix[9], short img_rows, short img_cols)
 {
 #pragma HLS INLINE
@@ -298,6 +417,14 @@ int xFwarpTransformKernel(hls::stream< XF_TNAME(DEPTH,NPC) > &input_image, hls::
 #pragma HLS DEPENDENCE variable=store1_pt_2OdR_EvC intra false
 #pragma HLS DEPENDENCE variable=store1_pt_2OdR_OdC intra false
 
+    //URAM based storages
+	ap_uint<64> bufUramNN[STORE_LINES][(COLS+7)/8];
+#pragma HLS RESOURCE   variable=bufUramNN core=XPM_MEMORY uram
+#pragma HLS dependence variable=bufUramNN inter false
+    //URAM storage garnularity for BL inerpolation is 3x3-pel block in 2x2-pel picture grid, it fits to one URAM word
+    ap_uint<72> bufUramBL[(STORE_LINES+1)/2][(COLS+1)/2];
+#pragma HLS RESOURCE   variable=bufUramBL core=XPM_MEMORY uram
+#pragma HLS dependence variable=bufUramBL inter false
 
 	//varables for loop counters
 	ap_uint<16> i=0,j=0,k=0,l=0,m=0,n=0,p=0;
@@ -342,7 +469,7 @@ int xFwarpTransformKernel(hls::stream< XF_TNAME(DEPTH,NPC) > &input_image, hls::
 	MAIN_ROWS:for (i=0;i<(img_rows + START_ROW);i++)
 	{
 #pragma HLS LOOP_TRIPCOUNT min=1 max=ROWS
-		MAIN_COLS:for(j=0;j<(img_cols);j++)
+		MAIN_COLS:for(j=0;j<(img_cols+3);j++)
 		{
 #pragma HLS LOOP_TRIPCOUNT min=1 max=COLS
 		#pragma HLS PIPELINE
@@ -362,12 +489,16 @@ int xFwarpTransformKernel(hls::stream< XF_TNAME(DEPTH,NPC) > &input_image, hls::
 				//function to store the input image stream to
 				//a buffer of size STORE_LINES rows
 				//computing i-l to snap the writes to STORE_LINES size buffer
+				if (USE_URAM)
+				  if (INTERPOLATION_TYPE) store_in_UramBL<COLS,STORE_LINES,DEPTH,NPC>(input_image        ,i-l,j, bufUramBL, img_cols);
+				  else {if (j<img_cols)   store_in_UramNN<COLS,STORE_LINES,DEPTH,NPC>(input_image.read() ,i-l,j, bufUramNN);}
+                else    if (j<img_cols)
 				store_EvOd_image1<COLS,STORE_LINES,DEPTH,NPC>( input_image.read() ,i-l,j, store1_pt_2EvR_EvC, store1_pt_2EvR_OdC, store1_pt_2OdR_EvC, store1_pt_2OdR_OdC);
 			}
 
 			//condition to compute and stream out the output image
 			//after START_ROW number of rows
-			if(i>=START_ROW)
+			if(i>=START_ROW && j<img_cols)
 			{
 				//computing k from i to index the output image from 0
 				k = i - (START_ROW);
@@ -468,11 +599,17 @@ int xFwarpTransformKernel(hls::stream< XF_TNAME(DEPTH,NPC) > &input_image, hls::
 					I1 = I - m;
 					if(INTERPOLATION_TYPE==0)
 					{
+                      if (USE_URAM)
+						op_val = retrieve_UramNN     <COLS,STORE_LINES,DEPTH,NPC>(I1,J, bufUramNN);
+					  else
 						op_val = retrieve_EvOd_image1<COLS,STORE_LINES,DEPTH,NPC>(I1,J, store1_pt_2EvR_EvC, store1_pt_2EvR_OdC, store1_pt_2OdR_EvC, store1_pt_2OdR_OdC);
 					}
 					else
 					{
 						//calling the read function with interpolation
+                      if (USE_URAM)
+						op_val = retrieve_UramBL       <COLS,STORE_LINES,DEPTH,NPC>(I1,J,A,B,C,D, bufUramBL);
+					  else
 						op_val = retrieve_EvOd_image4x1<COLS,STORE_LINES,DEPTH,NPC>(I1,J,A,B,C,D, store1_pt_2EvR_EvC, store1_pt_2EvR_OdC, store1_pt_2OdR_EvC, store1_pt_2OdR_OdC);
 					}
 				}
@@ -497,7 +634,7 @@ return 0;
 #pragma SDS data access_pattern("_src_mat.data":SEQUENTIAL)
 #pragma SDS data access_pattern("_dst_mat.data":SEQUENTIAL)
 #pragma SDS data mem_attribute ("_src_mat.data":NON_CACHEABLE|PHYSICAL_CONTIGUOUS, "_dst_mat.data":NON_CACHEABLE|PHYSICAL_CONTIGUOUS)
-template <int STORE_LINES, int START_ROW, int TRANSFORM, bool INTERPOLATION_TYPE, int TYPE, int ROWS, int COLS, int NPC>
+template <int STORE_LINES, int START_ROW, int TRANSFORM, bool INTERPOLATION_TYPE, int TYPE, int ROWS, int COLS, int NPC, bool USE_URAM = false>
 void warpTransform(xf::Mat<TYPE,ROWS,COLS,NPC> & _src_mat, xf::Mat<TYPE,ROWS,COLS,NPC> & _dst_mat, float P_matrix[9])
 {
 	#pragma HLS INLINE OFF
@@ -516,7 +653,7 @@ hls::stream< XF_TNAME(TYPE,NPC) > out_stream;
 		}
 	}
 
-xFwarpTransformKernel<NPC, ROWS, COLS, TYPE, STORE_LINES, START_ROW, TRANSFORM, INTERPOLATION_TYPE>(in_stream, out_stream, P_matrix, _src_mat.rows, _src_mat.cols);
+xFwarpTransformKernel<NPC, ROWS, COLS, TYPE, STORE_LINES, START_ROW, TRANSFORM, INTERPOLATION_TYPE, USE_URAM>(in_stream, out_stream, P_matrix, _src_mat.rows, _src_mat.cols);
 
 	for(int i=0; i<_dst_mat.rows;i++)
 	{