add int8 quantization (wang-xinyu#612)

freedenS · web-flow · commit 9bb2f6771cad · 2021-07-05T11:15:36.000+08:00
* add detr

* Update README.md

* add int8 quantization

fix some known bugs
diff --git a/detr/README.md b/detr/README.md
@@ -52,7 +52,7 @@ sudo ./detr -d detr.engine ../samples
 
 average cost of doInference(in detr.cpp) from second time with batch=1 under the ubuntu environment above
 
-|      | fp32    | fp16    | int8 |
-| ---- | ------- | ------- | ---- |
-| R50  | 19.57ms | 9.424ms | TODO |
+|      | fp32    | fp16    | int8   |
+| ---- | ------- | ------- | ------ |
+| R50  | 19.57ms | 9.424ms | 8.38ms |
 
diff --git a/detr/calibrator.hpp b/detr/calibrator.hpp
@@ -0,0 +1,116 @@
+#pragma once
+
+#include "NvInfer.h"
+#include <string>
+#include <vector>
+#include <iostream>
+#include <iterator>
+#include <fstream>
+#include <algorithm>
+#include "common.hpp"
+
+//! \class Int8EntropyCalibrator2
+//!
+//! \brief Implements Entropy calibrator 2.
+//!  CalibrationAlgoType is kENTROPY_CALIBRATION_2.
+//!
+class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 {
+ public:
+    Int8EntropyCalibrator2(int batchsize, int input_w, int input_h,
+    const char* img_dir, const char* calib_table_name,
+    const char* input_blob_name, bool read_cache = true);
+
+    virtual ~Int8EntropyCalibrator2();
+    int getBatchSize() const override;
+    bool getBatch(void* bindings[], const char* names[], int nbBindings) override;
+    const void* readCalibrationCache(size_t& length) override;
+    void writeCalibrationCache(const void* cache, size_t length) override;
+
+ private:
+    int batchsize_;
+    int input_w_;
+    int input_h_;
+    int img_idx_;
+    std::string img_dir_;
+    std::vector<std::string> img_files_;
+    size_t input_count_;
+    std::string calib_table_name_;
+    const char* input_blob_name_;
+    bool read_cache_;
+    void* device_input_;
+    std::vector<char> calib_cache_;
+};
+
+Int8EntropyCalibrator2::Int8EntropyCalibrator2(int batchsize,
+int input_w, int input_h, const char* img_dir,
+const char* calib_table_name, const char* input_blob_name,
+bool read_cache)
+    : batchsize_(batchsize)
+    , input_w_(input_w)
+    , input_h_(input_h)
+    , img_idx_(0)
+    , img_dir_(img_dir)
+    , calib_table_name_(calib_table_name)
+    , input_blob_name_(input_blob_name)
+    , read_cache_(read_cache) {
+    input_count_ = 3 * input_w * input_h * batchsize;
+    CUDA_CHECK(cudaMalloc(&device_input_, input_count_ * sizeof(float)));
+    read_files_in_dir(img_dir, img_files_);
+}
+
+Int8EntropyCalibrator2::~Int8EntropyCalibrator2() {
+    CUDA_CHECK(cudaFree(device_input_));
+}
+
+int Int8EntropyCalibrator2::getBatchSize() const {
+    return batchsize_;
+}
+
+bool Int8EntropyCalibrator2::getBatch(void* bindings[], const char* names[], int nbBindings) {
+    if (img_idx_ + batchsize_ > static_cast<int>(img_files_.size())) {
+        return false;
+    }
+
+    std::vector<float> input_imgs_(input_count_, 0);
+    for (int i = img_idx_; i < img_idx_ + batchsize_; i++) {
+        std::cout << img_files_[i] << "  " << i << std::endl;
+        cv::Mat temp = cv::imread(img_dir_ + img_files_[i]);
+        if (temp.empty()) {
+            std::cerr << "Fatal error: image cannot open!" << std::endl;
+            return false;
+        }
+        preprocessImg(temp, input_w_, input_h_);
+        for (int c = 0; c < 3; c++) {
+            for (int h = 0; h < input_h_; h++) {
+                for (int w = 0; w < input_w_; w++) {
+                    input_imgs_[(i-img_idx_)*input_w_*input_h_*3 +
+                        c * input_h_ * input_w_ + h * input_w_ + w] = temp.at<cv::Vec3f>(h, w)[c];
+                }
+            }
+        }
+    }
+    img_idx_ += batchsize_;
+
+    CUDA_CHECK(cudaMemcpy(device_input_, input_imgs_.data(), input_count_ * sizeof(float), cudaMemcpyHostToDevice));
+    assert(!strcmp(names[0], input_blob_name_));
+    bindings[0] = device_input_;
+    return true;
+}
+
+const void* Int8EntropyCalibrator2::readCalibrationCache(size_t& length) {
+    std::cout << "reading calib cache: " << calib_table_name_ << std::endl;
+    calib_cache_.clear();
+    std::ifstream input(calib_table_name_, std::ios::binary);
+    input >> std::noskipws;
+    if (read_cache_ && input.good()) {
+        std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(), std::back_inserter(calib_cache_));
+    }
+    length = calib_cache_.size();
+    return length ? calib_cache_.data() : nullptr;
+}
+
+void Int8EntropyCalibrator2::writeCalibrationCache(const void* cache, size_t length) {
+    std::cout << "writing calib cache: " << calib_table_name_ << " size: " << length << std::endl;
+    std::ofstream output(calib_table_name_, std::ios::binary);
+    output.write(reinterpret_cast<const char*>(cache), length);
+}
diff --git a/detr/common.hpp b/detr/common.hpp
@@ -78,6 +78,16 @@ static inline int read_files_in_dir(const char *p_dir_name, std::vector<std::str
     return 0;
 }
 
+void preprocessImg(cv::Mat& img, int newh, int neww) {
+    // convert to rgb
+    cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
+    cv::resize(img, img, cv::Size(neww, newh));
+    img.convertTo(img, CV_32FC3);
+    img /= 255;
+    img -= cv::Scalar(0.485, 0.456, 0.406);
+    img /= cv::Scalar(0.229, 0.224, 0.225);
+}
+
 #ifndef CUDA_CHECK
 #define CUDA_CHECK(callstr)\
     {\
diff --git a/detr/detr.cpp b/detr/detr.cpp
@@ -3,13 +3,13 @@
 #include <unordered_map>
 #include "./logging.h"
 #include "backbone.hpp"
+#include "calibrator.hpp"
 
 #define DEVICE 0
 #define BATCH_SIZE 1
 
 // 1 / math.sqrt(head_dim) https://github.com/pytorch/pytorch/blob/master/torch/csrc/api/include/torch/nn/functional/activation.h#623
 static const float SCALING = 0.17677669529663687;
-static const float MIN_SIZE = 800.0;
 static const int INPUT_H = 800;
 static const int INPUT_W = 1066;
 static const int NUM_CLASS = 92;  // include background
@@ -28,25 +28,6 @@ static const float SCORE_THRESH = 0.5;
 const char* INPUT_NODE_NAME = "images";
 const std::vector<std::string> OUTPUT_NAMES = { "scores", "boxes"};
 
-void preprocessImg(cv::Mat& img) {
-    // convert to rgb
-    cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
-    float ratio = static_cast<float>(MIN_SIZE) / std::min(img.rows, img.cols);
-    int newh = 0, neww = 0;
-    if (img.rows < img.cols) {
-        newh = MIN_SIZE;
-        neww = ratio * img.cols;
-    } else {
-        newh = ratio * img.rows;
-        neww = MIN_SIZE;
-    }
-    cv::resize(img, img, cv::Size(neww, newh));
-    img.convertTo(img, CV_32FC3);
-    img /= 255;
-    img -= cv::Scalar(0.485, 0.456, 0.406);
-    img /= cv::Scalar(0.229, 0.224, 0.225);
-}
-
 ITensor* PositionEmbeddingSine(
 INetworkDefinition *network,
 std::unordered_map<std::string, Weights>& weightMap,
@@ -555,7 +536,7 @@ const std::string& modelType = "fp16"
     INetworkDefinition* network = builder->createNetworkV2(0U);
 
     // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME
-    ITensor* data = network->addInput("data", dt, Dims3{ 3, INPUT_H, INPUT_W });
+    ITensor* data = network->addInput(INPUT_NODE_NAME, dt, Dims3{ 3, INPUT_H, INPUT_W });
 
     // preprocess
     std::unordered_map<std::string, Weights> weightMap;
@@ -605,7 +586,12 @@ const std::string& modelType = "fp16"
     } else if (modelType == "fp16") {
         config->setFlag(BuilderFlag::kFP16);
     } else if (modelType == "int8") {
-        // TODO: test with int8 quantization
+        std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
+        assert(builder->platformHasFastInt8());
+        config->setFlag(BuilderFlag::kINT8);
+        Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(BATCH_SIZE, INPUT_W, INPUT_H, "./coco_calib/",
+        "int8calib.table", INPUT_NODE_NAME);
+        config->setInt8Calibrator(calibrator);
     } else {
         throw("does not support model type");
     }
@@ -761,9 +747,9 @@ int main(int argc, char** argv) {
 
         for (int b = 0; b < fcount; b++) {
             cv::Mat img = cv::imread(imgDir + "/" + fileList[f - fcount + 1 + b]);
-            preprocessImg(img);
-            assert(img.cols * img.rows * 3 == input_size);
             if (img.empty()) continue;
+            preprocessImg(img, INPUT_H, INPUT_W);
+            assert(img.cols * img.rows * 3 == input_size);
             for (int c = 0; c < 3; c++) {
                 for (int h = 0; h < img.rows; h++) {
                     for (int w = 0; w < img.cols; w++) {