update blobFromImage

fateshelled · fateshelled · commit 9ed83f2b7261 · 2024-08-23T00:20:00.000+09:00
diff --git a/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/core.hpp b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/core.hpp
@@ -2,6 +2,8 @@
 #define _YOLOX_CPP_CORE_HPP
 
 #include <opencv2/core/types.hpp>
+#include <opencv2/core/simd_intrinsics.hpp>
+#include <algorithm>
 
 namespace yolox_cpp
 {
@@ -49,15 +51,21 @@ namespace yolox_cpp
         int num_classes_;
         bool p6_;
         std::string model_version_;
-        const std::vector<float> mean_ = {0.485, 0.456, 0.406};
-        const std::vector<float> std_ = {0.229, 0.224, 0.225};
+        // const std::vector<float> mean_ = {0.485, 0.456, 0.406};
+        // const std::vector<float> std_ = {0.229, 0.224, 0.225};
+        const std::vector<float> std255_inv_ = {
+            1.0 / (255.0 * 0.229), 1.0 / (255.0 * 0.224), 1.0 / (255.0 * 0.225)};
+        const std::vector<float> mean_std_ = {
+            -0.485 / 0.229 , -0.456 / 0.224, -0.406 / 0.225};
         const std::vector<int> strides_ = {8, 16, 32};
         const std::vector<int> strides_p6_ = {8, 16, 32, 64};
         std::vector<GridAndStride> grid_strides_;
 
         cv::Mat static_resize(const cv::Mat &img)
         {
-            const float r = std::min(input_w_ / (img.cols * 1.0), input_h_ / (img.rows * 1.0));
+            const float r = std::min(
+                static_cast<float>(input_w_) / (static_cast<float>(img.cols) * 1.0f),
+                static_cast<float>(input_h_) / (static_cast<float>(img.rows) * 1.0f));
             // r = std::min(r, 1.0f);
             const int unpad_w = r * img.cols;
             const int unpad_h = r * img.rows;
@@ -71,69 +79,28 @@ namespace yolox_cpp
         // for NCHW
         void blobFromImage(const cv::Mat &img, float *blob_data)
         {
-            const size_t channels = 3;
-            const size_t img_h = img.rows;
-            const size_t img_w = img.cols;
-            if (this->model_version_ == "0.1.0")
-            {
-                for (size_t c = 0; c < channels; ++c)
-                {
-                    for (size_t h = 0; h < img_h; ++h)
-                    {
-                        for (size_t w = 0; w < img_w; ++w)
-                        {
-                            blob_data[c * img_w * img_h + h * img_w + w] =
-                                ((float)img.ptr<cv::Vec3b>(h)[w][c] / 255.0 - this->mean_[c]) / this->std_[c];
-                        }
-                    }
-                }
-            }
-            else
-            {
-                for (size_t c = 0; c < channels; ++c)
-                {
-                    for (size_t h = 0; h < img_h; ++h)
-                    {
-                        for (size_t w = 0; w < img_w; ++w)
-                        {
-                            blob_data[c * img_w * img_h + h * img_w + w] = (float)img.ptr<cv::Vec3b>(h)[w][c]; // 0.1.1rc0 or later
-                        }
-                    }
-                }
-            }
+            blobFromImage_cpu(img, blob_data);
+            // #if defined(CV_SIMD128) && CV_SIMD128 == 1
+            //     blobFromImage_simd(img, blob_data);
+            // #else
+            //     blobFromImage_cpu(img, blob_data);
+            // #endif
         }
 
         // for NHWC
         void blobFromImage_nhwc(const cv::Mat &img, float *blob_data)
         {
-            const size_t channels = 3;
-            const size_t img_h = img.rows;
-            const size_t img_w = img.cols;
-            if (this->model_version_ == "0.1.0")
-            {
-                for (size_t i = 0; i < img_h * img_w; ++i)
-                {
-                    for (size_t c = 0; c < channels; ++c)
-                    {
-                        blob_data[i * channels + c] =
-                            ((float)img.data[i * channels + c] / 255.0 - this->mean_[c]) / this->std_[c];
-                    }
-                }
-            }
-            else
-            {
-                for (size_t i = 0; i < img_h * img_w; ++i)
-                {
-                    for (size_t c = 0; c < channels; ++c)
-                    {
-                        blob_data[i * channels + c] = (float)img.data[i * channels + c]; // 0.1.1rc0 or later
-                    }
-                }
-            }
+            blobFromImage_nhwc_cpu(img, blob_data);
+            // #if defined(CV_SIMD128) && CV_SIMD128 == 1
+            //     blobFromImage_nhwc_simd(img, blob_data);
+            // #else
+            //     blobFromImage_nhwc_cpu(img, blob_data);
+            // #endif
         }
 
         void generate_grids_and_stride(const int target_w, const int target_h, const std::vector<int> &strides, std::vector<GridAndStride> &grid_strides)
         {
+            grid_strides.clear();
             for (auto stride : strides)
             {
                 const int num_grid_w = target_w / stride;
@@ -151,6 +118,7 @@ namespace yolox_cpp
         void generate_yolox_proposals(const std::vector<GridAndStride> &grid_strides, const float *feat_ptr, const float prob_threshold, std::vector<Object> &objects)
         {
             const int num_anchors = grid_strides.size();
+            objects.clear();
 
             for (int anchor_idx = 0; anchor_idx < num_anchors; ++anchor_idx)
             {
@@ -203,42 +171,6 @@ namespace yolox_cpp
             return inter.area();
         }
 
-        void qsort_descent_inplace(std::vector<Object> &faceobjects, int left, int right)
-        {
-            int i = left;
-            int j = right;
-            float p = faceobjects[(left + right) / 2].prob;
-
-            while (i <= j)
-            {
-                while (faceobjects[i].prob > p)
-                    ++i;
-
-                while (faceobjects[j].prob < p)
-                    --j;
-
-                if (i <= j)
-                {
-                    std::swap(faceobjects[i], faceobjects[j]);
-
-                    ++i;
-                    --j;
-                }
-            }
-            if (left < j)
-                qsort_descent_inplace(faceobjects, left, j);
-            if (i < right)
-                qsort_descent_inplace(faceobjects, i, right);
-        }
-
-        void qsort_descent_inplace(std::vector<Object> &objects)
-        {
-            if (objects.empty())
-                return;
-
-            qsort_descent_inplace(objects, 0, objects.size() - 1);
-        }
-
         void nms_sorted_bboxes(const std::vector<Object> &faceobjects, std::vector<int> &picked, const float nms_threshold)
         {
             picked.clear();
@@ -282,36 +214,207 @@ namespace yolox_cpp
             std::vector<Object> proposals;
             generate_yolox_proposals(grid_strides, prob, bbox_conf_thresh, proposals);
 
-            qsort_descent_inplace(proposals);
+            std::sort(
+                proposals.begin(), proposals.end(),
+                [](const Object& a, const Object& b) {
+                    return a.prob > b.prob; // descent
+                }
+            );
 
             std::vector<int> picked;
             nms_sorted_bboxes(proposals, picked, nms_thresh_);
 
             int count = picked.size();
             objects.resize(count);
+            const float max_x = static_cast<float>(img_w - 1);
+            const float max_y = static_cast<float>(img_h - 1);
 
             for (int i = 0; i < count; ++i)
             {
                 objects[i] = proposals[picked[i]];
 
                 // adjust offset to original unpadded
-                float x0 = (objects[i].rect.x) / scale;
-                float y0 = (objects[i].rect.y) / scale;
-                float x1 = (objects[i].rect.x + objects[i].rect.width) / scale;
-                float y1 = (objects[i].rect.y + objects[i].rect.height) / scale;
+                float x0 = static_cast<float>(objects[i].rect.x) / scale;
+                float y0 = static_cast<float>(objects[i].rect.y) / scale;
+                float x1 = static_cast<float>(objects[i].rect.x + objects[i].rect.width) / scale;
+                float y1 = static_cast<float>(objects[i].rect.y + objects[i].rect.height) / scale;
 
                 // clip
-                x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f);
-                y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f);
-                x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f);
-                y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f);
+                x0 = std::max(std::min(x0, max_x), 0.f);
+                y0 = std::max(std::min(y0, max_y), 0.f);
+                x1 = std::max(std::min(x1, max_x), 0.f);
+                y1 = std::max(std::min(y1, max_y), 0.f);
 
                 objects[i].rect.x = x0;
                 objects[i].rect.y = y0;
                 objects[i].rect.width = x1 - x0;
                 objects[i].rect.height = y1 - y0;
             }
         }
+
+    private:
+    #if defined(CV_SIMD128) && CV_SIMD128 == 1
+        void blobFromImage_simd(const cv::Mat &img, float *blob_data)
+        {
+            const size_t channels = 3;
+            const size_t img_h = img.rows;
+            const size_t img_w = img.cols;
+            const size_t img_hw = img_h * img_w;
+
+            const size_t step = 4; // load 4 pixel
+            const size_t N = img_hw / step;
+            const size_t remain = img_hw % step;
+
+            float *blob_data_ch0 = blob_data;
+            float *blob_data_ch1 = blob_data + img_hw;
+            float *blob_data_ch2 = blob_data + img_hw * 2;
+
+            if (this->model_version_ == "0.1.0")
+            {
+                cv::Mat img_f32;
+                img.convertTo(img_f32, CV_32FC3);
+                const cv::v_float32x4 mean_std0 = cv::v_setall_f32(-this->mean_std_[0]);
+                const cv::v_float32x4 mean_std1 = cv::v_setall_f32(-this->mean_std_[1]);
+                const cv::v_float32x4 mean_std2 = cv::v_setall_f32(-this->mean_std_[2]);
+                const cv::v_float32x4 std255_inv_0 = cv::v_setall_f32(this->std255_inv_[0]);
+                const cv::v_float32x4 std255_inv_1 = cv::v_setall_f32(this->std255_inv_[1]);
+                const cv::v_float32x4 std255_inv_2 = cv::v_setall_f32(this->std255_inv_[2]);
+
+                for (size_t i = 0; i < N; ++i)
+                {
+                    cv::v_float32x4 ch0_f;
+                    cv::v_float32x4 ch1_f;
+                    cv::v_float32x4 ch2_f;
+                    // load 4 pixel x 3ch
+                    cv::v_load_deinterleave(
+                        reinterpret_cast<const float*>(img_f32.data) + i * (step * channels),
+                        ch0_f, ch1_f, ch2_f);
+
+                    {
+                        ch0_f = ch0_f * std255_inv_0 + mean_std0;
+                        ch1_f = ch1_f * std255_inv_1 + mean_std1;
+                        ch2_f = ch2_f * std255_inv_2 + mean_std2;
+                    }
+
+                    cv::v_store(blob_data_ch0 + i * step, ch0_f);
+                    cv::v_store(blob_data_ch1 + i * step, ch1_f);
+                    cv::v_store(blob_data_ch2 + i * step, ch2_f);
+                }
+            }
+            else
+            {
+                cv::Mat img_f32;
+                img.convertTo(img_f32, CV_32FC3);
+                for (size_t i = 0; i < N; ++i)
+                {
+                    cv::v_float32x4 ch0_f;
+                    cv::v_float32x4 ch1_f;
+                    cv::v_float32x4 ch2_f;
+                    // load 4 pixel x 3ch
+                    cv::v_load_deinterleave(
+                        reinterpret_cast<const float*>(img_f32.data) + i * (step * channels),
+                        ch0_f, ch1_f, ch2_f);
+
+                    cv::v_store(blob_data_ch0 + i * step, ch0_f);
+                    cv::v_store(blob_data_ch1 + i * step, ch1_f);
+                    cv::v_store(blob_data_ch2 + i * step, ch2_f);
+                }
+            }
+
+            if (remain > 0)
+            {
+                const size_t simd_done_num = N * step;
+                if (this->model_version_ == "0.1.0")
+                {
+                    for (size_t i = 0; i < remain; ++i)
+                    {
+                        // HWC -> CHW
+                        const size_t out_idx = simd_done_num + i;
+                        const size_t src_idx = out_idx * channels;
+                        blob_data_ch0[out_idx] = static_cast<float>(img.data[src_idx + 0]) * this->std255_inv_[0] + this->mean_std_[0];
+                        blob_data_ch1[out_idx] = static_cast<float>(img.data[src_idx + 1]) * this->std255_inv_[1] + this->mean_std_[1];
+                        blob_data_ch2[out_idx] = static_cast<float>(img.data[src_idx + 2]) * this->std255_inv_[2] + this->mean_std_[2];
+                    }
+                }
+                else
+                {
+                    for (size_t i = 0; i < remain; ++i)
+                    {
+                        // HWC -> CHW
+                        const size_t out_idx = simd_done_num + i;
+                        const size_t src_idx = out_idx * channels;
+                        blob_data_ch0[out_idx] = static_cast<float>(img.data[src_idx + 0]);
+                        blob_data_ch1[out_idx] = static_cast<float>(img.data[src_idx + 1]);
+                        blob_data_ch2[out_idx] = static_cast<float>(img.data[src_idx + 2]);
+                    }
+
+                }
+            }
+
+        }
+    #endif
+        void blobFromImage_cpu(const cv::Mat &img, float *blob_data)
+        {
+            const size_t channels = 3;
+            const size_t img_h = img.rows;
+            const size_t img_w = img.cols;
+            const size_t img_hw = img_h * img_w;
+            float *blob_data_ch0 = blob_data;
+            float *blob_data_ch1 = blob_data + img_hw;
+            float *blob_data_ch2 = blob_data + img_hw * 2;
+            // HWC -> CHW
+            if (this->model_version_ == "0.1.0")
+            {
+                for (size_t i = 0; i < img_hw; ++i)
+                {
+                    // blob = (img / 255.0 - mean) / std
+                    const size_t src_idx = i * channels;
+                    blob_data_ch0[i] = static_cast<float>(img.data[src_idx + 0]) * this->std255_inv_[0] + this->mean_std_[0];
+                    blob_data_ch1[i] = static_cast<float>(img.data[src_idx + 1]) * this->std255_inv_[1] + this->mean_std_[1];
+                    blob_data_ch2[i] = static_cast<float>(img.data[src_idx + 2]) * this->std255_inv_[2] + this->mean_std_[2];
+                }
+            }
+            else
+            {
+                for (size_t i = 0; i < img_hw; ++i)
+                {
+                    // HWC -> CHW
+                    const size_t src_idx = i * channels;
+                    blob_data_ch0[i] = static_cast<float>(img.data[src_idx + 0]);
+                    blob_data_ch1[i] = static_cast<float>(img.data[src_idx + 1]);
+                    blob_data_ch2[i] = static_cast<float>(img.data[src_idx + 2]);
+                }
+            }
+
+        }
+        void blobFromImage_nhwc_cpu(const cv::Mat &img, float *blob_data)
+        {
+            const size_t channels = 3;
+            const size_t img_h = img.rows;
+            const size_t img_w = img.cols;
+            if (this->model_version_ == "0.1.0")
+            {
+                for (size_t i = 0; i < img_h * img_w; ++i)
+                {
+                    for (size_t c = 0; c < channels; ++c)
+                    {
+                        // blob = (img / 255.0 - mean) / std
+                        blob_data[i * channels + c] =
+                            static_cast<float>(img.data[i * channels + c]) * this->std255_inv_[c] + this->mean_std_[c];
+                    }
+                }
+            }
+            else
+            {
+                for (size_t i = 0; i < img_h * img_w; ++i)
+                {
+                    for (size_t c = 0; c < channels; ++c)
+                    {
+                        blob_data[i * channels + c] = static_cast<float>(img.data[i * channels + c]); // 0.1.1rc0 or later
+                    }
+                }
+            }
+        }
     };
 }
 #endif