22#define _YOLOX_CPP_CORE_HPP
33
44#include < opencv2/core/types.hpp>
5+ #include < opencv2/core/simd_intrinsics.hpp>
6+ #include < algorithm>
57
68namespace yolox_cpp
79{
@@ -49,15 +51,21 @@ namespace yolox_cpp
4951 int num_classes_;
5052 bool p6_;
5153 std::string model_version_;
52- const std::vector<float > mean_ = {0.485 , 0.456 , 0.406 };
53- const std::vector<float > std_ = {0.229 , 0.224 , 0.225 };
54+ // const std::vector<float> mean_ = {0.485, 0.456, 0.406};
55+ // const std::vector<float> std_ = {0.229, 0.224, 0.225};
56+ const std::vector<float > std255_inv_ = {
57+ 1.0 / (255.0 * 0.229 ), 1.0 / (255.0 * 0.224 ), 1.0 / (255.0 * 0.225 )};
58+ const std::vector<float > mean_std_ = {
59+ -0.485 / 0.229 , -0.456 / 0.224 , -0.406 / 0.225 };
5460 const std::vector<int > strides_ = {8 , 16 , 32 };
5561 const std::vector<int > strides_p6_ = {8 , 16 , 32 , 64 };
5662 std::vector<GridAndStride> grid_strides_;
5763
5864 cv::Mat static_resize (const cv::Mat &img)
5965 {
60- const float r = std::min (input_w_ / (img.cols * 1.0 ), input_h_ / (img.rows * 1.0 ));
66+ const float r = std::min (
67+ static_cast <float >(input_w_) / (static_cast <float >(img.cols ) * 1 .0f ),
68+ static_cast <float >(input_h_) / (static_cast <float >(img.rows ) * 1 .0f ));
6169 // r = std::min(r, 1.0f);
6270 const int unpad_w = r * img.cols ;
6371 const int unpad_h = r * img.rows ;
@@ -71,69 +79,28 @@ namespace yolox_cpp
7179 // for NCHW
7280 void blobFromImage (const cv::Mat &img, float *blob_data)
7381 {
74- const size_t channels = 3 ;
75- const size_t img_h = img.rows ;
76- const size_t img_w = img.cols ;
77- if (this ->model_version_ == " 0.1.0" )
78- {
79- for (size_t c = 0 ; c < channels; ++c)
80- {
81- for (size_t h = 0 ; h < img_h; ++h)
82- {
83- for (size_t w = 0 ; w < img_w; ++w)
84- {
85- blob_data[c * img_w * img_h + h * img_w + w] =
86- ((float )img.ptr <cv::Vec3b>(h)[w][c] / 255.0 - this ->mean_ [c]) / this ->std_ [c];
87- }
88- }
89- }
90- }
91- else
92- {
93- for (size_t c = 0 ; c < channels; ++c)
94- {
95- for (size_t h = 0 ; h < img_h; ++h)
96- {
97- for (size_t w = 0 ; w < img_w; ++w)
98- {
99- blob_data[c * img_w * img_h + h * img_w + w] = (float )img.ptr <cv::Vec3b>(h)[w][c]; // 0.1.1rc0 or later
100- }
101- }
102- }
103- }
82+ blobFromImage_cpu (img, blob_data);
83+ // #if defined(CV_SIMD128) && CV_SIMD128 == 1
84+ // blobFromImage_simd(img, blob_data);
85+ // #else
86+ // blobFromImage_cpu(img, blob_data);
87+ // #endif
10488 }
10589
10690 // for NHWC
10791 void blobFromImage_nhwc (const cv::Mat &img, float *blob_data)
10892 {
109- const size_t channels = 3 ;
110- const size_t img_h = img.rows ;
111- const size_t img_w = img.cols ;
112- if (this ->model_version_ == " 0.1.0" )
113- {
114- for (size_t i = 0 ; i < img_h * img_w; ++i)
115- {
116- for (size_t c = 0 ; c < channels; ++c)
117- {
118- blob_data[i * channels + c] =
119- ((float )img.data [i * channels + c] / 255.0 - this ->mean_ [c]) / this ->std_ [c];
120- }
121- }
122- }
123- else
124- {
125- for (size_t i = 0 ; i < img_h * img_w; ++i)
126- {
127- for (size_t c = 0 ; c < channels; ++c)
128- {
129- blob_data[i * channels + c] = (float )img.data [i * channels + c]; // 0.1.1rc0 or later
130- }
131- }
132- }
93+ blobFromImage_nhwc_cpu (img, blob_data);
94+ // #if defined(CV_SIMD128) && CV_SIMD128 == 1
95+ // blobFromImage_nhwc_simd(img, blob_data);
96+ // #else
97+ // blobFromImage_nhwc_cpu(img, blob_data);
98+ // #endif
13399 }
134100
135101 void generate_grids_and_stride (const int target_w, const int target_h, const std::vector<int > &strides, std::vector<GridAndStride> &grid_strides)
136102 {
103+ grid_strides.clear ();
137104 for (auto stride : strides)
138105 {
139106 const int num_grid_w = target_w / stride;
@@ -151,6 +118,7 @@ namespace yolox_cpp
151118 void generate_yolox_proposals (const std::vector<GridAndStride> &grid_strides, const float *feat_ptr, const float prob_threshold, std::vector<Object> &objects)
152119 {
153120 const int num_anchors = grid_strides.size ();
121+ objects.clear ();
154122
155123 for (int anchor_idx = 0 ; anchor_idx < num_anchors; ++anchor_idx)
156124 {
@@ -203,42 +171,6 @@ namespace yolox_cpp
203171 return inter.area ();
204172 }
205173
206- void qsort_descent_inplace (std::vector<Object> &faceobjects, int left, int right)
207- {
208- int i = left;
209- int j = right;
210- float p = faceobjects[(left + right) / 2 ].prob ;
211-
212- while (i <= j)
213- {
214- while (faceobjects[i].prob > p)
215- ++i;
216-
217- while (faceobjects[j].prob < p)
218- --j;
219-
220- if (i <= j)
221- {
222- std::swap (faceobjects[i], faceobjects[j]);
223-
224- ++i;
225- --j;
226- }
227- }
228- if (left < j)
229- qsort_descent_inplace (faceobjects, left, j);
230- if (i < right)
231- qsort_descent_inplace (faceobjects, i, right);
232- }
233-
234- void qsort_descent_inplace (std::vector<Object> &objects)
235- {
236- if (objects.empty ())
237- return ;
238-
239- qsort_descent_inplace (objects, 0 , objects.size () - 1 );
240- }
241-
242174 void nms_sorted_bboxes (const std::vector<Object> &faceobjects, std::vector<int > &picked, const float nms_threshold)
243175 {
244176 picked.clear ();
@@ -282,36 +214,207 @@ namespace yolox_cpp
282214 std::vector<Object> proposals;
283215 generate_yolox_proposals (grid_strides, prob, bbox_conf_thresh, proposals);
284216
285- qsort_descent_inplace (proposals);
217+ std::sort (
218+ proposals.begin (), proposals.end (),
219+ [](const Object& a, const Object& b) {
220+ return a.prob > b.prob ; // descent
221+ }
222+ );
286223
287224 std::vector<int > picked;
288225 nms_sorted_bboxes (proposals, picked, nms_thresh_);
289226
290227 int count = picked.size ();
291228 objects.resize (count);
229+ const float max_x = static_cast <float >(img_w - 1 );
230+ const float max_y = static_cast <float >(img_h - 1 );
292231
293232 for (int i = 0 ; i < count; ++i)
294233 {
295234 objects[i] = proposals[picked[i]];
296235
297236 // adjust offset to original unpadded
298- float x0 = (objects[i].rect .x ) / scale;
299- float y0 = (objects[i].rect .y ) / scale;
300- float x1 = (objects[i].rect .x + objects[i].rect .width ) / scale;
301- float y1 = (objects[i].rect .y + objects[i].rect .height ) / scale;
237+ float x0 = static_cast < float > (objects[i].rect .x ) / scale;
238+ float y0 = static_cast < float > (objects[i].rect .y ) / scale;
239+ float x1 = static_cast < float > (objects[i].rect .x + objects[i].rect .width ) / scale;
240+ float y1 = static_cast < float > (objects[i].rect .y + objects[i].rect .height ) / scale;
302241
303242 // clip
304- x0 = std::max (std::min (x0, ( float )(img_w - 1 ) ), 0 .f );
305- y0 = std::max (std::min (y0, ( float )(img_h - 1 ) ), 0 .f );
306- x1 = std::max (std::min (x1, ( float )(img_w - 1 ) ), 0 .f );
307- y1 = std::max (std::min (y1, ( float )(img_h - 1 ) ), 0 .f );
243+ x0 = std::max (std::min (x0, max_x ), 0 .f );
244+ y0 = std::max (std::min (y0, max_y ), 0 .f );
245+ x1 = std::max (std::min (x1, max_x ), 0 .f );
246+ y1 = std::max (std::min (y1, max_y ), 0 .f );
308247
309248 objects[i].rect .x = x0;
310249 objects[i].rect .y = y0;
311250 objects[i].rect .width = x1 - x0;
312251 objects[i].rect .height = y1 - y0;
313252 }
314253 }
254+
255+ private:
256+ #if defined(CV_SIMD128) && CV_SIMD128 == 1
257+ void blobFromImage_simd (const cv::Mat &img, float *blob_data)
258+ {
259+ const size_t channels = 3 ;
260+ const size_t img_h = img.rows ;
261+ const size_t img_w = img.cols ;
262+ const size_t img_hw = img_h * img_w;
263+
264+ const size_t step = 4 ; // load 4 pixel
265+ const size_t N = img_hw / step;
266+ const size_t remain = img_hw % step;
267+
268+ float *blob_data_ch0 = blob_data;
269+ float *blob_data_ch1 = blob_data + img_hw;
270+ float *blob_data_ch2 = blob_data + img_hw * 2 ;
271+
272+ if (this ->model_version_ == " 0.1.0" )
273+ {
274+ cv::Mat img_f32;
275+ img.convertTo (img_f32, CV_32FC3);
276+ const cv::v_float32x4 mean_std0 = cv::v_setall_f32 (-this ->mean_std_ [0 ]);
277+ const cv::v_float32x4 mean_std1 = cv::v_setall_f32 (-this ->mean_std_ [1 ]);
278+ const cv::v_float32x4 mean_std2 = cv::v_setall_f32 (-this ->mean_std_ [2 ]);
279+ const cv::v_float32x4 std255_inv_0 = cv::v_setall_f32 (this ->std255_inv_ [0 ]);
280+ const cv::v_float32x4 std255_inv_1 = cv::v_setall_f32 (this ->std255_inv_ [1 ]);
281+ const cv::v_float32x4 std255_inv_2 = cv::v_setall_f32 (this ->std255_inv_ [2 ]);
282+
283+ for (size_t i = 0 ; i < N; ++i)
284+ {
285+ cv::v_float32x4 ch0_f;
286+ cv::v_float32x4 ch1_f;
287+ cv::v_float32x4 ch2_f;
288+ // load 4 pixel x 3ch
289+ cv::v_load_deinterleave (
290+ reinterpret_cast <const float *>(img_f32.data ) + i * (step * channels),
291+ ch0_f, ch1_f, ch2_f);
292+
293+ {
294+ ch0_f = ch0_f * std255_inv_0 + mean_std0;
295+ ch1_f = ch1_f * std255_inv_1 + mean_std1;
296+ ch2_f = ch2_f * std255_inv_2 + mean_std2;
297+ }
298+
299+ cv::v_store (blob_data_ch0 + i * step, ch0_f);
300+ cv::v_store (blob_data_ch1 + i * step, ch1_f);
301+ cv::v_store (blob_data_ch2 + i * step, ch2_f);
302+ }
303+ }
304+ else
305+ {
306+ cv::Mat img_f32;
307+ img.convertTo (img_f32, CV_32FC3);
308+ for (size_t i = 0 ; i < N; ++i)
309+ {
310+ cv::v_float32x4 ch0_f;
311+ cv::v_float32x4 ch1_f;
312+ cv::v_float32x4 ch2_f;
313+ // load 4 pixel x 3ch
314+ cv::v_load_deinterleave (
315+ reinterpret_cast <const float *>(img_f32.data ) + i * (step * channels),
316+ ch0_f, ch1_f, ch2_f);
317+
318+ cv::v_store (blob_data_ch0 + i * step, ch0_f);
319+ cv::v_store (blob_data_ch1 + i * step, ch1_f);
320+ cv::v_store (blob_data_ch2 + i * step, ch2_f);
321+ }
322+ }
323+
324+ if (remain > 0 )
325+ {
326+ const size_t simd_done_num = N * step;
327+ if (this ->model_version_ == " 0.1.0" )
328+ {
329+ for (size_t i = 0 ; i < remain; ++i)
330+ {
331+ // HWC -> CHW
332+ const size_t out_idx = simd_done_num + i;
333+ const size_t src_idx = out_idx * channels;
334+ blob_data_ch0[out_idx] = static_cast <float >(img.data [src_idx + 0 ]) * this ->std255_inv_ [0 ] + this ->mean_std_ [0 ];
335+ blob_data_ch1[out_idx] = static_cast <float >(img.data [src_idx + 1 ]) * this ->std255_inv_ [1 ] + this ->mean_std_ [1 ];
336+ blob_data_ch2[out_idx] = static_cast <float >(img.data [src_idx + 2 ]) * this ->std255_inv_ [2 ] + this ->mean_std_ [2 ];
337+ }
338+ }
339+ else
340+ {
341+ for (size_t i = 0 ; i < remain; ++i)
342+ {
343+ // HWC -> CHW
344+ const size_t out_idx = simd_done_num + i;
345+ const size_t src_idx = out_idx * channels;
346+ blob_data_ch0[out_idx] = static_cast <float >(img.data [src_idx + 0 ]);
347+ blob_data_ch1[out_idx] = static_cast <float >(img.data [src_idx + 1 ]);
348+ blob_data_ch2[out_idx] = static_cast <float >(img.data [src_idx + 2 ]);
349+ }
350+
351+ }
352+ }
353+
354+ }
355+ #endif
356+ void blobFromImage_cpu (const cv::Mat &img, float *blob_data)
357+ {
358+ const size_t channels = 3 ;
359+ const size_t img_h = img.rows ;
360+ const size_t img_w = img.cols ;
361+ const size_t img_hw = img_h * img_w;
362+ float *blob_data_ch0 = blob_data;
363+ float *blob_data_ch1 = blob_data + img_hw;
364+ float *blob_data_ch2 = blob_data + img_hw * 2 ;
365+ // HWC -> CHW
366+ if (this ->model_version_ == " 0.1.0" )
367+ {
368+ for (size_t i = 0 ; i < img_hw; ++i)
369+ {
370+ // blob = (img / 255.0 - mean) / std
371+ const size_t src_idx = i * channels;
372+ blob_data_ch0[i] = static_cast <float >(img.data [src_idx + 0 ]) * this ->std255_inv_ [0 ] + this ->mean_std_ [0 ];
373+ blob_data_ch1[i] = static_cast <float >(img.data [src_idx + 1 ]) * this ->std255_inv_ [1 ] + this ->mean_std_ [1 ];
374+ blob_data_ch2[i] = static_cast <float >(img.data [src_idx + 2 ]) * this ->std255_inv_ [2 ] + this ->mean_std_ [2 ];
375+ }
376+ }
377+ else
378+ {
379+ for (size_t i = 0 ; i < img_hw; ++i)
380+ {
381+ // HWC -> CHW
382+ const size_t src_idx = i * channels;
383+ blob_data_ch0[i] = static_cast <float >(img.data [src_idx + 0 ]);
384+ blob_data_ch1[i] = static_cast <float >(img.data [src_idx + 1 ]);
385+ blob_data_ch2[i] = static_cast <float >(img.data [src_idx + 2 ]);
386+ }
387+ }
388+
389+ }
390+ void blobFromImage_nhwc_cpu (const cv::Mat &img, float *blob_data)
391+ {
392+ const size_t channels = 3 ;
393+ const size_t img_h = img.rows ;
394+ const size_t img_w = img.cols ;
395+ if (this ->model_version_ == " 0.1.0" )
396+ {
397+ for (size_t i = 0 ; i < img_h * img_w; ++i)
398+ {
399+ for (size_t c = 0 ; c < channels; ++c)
400+ {
401+ // blob = (img / 255.0 - mean) / std
402+ blob_data[i * channels + c] =
403+ static_cast <float >(img.data [i * channels + c]) * this ->std255_inv_ [c] + this ->mean_std_ [c];
404+ }
405+ }
406+ }
407+ else
408+ {
409+ for (size_t i = 0 ; i < img_h * img_w; ++i)
410+ {
411+ for (size_t c = 0 ; c < channels; ++c)
412+ {
413+ blob_data[i * channels + c] = static_cast <float >(img.data [i * channels + c]); // 0.1.1rc0 or later
414+ }
415+ }
416+ }
417+ }
315418 };
316419}
317420#endif
0 commit comments