Skip to content

Commit 9ed83f2

Browse files
committed
update blobFromImage
1 parent c6312ab commit 9ed83f2

1 file changed

Lines changed: 205 additions & 102 deletions

File tree

  • yolox_ros_cpp/yolox_cpp/include/yolox_cpp

yolox_ros_cpp/yolox_cpp/include/yolox_cpp/core.hpp

Lines changed: 205 additions & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
#define _YOLOX_CPP_CORE_HPP
33

44
#include <opencv2/core/types.hpp>
5+
#include <opencv2/core/simd_intrinsics.hpp>
6+
#include <algorithm>
57

68
namespace yolox_cpp
79
{
@@ -49,15 +51,21 @@ namespace yolox_cpp
4951
int num_classes_;
5052
bool p6_;
5153
std::string model_version_;
52-
const std::vector<float> mean_ = {0.485, 0.456, 0.406};
53-
const std::vector<float> std_ = {0.229, 0.224, 0.225};
54+
// const std::vector<float> mean_ = {0.485, 0.456, 0.406};
55+
// const std::vector<float> std_ = {0.229, 0.224, 0.225};
56+
const std::vector<float> std255_inv_ = {
57+
1.0 / (255.0 * 0.229), 1.0 / (255.0 * 0.224), 1.0 / (255.0 * 0.225)};
58+
const std::vector<float> mean_std_ = {
59+
-0.485 / 0.229 , -0.456 / 0.224, -0.406 / 0.225};
5460
const std::vector<int> strides_ = {8, 16, 32};
5561
const std::vector<int> strides_p6_ = {8, 16, 32, 64};
5662
std::vector<GridAndStride> grid_strides_;
5763

5864
cv::Mat static_resize(const cv::Mat &img)
5965
{
60-
const float r = std::min(input_w_ / (img.cols * 1.0), input_h_ / (img.rows * 1.0));
66+
const float r = std::min(
67+
static_cast<float>(input_w_) / (static_cast<float>(img.cols) * 1.0f),
68+
static_cast<float>(input_h_) / (static_cast<float>(img.rows) * 1.0f));
6169
// r = std::min(r, 1.0f);
6270
const int unpad_w = r * img.cols;
6371
const int unpad_h = r * img.rows;
@@ -71,69 +79,28 @@ namespace yolox_cpp
7179
// for NCHW
7280
void blobFromImage(const cv::Mat &img, float *blob_data)
7381
{
74-
const size_t channels = 3;
75-
const size_t img_h = img.rows;
76-
const size_t img_w = img.cols;
77-
if (this->model_version_ == "0.1.0")
78-
{
79-
for (size_t c = 0; c < channels; ++c)
80-
{
81-
for (size_t h = 0; h < img_h; ++h)
82-
{
83-
for (size_t w = 0; w < img_w; ++w)
84-
{
85-
blob_data[c * img_w * img_h + h * img_w + w] =
86-
((float)img.ptr<cv::Vec3b>(h)[w][c] / 255.0 - this->mean_[c]) / this->std_[c];
87-
}
88-
}
89-
}
90-
}
91-
else
92-
{
93-
for (size_t c = 0; c < channels; ++c)
94-
{
95-
for (size_t h = 0; h < img_h; ++h)
96-
{
97-
for (size_t w = 0; w < img_w; ++w)
98-
{
99-
blob_data[c * img_w * img_h + h * img_w + w] = (float)img.ptr<cv::Vec3b>(h)[w][c]; // 0.1.1rc0 or later
100-
}
101-
}
102-
}
103-
}
82+
blobFromImage_cpu(img, blob_data);
83+
// #if defined(CV_SIMD128) && CV_SIMD128 == 1
84+
// blobFromImage_simd(img, blob_data);
85+
// #else
86+
// blobFromImage_cpu(img, blob_data);
87+
// #endif
10488
}
10589

10690
// for NHWC
10791
void blobFromImage_nhwc(const cv::Mat &img, float *blob_data)
10892
{
109-
const size_t channels = 3;
110-
const size_t img_h = img.rows;
111-
const size_t img_w = img.cols;
112-
if (this->model_version_ == "0.1.0")
113-
{
114-
for (size_t i = 0; i < img_h * img_w; ++i)
115-
{
116-
for (size_t c = 0; c < channels; ++c)
117-
{
118-
blob_data[i * channels + c] =
119-
((float)img.data[i * channels + c] / 255.0 - this->mean_[c]) / this->std_[c];
120-
}
121-
}
122-
}
123-
else
124-
{
125-
for (size_t i = 0; i < img_h * img_w; ++i)
126-
{
127-
for (size_t c = 0; c < channels; ++c)
128-
{
129-
blob_data[i * channels + c] = (float)img.data[i * channels + c]; // 0.1.1rc0 or later
130-
}
131-
}
132-
}
93+
blobFromImage_nhwc_cpu(img, blob_data);
94+
// #if defined(CV_SIMD128) && CV_SIMD128 == 1
95+
// blobFromImage_nhwc_simd(img, blob_data);
96+
// #else
97+
// blobFromImage_nhwc_cpu(img, blob_data);
98+
// #endif
13399
}
134100

135101
void generate_grids_and_stride(const int target_w, const int target_h, const std::vector<int> &strides, std::vector<GridAndStride> &grid_strides)
136102
{
103+
grid_strides.clear();
137104
for (auto stride : strides)
138105
{
139106
const int num_grid_w = target_w / stride;
@@ -151,6 +118,7 @@ namespace yolox_cpp
151118
void generate_yolox_proposals(const std::vector<GridAndStride> &grid_strides, const float *feat_ptr, const float prob_threshold, std::vector<Object> &objects)
152119
{
153120
const int num_anchors = grid_strides.size();
121+
objects.clear();
154122

155123
for (int anchor_idx = 0; anchor_idx < num_anchors; ++anchor_idx)
156124
{
@@ -203,42 +171,6 @@ namespace yolox_cpp
203171
return inter.area();
204172
}
205173

206-
void qsort_descent_inplace(std::vector<Object> &faceobjects, int left, int right)
207-
{
208-
int i = left;
209-
int j = right;
210-
float p = faceobjects[(left + right) / 2].prob;
211-
212-
while (i <= j)
213-
{
214-
while (faceobjects[i].prob > p)
215-
++i;
216-
217-
while (faceobjects[j].prob < p)
218-
--j;
219-
220-
if (i <= j)
221-
{
222-
std::swap(faceobjects[i], faceobjects[j]);
223-
224-
++i;
225-
--j;
226-
}
227-
}
228-
if (left < j)
229-
qsort_descent_inplace(faceobjects, left, j);
230-
if (i < right)
231-
qsort_descent_inplace(faceobjects, i, right);
232-
}
233-
234-
void qsort_descent_inplace(std::vector<Object> &objects)
235-
{
236-
if (objects.empty())
237-
return;
238-
239-
qsort_descent_inplace(objects, 0, objects.size() - 1);
240-
}
241-
242174
void nms_sorted_bboxes(const std::vector<Object> &faceobjects, std::vector<int> &picked, const float nms_threshold)
243175
{
244176
picked.clear();
@@ -282,36 +214,207 @@ namespace yolox_cpp
282214
std::vector<Object> proposals;
283215
generate_yolox_proposals(grid_strides, prob, bbox_conf_thresh, proposals);
284216

285-
qsort_descent_inplace(proposals);
217+
std::sort(
218+
proposals.begin(), proposals.end(),
219+
[](const Object& a, const Object& b) {
220+
return a.prob > b.prob; // descent
221+
}
222+
);
286223

287224
std::vector<int> picked;
288225
nms_sorted_bboxes(proposals, picked, nms_thresh_);
289226

290227
int count = picked.size();
291228
objects.resize(count);
229+
const float max_x = static_cast<float>(img_w - 1);
230+
const float max_y = static_cast<float>(img_h - 1);
292231

293232
for (int i = 0; i < count; ++i)
294233
{
295234
objects[i] = proposals[picked[i]];
296235

297236
// adjust offset to original unpadded
298-
float x0 = (objects[i].rect.x) / scale;
299-
float y0 = (objects[i].rect.y) / scale;
300-
float x1 = (objects[i].rect.x + objects[i].rect.width) / scale;
301-
float y1 = (objects[i].rect.y + objects[i].rect.height) / scale;
237+
float x0 = static_cast<float>(objects[i].rect.x) / scale;
238+
float y0 = static_cast<float>(objects[i].rect.y) / scale;
239+
float x1 = static_cast<float>(objects[i].rect.x + objects[i].rect.width) / scale;
240+
float y1 = static_cast<float>(objects[i].rect.y + objects[i].rect.height) / scale;
302241

303242
// clip
304-
x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f);
305-
y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f);
306-
x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f);
307-
y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f);
243+
x0 = std::max(std::min(x0, max_x), 0.f);
244+
y0 = std::max(std::min(y0, max_y), 0.f);
245+
x1 = std::max(std::min(x1, max_x), 0.f);
246+
y1 = std::max(std::min(y1, max_y), 0.f);
308247

309248
objects[i].rect.x = x0;
310249
objects[i].rect.y = y0;
311250
objects[i].rect.width = x1 - x0;
312251
objects[i].rect.height = y1 - y0;
313252
}
314253
}
254+
255+
private:
256+
#if defined(CV_SIMD128) && CV_SIMD128 == 1
257+
void blobFromImage_simd(const cv::Mat &img, float *blob_data)
258+
{
259+
const size_t channels = 3;
260+
const size_t img_h = img.rows;
261+
const size_t img_w = img.cols;
262+
const size_t img_hw = img_h * img_w;
263+
264+
const size_t step = 4; // load 4 pixel
265+
const size_t N = img_hw / step;
266+
const size_t remain = img_hw % step;
267+
268+
float *blob_data_ch0 = blob_data;
269+
float *blob_data_ch1 = blob_data + img_hw;
270+
float *blob_data_ch2 = blob_data + img_hw * 2;
271+
272+
if (this->model_version_ == "0.1.0")
273+
{
274+
cv::Mat img_f32;
275+
img.convertTo(img_f32, CV_32FC3);
276+
const cv::v_float32x4 mean_std0 = cv::v_setall_f32(-this->mean_std_[0]);
277+
const cv::v_float32x4 mean_std1 = cv::v_setall_f32(-this->mean_std_[1]);
278+
const cv::v_float32x4 mean_std2 = cv::v_setall_f32(-this->mean_std_[2]);
279+
const cv::v_float32x4 std255_inv_0 = cv::v_setall_f32(this->std255_inv_[0]);
280+
const cv::v_float32x4 std255_inv_1 = cv::v_setall_f32(this->std255_inv_[1]);
281+
const cv::v_float32x4 std255_inv_2 = cv::v_setall_f32(this->std255_inv_[2]);
282+
283+
for (size_t i = 0; i < N; ++i)
284+
{
285+
cv::v_float32x4 ch0_f;
286+
cv::v_float32x4 ch1_f;
287+
cv::v_float32x4 ch2_f;
288+
// load 4 pixel x 3ch
289+
cv::v_load_deinterleave(
290+
reinterpret_cast<const float*>(img_f32.data) + i * (step * channels),
291+
ch0_f, ch1_f, ch2_f);
292+
293+
{
294+
ch0_f = ch0_f * std255_inv_0 + mean_std0;
295+
ch1_f = ch1_f * std255_inv_1 + mean_std1;
296+
ch2_f = ch2_f * std255_inv_2 + mean_std2;
297+
}
298+
299+
cv::v_store(blob_data_ch0 + i * step, ch0_f);
300+
cv::v_store(blob_data_ch1 + i * step, ch1_f);
301+
cv::v_store(blob_data_ch2 + i * step, ch2_f);
302+
}
303+
}
304+
else
305+
{
306+
cv::Mat img_f32;
307+
img.convertTo(img_f32, CV_32FC3);
308+
for (size_t i = 0; i < N; ++i)
309+
{
310+
cv::v_float32x4 ch0_f;
311+
cv::v_float32x4 ch1_f;
312+
cv::v_float32x4 ch2_f;
313+
// load 4 pixel x 3ch
314+
cv::v_load_deinterleave(
315+
reinterpret_cast<const float*>(img_f32.data) + i * (step * channels),
316+
ch0_f, ch1_f, ch2_f);
317+
318+
cv::v_store(blob_data_ch0 + i * step, ch0_f);
319+
cv::v_store(blob_data_ch1 + i * step, ch1_f);
320+
cv::v_store(blob_data_ch2 + i * step, ch2_f);
321+
}
322+
}
323+
324+
if (remain > 0)
325+
{
326+
const size_t simd_done_num = N * step;
327+
if (this->model_version_ == "0.1.0")
328+
{
329+
for (size_t i = 0; i < remain; ++i)
330+
{
331+
// HWC -> CHW
332+
const size_t out_idx = simd_done_num + i;
333+
const size_t src_idx = out_idx * channels;
334+
blob_data_ch0[out_idx] = static_cast<float>(img.data[src_idx + 0]) * this->std255_inv_[0] + this->mean_std_[0];
335+
blob_data_ch1[out_idx] = static_cast<float>(img.data[src_idx + 1]) * this->std255_inv_[1] + this->mean_std_[1];
336+
blob_data_ch2[out_idx] = static_cast<float>(img.data[src_idx + 2]) * this->std255_inv_[2] + this->mean_std_[2];
337+
}
338+
}
339+
else
340+
{
341+
for (size_t i = 0; i < remain; ++i)
342+
{
343+
// HWC -> CHW
344+
const size_t out_idx = simd_done_num + i;
345+
const size_t src_idx = out_idx * channels;
346+
blob_data_ch0[out_idx] = static_cast<float>(img.data[src_idx + 0]);
347+
blob_data_ch1[out_idx] = static_cast<float>(img.data[src_idx + 1]);
348+
blob_data_ch2[out_idx] = static_cast<float>(img.data[src_idx + 2]);
349+
}
350+
351+
}
352+
}
353+
354+
}
355+
#endif
356+
void blobFromImage_cpu(const cv::Mat &img, float *blob_data)
357+
{
358+
const size_t channels = 3;
359+
const size_t img_h = img.rows;
360+
const size_t img_w = img.cols;
361+
const size_t img_hw = img_h * img_w;
362+
float *blob_data_ch0 = blob_data;
363+
float *blob_data_ch1 = blob_data + img_hw;
364+
float *blob_data_ch2 = blob_data + img_hw * 2;
365+
// HWC -> CHW
366+
if (this->model_version_ == "0.1.0")
367+
{
368+
for (size_t i = 0; i < img_hw; ++i)
369+
{
370+
// blob = (img / 255.0 - mean) / std
371+
const size_t src_idx = i * channels;
372+
blob_data_ch0[i] = static_cast<float>(img.data[src_idx + 0]) * this->std255_inv_[0] + this->mean_std_[0];
373+
blob_data_ch1[i] = static_cast<float>(img.data[src_idx + 1]) * this->std255_inv_[1] + this->mean_std_[1];
374+
blob_data_ch2[i] = static_cast<float>(img.data[src_idx + 2]) * this->std255_inv_[2] + this->mean_std_[2];
375+
}
376+
}
377+
else
378+
{
379+
for (size_t i = 0; i < img_hw; ++i)
380+
{
381+
// HWC -> CHW
382+
const size_t src_idx = i * channels;
383+
blob_data_ch0[i] = static_cast<float>(img.data[src_idx + 0]);
384+
blob_data_ch1[i] = static_cast<float>(img.data[src_idx + 1]);
385+
blob_data_ch2[i] = static_cast<float>(img.data[src_idx + 2]);
386+
}
387+
}
388+
389+
}
390+
void blobFromImage_nhwc_cpu(const cv::Mat &img, float *blob_data)
391+
{
392+
const size_t channels = 3;
393+
const size_t img_h = img.rows;
394+
const size_t img_w = img.cols;
395+
if (this->model_version_ == "0.1.0")
396+
{
397+
for (size_t i = 0; i < img_h * img_w; ++i)
398+
{
399+
for (size_t c = 0; c < channels; ++c)
400+
{
401+
// blob = (img / 255.0 - mean) / std
402+
blob_data[i * channels + c] =
403+
static_cast<float>(img.data[i * channels + c]) * this->std255_inv_[c] + this->mean_std_[c];
404+
}
405+
}
406+
}
407+
else
408+
{
409+
for (size_t i = 0; i < img_h * img_w; ++i)
410+
{
411+
for (size_t c = 0; c < channels; ++c)
412+
{
413+
blob_data[i * channels + c] = static_cast<float>(img.data[i * channels + c]); // 0.1.1rc0 or later
414+
}
415+
}
416+
}
417+
}
315418
};
316419
}
317420
#endif

0 commit comments

Comments
 (0)