11#include " yolox_cpp/yolox_tensorrt.hpp"
22
3- namespace yolox_cpp {
3+ namespace yolox_cpp
4+ {
45
56 YoloXTensorRT::YoloXTensorRT (file_name_t path_to_engine, int device,
67 float nms_th, float conf_th, std::string model_version,
78 int num_classes, bool p6)
8- : AbcYoloX(nms_th, conf_th, model_version, num_classes, p6),
9- DEVICE_ (device)
9+ : AbcYoloX(nms_th, conf_th, model_version, num_classes, p6),
10+ DEVICE_ (device)
1011 {
1112 cudaSetDevice (this ->DEVICE_ );
1213 // create a model using the API directly and serialize it to a stream
1314 char *trtModelStream{nullptr };
1415 size_t size{0 };
1516
1617 std::ifstream file (path_to_engine, std::ios::binary);
17- if (file.good ()) {
18+ if (file.good ())
19+ {
1820 file.seekg (0 , file.end );
1921 size = file.tellg ();
2022 file.seekg (0 , file.beg );
2123 trtModelStream = new char [size];
2224 assert (trtModelStream);
2325 file.read (trtModelStream, size);
2426 file.close ();
25- }else {
27+ }
28+ else
29+ {
2630 std::cerr << " invalid arguments path_to_engine: " << path_to_engine << std::endl;
2731 return ;
2832 }
@@ -35,28 +39,41 @@ namespace yolox_cpp{
3539 assert (this ->context_ != nullptr );
3640 delete[] trtModelStream;
3741
38- auto input_dims = this ->engine_ ->getBindingDimensions (0 );
42+ const auto input_name = this ->engine_ ->getIOTensorName (this ->inputIndex_ );
43+ const auto input_dims = this ->engine_ ->getTensorShape (input_name);
3944 this ->input_h_ = input_dims.d [2 ];
4045 this ->input_w_ = input_dims.d [3 ];
4146 std::cout << " INPUT_HEIGHT: " << this ->input_h_ << std::endl;
4247 std::cout << " INPUT_WIDTH: " << this ->input_w_ << std::endl;
4348
44- auto out_dims = this ->engine_ ->getBindingDimensions (1 );
49+ const auto output_name = this ->engine_ ->getIOTensorName (this ->outputIndex_ );
50+ auto output_dims = this ->engine_ ->getTensorShape (output_name);
4551 this ->output_size_ = 1 ;
46- for (int j=0 ; j<out_dims.nbDims ; ++j) {
47- this ->output_size_ *= out_dims.d [j];
52+ for (int j = 0 ; j < output_dims.nbDims ; ++j)
53+ {
54+ this ->output_size_ *= output_dims.d [j];
4855 }
4956
5057 // Pointers to input and output device buffers to pass to engine.
5158 // Engine requires exactly IEngine::getNbBindings() number of buffers.
52- assert (this ->engine_ ->getNbBindings () == 2 );
59+ assert (this ->engine_ ->getNbIOTensors () == 2 );
5360 // In order to bind the buffers, we need to know the names of the input and output tensors.
5461 // Note that indices are guaranteed to be less than IEngine::getNbBindings()
55- assert (this ->engine_ ->getBindingDataType (this ->inputIndex_ ) == nvinfer1::DataType::kFLOAT );
56- assert (this ->engine_ ->getBindingDataType (this ->outputIndex_ ) == nvinfer1::DataType::kFLOAT );
62+ assert (this ->engine_ ->getTensorDataType (input_name) == nvinfer1::DataType::kFLOAT );
63+ assert (this ->engine_ ->getTensorDataType (output_name) == nvinfer1::DataType::kFLOAT );
64+
65+ // Create GPU buffers on device
66+ CHECK (cudaMalloc (&this ->inference_buffers_ [this ->inputIndex_ ], 3 * this ->input_h_ * this ->input_w_ * sizeof (float )));
67+ CHECK (cudaMalloc (&this ->inference_buffers_ [this ->outputIndex_ ], this ->output_size_ * sizeof (float )));
68+
69+ assert (this ->context_ ->setInputShape (input_name, input_dims));
70+ assert (this ->context_ ->allInputDimensionsSpecified ());
71+
72+ assert (this ->context_ ->setTensorAddress (input_name, this ->inference_buffers_ [this ->inputIndex_ ]));
73+ assert (this ->context_ ->setTensorAddress (output_name, this ->inference_buffers_ [this ->outputIndex_ ]));
5774
5875 // Prepare GridAndStrides
59- if (this ->p6_ )
76+ if (this ->p6_ )
6077 {
6178 generate_grids_and_stride (this ->input_w_ , this ->input_h_ , this ->strides_p6_ , this ->grid_strides_ );
6279 }
@@ -66,18 +83,24 @@ namespace yolox_cpp{
6683 }
6784 }
6885
69- std::vector<Object> YoloXTensorRT::inference (const cv::Mat& frame)
86+ YoloXTensorRT::~YoloXTensorRT ()
87+ {
88+ CHECK (cudaFree (inference_buffers_[this ->inputIndex_ ]));
89+ CHECK (cudaFree (inference_buffers_[this ->outputIndex_ ]));
90+ }
91+
92+ std::vector<Object> YoloXTensorRT::inference (const cv::Mat &frame)
7093 {
7194 // preprocess
7295 auto pr_img = static_resize (frame);
73- float * input_blob = new float [pr_img.total ()* 3 ];
96+ float * input_blob = new float [pr_img.total () * 3 ];
7497 blobFromImage (pr_img, input_blob);
7598
7699 // inference
77- float * output_blob = new float [this ->output_size_ ];
100+ float * output_blob = new float [this ->output_size_ ];
78101 this ->doInference (input_blob, output_blob);
79102
80- float scale = std::min (this ->input_w_ / (frame.cols * 1.0 ), this ->input_h_ / (frame.rows * 1.0 ));
103+ float scale = std::min (this ->input_w_ / (frame.cols * 1.0 ), this ->input_h_ / (frame.rows * 1.0 ));
81104
82105 std::vector<Object> objects;
83106 decode_outputs (output_blob, this ->grid_strides_ , objects, this ->bbox_conf_thresh_ , scale, frame.cols , frame.rows );
@@ -87,31 +110,25 @@ namespace yolox_cpp{
87110 return objects;
88111 }
89112
90- void YoloXTensorRT::doInference (float * input, float * output)
113+ void YoloXTensorRT::doInference (float * input, float * output)
91114 {
92- // Pointers to input and output device buffers to pass to engine.
93- // Engine requires exactly IEngine::getNbBindings() number of buffers.
94- void * buffers[2 ];
95-
96- // Create GPU buffers on device
97- CHECK (cudaMalloc (&buffers[this ->inputIndex_ ], 3 * this ->input_h_ * this ->input_w_ * sizeof (float )));
98- CHECK (cudaMalloc (&buffers[this ->outputIndex_ ], this ->output_size_ * sizeof (float )));
99-
100115 // Create stream
101116 cudaStream_t stream;
102117 CHECK (cudaStreamCreate (&stream));
103118
104119 // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
105- CHECK (cudaMemcpyAsync (buffers[this ->inputIndex_ ], input, 3 * this ->input_h_ * this ->input_w_ * sizeof (float ), cudaMemcpyHostToDevice, stream));
106- context_->enqueueV2 (buffers, stream, nullptr );
107- CHECK (cudaMemcpyAsync (output, buffers[this ->outputIndex_ ], this ->output_size_ * sizeof (float ), cudaMemcpyDeviceToHost, stream));
108- cudaStreamSynchronize (stream);
109-
110- // Release stream and buffers
111- cudaStreamDestroy (stream);
112- CHECK (cudaFree (buffers[0 ]));
113- CHECK (cudaFree (buffers[1 ]));
120+ CHECK (cudaMemcpyAsync (this ->inference_buffers_ [this ->inputIndex_ ], input, 3 * this ->input_h_ * this ->input_w_ * sizeof (float ), cudaMemcpyHostToDevice, stream));
121+
122+ bool success = context_->enqueueV3 (stream);
123+ if (!success)
124+ throw std::runtime_error (" failed inference" );
125+
126+ CHECK (cudaMemcpyAsync (output, this ->inference_buffers_ [this ->outputIndex_ ], this ->output_size_ * sizeof (float ), cudaMemcpyDeviceToHost, stream));
127+
128+ CHECK (cudaStreamSynchronize (stream));
129+
130+ // Release stream
131+ CHECK (cudaStreamDestroy (stream));
114132 }
115133
116134} // namespace yolox_cpp
117-
0 commit comments