#!/usr/bin/env python # -*- coding: utf-8 -*- import torch import sys sys.path.append('.') import time import pycuda.autoinit import pycuda.driver as cuda import tensorrt as trt import torch import time from PIL import Image import cv2,os import torchvision import numpy as np max_batch_size = 1 onnx_model_path = "/data/disk1/workspace/06_reid/01_fast_reid/02_fast_reid_inference/fastreid.onnx" TRT_LOGGER = trt.Logger() # class HostDeviceMem(object): # def init(self, host_mem, device_mem): # # """ # # host_mem: cpu memory # # device_mem: gpu memory # # """ # print("-----------11-----------") # self.host = host_mem # self.device = device_mem # def init(): # # """ # # host_mem: cpu memory # # device_mem: gpu memory # # """ # print("---------22-------------") # def __str__(self): # return "Host:\n" + str(self.host)+"\nDevice:\n"+str(self.device) # def __repr__(self): # return self.__str__() def get_img_np_nchw(filename): image = cv2.imread(filename) image_cv = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) image_cv = cv2.resize(image_cv, (256, 128)) miu = np.array([0.485, 0.456, 0.406]).reshape(3, 1, 1) std = np.array([0.229, 0.224, 0.225]).reshape(3, 1, 1) img_np = np.array(image_cv, dtype=np.float)/255. img_np = img_np.transpose((2, 0, 1)) img_np -= miu img_np /= std img_np_nchw = img_np[np.newaxis] img_np_nchw = np.tile(img_np_nchw,(max_batch_size, 1, 1, 1)) return img_np_nchw class HostDeviceMem(object): def __init__(self, host_mem, device_mem): # """ # host_mem: cpu memory # device_mem: gpu memory # """ self.host = host_mem self.device = device_mem print() def __str__(self): return "Host:\n" + str(self.host)+"\nDevice:\n"+str(self.device) def __repr__(self): return self.__str__() def allocate_buffers(engine): inputs, outputs, bindings = [], [], [] stream = cuda.Stream() for binding in engine: size = trt.volume(engine.get_binding_shape(binding)) dtype = trt.nptype(engine.get_binding_dtype(binding)) host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) bindings.append(int(device_mem)) #append to the appropriate list if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: outputs.append(HostDeviceMem(host_mem, device_mem)) return inputs, outputs, bindings, stream def get_engine(max_batch_size=1, onnx_file_path="", engine_file_path="", fp16_mode=False, save_engine=True): if os.path.exists(engine_file_path): print("Reading engine from file: {}".format(engine_file_path)) with open(engine_file_path, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime: return runtime.deserialize_cuda_engine(f.read()) # 反序列化 else: explicit_batch = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) # In TensorRT 7.0, the ONNX parser only supports full-dimensions mode, meaning that your network definition must be created with the explicitBatch flag set. For more information, see Working With Dynamic Shapes. with trt.Builder(TRT_LOGGER) as builder, \ builder.create_network(explicit_batch) as network, \ trt.OnnxParser(network, TRT_LOGGER) as parser: config = builder.create_builder_config() config.max_workspace_size = 1<<30 builder.max_batch_size = max_batch_size # 执行时最大可以使用的batchsize builder.fp16_mode = fp16_mode if not os.path.exists(onnx_file_path): quit("ONNX file {} not found!".format(onnx_file_path)) print('loading onnx file from path {} ...'.format(onnx_file_path)) with open(onnx_file_path, 'rb') as model: # 二值化的网络结果和参数 print("Begining onnx file parsing") parser.parse(model.read()) print("Completed parsing of onnx file") print("Building an engine from file{}' this may take a while...".format(onnx_file_path)) ################# print(network.get_layer(network.num_layers-1).get_output(0).shape) engine=builder.build_engin(network, config) print("Completed creating Engine") if save_engine: with open(engine_file_path, 'wb') as f: f.write(engine.serialize()) # 序列化 return engine def do_inference(context, bindings, inputs, outputs, stream, batch_size=1): [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] context.execute_async_v2(bindings=bindings, stream_handle=stream.handle) [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] # gpu to cpu # Synchronize the stream stream.synchronize() return [out.host for out in outputs] def postprocess_the_outputs(h_outputs, shape_of_output): h_outputs = h_outputs.reshape(*shape_of_output) return h_outputs if __name__ == '__main__': img_np_nchw = get_img_np_nchw("/data/disk1/project/data/01_reid/0_1.png").astype(np.float32) fp16_mode = True trt_engine_path ="./human_feature{}.trt".format(fp16_mode) engine = get_engine(max_batch_size, onnx_model_path, trt_engine_path, fp16_mode) context = engine.create_execution_context() inputs, outputs, bindings, stream = allocate_buffers(engine) shape_of_output = (max_batch_size, 2048) inputs[0].host = img_np_nchw t1 = time.time() trt_outputs = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream, batch_size = max_batch_size) t2 = time.time() print(trt_outputs, trt_outputs[0].shape) feat = postprocess_the_outputs(trt_outputs[0], shape_of_output) print('TensorRT ok') print("Inference time with the TensorRT engine: {}".format(t2-t1))