#!/usr/bin/env python
|
# -*- coding: utf-8 -*-
|
import torch
|
import sys
|
sys.path.append('.')
|
|
import time
|
|
import pycuda.autoinit
|
import pycuda.driver as cuda
|
import tensorrt as trt
|
import torch
|
import time
|
from PIL import Image
|
import cv2,os
|
import torchvision
|
import numpy as np
|
|
max_batch_size = 1
|
onnx_model_path = "/data/disk1/workspace/06_reid/01_fast_reid/02_fast_reid_inference/fastreid.onnx"
|
TRT_LOGGER = trt.Logger()
|
|
# class HostDeviceMem(object):
|
# def init(self, host_mem, device_mem):
|
# # """
|
# # host_mem: cpu memory
|
# # device_mem: gpu memory
|
# # """
|
# print("-----------11-----------")
|
# self.host = host_mem
|
# self.device = device_mem
|
|
# def init():
|
# # """
|
# # host_mem: cpu memory
|
# # device_mem: gpu memory
|
# # """
|
# print("---------22-------------")
|
|
# def __str__(self):
|
# return "Host:\n" + str(self.host)+"\nDevice:\n"+str(self.device)
|
# def __repr__(self):
|
# return self.__str__()
|
|
def get_img_np_nchw(filename):
|
image = cv2.imread(filename)
|
image_cv = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
image_cv = cv2.resize(image_cv, (256, 128))
|
miu = np.array([0.485, 0.456, 0.406]).reshape(3, 1, 1)
|
std = np.array([0.229, 0.224, 0.225]).reshape(3, 1, 1)
|
img_np = np.array(image_cv, dtype=np.float)/255.
|
img_np = img_np.transpose((2, 0, 1))
|
img_np -= miu
|
img_np /= std
|
img_np_nchw = img_np[np.newaxis]
|
img_np_nchw = np.tile(img_np_nchw,(max_batch_size, 1, 1, 1))
|
return img_np_nchw
|
|
|
class HostDeviceMem(object):
|
def __init__(self, host_mem, device_mem):
|
# """
|
# host_mem: cpu memory
|
# device_mem: gpu memory
|
# """
|
self.host = host_mem
|
self.device = device_mem
|
print()
|
|
def __str__(self):
|
return "Host:\n" + str(self.host)+"\nDevice:\n"+str(self.device)
|
|
def __repr__(self):
|
return self.__str__()
|
|
def allocate_buffers(engine):
|
inputs, outputs, bindings = [], [], []
|
stream = cuda.Stream()
|
for binding in engine:
|
size = trt.volume(engine.get_binding_shape(binding))
|
dtype = trt.nptype(engine.get_binding_dtype(binding))
|
host_mem = cuda.pagelocked_empty(size, dtype)
|
device_mem = cuda.mem_alloc(host_mem.nbytes)
|
bindings.append(int(device_mem))
|
#append to the appropriate list
|
if engine.binding_is_input(binding):
|
inputs.append(HostDeviceMem(host_mem, device_mem))
|
else:
|
outputs.append(HostDeviceMem(host_mem, device_mem))
|
return inputs, outputs, bindings, stream
|
|
def get_engine(max_batch_size=1, onnx_file_path="", engine_file_path="", fp16_mode=False, save_engine=True):
|
if os.path.exists(engine_file_path):
|
print("Reading engine from file: {}".format(engine_file_path))
|
with open(engine_file_path, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
|
return runtime.deserialize_cuda_engine(f.read()) # 反序列化
|
else:
|
|
explicit_batch = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
|
# In TensorRT 7.0, the ONNX parser only supports full-dimensions mode, meaning that your network definition must be created with the explicitBatch flag set. For more information, see Working With Dynamic Shapes.
|
|
with trt.Builder(TRT_LOGGER) as builder, \
|
builder.create_network(explicit_batch) as network, \
|
trt.OnnxParser(network, TRT_LOGGER) as parser:
|
|
config = builder.create_builder_config()
|
config.max_workspace_size = 1<<30
|
builder.max_batch_size = max_batch_size # 执行时最大可以使用的batchsize
|
builder.fp16_mode = fp16_mode
|
|
if not os.path.exists(onnx_file_path):
|
quit("ONNX file {} not found!".format(onnx_file_path))
|
print('loading onnx file from path {} ...'.format(onnx_file_path))
|
with open(onnx_file_path, 'rb') as model: # 二值化的网络结果和参数
|
print("Begining onnx file parsing")
|
parser.parse(model.read())
|
|
print("Completed parsing of onnx file")
|
print("Building an engine from file{}' this may take a while...".format(onnx_file_path))
|
|
#################
|
print(network.get_layer(network.num_layers-1).get_output(0).shape)
|
engine=builder.build_engin(network, config)
|
print("Completed creating Engine")
|
if save_engine:
|
with open(engine_file_path, 'wb') as f:
|
f.write(engine.serialize()) # 序列化
|
return engine
|
|
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
|
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
|
context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
|
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
|
# gpu to cpu
|
# Synchronize the stream
|
stream.synchronize()
|
return [out.host for out in outputs]
|
|
|
def postprocess_the_outputs(h_outputs, shape_of_output):
|
h_outputs = h_outputs.reshape(*shape_of_output)
|
return h_outputs
|
|
if __name__ == '__main__':
|
img_np_nchw = get_img_np_nchw("/data/disk1/project/data/01_reid/0_1.png").astype(np.float32)
|
fp16_mode = True
|
trt_engine_path ="./human_feature{}.trt".format(fp16_mode)
|
|
engine = get_engine(max_batch_size, onnx_model_path, trt_engine_path, fp16_mode)
|
|
context = engine.create_execution_context()
|
inputs, outputs, bindings, stream = allocate_buffers(engine)
|
|
shape_of_output = (max_batch_size, 2048)
|
|
inputs[0].host = img_np_nchw
|
|
t1 = time.time()
|
trt_outputs = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream, batch_size = max_batch_size)
|
t2 = time.time()
|
print(trt_outputs, trt_outputs[0].shape)
|
|
feat = postprocess_the_outputs(trt_outputs[0], shape_of_output)
|
print('TensorRT ok')
|
print("Inference time with the TensorRT engine: {}".format(t2-t1))
|