Algorithm/reID.git

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import torch
import sys
sys.path.append('.')
 
import time
 
import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt
import torch
import time
from PIL import Image
import cv2,os
import torchvision
import numpy as np
 
max_batch_size = 1
onnx_model_path = "/data/disk1/workspace/06_reid/01_fast_reid/02_fast_reid_inference/fastreid.onnx"
TRT_LOGGER = trt.Logger()
 
# class HostDeviceMem(object):
#     def init(self, host_mem, device_mem):
#         # """
#         # host_mem: cpu memory
#         # device_mem: gpu memory
#         # """
#         print("-----------11-----------")
#         self.host = host_mem
#         self.device = device_mem
 
#     def init():
#         # """
#         # host_mem: cpu memory
#         # device_mem: gpu memory
#         # """
#         print("---------22-------------")
 
#     def __str__(self):
#         return "Host:\n" + str(self.host)+"\nDevice:\n"+str(self.device)
#     def __repr__(self):
#         return self.__str__()
 
def get_img_np_nchw(filename):
    image = cv2.imread(filename)
    image_cv = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image_cv = cv2.resize(image_cv, (256, 128))
    miu = np.array([0.485, 0.456, 0.406]).reshape(3, 1, 1)
    std = np.array([0.229, 0.224, 0.225]).reshape(3, 1, 1)
    img_np = np.array(image_cv, dtype=np.float)/255.
    img_np = img_np.transpose((2, 0, 1))
    img_np -= miu
    img_np /= std
    img_np_nchw = img_np[np.newaxis]
    img_np_nchw = np.tile(img_np_nchw,(max_batch_size, 1, 1, 1))
    return img_np_nchw
 
 
class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        # """
        # host_mem: cpu memory
        # device_mem: gpu memory
        # """
        self.host = host_mem
        self.device = device_mem
        print()
 
    def __str__(self):
        return "Host:\n" + str(self.host)+"\nDevice:\n"+str(self.device)
 
    def __repr__(self):
        return self.__str__()
 
def allocate_buffers(engine):
    inputs, outputs, bindings = [], [], []
    stream = cuda.Stream()
    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding))
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        bindings.append(int(device_mem))
        #append to the appropriate list
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream
 
def get_engine(max_batch_size=1, onnx_file_path="", engine_file_path="", fp16_mode=False, save_engine=True):
    if os.path.exists(engine_file_path):
        print("Reading engine from file: {}".format(engine_file_path))
        with open(engine_file_path, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
            return runtime.deserialize_cuda_engine(f.read()) # 反序列化
    else:
 
        explicit_batch = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
        # In TensorRT 7.0, the ONNX parser only supports full-dimensions mode, meaning that your network definition must be created with the explicitBatch flag set. For more information, see Working With Dynamic Shapes.
 
        with trt.Builder(TRT_LOGGER) as builder, \
            builder.create_network(explicit_batch) as network,  \
            trt.OnnxParser(network, TRT_LOGGER) as parser:
 
            config = builder.create_builder_config()
            config.max_workspace_size = 1<<30
            builder.max_batch_size = max_batch_size # 执行时最大可以使用的batchsize
            builder.fp16_mode = fp16_mode
 
            if not os.path.exists(onnx_file_path):
                quit("ONNX file {} not found!".format(onnx_file_path))
            print('loading onnx file from path {} ...'.format(onnx_file_path))
            with open(onnx_file_path, 'rb') as model: # 二值化的网络结果和参数
                print("Begining onnx file parsing")
                parser.parse(model.read())
 
            print("Completed parsing of onnx file")
            print("Building an engine from file{}' this may take a while...".format(onnx_file_path))
 
            #################
            print(network.get_layer(network.num_layers-1).get_output(0).shape)
            engine=builder.build_engin(network, config)
            print("Completed creating Engine")
            if save_engine:
                with open(engine_file_path, 'wb') as f:
                    f.write(engine.serialize())  # 序列化
            return engine
 
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # gpu to cpu
    # Synchronize the stream
    stream.synchronize()
    return [out.host for out in outputs]
 
 
def postprocess_the_outputs(h_outputs, shape_of_output):
    h_outputs = h_outputs.reshape(*shape_of_output)
    return h_outputs
 
if __name__ == '__main__':
    img_np_nchw = get_img_np_nchw("/data/disk1/project/data/01_reid/0_1.png").astype(np.float32)
    fp16_mode = True
    trt_engine_path ="./human_feature{}.trt".format(fp16_mode)
 
    engine = get_engine(max_batch_size, onnx_model_path, trt_engine_path, fp16_mode)
 
    context = engine.create_execution_context()
    inputs, outputs, bindings, stream = allocate_buffers(engine)
 
    shape_of_output = (max_batch_size, 2048)
 
    inputs[0].host = img_np_nchw
 
    t1 = time.time()
    trt_outputs = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream, batch_size = max_batch_size)
    t2 = time.time()
    print(trt_outputs, trt_outputs[0].shape)
 
    feat = postprocess_the_outputs(trt_outputs[0], shape_of_output)
    print('TensorRT ok')
    print("Inference time with the TensorRT engine: {}".format(t2-t1))