Scheaven
2021-09-18 291deeb1fcf45dbf39a24aa72a213ff3fd6b3405
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import torch
import sys
sys.path.append('.')
 
import time
 
import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt
import torch
import time
from PIL import Image
import cv2,os
import torchvision
import numpy as np
 
max_batch_size = 1
onnx_model_path = "/data/disk1/workspace/06_reid/01_fast_reid/02_fast_reid_inference/fastreid.onnx"
TRT_LOGGER = trt.Logger()
 
# class HostDeviceMem(object):
#     def init(self, host_mem, device_mem):
#         # """
#         # host_mem: cpu memory
#         # device_mem: gpu memory
#         # """
#         print("-----------11-----------")
#         self.host = host_mem
#         self.device = device_mem
 
#     def init():
#         # """
#         # host_mem: cpu memory
#         # device_mem: gpu memory
#         # """
#         print("---------22-------------")
 
#     def __str__(self):
#         return "Host:\n" + str(self.host)+"\nDevice:\n"+str(self.device)
#     def __repr__(self):
#         return self.__str__()
 
def get_img_np_nchw(filename):
    image = cv2.imread(filename)
    image_cv = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image_cv = cv2.resize(image_cv, (256, 128))
    miu = np.array([0.485, 0.456, 0.406]).reshape(3, 1, 1)
    std = np.array([0.229, 0.224, 0.225]).reshape(3, 1, 1)
    img_np = np.array(image_cv, dtype=np.float)/255.
    img_np = img_np.transpose((2, 0, 1))
    img_np -= miu
    img_np /= std
    img_np_nchw = img_np[np.newaxis]
    img_np_nchw = np.tile(img_np_nchw,(max_batch_size, 1, 1, 1))
    return img_np_nchw
 
 
class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        # """
        # host_mem: cpu memory
        # device_mem: gpu memory
        # """
        self.host = host_mem
        self.device = device_mem
        print()
 
    def __str__(self):
        return "Host:\n" + str(self.host)+"\nDevice:\n"+str(self.device)
 
    def __repr__(self):
        return self.__str__()
 
def allocate_buffers(engine):
    inputs, outputs, bindings = [], [], []
    stream = cuda.Stream()
    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding))
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        bindings.append(int(device_mem))
        #append to the appropriate list
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream
 
def get_engine(max_batch_size=1, onnx_file_path="", engine_file_path="", fp16_mode=False, save_engine=True):
    if os.path.exists(engine_file_path):
        print("Reading engine from file: {}".format(engine_file_path))
        with open(engine_file_path, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
            return runtime.deserialize_cuda_engine(f.read()) # 反序列化
    else:
 
        explicit_batch = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
        # In TensorRT 7.0, the ONNX parser only supports full-dimensions mode, meaning that your network definition must be created with the explicitBatch flag set. For more information, see Working With Dynamic Shapes.
 
        with trt.Builder(TRT_LOGGER) as builder, \
            builder.create_network(explicit_batch) as network,  \
            trt.OnnxParser(network, TRT_LOGGER) as parser:
 
            config = builder.create_builder_config()
            config.max_workspace_size = 1<<30
            builder.max_batch_size = max_batch_size # 执行时最大可以使用的batchsize
            builder.fp16_mode = fp16_mode
 
            if not os.path.exists(onnx_file_path):
                quit("ONNX file {} not found!".format(onnx_file_path))
            print('loading onnx file from path {} ...'.format(onnx_file_path))
            with open(onnx_file_path, 'rb') as model: # 二值化的网络结果和参数
                print("Begining onnx file parsing")
                parser.parse(model.read())
 
            print("Completed parsing of onnx file")
            print("Building an engine from file{}' this may take a while...".format(onnx_file_path))
 
            #################
            print(network.get_layer(network.num_layers-1).get_output(0).shape)
            engine=builder.build_engin(network, config)
            print("Completed creating Engine")
            if save_engine:
                with open(engine_file_path, 'wb') as f:
                    f.write(engine.serialize())  # 序列化
            return engine
 
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # gpu to cpu
    # Synchronize the stream
    stream.synchronize()
    return [out.host for out in outputs]
 
 
def postprocess_the_outputs(h_outputs, shape_of_output):
    h_outputs = h_outputs.reshape(*shape_of_output)
    return h_outputs
 
if __name__ == '__main__':
    img_np_nchw = get_img_np_nchw("/data/disk1/project/data/01_reid/0_1.png").astype(np.float32)
    fp16_mode = True
    trt_engine_path ="./human_feature{}.trt".format(fp16_mode)
 
    engine = get_engine(max_batch_size, onnx_model_path, trt_engine_path, fp16_mode)
 
    context = engine.create_execution_context()
    inputs, outputs, bindings, stream = allocate_buffers(engine)
 
    shape_of_output = (max_batch_size, 2048)
 
    inputs[0].host = img_np_nchw
 
    t1 = time.time()
    trt_outputs = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream, batch_size = max_batch_size)
    t2 = time.time()
    print(trt_outputs, trt_outputs[0].shape)
 
    feat = postprocess_the_outputs(trt_outputs[0], shape_of_output)
    print('TensorRT ok')
    print("Inference time with the TensorRT engine: {}".format(t2-t1))