#!/usr/bin/env python
# -*- coding: utf-8 -*-
import torch
import sys
sys.path.append('.')

import time

import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt
import torch
import time
from PIL import Image
import cv2,os
import torchvision
import numpy as np

max_batch_size = 1
onnx_model_path = "/data/disk1/workspace/06_reid/01_fast_reid/02_fast_reid_inference/fastreid.onnx"
TRT_LOGGER = trt.Logger()

# class HostDeviceMem(object):
#     def init(self, host_mem, device_mem):
#         # """
#         # host_mem: cpu memory
#         # device_mem: gpu memory
#         # """
#         print("-----------11-----------")
#         self.host = host_mem
#         self.device = device_mem

#     def init():
#         # """
#         # host_mem: cpu memory
#         # device_mem: gpu memory
#         # """
#         print("---------22-------------")

#     def __str__(self):
#         return "Host:\n" + str(self.host)+"\nDevice:\n"+str(self.device)
#     def __repr__(self):
#         return self.__str__()

def get_img_np_nchw(filename):
    image = cv2.imread(filename)
    image_cv = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image_cv = cv2.resize(image_cv, (256, 128))
    miu = np.array([0.485, 0.456, 0.406]).reshape(3, 1, 1)
    std = np.array([0.229, 0.224, 0.225]).reshape(3, 1, 1)
    img_np = np.array(image_cv, dtype=np.float)/255.
    img_np = img_np.transpose((2, 0, 1))
    img_np -= miu
    img_np /= std
    img_np_nchw = img_np[np.newaxis]
    img_np_nchw = np.tile(img_np_nchw,(max_batch_size, 1, 1, 1))
    return img_np_nchw


class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        # """
        # host_mem: cpu memory
        # device_mem: gpu memory
        # """
        self.host = host_mem
        self.device = device_mem
        print()

    def __str__(self):
        return "Host:\n" + str(self.host)+"\nDevice:\n"+str(self.device)

    def __repr__(self):
        return self.__str__()

def allocate_buffers(engine):
    inputs, outputs, bindings = [], [], []
    stream = cuda.Stream()
    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding))
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        bindings.append(int(device_mem))
        #append to the appropriate list
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream

def get_engine(max_batch_size=1, onnx_file_path="", engine_file_path="", fp16_mode=False, save_engine=True):
    if os.path.exists(engine_file_path):
        print("Reading engine from file: {}".format(engine_file_path))
        with open(engine_file_path, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
            return runtime.deserialize_cuda_engine(f.read()) # 反序列化
    else:

        explicit_batch = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
        # In TensorRT 7.0, the ONNX parser only supports full-dimensions mode, meaning that your network definition must be created with the explicitBatch flag set. For more information, see Working With Dynamic Shapes.

        with trt.Builder(TRT_LOGGER) as builder, \
            builder.create_network(explicit_batch) as network,  \
            trt.OnnxParser(network, TRT_LOGGER) as parser:

            config = builder.create_builder_config()
            config.max_workspace_size = 1<<30
            builder.max_batch_size = max_batch_size # 执行时最大可以使用的batchsize
            builder.fp16_mode = fp16_mode

            if not os.path.exists(onnx_file_path):
                quit("ONNX file {} not found!".format(onnx_file_path))
            print('loading onnx file from path {} ...'.format(onnx_file_path))
            with open(onnx_file_path, 'rb') as model: # 二值化的网络结果和参数
                print("Begining onnx file parsing")
                parser.parse(model.read())

            print("Completed parsing of onnx file")
            print("Building an engine from file{}' this may take a while...".format(onnx_file_path))

            #################
            print(network.get_layer(network.num_layers-1).get_output(0).shape)
            engine=builder.build_engin(network, config)
            print("Completed creating Engine")
            if save_engine:
                with open(engine_file_path, 'wb') as f:
                    f.write(engine.serialize())  # 序列化
            return engine

def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # gpu to cpu
    # Synchronize the stream
    stream.synchronize()
    return [out.host for out in outputs]


def postprocess_the_outputs(h_outputs, shape_of_output):
    h_outputs = h_outputs.reshape(*shape_of_output)
    return h_outputs

if __name__ == '__main__':
    img_np_nchw = get_img_np_nchw("/data/disk1/project/data/01_reid/0_1.png").astype(np.float32)
    fp16_mode = True
    trt_engine_path ="./human_feature{}.trt".format(fp16_mode)

    engine = get_engine(max_batch_size, onnx_model_path, trt_engine_path, fp16_mode)

    context = engine.create_execution_context()
    inputs, outputs, bindings, stream = allocate_buffers(engine)

    shape_of_output = (max_batch_size, 2048)

    inputs[0].host = img_np_nchw

    t1 = time.time()
    trt_outputs = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream, batch_size = max_batch_size)
    t2 = time.time()
    print(trt_outputs, trt_outputs[0].shape)

    feat = postprocess_the_outputs(trt_outputs[0], shape_of_output)
    print('TensorRT ok')
    print("Inference time with the TensorRT engine: {}".format(t2-t1))