Created
October 10, 2021 03:27
-
-
Save HoangTienDuc/b15c3f45a560f8613a4f495fbe727c4b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pycuda.driver as cuda | |
import pycuda.autoinit | |
import numpy as np | |
from decoder import CifCafDecoder | |
import tensorrt as trt | |
import cv2 | |
import openpifpaf | |
import torch | |
TRT_LOGGER = trt.Logger() | |
# Simple helper data class that's a little nicer to use than a 2-tuple. | |
class HostDeviceMem(object): | |
def __init__(self, host_mem, device_mem): | |
self.host = host_mem | |
self.device = device_mem | |
def __str__(self): | |
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device) | |
def __repr__(self): | |
return self.__str__() | |
# Allocates all buffers required for an engine, i.e. host/device inputs/outputs. | |
def allocate_buffers(engine): | |
inputs = [] | |
outputs = [] | |
bindings = [] | |
stream = cuda.Stream() | |
out_shapes = [] | |
input_shapes = [] | |
out_names = [] | |
max_batch_size = engine.max_batch_size | |
for binding in engine: | |
binding_shape = engine.get_binding_shape(binding) | |
#Fix -1 dimension for proper memory allocation for batch_size > 1 | |
if binding_shape[0] == -1: | |
binding_shape = (1,) + binding_shape[1:] | |
size = trt.volume(binding_shape) * max_batch_size | |
dtype = trt.nptype(engine.get_binding_dtype(binding)) | |
# Allocate host and device buffers | |
host_mem = cuda.pagelocked_empty(size, dtype) | |
device_mem = cuda.mem_alloc(host_mem.nbytes) | |
# Append the device buffer to device bindings. | |
bindings.append(int(device_mem)) | |
# Append to the appropriate list. | |
if engine.binding_is_input(binding): | |
inputs.append(HostDeviceMem(host_mem, device_mem)) | |
input_shapes.append(engine.get_binding_shape(binding)) | |
else: | |
outputs.append(HostDeviceMem(host_mem, device_mem)) | |
#Collect original output shapes and names from engine | |
out_shapes.append(engine.get_binding_shape(binding)) | |
out_names.append(binding) | |
return inputs, outputs, bindings, stream, input_shapes, out_shapes, out_names, max_batch_size | |
# This function is generalized for multiple inputs/outputs. | |
# inputs and outputs are expected to be lists of HostDeviceMem objects. | |
def do_inference(context, bindings, inputs, outputs, stream): | |
# Transfer input data to the GPU. | |
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] | |
# Run inference. | |
context.execute_async_v2(bindings=bindings, stream_handle=stream.handle) | |
# Transfer predictions back from the GPU. | |
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] | |
# Synchronize the stream | |
stream.synchronize() | |
# Return only the host outputs. | |
return [out.host for out in outputs] | |
class TrtModel(object): | |
def __init__(self, model): | |
self.engine_file = model | |
self.engine = None | |
self.inputs = None | |
self.outputs = None | |
self.bindings = None | |
self.stream = None | |
self.context = None | |
self.input_shapes = None | |
self.out_shapes = None | |
self.max_batch_size = 1 | |
self.cuda_ctx = cuda.Device(0).make_context() | |
if self.cuda_ctx: | |
self.cuda_ctx.push() | |
def build(self): | |
with open(self.engine_file, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime: | |
self.engine = runtime.deserialize_cuda_engine(f.read()) | |
self.inputs, self.outputs, self.bindings, self.stream, self.input_shapes, self.out_shapes, self.out_names, self.max_batch_size = allocate_buffers( | |
self.engine) | |
self.context = self.engine.create_execution_context() | |
self.context.active_optimization_profile = 0 | |
if self.cuda_ctx: | |
self.cuda_ctx.pop() | |
def run(self, input, deflatten: bool = True, as_dict=False): | |
# lazy load implementation | |
if self.engine is None: | |
self.build() | |
if self.cuda_ctx: | |
self.cuda_ctx.push() | |
input = np.asarray(input) | |
batch_size = input.shape[0] | |
allocate_place = np.prod(input.shape) | |
self.inputs[0].host[:allocate_place] = input.flatten(order='C').astype(np.float32) | |
self.context.set_binding_shape(0, input.shape) | |
trt_outputs = do_inference( | |
self.context, bindings=self.bindings, | |
inputs=self.inputs, outputs=self.outputs, stream=self.stream) | |
if self.cuda_ctx: | |
self.cuda_ctx.pop() | |
#Reshape TRT outputs to original shape instead of flattened array | |
if deflatten: | |
trt_outputs = [torch.from_numpy(output.reshape(shape)) for output, shape in zip(trt_outputs, [(17, 5, 47, 81), (19, 9, 47, 81)])] | |
if as_dict: | |
return {name: trt_outputs[i] for i, name in enumerate(self.out_names)} | |
return trt_outputs | |
engine = TrtModel("/data/data/tensorrt/openpifpaf_resnet50_641_369_d16.trt") | |
# engine = TrtModel("/data/arcface_r100_v1.onnx_b1_gpu0_fp16.engine") | |
engine.build() | |
image = cv2.imread("/data/warmup.jpg") | |
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) | |
img_normalized = np.zeros(image.shape) | |
img_normalized = cv2.normalize(image_rgb, img_normalized, 0, 255, cv2.NORM_MINMAX) | |
img_normalized = cv2.resize(img_normalized, (641, 369)) | |
# image = cv2.resize(image, (112, 112)) | |
# img_normalized = np.transpose(img_normalized, axes=(2, 0, 1)) | |
import PIL | |
pil_im = PIL.Image.fromarray(img_normalized) | |
preprocess = None | |
data = openpifpaf.datasets.PilImageList([pil_im], preprocess=preprocess) | |
loader = torch.utils.data.DataLoader( | |
data, batch_size=1, shuffle=False, pin_memory=True, | |
collate_fn=openpifpaf.datasets.collate_images_anns_meta) | |
for images_batch, _, __ in loader: | |
np_img = images_batch.numpy() | |
# np_img = np.expand_dims(img_normalized, axis=0) | |
trt_outputs = engine.run(np_img) | |
decoder = CifCafDecoder() | |
predictions = decoder.decode(trt_outputs) | |
# for i, pred_object in enumerate(predictions): | |
# pred = pred_object.data | |
# pred_visible = pred[pred[:, 2] > .2] | |
# xs = pred_visible[:, 0] | |
# ys = pred_visible[:, 1] | |
# if len(xs) == 0 or len(ys) == 0: | |
# continue | |
# x, y, w, h = pred_object.bbox() | |
# print(x, y, w, h) | |
# # x_min = int(x) | |
# # x_max = int(x + w) | |
# # y_min = int(y) | |
# # y_max = int(y + h) | |
# # xmin = int(max(x_min - .15 * w, 0)) | |
# # xmax = int(min(x_max + .15 * w, self.w)) | |
# # ymin = int(max(y_min - .2 * h, 0)) | |
# # ymax = int(min(y_max + .05 * h, self.h)) | |
# # bbox_dict={} | |
img_vis = cv2.resize(image, (641, 369)) | |
import random | |
for i, pred_object in enumerate(predictions): | |
pred = pred_object.data | |
pred_visible = pred[pred[:, 2] > 0] | |
xs = pred_visible[:, 0] | |
ys = pred_visible[:, 1] | |
if min(xs) < 0 or min(ys) < 0: | |
continue | |
color = (random.randint(60, 200), random.randint(0, 255), random.randint(0, 255)) | |
for x,y in zip(xs,ys): | |
cv2.circle(img_vis,(int(x), int(y)), 2, color, -1) | |
decode_order=[(a,b) for (a,b,c,d) in pred_object.decoding_order] | |
for index, (a,b) in enumerate(decode_order): | |
if (a+1,b+1) in pred_object.skeleton or (b+1,a+1) in pred_object.skeleton: | |
x1,y1,_ = pred_object.decoding_order[index][2] | |
x2,y2,_ = pred_object.decoding_order[index][3] | |
else: | |
continue | |
cv2.line(img_vis, ( x1, y1), ( x2, y2), color, 1) | |
cv2.imwrite("result.jpg", img_vis) | |
engine.cuda_ctx.pop() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment