Skip to content

Instantly share code, notes, and snippets.

@chuanqi129
Created June 21, 2023 01:35
Show Gist options
  • Save chuanqi129/e0a1b45b13cdf017d13cd3d5962fe81f to your computer and use it in GitHub Desktop.
Save chuanqi129/e0a1b45b13cdf017d13cd3d5962fe81f to your computer and use it in GitHub Desktop.
diff --git a/detect.py b/detect.py
index 8feb07d..d65f3f3 100644
--- a/detect.py
+++ b/detect.py
@@ -74,6 +74,13 @@ def run(
hide_conf=False, # hide confidences
half=False, # use FP16 half-precision inference
dnn=False, # use OpenCV DNN for ONNX inference
+ precision="bfloat16",
+ channels_last=1,
+ num_iter=0,
+ num_warmup=0,
+ profile=False,
+ ipex=False,
+ jit=False
):
source = str(source)
save_img = not nosave and not source.endswith('.txt') # save inference images
@@ -90,6 +97,30 @@ def run(
# Load model
device = select_device(device)
model = DetectMultiBackend(weights, device=device, dnn=dnn, data=data, fp16=half)
+ if channels_last:
+ try:
+ model = model.to(memory_format=torch.channels_last)
+ print("---- Use NHWC model")
+ except:
+ print("---- Use normal model")
+ if ipex:
+ model.eval()
+ import intel_extension_for_pytorch as ipex
+ if precision == "bfloat16":
+ model = ipex.optimize(model, dtype=torch.bfloat16, inplace=True)
+ else:
+ model = ipex.optimize(model, dtype=torch.float32, inplace=True)
+ print("Running IPEX ...")
+ if jit:
+ try:
+ # model = torch.jit.script(model)
+ model = torch.jit.script(model)
+ print("---- With JIT enabled.")
+ if ipex:
+ model = torch.jit.freeze(model)
+ except:
+ print("---- With JIT disabled.")
+
stride, names, pt = model.stride, model.names, model.pt
imgsz = check_img_size(imgsz, s=stride) # check image size
@@ -107,7 +138,11 @@ def run(
# Run inference
model.warmup(imgsz=(1 if pt else bs, 3, *imgsz)) # warmup
dt, seen = [0.0, 0.0, 0.0], 0
+ iter_count = 0
+ batch_time_list = []
for path, im, im0s, vid_cap, s in dataset:
+ if iter_count > num_iter and num_iter > 0:
+ break
t1 = time_sync()
im = torch.from_numpy(im).to(device)
im = im.half() if model.fp16 else im.float() # uint8 to fp16/32
@@ -115,20 +150,47 @@ def run(
if len(im.shape) == 3:
im = im[None] # expand for batch dim
t2 = time_sync()
- dt[0] += t2 - t1
+ if iter_count >= num_warmup:
+ dt[0] += t2 - t1
# Inference
visualize = increment_path(save_dir / Path(path).stem, mkdir=True) if visualize else False
- pred = model(im, augment=augment, visualize=visualize)
+ if profile:
+ with torch.profiler.profile(
+ activities=[torch.profiler.ProfilerActivity.CPU],
+ record_shapes=True,
+ ) as p:
+ pred = model(im, augment=augment, visualize=visualize)
+ profile_iter = len(dataset) if num_iter > len(dataset) or num_iter <= 0 else num_iter
+ if iter_count == profile_iter:
+ output = p.key_averages().table(sort_by="self_cpu_time_total")
+ print(output)
+ import pathlib
+ timeline_dir = str(pathlib.Path.cwd()) + '/timeline/'
+ if not os.path.exists(timeline_dir):
+ try:
+ os.makedirs(timeline_dir)
+ except:
+ pass
+ timeline_file = timeline_dir + 'timeline-' + str(torch.backends.quantized.engine) + '-' + \
+ 'yolo5-' + str(iter_count) + '-' + str(os.getpid()) + '.json'
+ p.export_chrome_trace(timeline_file)
+ else:
+ pred = model(im, augment=augment, visualize=visualize)
t3 = time_sync()
- dt[1] += t3 - t2
+ print("Iteration: {}, inference time: {} sec.".format(iter_count, t3 - t2), flush=True)
+ if iter_count >= num_warmup:
+ dt[1] += t3 - t2
+ batch_time_list.append((t3 - t2) * 1000)
# NMS
pred = non_max_suppression(pred, conf_thres, iou_thres, classes, agnostic_nms, max_det=max_det)
- dt[2] += time_sync() - t3
+ if iter_count >= num_warmup:
+ dt[2] += time_sync() - t3
# Second-stage classifier (optional)
# pred = utils.general.apply_classifier(pred, classifier_model, im, im0s)
+ iter_count += 1
# Process predictions
for i, det in enumerate(pred): # per image
@@ -199,6 +261,19 @@ def run(
LOGGER.info(f'{s}Done. ({t3 - t2:.3f}s)')
# Print results
+ print("\n", "-"*20, "Summary", "-"*20)
+ latency = dt[1] / iter_count * 1000
+ throughput = iter_count / dt[1]
+ print("Latency:\t {:.3f} ms".format(latency))
+ print("Throughput:\t {:.2f} samples/s".format(throughput))
+ # P50
+ batch_time_list.sort()
+ p50_latency = batch_time_list[int(len(batch_time_list) * 0.50) - 1]
+ p90_latency = batch_time_list[int(len(batch_time_list) * 0.90) - 1]
+ p99_latency = batch_time_list[int(len(batch_time_list) * 0.99) - 1]
+ print('Latency P50:\t %.3f ms\nLatency P90:\t %.3f ms\nLatency P99:\t %.3f ms\n'\
+ % (p50_latency, p90_latency, p99_latency))
+
t = tuple(x / seen * 1E3 for x in dt) # speeds per image
LOGGER.info(f'Speed: %.1fms pre-process, %.1fms inference, %.1fms NMS per image at shape {(1, 3, *imgsz)}' % t)
if save_txt or save_img:
@@ -236,6 +311,13 @@ def parse_opt():
parser.add_argument('--hide-conf', default=False, action='store_true', help='hide confidences')
parser.add_argument('--half', action='store_true', help='use FP16 half-precision inference')
parser.add_argument('--dnn', action='store_true', help='use OpenCV DNN for ONNX inference')
+ parser.add_argument('--precision', type=str, default="bfloat16")
+ parser.add_argument('--channels_last', type=int, default=1)
+ parser.add_argument('--num_iter', type=int, default=0)
+ parser.add_argument('--num_warmup', type=int, default=0)
+ parser.add_argument('--profile', action='store_true', help='profile')
+ parser.add_argument('--ipex', action='store_true', help='ipex')
+ parser.add_argument('--jit', action='store_true', help='jit')
opt = parser.parse_args()
opt.imgsz *= 2 if len(opt.imgsz) == 1 else 1 # expand
print_args(vars(opt))
@@ -244,7 +326,11 @@ def parse_opt():
def main(opt):
check_requirements(exclude=('tensorboard', 'thop'))
- run(**vars(opt))
+ if opt.precision == "bfloat16":
+ with torch.cpu.amp.autocast(enabled=True, dtype=torch.bfloat16):
+ run(**vars(opt))
+ else:
+ run(**vars(opt))
if __name__ == "__main__":
diff --git a/requirements.txt b/requirements.txt
index 1937b93..7782196 100755
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,8 +9,6 @@ Pillow>=7.1.2
PyYAML>=5.3.1
requests>=2.23.0
scipy>=1.4.1 # Google Colab version
-torch>=1.7.0
-torchvision>=0.8.1
tqdm>=4.41.0
protobuf<=3.20.1 # https://github.com/ultralytics/yolov5/issues/8012
diff --git a/utils/torch_utils.py b/utils/torch_utils.py
index b1b107e..5cfad3d 100644
--- a/utils/torch_utils.py
+++ b/utils/torch_utils.py
@@ -199,7 +199,7 @@ def fuse_conv_and_bn(conv, bn):
stride=conv.stride,
padding=conv.padding,
groups=conv.groups,
- bias=True).requires_grad_(False).to(conv.weight.device)
+ bias=True).requires_grad_(True).to(conv.weight.device)
# Prepare filters
w_conv = conv.weight.clone().view(conv.out_channels, -1)
@@ -297,7 +297,7 @@ class ModelEMA:
self.updates = updates # number of EMA updates
self.decay = lambda x: decay * (1 - math.exp(-x / tau)) # decay exponential ramp (to help early epochs)
for p in self.ema.parameters():
- p.requires_grad_(False)
+ p.requires_grad_(True)
def update(self, model):
# Update EMA parameters
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment