chuanqi129 · June 21, 2023 01:35
diff --git a/Yolov5.patch b/Yolov5.patch
 diff --git a/detect.py b/detect.py
 index 8feb07d..d65f3f3 100644
 --- a/detect.py
 +++ b/detect.py
 @@ -74,6 +74,13 @@ def run(
         hide_conf=False,  # hide confidences
         half=False,  # use FP16 half-precision inference
         dnn=False,  # use OpenCV DNN for ONNX inference
 +        precision="bfloat16",
 +        channels_last=1,
 +        num_iter=0,
 +        num_warmup=0,
 +        profile=False,
 +        ipex=False,
 +        jit=False
 ):
     source = str(source)
     save_img = not nosave and not source.endswith('.txt')  # save inference images
 @@ -90,6 +97,30 @@ def run(
     # Load model
     device = select_device(device)
     model = DetectMultiBackend(weights, device=device, dnn=dnn, data=data, fp16=half)
 +    if channels_last:
 +        try:
 +            model = model.to(memory_format=torch.channels_last)
 +            print("---- Use NHWC model")
 +        except:
 +            print("---- Use normal model")
 +    if ipex:
 +        model.eval()
 +        import intel_extension_for_pytorch as ipex
 +        if precision == "bfloat16":
 +            model = ipex.optimize(model, dtype=torch.bfloat16, inplace=True)
 +        else:
 +            model = ipex.optimize(model, dtype=torch.float32, inplace=True)
 +        print("Running IPEX ...")
 +    if jit:
 +        try:
 +            # model = torch.jit.script(model)
 +            model = torch.jit.script(model)
 +            print("---- With JIT enabled.")
 +            if ipex:
 +                model = torch.jit.freeze(model)
 +        except:
 +            print("---- With JIT disabled.")
 +
     stride, names, pt = model.stride, model.names, model.pt
     imgsz = check_img_size(imgsz, s=stride)  # check image size
 
 @@ -107,7 +138,11 @@ def run(
     # Run inference
     model.warmup(imgsz=(1 if pt else bs, 3, *imgsz))  # warmup
     dt, seen = [0.0, 0.0, 0.0], 0
 +    iter_count = 0
 +    batch_time_list = []
     for path, im, im0s, vid_cap, s in dataset:
 +        if iter_count > num_iter and num_iter > 0:
 +            break
         t1 = time_sync()
         im = torch.from_numpy(im).to(device)
         im = im.half() if model.fp16 else im.float()  # uint8 to fp16/32
 @@ -115,20 +150,47 @@ def run(
         if len(im.shape) == 3:
             im = im[None]  # expand for batch dim
         t2 = time_sync()
 -        dt[0] += t2 - t1
 +        if iter_count >= num_warmup:
 +            dt[0] += t2 - t1
 
         # Inference
         visualize = increment_path(save_dir / Path(path).stem, mkdir=True) if visualize else False
 -        pred = model(im, augment=augment, visualize=visualize)
 +        if profile:
 +            with torch.profiler.profile(
 +                activities=[torch.profiler.ProfilerActivity.CPU],
 +                record_shapes=True,
 +            ) as p:
 +                pred = model(im, augment=augment, visualize=visualize)
 +            profile_iter = len(dataset) if num_iter > len(dataset) or num_iter <= 0 else num_iter
 +            if iter_count == profile_iter:
 +                output = p.key_averages().table(sort_by="self_cpu_time_total")
 +                print(output)
 +                import pathlib
 +                timeline_dir = str(pathlib.Path.cwd()) + '/timeline/'
 +                if not os.path.exists(timeline_dir):
 +                    try:
 +                        os.makedirs(timeline_dir)
 +                    except:
 +                        pass
 +                timeline_file = timeline_dir + 'timeline-' + str(torch.backends.quantized.engine) + '-' + \
 +                            'yolo5-' + str(iter_count) + '-' + str(os.getpid()) + '.json'
 +                p.export_chrome_trace(timeline_file)
 +        else:
 +            pred = model(im, augment=augment, visualize=visualize)
         t3 = time_sync()
 -        dt[1] += t3 - t2
 +        print("Iteration: {}, inference time: {} sec.".format(iter_count, t3 - t2), flush=True)
 +        if iter_count >= num_warmup:
 +            dt[1] += t3 - t2
 +            batch_time_list.append((t3 - t2) * 1000)
 
         # NMS
         pred = non_max_suppression(pred, conf_thres, iou_thres, classes, agnostic_nms, max_det=max_det)
 -        dt[2] += time_sync() - t3
 +        if iter_count >= num_warmup:
 +            dt[2] += time_sync() - t3
 
         # Second-stage classifier (optional)
         # pred = utils.general.apply_classifier(pred, classifier_model, im, im0s)
 +        iter_count += 1
 
         # Process predictions
         for i, det in enumerate(pred):  # per image
 @@ -199,6 +261,19 @@ def run(
         LOGGER.info(f'{s}Done. ({t3 - t2:.3f}s)')
 
     # Print results
 +    print("\n", "-"*20, "Summary", "-"*20)
 +    latency = dt[1] / iter_count * 1000
 +    throughput = iter_count / dt[1]
 +    print("Latency:\t {:.3f} ms".format(latency))
 +    print("Throughput:\t {:.2f} samples/s".format(throughput))
 +    # P50
 +    batch_time_list.sort()
 +    p50_latency = batch_time_list[int(len(batch_time_list) * 0.50) - 1]
 +    p90_latency = batch_time_list[int(len(batch_time_list) * 0.90) - 1]
 +    p99_latency = batch_time_list[int(len(batch_time_list) * 0.99) - 1]
 +    print('Latency P50:\t %.3f ms\nLatency P90:\t %.3f ms\nLatency P99:\t %.3f ms\n'\
 +            % (p50_latency, p90_latency, p99_latency))
 +
     t = tuple(x / seen * 1E3 for x in dt)  # speeds per image
     LOGGER.info(f'Speed: %.1fms pre-process, %.1fms inference, %.1fms NMS per image at shape {(1, 3, *imgsz)}' % t)
     if save_txt or save_img:
 @@ -236,6 +311,13 @@ def parse_opt():
     parser.add_argument('--hide-conf', default=False, action='store_true', help='hide confidences')
     parser.add_argument('--half', action='store_true', help='use FP16 half-precision inference')
     parser.add_argument('--dnn', action='store_true', help='use OpenCV DNN for ONNX inference')
 +    parser.add_argument('--precision', type=str, default="bfloat16")
 +    parser.add_argument('--channels_last', type=int, default=1)
 +    parser.add_argument('--num_iter', type=int, default=0)
 +    parser.add_argument('--num_warmup', type=int, default=0)
 +    parser.add_argument('--profile', action='store_true', help='profile')
 +    parser.add_argument('--ipex', action='store_true', help='ipex')
 +    parser.add_argument('--jit', action='store_true', help='jit')
     opt = parser.parse_args()
     opt.imgsz *= 2 if len(opt.imgsz) == 1 else 1  # expand
     print_args(vars(opt))
 @@ -244,7 +326,11 @@ def parse_opt():
 
 def main(opt):
     check_requirements(exclude=('tensorboard', 'thop'))
 -    run(**vars(opt))
 +    if opt.precision == "bfloat16":
 +        with torch.cpu.amp.autocast(enabled=True, dtype=torch.bfloat16):
 +            run(**vars(opt))
 +    else:
 +        run(**vars(opt))
 
 
 if __name__ == "__main__":
 diff --git a/requirements.txt b/requirements.txt
 index 1937b93..7782196 100755
 --- a/requirements.txt
 +++ b/requirements.txt
 @@ -9,8 +9,6 @@ Pillow>=7.1.2
 PyYAML>=5.3.1
 requests>=2.23.0
 scipy>=1.4.1  # Google Colab version
 -torch>=1.7.0
 -torchvision>=0.8.1
 tqdm>=4.41.0
 protobuf<=3.20.1  # https://github.com/ultralytics/yolov5/issues/8012
 
 diff --git a/utils/torch_utils.py b/utils/torch_utils.py
 index b1b107e..5cfad3d 100644
 --- a/utils/torch_utils.py
 +++ b/utils/torch_utils.py
 @@ -199,7 +199,7 @@ def fuse_conv_and_bn(conv, bn):
                           stride=conv.stride,
                           padding=conv.padding,
                           groups=conv.groups,
 -                          bias=True).requires_grad_(False).to(conv.weight.device)
 +                          bias=True).requires_grad_(True).to(conv.weight.device)
 
     # Prepare filters
     w_conv = conv.weight.clone().view(conv.out_channels, -1)
 @@ -297,7 +297,7 @@ class ModelEMA:
         self.updates = updates  # number of EMA updates
         self.decay = lambda x: decay * (1 - math.exp(-x / tau))  # decay exponential ramp (to help early epochs)
         for p in self.ema.parameters():
 -            p.requires_grad_(False)
 +            p.requires_grad_(True)
 
     def update(self, model):
         # Update EMA parameters
	diff --git a/detect.py b/detect.py
	index 8feb07d..d65f3f3 100644
	--- a/detect.py
	+++ b/detect.py
	@@ -74,6 +74,13 @@ def run(
	hide_conf=False, # hide confidences
	half=False, # use FP16 half-precision inference
	dnn=False, # use OpenCV DNN for ONNX inference
	+ precision="bfloat16",
	+ channels_last=1,
	+ num_iter=0,
	+ num_warmup=0,
	+ profile=False,
	+ ipex=False,
	+ jit=False
	):
	source = str(source)
	save_img = not nosave and not source.endswith('.txt') # save inference images
	@@ -90,6 +97,30 @@ def run(
	# Load model
	device = select_device(device)
	model = DetectMultiBackend(weights, device=device, dnn=dnn, data=data, fp16=half)
	+ if channels_last:
	+ try:
	+ model = model.to(memory_format=torch.channels_last)
	+ print("---- Use NHWC model")
	+ except:
	+ print("---- Use normal model")
	+ if ipex:
	+ model.eval()
	+ import intel_extension_for_pytorch as ipex
	+ if precision == "bfloat16":
	+ model = ipex.optimize(model, dtype=torch.bfloat16, inplace=True)
	+ else:
	+ model = ipex.optimize(model, dtype=torch.float32, inplace=True)
	+ print("Running IPEX ...")
	+ if jit:
	+ try:
	+ # model = torch.jit.script(model)
	+ model = torch.jit.script(model)
	+ print("---- With JIT enabled.")
	+ if ipex:
	+ model = torch.jit.freeze(model)
	+ except:
	+ print("---- With JIT disabled.")
	+
	stride, names, pt = model.stride, model.names, model.pt
	imgsz = check_img_size(imgsz, s=stride) # check image size

	@@ -107,7 +138,11 @@ def run(
	# Run inference
	model.warmup(imgsz=(1 if pt else bs, 3, *imgsz)) # warmup
	dt, seen = [0.0, 0.0, 0.0], 0
	+ iter_count = 0
	+ batch_time_list = []
	for path, im, im0s, vid_cap, s in dataset:
	+ if iter_count > num_iter and num_iter > 0:
	+ break
	t1 = time_sync()
	im = torch.from_numpy(im).to(device)
	im = im.half() if model.fp16 else im.float() # uint8 to fp16/32
	@@ -115,20 +150,47 @@ def run(
	if len(im.shape) == 3:
	im = im[None] # expand for batch dim
	t2 = time_sync()
	- dt[0] += t2 - t1
	+ if iter_count >= num_warmup:
	+ dt[0] += t2 - t1

	# Inference
	visualize = increment_path(save_dir / Path(path).stem, mkdir=True) if visualize else False
	- pred = model(im, augment=augment, visualize=visualize)
	+ if profile:
	+ with torch.profiler.profile(
	+ activities=[torch.profiler.ProfilerActivity.CPU],
	+ record_shapes=True,
	+ ) as p:
	+ pred = model(im, augment=augment, visualize=visualize)
	+ profile_iter = len(dataset) if num_iter > len(dataset) or num_iter <= 0 else num_iter
	+ if iter_count == profile_iter:
	+ output = p.key_averages().table(sort_by="self_cpu_time_total")
	+ print(output)
	+ import pathlib
	+ timeline_dir = str(pathlib.Path.cwd()) + '/timeline/'
	+ if not os.path.exists(timeline_dir):
	+ try:
	+ os.makedirs(timeline_dir)
	+ except:
	+ pass
	+ timeline_file = timeline_dir + 'timeline-' + str(torch.backends.quantized.engine) + '-' + \
	+ 'yolo5-' + str(iter_count) + '-' + str(os.getpid()) + '.json'
	+ p.export_chrome_trace(timeline_file)
	+ else:
	+ pred = model(im, augment=augment, visualize=visualize)
	t3 = time_sync()
	- dt[1] += t3 - t2
	+ print("Iteration: {}, inference time: {} sec.".format(iter_count, t3 - t2), flush=True)
	+ if iter_count >= num_warmup:
	+ dt[1] += t3 - t2
	+ batch_time_list.append((t3 - t2) * 1000)

	# NMS
	pred = non_max_suppression(pred, conf_thres, iou_thres, classes, agnostic_nms, max_det=max_det)
	- dt[2] += time_sync() - t3
	+ if iter_count >= num_warmup:
	+ dt[2] += time_sync() - t3

	# Second-stage classifier (optional)
	# pred = utils.general.apply_classifier(pred, classifier_model, im, im0s)
	+ iter_count += 1

	# Process predictions
	for i, det in enumerate(pred): # per image
	@@ -199,6 +261,19 @@ def run(
	LOGGER.info(f'{s}Done. ({t3 - t2:.3f}s)')

	# Print results
	+ print("\n", "-"20, "Summary", "-"20)
	+ latency = dt[1] / iter_count * 1000
	+ throughput = iter_count / dt[1]
	+ print("Latency:\t {:.3f} ms".format(latency))
	+ print("Throughput:\t {:.2f} samples/s".format(throughput))
	+ # P50
	+ batch_time_list.sort()
	+ p50_latency = batch_time_list[int(len(batch_time_list) * 0.50) - 1]
	+ p90_latency = batch_time_list[int(len(batch_time_list) * 0.90) - 1]
	+ p99_latency = batch_time_list[int(len(batch_time_list) * 0.99) - 1]
	+ print('Latency P50:\t %.3f ms\nLatency P90:\t %.3f ms\nLatency P99:\t %.3f ms\n'\
	+ % (p50_latency, p90_latency, p99_latency))
	+
	t = tuple(x / seen * 1E3 for x in dt) # speeds per image
	LOGGER.info(f'Speed: %.1fms pre-process, %.1fms inference, %.1fms NMS per image at shape {(1, 3, *imgsz)}' % t)
	if save_txt or save_img:
	@@ -236,6 +311,13 @@ def parse_opt():
	parser.add_argument('--hide-conf', default=False, action='store_true', help='hide confidences')
	parser.add_argument('--half', action='store_true', help='use FP16 half-precision inference')
	parser.add_argument('--dnn', action='store_true', help='use OpenCV DNN for ONNX inference')
	+ parser.add_argument('--precision', type=str, default="bfloat16")
	+ parser.add_argument('--channels_last', type=int, default=1)
	+ parser.add_argument('--num_iter', type=int, default=0)
	+ parser.add_argument('--num_warmup', type=int, default=0)
	+ parser.add_argument('--profile', action='store_true', help='profile')
	+ parser.add_argument('--ipex', action='store_true', help='ipex')
	+ parser.add_argument('--jit', action='store_true', help='jit')
	opt = parser.parse_args()
	opt.imgsz *= 2 if len(opt.imgsz) == 1 else 1 # expand
	print_args(vars(opt))
	@@ -244,7 +326,11 @@ def parse_opt():

	def main(opt):
	check_requirements(exclude=('tensorboard', 'thop'))
	- run(**vars(opt))
	+ if opt.precision == "bfloat16":
	+ with torch.cpu.amp.autocast(enabled=True, dtype=torch.bfloat16):
	+ run(**vars(opt))
	+ else:
	+ run(**vars(opt))


	if __name__ == "__main__":
	diff --git a/requirements.txt b/requirements.txt
	index 1937b93..7782196 100755
	--- a/requirements.txt
	+++ b/requirements.txt
	@@ -9,8 +9,6 @@ Pillow>=7.1.2
	PyYAML>=5.3.1
	requests>=2.23.0
	scipy>=1.4.1 # Google Colab version
	-torch>=1.7.0
	-torchvision>=0.8.1
	tqdm>=4.41.0
	protobuf<=3.20.1 # https://github.com/ultralytics/yolov5/issues/8012

	diff --git a/utils/torch_utils.py b/utils/torch_utils.py
	index b1b107e..5cfad3d 100644
	--- a/utils/torch_utils.py
	+++ b/utils/torch_utils.py
	@@ -199,7 +199,7 @@ def fuse_conv_and_bn(conv, bn):
	stride=conv.stride,
	padding=conv.padding,
	groups=conv.groups,
	- bias=True).requires_grad_(False).to(conv.weight.device)
	+ bias=True).requires_grad_(True).to(conv.weight.device)

	# Prepare filters
	w_conv = conv.weight.clone().view(conv.out_channels, -1)
	@@ -297,7 +297,7 @@ class ModelEMA:
	self.updates = updates # number of EMA updates
	self.decay = lambda x: decay * (1 - math.exp(-x / tau)) # decay exponential ramp (to help early epochs)
	for p in self.ema.parameters():
	- p.requires_grad_(False)
	+ p.requires_grad_(True)

	def update(self, model):
	# Update EMA parameters