Created
June 21, 2023 01:35
-
-
Save chuanqi129/e0a1b45b13cdf017d13cd3d5962fe81f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/detect.py b/detect.py | |
index 8feb07d..d65f3f3 100644 | |
--- a/detect.py | |
+++ b/detect.py | |
@@ -74,6 +74,13 @@ def run( | |
hide_conf=False, # hide confidences | |
half=False, # use FP16 half-precision inference | |
dnn=False, # use OpenCV DNN for ONNX inference | |
+ precision="bfloat16", | |
+ channels_last=1, | |
+ num_iter=0, | |
+ num_warmup=0, | |
+ profile=False, | |
+ ipex=False, | |
+ jit=False | |
): | |
source = str(source) | |
save_img = not nosave and not source.endswith('.txt') # save inference images | |
@@ -90,6 +97,30 @@ def run( | |
# Load model | |
device = select_device(device) | |
model = DetectMultiBackend(weights, device=device, dnn=dnn, data=data, fp16=half) | |
+ if channels_last: | |
+ try: | |
+ model = model.to(memory_format=torch.channels_last) | |
+ print("---- Use NHWC model") | |
+ except: | |
+ print("---- Use normal model") | |
+ if ipex: | |
+ model.eval() | |
+ import intel_extension_for_pytorch as ipex | |
+ if precision == "bfloat16": | |
+ model = ipex.optimize(model, dtype=torch.bfloat16, inplace=True) | |
+ else: | |
+ model = ipex.optimize(model, dtype=torch.float32, inplace=True) | |
+ print("Running IPEX ...") | |
+ if jit: | |
+ try: | |
+ # model = torch.jit.script(model) | |
+ model = torch.jit.script(model) | |
+ print("---- With JIT enabled.") | |
+ if ipex: | |
+ model = torch.jit.freeze(model) | |
+ except: | |
+ print("---- With JIT disabled.") | |
+ | |
stride, names, pt = model.stride, model.names, model.pt | |
imgsz = check_img_size(imgsz, s=stride) # check image size | |
@@ -107,7 +138,11 @@ def run( | |
# Run inference | |
model.warmup(imgsz=(1 if pt else bs, 3, *imgsz)) # warmup | |
dt, seen = [0.0, 0.0, 0.0], 0 | |
+ iter_count = 0 | |
+ batch_time_list = [] | |
for path, im, im0s, vid_cap, s in dataset: | |
+ if iter_count > num_iter and num_iter > 0: | |
+ break | |
t1 = time_sync() | |
im = torch.from_numpy(im).to(device) | |
im = im.half() if model.fp16 else im.float() # uint8 to fp16/32 | |
@@ -115,20 +150,47 @@ def run( | |
if len(im.shape) == 3: | |
im = im[None] # expand for batch dim | |
t2 = time_sync() | |
- dt[0] += t2 - t1 | |
+ if iter_count >= num_warmup: | |
+ dt[0] += t2 - t1 | |
# Inference | |
visualize = increment_path(save_dir / Path(path).stem, mkdir=True) if visualize else False | |
- pred = model(im, augment=augment, visualize=visualize) | |
+ if profile: | |
+ with torch.profiler.profile( | |
+ activities=[torch.profiler.ProfilerActivity.CPU], | |
+ record_shapes=True, | |
+ ) as p: | |
+ pred = model(im, augment=augment, visualize=visualize) | |
+ profile_iter = len(dataset) if num_iter > len(dataset) or num_iter <= 0 else num_iter | |
+ if iter_count == profile_iter: | |
+ output = p.key_averages().table(sort_by="self_cpu_time_total") | |
+ print(output) | |
+ import pathlib | |
+ timeline_dir = str(pathlib.Path.cwd()) + '/timeline/' | |
+ if not os.path.exists(timeline_dir): | |
+ try: | |
+ os.makedirs(timeline_dir) | |
+ except: | |
+ pass | |
+ timeline_file = timeline_dir + 'timeline-' + str(torch.backends.quantized.engine) + '-' + \ | |
+ 'yolo5-' + str(iter_count) + '-' + str(os.getpid()) + '.json' | |
+ p.export_chrome_trace(timeline_file) | |
+ else: | |
+ pred = model(im, augment=augment, visualize=visualize) | |
t3 = time_sync() | |
- dt[1] += t3 - t2 | |
+ print("Iteration: {}, inference time: {} sec.".format(iter_count, t3 - t2), flush=True) | |
+ if iter_count >= num_warmup: | |
+ dt[1] += t3 - t2 | |
+ batch_time_list.append((t3 - t2) * 1000) | |
# NMS | |
pred = non_max_suppression(pred, conf_thres, iou_thres, classes, agnostic_nms, max_det=max_det) | |
- dt[2] += time_sync() - t3 | |
+ if iter_count >= num_warmup: | |
+ dt[2] += time_sync() - t3 | |
# Second-stage classifier (optional) | |
# pred = utils.general.apply_classifier(pred, classifier_model, im, im0s) | |
+ iter_count += 1 | |
# Process predictions | |
for i, det in enumerate(pred): # per image | |
@@ -199,6 +261,19 @@ def run( | |
LOGGER.info(f'{s}Done. ({t3 - t2:.3f}s)') | |
# Print results | |
+ print("\n", "-"*20, "Summary", "-"*20) | |
+ latency = dt[1] / iter_count * 1000 | |
+ throughput = iter_count / dt[1] | |
+ print("Latency:\t {:.3f} ms".format(latency)) | |
+ print("Throughput:\t {:.2f} samples/s".format(throughput)) | |
+ # P50 | |
+ batch_time_list.sort() | |
+ p50_latency = batch_time_list[int(len(batch_time_list) * 0.50) - 1] | |
+ p90_latency = batch_time_list[int(len(batch_time_list) * 0.90) - 1] | |
+ p99_latency = batch_time_list[int(len(batch_time_list) * 0.99) - 1] | |
+ print('Latency P50:\t %.3f ms\nLatency P90:\t %.3f ms\nLatency P99:\t %.3f ms\n'\ | |
+ % (p50_latency, p90_latency, p99_latency)) | |
+ | |
t = tuple(x / seen * 1E3 for x in dt) # speeds per image | |
LOGGER.info(f'Speed: %.1fms pre-process, %.1fms inference, %.1fms NMS per image at shape {(1, 3, *imgsz)}' % t) | |
if save_txt or save_img: | |
@@ -236,6 +311,13 @@ def parse_opt(): | |
parser.add_argument('--hide-conf', default=False, action='store_true', help='hide confidences') | |
parser.add_argument('--half', action='store_true', help='use FP16 half-precision inference') | |
parser.add_argument('--dnn', action='store_true', help='use OpenCV DNN for ONNX inference') | |
+ parser.add_argument('--precision', type=str, default="bfloat16") | |
+ parser.add_argument('--channels_last', type=int, default=1) | |
+ parser.add_argument('--num_iter', type=int, default=0) | |
+ parser.add_argument('--num_warmup', type=int, default=0) | |
+ parser.add_argument('--profile', action='store_true', help='profile') | |
+ parser.add_argument('--ipex', action='store_true', help='ipex') | |
+ parser.add_argument('--jit', action='store_true', help='jit') | |
opt = parser.parse_args() | |
opt.imgsz *= 2 if len(opt.imgsz) == 1 else 1 # expand | |
print_args(vars(opt)) | |
@@ -244,7 +326,11 @@ def parse_opt(): | |
def main(opt): | |
check_requirements(exclude=('tensorboard', 'thop')) | |
- run(**vars(opt)) | |
+ if opt.precision == "bfloat16": | |
+ with torch.cpu.amp.autocast(enabled=True, dtype=torch.bfloat16): | |
+ run(**vars(opt)) | |
+ else: | |
+ run(**vars(opt)) | |
if __name__ == "__main__": | |
diff --git a/requirements.txt b/requirements.txt | |
index 1937b93..7782196 100755 | |
--- a/requirements.txt | |
+++ b/requirements.txt | |
@@ -9,8 +9,6 @@ Pillow>=7.1.2 | |
PyYAML>=5.3.1 | |
requests>=2.23.0 | |
scipy>=1.4.1 # Google Colab version | |
-torch>=1.7.0 | |
-torchvision>=0.8.1 | |
tqdm>=4.41.0 | |
protobuf<=3.20.1 # https://github.com/ultralytics/yolov5/issues/8012 | |
diff --git a/utils/torch_utils.py b/utils/torch_utils.py | |
index b1b107e..5cfad3d 100644 | |
--- a/utils/torch_utils.py | |
+++ b/utils/torch_utils.py | |
@@ -199,7 +199,7 @@ def fuse_conv_and_bn(conv, bn): | |
stride=conv.stride, | |
padding=conv.padding, | |
groups=conv.groups, | |
- bias=True).requires_grad_(False).to(conv.weight.device) | |
+ bias=True).requires_grad_(True).to(conv.weight.device) | |
# Prepare filters | |
w_conv = conv.weight.clone().view(conv.out_channels, -1) | |
@@ -297,7 +297,7 @@ class ModelEMA: | |
self.updates = updates # number of EMA updates | |
self.decay = lambda x: decay * (1 - math.exp(-x / tau)) # decay exponential ramp (to help early epochs) | |
for p in self.ema.parameters(): | |
- p.requires_grad_(False) | |
+ p.requires_grad_(True) | |
def update(self, model): | |
# Update EMA parameters |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment