Created
May 13, 2021 09:12
-
-
Save albanie/49a1e0c844ed91c473648ee5803846f4 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""A minimal profiler to compare speed differences between reading a video with opencv | |
and DALI. | |
For both dataloaders, the goal is to read an mp4 video and then load batches of frames | |
onto the GPU. | |
""" | |
import os | |
import logging | |
import time | |
import subprocess | |
import argparse | |
import platform | |
from pathlib import Path | |
from collections import defaultdict | |
from datetime import datetime | |
import cv2 | |
import numpy as np | |
import torch | |
import nvidia.dali | |
import nvidia.dali.fn as fn | |
from nvidia.dali import pipeline_def | |
from nvidia.dali.plugin.pytorch import DALIGenericIterator | |
def fetch_and_prep_movie( | |
url: str, | |
dest_path: Path, | |
refresh: bool, | |
height: int, | |
width: int, | |
trim: str, | |
): | |
"""Fetch a video from the web and prepare it for profiling. | |
Args: | |
url: the location of the video | |
dest_path: location where the the prepared video will be stored | |
refresh: whether to overwrite an existing video | |
height: the height (in pixels) that the video will be resized to | |
width: the width (in pixels) that the video will be resized to | |
trim: the video will be cropped to this duration, specified as an ffmpeg-style | |
formatted string (i.e. the `HH:MM.SS.MMM` format) | |
""" | |
if dest_path.exists() and not refresh: | |
print(f"Found existing sintel movie at {url}") | |
return | |
print(f"Fetching original sintel movie from {url}") | |
dest_path.parent.mkdir(exist_ok=True, parents=True) | |
raw_video = dest_path.parent / f"{dest_path.stem}-raw{dest_path.suffix}" | |
os.system(f"wget {url} -O {raw_video}") | |
prep = f'ffmpeg -y -i {raw_video} -vf "scale={height}x{width}" -to {trim} {dest_path}' | |
os.system(prep) | |
def mini_cv2_dataloader( | |
video_path: Path, | |
batch_size: int, | |
sequence_length: int, | |
height: int, | |
width: int, | |
): | |
"""A minimalist opencv2 dataloader | |
Args: | |
video_path: location of the video to be read | |
batch_size: the batch dimension of the tensor to be loaded | |
sequence_length: the sequence dimension of the tensor to be loaded | |
height: the height (in pixels) of the video | |
width: the width (in pixels) of the video | |
Yields: | |
A batch of video frames, stored as a pytorch tensor on the GPU in | |
NOTE: This is not meant to be in any way optimised - it's the simplest way | |
to load frames into a tensor (that I could think of). | |
""" | |
video_capture = cv2.VideoCapture(str(video_path)) | |
while video_capture.isOpened(): | |
next_batch = torch.zeros((batch_size, sequence_length, height, width, 3), | |
device="cuda") | |
for batch_idx in range(batch_size): | |
for seq_idx in range(sequence_length): | |
_, im = video_capture.read() | |
if im is None: | |
return | |
gpu_im = torch.from_numpy(im) # move to GPU to match DALI outputs | |
next_batch[batch_idx, seq_idx] = gpu_im | |
yield [{"data": next_batch}] # mimic DALI pipeline output | |
def profile( | |
mode: str, | |
video_path: Path, | |
test_vid_width: int, | |
test_vid_height: int, | |
batch_size: int, | |
sequence_length: int, | |
num_threads: int, | |
initial_prefetch_size: int, | |
warmup: int, | |
): | |
"""Profile video readers | |
Args: | |
model: whether to use cv2 or dali as the data loader | |
video_path: location of the video to be read | |
test_vid_width: the width (in pixels) of the video | |
test_vid_height: the height (in pixels) of the video | |
batch_size: the batch dimension of the tensor to be loaded | |
sequence_length: the sequence dimension of the tensor to be loaded | |
num_threads: the number of threads to use for reading (DALI only) | |
initial_prefetch_size: DALI dataloader parameter | |
warmup: how many iterations to run before starting to profile speed | |
Returns: | |
The average speed at which frames were read. | |
""" | |
if mode == "cv2": | |
dataloader = mini_cv2_dataloader( | |
video_path=video_path, | |
batch_size=batch_size, | |
sequence_length=sequence_length, | |
width=test_vid_width, | |
height=test_vid_height, | |
) | |
elif mode == "dali": | |
@pipeline_def | |
def video_pipe(): | |
video = fn.readers.video( | |
name="video", | |
device="gpu", | |
filenames=[str(video_path)], | |
sequence_length=sequence_length, | |
shard_id=0, | |
num_shards=1, | |
random_shuffle=False, | |
pad_last_batch=True, | |
initial_fill=initial_prefetch_size, | |
stride=1, | |
step=sequence_length, | |
) | |
return video | |
pipe = video_pipe( #pylint: disable = unexpected-keyword-arg | |
batch_size=batch_size, | |
prefetch_queue_depth=1, | |
num_threads=num_threads, | |
device_id=0, | |
seed=0, | |
) | |
pipe.build() | |
dataloader = DALIGenericIterator(pipe, ["data"], reader_name='video') | |
# Compare video reading speed | |
for ii, batch in enumerate(dataloader): | |
if ii == warmup: | |
start = time.time() | |
total_frames = 0 | |
if ii > warmup: | |
total_time = time.time() - start | |
shape = batch[0]["data"].shape | |
total_frames += shape[0] * shape[1] | |
avg_hz = total_frames / max(total_time, 1E-5) | |
if ii % 10 == 0: | |
print(f"Speed {avg_hz:.1f} hz, {shape}") | |
return avg_hz | |
def main(): | |
# pylint: disable=line-too-long | |
# flake8: noqa: E501 | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--src_url", default="http://peach.themazzone.com/durian/movies/sintel-1024-surround.mp4") | |
parser.add_argument("--video_path", type=Path, default="data/dali_mwp/sintel.mp4") | |
parser.add_argument("--batch_sizes", type=int, nargs="+", default=[1, 10]) | |
parser.add_argument("--log_dir", type=Path, default="data/dali_mwp/logs") | |
parser.add_argument("--refresh", action="store_true") | |
parser.add_argument("--warmup", type=int, default=1) | |
parser.add_argument("--modes", type=str, nargs="+", default=["cv2", "dali"]) | |
parser.add_argument("--test_vid_height", type=int, default=256) | |
parser.add_argument("--test_vid_width", type=int, default=256) | |
parser.add_argument("--test_vid_duration", type=str, default="00:03:00") | |
parser.add_argument("--num_threads", type=int, nargs="+", default=[1, 4, 8, 16]) | |
parser.add_argument("--sequence_lengths", nargs="+", type=int, default=[8, 16, 32, 64]) | |
parser.add_argument("--num_runs", type=int, default=3) | |
parser.add_argument("--initial_prefetch_size", type=int, default=0) | |
args = parser.parse_args() | |
# fetch sintel movie for profiling | |
fetch_and_prep_movie( | |
url=args.src_url, | |
dest_path=args.video_path, | |
refresh=args.refresh, | |
width=args.test_vid_width, | |
height=args.test_vid_height, | |
trim=args.test_vid_duration, | |
) | |
# Set up logging | |
log_name = f"timing-log-{datetime.now().strftime('%y-%m-%d_%H:%M:%S.txt')}" | |
args.log_dir.mkdir(exist_ok=True, parents=True) | |
handlers = [logging.FileHandler(str(args.log_dir / log_name)), logging.StreamHandler()] | |
logging.basicConfig(level=logging.INFO, format="%(message)s", handlers=handlers) | |
logging.info(f"Launched profiler with args\n: {args}") | |
# Compute some stats about how many runs will be profiled | |
cv2_runs = len(args.batch_sizes) * len(args.sequence_lengths) * args.num_runs | |
dali_runs = cv2_runs * len(args.num_threads) | |
total_runs = cv2_runs + dali_runs | |
logging.info(f"Running the profiler on {total_runs} configurations") | |
# Keep track of current run | |
curr_run_idx = 1 | |
# Store timing results in a defaultdict, with one list of timings per configuration | |
# to make it easy to average across runs | |
results = defaultdict(list) | |
# First, profile video loading with opencv. Since this has fewer configuration options | |
# than dali, we run it as a separate loop | |
mode = "cv2" | |
for batch_size in args.batch_sizes: | |
for sequence_length in args.sequence_lengths: | |
for _ in range(args.num_runs): | |
tag = f"{mode}-batch_size-{batch_size}-seq_len-{sequence_length}" | |
print(f"{tag} {len(results[tag])} [{curr_run_idx}/{total_runs}]") | |
avg_hz = profile( | |
video_path=args.video_path, | |
test_vid_width=args.test_vid_width, | |
test_vid_height=args.test_vid_height, | |
batch_size=batch_size, | |
sequence_length=sequence_length, | |
num_threads=1, | |
initial_prefetch_size=args.initial_prefetch_size, | |
warmup=args.warmup, | |
mode=mode, | |
) | |
results[tag].append(avg_hz) | |
curr_run_idx += 1 | |
# Second, repeat the same configurations with DALI, with the addition of an extra | |
# set of configurations using different numbers of threads | |
mode = "dali" | |
for batch_size in args.batch_sizes: | |
for sequence_length in args.sequence_lengths: | |
for _ in range(args.num_runs): | |
for num_thread in args.num_threads: | |
tag = (f"{mode}-batch_size-{batch_size}-seq_len-{sequence_length}" | |
f"-num_threads-{num_thread}") | |
print(f"{tag} run {len(results[tag])} [{curr_run_idx}/{total_runs}]") | |
avg_hz = profile( | |
video_path=args.video_path, | |
test_vid_width=args.test_vid_width, | |
test_vid_height=args.test_vid_height, | |
batch_size=batch_size, | |
sequence_length=sequence_length, | |
num_threads=num_thread, | |
initial_prefetch_size=args.initial_prefetch_size, | |
warmup=args.warmup, | |
mode=mode, | |
) | |
results[tag].append(avg_hz) | |
curr_run_idx += 1 | |
logging.info("===========================================") | |
logging.info("Hardware summary (note, only using one GPU)") | |
logging.info(subprocess.check_output(["nvidia-smi", "-L"])) | |
logging.info("===========================================") | |
logging.info("Platform summary:") | |
logging.info(platform.sys.version) | |
logging.info("===========================================") | |
logging.info(f"DALI version: {nvidia.dali.__version__}") | |
logging.info(f"Torch cuda: {torch.version.cuda}") | |
logging.info("===========================================") | |
logging.info(f"Timing results (averages across {args.num_runs}):") | |
for mode, speeds in results.items(): | |
logging.info(f"{mode}: {np.mean(speeds):.1f} Hz +/- {np.std(speeds):.1f}") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Outputs: