### DATALOADERS ##################################################################
# When building DataLoaders. Set `num_workers>0` and `pin_memory=True`
DataLoader(dataset, num_workers=8, pin_memory=True)

### num_workers ##################################################################
# num_workers depends on the batch size and the machine
# A general place to start is to set num_workers = number of CPUs in the machine.
# Increasing num_workers all increases the CPU usage
# BEST TIP: Increase num_workers slowly and stop when there is no performance increase.

### spawn ##################################################################
# PyTorch has issues with `num_workers > 0` and using `spawn`

### .item(), .numpy(), .cpu() ##################################################################
# DONOT call .item() anywhere in your script. PL takes care of it.

### emptycache() ##################################################################
# DONOT call it anywhere un-necessarily.

### TENSOR CREATION ##################################################################
# Construct TENSOR directly on the device when using PL module (self)

t = torch.rand(2, 2).cuda()                 ## BAD
t = torch.rand(2, 2, device=self.device)    ## GOOD

## For tensors that need to be MODEL's ATTRIBUTES, its best to register them as buffers
## in the module's __init__() method

t = torch.rand(2, 2, device=self.device)        ## BAD
self.register_buffer("t", torch.rand(2, 2))     ## GOOD

### DDP v/s DP ##################################################################
# Use DDP instead of DP. 
# DDP is much faster compared to DP.

###  ##################################################################