ogrisel · May 27, 2016 13:33
diff --git a/haswell-n1-highmem-32-xeon-2.30GHz.txt b/haswell-n1-highmem-32-xeon-2.30GHz.txt
 model name      : Intel(R) Xeon(R) CPU @ 2.30GHz                                                                                   
 Loading arrays to memory                                                                                                           
 Loading speed: 0.670GB/s                                                                                                           
                                                                                                                                   
 timing bandwidth for sequential memory access                                                                                      
 bandwidth: 9.0 GB/s                                                                                                                
                                                                                                                                   
 n_workers=2 (threads)                                                                                                              
 bandwidth: 17.4 GB/s (1.9x)                                                                                                        
 n_workers=4 (threads)                                                                                                              
 bandwidth: 29.5 GB/s (3.3x)                                                                                                        
 n_workers=8 (threads)                                                                                                              
 bandwidth: 36.6 GB/s (4.1x)                                                                                                        
 n_workers=16 (threads)                                                                                                             
 bandwidth: 44.8 GB/s (5.0x)                                                                                                        
 n_workers=32 (threads)                                                                                                             
 bandwidth: 60.4 GB/s (6.7x) 
diff --git a/intel-core-i7-6560U-2.20GHz.txt b/intel-core-i7-6560U-2.20GHz.txt
 model name	: Intel(R) Core(TM) i7-6560U CPU @ 2.20GHz
 Loading arrays to memory
 Loading speed: 0.530GB/s

 timing bandwidth for sequential memory access
 bandwidth: 5.8 GB/s

 n_workers=2 (threads)
 bandwidth: 8.5 GB/s (1.5x)
 n_workers=4 (threads)
 bandwidth: 8.3 GB/s (1.4x)
 n_workers=8 (threads)
 bandwidth: 7.9 GB/s (1.4x)
diff --git a/memory_bandwidth.py b/memory_bandwidth.py
 from time import time
 import os
 import os.path as op
 import numpy as np
 from concurrent.futures import ThreadPoolExecutor


 def prepare_data_files(n=8):
    fnames = []
    for i in range(n):
        fname = 'random_data_%d.npy' % i
        if not op.exists(fname):
            print('generating %s' % fname)
            a = np.random.randn(int(1e9 / 8))  # 1GB
            np.save(fname, a)
        fnames.append(fname)
    return fnames



 def run_bench_bandwidth(data_size_gb=8):
    os.system("cat /proc/cpuinfo  | grep 'model name' | uniq")
    fnames = prepare_data_files(n=data_size_gb)

    print('Loading arrays to memory')
    t0 = time()
    arrays = [np.load(fname) for fname in fnames]
    duration = time() - t0
    print("Loading speed: %0.3fGB/s\n" % (data_size_gb / duration))

    # sequential access
    print("timing bandwidth for sequential memory access")
    t0 = time()
    list(map(np.max, arrays))
    sequential_access_time = time() - t0
    sequential_bandwidth = data_size_gb / sequential_access_time
    print("bandwidth: %0.1f GB/s" % sequential_bandwidth)
    print()

    n_workers = 2
    while n_workers <= data_size_gb:
        print('n_workers=%d (threads)' % n_workers)
        with ThreadPoolExecutor(n_workers) as e:
            t0 = time()
            list(e.map(np.max, arrays))
            access_time = time() - t0
            bandwidth = data_size_gb / access_time
            print("bandwidth: %0.1f GB/s (%0.1fx)" %
                (bandwidth, bandwidth / sequential_bandwidth))
        n_workers *= 2
    print()


 if __name__ == "__main__":
    import sys
    if len(sys.argv) == 2:
        n_gb = int(sys.argv[1])
    else:
        n_gb = 8
    run_bench_bandwidth(n_gb)
	model name : Intel(R) Xeon(R) CPU @ 2.30GHz
	Loading arrays to memory
	Loading speed: 0.670GB/s

	timing bandwidth for sequential memory access
	bandwidth: 9.0 GB/s

	n_workers=2 (threads)
	bandwidth: 17.4 GB/s (1.9x)
	n_workers=4 (threads)
	bandwidth: 29.5 GB/s (3.3x)
	n_workers=8 (threads)
	bandwidth: 36.6 GB/s (4.1x)
	n_workers=16 (threads)
	bandwidth: 44.8 GB/s (5.0x)
	n_workers=32 (threads)
	bandwidth: 60.4 GB/s (6.7x)
	model name : Intel(R) Core(TM) i7-6560U CPU @ 2.20GHz
	Loading arrays to memory
	Loading speed: 0.530GB/s

	timing bandwidth for sequential memory access
	bandwidth: 5.8 GB/s

	n_workers=2 (threads)
	bandwidth: 8.5 GB/s (1.5x)
	n_workers=4 (threads)
	bandwidth: 8.3 GB/s (1.4x)
	n_workers=8 (threads)
	bandwidth: 7.9 GB/s (1.4x)
	from time import time
	import os
	import os.path as op
	import numpy as np
	from concurrent.futures import ThreadPoolExecutor


	def prepare_data_files(n=8):
	fnames = []
	for i in range(n):
	fname = 'random_data_%d.npy' % i
	if not op.exists(fname):
	print('generating %s' % fname)
	a = np.random.randn(int(1e9 / 8)) # 1GB
	np.save(fname, a)
	fnames.append(fname)
	return fnames



	def run_bench_bandwidth(data_size_gb=8):
	os.system("cat /proc/cpuinfo \| grep 'model name' \| uniq")
	fnames = prepare_data_files(n=data_size_gb)

	print('Loading arrays to memory')
	t0 = time()
	arrays = [np.load(fname) for fname in fnames]
	duration = time() - t0
	print("Loading speed: %0.3fGB/s\n" % (data_size_gb / duration))

	# sequential access
	print("timing bandwidth for sequential memory access")
	t0 = time()
	list(map(np.max, arrays))
	sequential_access_time = time() - t0
	sequential_bandwidth = data_size_gb / sequential_access_time
	print("bandwidth: %0.1f GB/s" % sequential_bandwidth)
	print()

	n_workers = 2
	while n_workers <= data_size_gb:
	print('n_workers=%d (threads)' % n_workers)
	with ThreadPoolExecutor(n_workers) as e:
	t0 = time()
	list(e.map(np.max, arrays))
	access_time = time() - t0
	bandwidth = data_size_gb / access_time
	print("bandwidth: %0.1f GB/s (%0.1fx)" %
	(bandwidth, bandwidth / sequential_bandwidth))
	n_workers *= 2
	print()


	if __name__ == "__main__":
	import sys
	if len(sys.argv) == 2:
	n_gb = int(sys.argv[1])
	else:
	n_gb = 8
	run_bench_bandwidth(n_gb)