Geson-anko · February 23, 2025 02:32 · Geson-anko · Feb 22, 2025
diff --git a/Tensor Serialization Performance Comparison Result.txt b/Tensor Serialization Performance Comparison Result.txt
 Benchmarking serialization methods...
 Format: numpy().tobytes() / data_ptr() / torch.save
 ------------------------------------------------------------

 Tensor shape: (100,)
 Number of iterations: 10000

 Average times per operation (ms):
 Serialization:   ['0.001', '0.000', '0.020']
 Deserialization: ['0.001', '0.001', '0.024']

 Throughput (MB/s):
 Serialization:   ['724.3', '1305.5', '18.8']
 Deserialization: ['470.9', '444.4', '15.9']

 Buffer sizes:
 - numpy().tobytes(): 0.4 KB
 - data_ptr(): 0.4 KB
 - torch.save: 1.5 KB

 Correctness check (max absolute difference):
 - numpy().tobytes(): 0.000e+00
 - data_ptr(): 0.000e+00
 - torch.save: 0.000e+00

 Tensor shape: (1000,)
 Number of iterations: 10000

 Average times per operation (ms):
 Serialization:   ['0.001', '0.000', '0.021']
 Deserialization: ['0.001', '0.001', '0.024']

 Throughput (MB/s):
 Serialization:   ['6582.6', '10595.4', '178.9']
 Deserialization: ['4526.6', '4340.6', '157.8']

 Buffer sizes:
 - numpy().tobytes(): 3.9 KB
 - data_ptr(): 3.9 KB
 - torch.save: 5.0 KB

 Correctness check (max absolute difference):
 - numpy().tobytes(): 0.000e+00
 - data_ptr(): 0.000e+00
 - torch.save: 0.000e+00

 Tensor shape: (100, 100)
 Number of iterations: 5000

 Average times per operation (ms):
 Serialization:   ['0.001', '0.001', '0.029']
 Deserialization: ['0.001', '0.001', '0.025']

 Throughput (MB/s):
 Serialization:   ['42424.0', '59640.4', '1315.9']
 Deserialization: ['31161.4', '30357.5', '1531.3']

 Buffer sizes:
 - numpy().tobytes(): 39.1 KB
 - data_ptr(): 39.1 KB
 - torch.save: 40.2 KB

 Correctness check (max absolute difference):
 - numpy().tobytes(): 0.000e+00
 - data_ptr(): 0.000e+00
 - torch.save: 0.000e+00

 Tensor shape: (1000, 1000)
 Number of iterations: 1000

 Average times per operation (ms):
 Serialization:   ['0.178', '0.174', '0.960']
 Deserialization: ['0.172', '0.174', '0.199']

 Throughput (MB/s):
 Serialization:   ['21479.6', '21910.3', '3972.7']
 Deserialization: ['22117.4', '21879.9', '19125.0']

 Buffer sizes:
 - numpy().tobytes(): 3906.2 KB
 - data_ptr(): 3906.2 KB
 - torch.save: 3907.4 KB

 Correctness check (max absolute difference):
 - numpy().tobytes(): 0.000e+00
 - data_ptr(): 0.000e+00
 - torch.save: 0.000e+00

 Tensor shape: (5000, 5000)
 Number of iterations: 100

 Average times per operation (ms):
 Serialization:   ['4.664', '4.730', '24.671']
 Deserialization: ['4.846', '4.943', '5.403']

 Throughput (MB/s):
 Serialization:   ['20446.0', '20163.4', '3865.6']
 Deserialization: ['19679.6', '19293.1', '17649.9']

 Buffer sizes:
 - numpy().tobytes(): 97656.2 KB
 - data_ptr(): 97656.2 KB
 - torch.save: 97657.4 KB

 Correctness check (max absolute difference):
 - numpy().tobytes(): 0.000e+00
 - data_ptr(): 0.000e+00
 - torch.save: 0.000e+00
diff --git a/tensor_serialization_benchmark.py b/tensor_serialization_benchmark.py
 import torch
 import numpy as np
 from io import BytesIO
 import timeit
 import ctypes
 import gc

 # Method 1: numpy().tobytes()
 def tensor_to_buffer_numpy(tensor: torch.Tensor) -> bytes:
    if tensor.device.type != "cpu":
        tensor = tensor.cpu()
    return tensor.numpy().tobytes()

 def buffer_to_tensor_numpy(buffer: bytes, shape: tuple[int, ...], dtype: torch.dtype) -> torch.Tensor:
    array = np.frombuffer(buffer, dtype=np.float32).copy()
    return torch.from_numpy(array.reshape(shape))

 # Method 2: data_ptr()
 def tensor_to_buffer_ptr(tensor: torch.Tensor) -> bytes:
    if tensor.device.type != "cpu":
        tensor = tensor.cpu()
    nbytes = tensor.nelement() * tensor.element_size()
    ptr = tensor.data_ptr()
    return ctypes.string_at(ptr, nbytes)

 def buffer_to_tensor_ptr(buffer: bytes, shape: tuple[int, ...], dtype: torch.dtype) -> torch.Tensor:
    return torch.frombuffer(bytearray(buffer), dtype=dtype).reshape(shape)

 # Method 3: torch.save with BytesIO
 def tensor_to_buffer_save(tensor: torch.Tensor) -> bytes:
    buffer = BytesIO()
    torch.save(tensor, buffer)
    return buffer.getvalue()

 def buffer_to_tensor_save(buffer: bytes) -> torch.Tensor:
    return torch.load(BytesIO(buffer))

 def benchmark_serialization(tensor: torch.Tensor, number: int = 1000, warmup: int = 100) -> tuple[float, float, float]:
    # Warmup
    for _ in range(warmup):
        tensor_to_buffer_numpy(tensor)
        tensor_to_buffer_ptr(tensor)
        tensor_to_buffer_save(tensor)
    
    gc.collect()
    torch.cuda.empty_cache()
    
    # Benchmark tensor_to_buffer
    time_numpy = timeit.timeit(
        lambda: tensor_to_buffer_numpy(tensor),
        number=number
    )
    
    time_ptr = timeit.timeit(
        lambda: tensor_to_buffer_ptr(tensor),
        number=number
    )
    
    time_save = timeit.timeit(
        lambda: tensor_to_buffer_save(tensor),
        number=number
    )
    
    return time_numpy, time_ptr, time_save

 def benchmark_deserialization(buffer_numpy: bytes, buffer_ptr: bytes, buffer_save: bytes,
                            shape: tuple, dtype: torch.dtype, number: int = 1000, warmup: int = 100) -> tuple[float, float, float]:
    # Warmup
    for _ in range(warmup):
        buffer_to_tensor_numpy(buffer_numpy, shape, dtype)
        buffer_to_tensor_ptr(buffer_ptr, shape, dtype)
        buffer_to_tensor_save(buffer_save)
    
    gc.collect()
    torch.cuda.empty_cache()
    
    # Benchmark buffer_to_tensor
    time_numpy = timeit.timeit(
        lambda: buffer_to_tensor_numpy(buffer_numpy, shape, dtype),
        number=number
    )
    
    time_ptr = timeit.timeit(
        lambda: buffer_to_tensor_ptr(buffer_ptr, shape, dtype),
        number=number
    )
    
    time_save = timeit.timeit(
        lambda: buffer_to_tensor_save(buffer_save),
        number=number
    )
    
    return time_numpy, time_ptr, time_save

 # Run benchmarks with different sizes and iterations
 sizes = [
    ((100,), 10000),          # Small tensor, many iterations
    ((1000,), 10000),         # Medium tensor, many iterations
    ((100, 100), 5000),       # Medium tensor, medium iterations
    ((1000, 1000), 1000),     # Large tensor, fewer iterations
    ((5000, 5000), 100),      # Very large tensor, few iterations
 ]

 print("Benchmarking serialization methods...")
 print("Format: numpy().tobytes() / data_ptr() / torch.save")
 print("-" * 60)

 for size, iterations in sizes:
    x = torch.randn(size, dtype=torch.float32)
    
    print(f"\nTensor shape: {size}")
    print(f"Number of iterations: {iterations}")
    
    # Get buffers for deserialization benchmark
    buffer_numpy = tensor_to_buffer_numpy(x)
    buffer_ptr = tensor_to_buffer_ptr(x)
    buffer_save = tensor_to_buffer_save(x)
    
    # Run benchmarks
    ser_times = benchmark_serialization(x, iterations)
    deser_times = benchmark_deserialization(buffer_numpy, buffer_ptr, buffer_save, x.shape, x.dtype, iterations)
    
    # Print results
    print("\nAverage times per operation (ms):")
    print(f"Serialization:   {[f'{t*1000/iterations:.3f}' for t in ser_times]}")
    print(f"Deserialization: {[f'{t*1000/iterations:.3f}' for t in deser_times]}")
    
    # Calculate throughput in MB/s
    tensor_size_mb = x.nelement() * x.element_size() / (1024 * 1024)
    throughputs_ser = [tensor_size_mb / (t/iterations) for t in ser_times]
    throughputs_deser = [tensor_size_mb / (t/iterations) for t in deser_times]
    
    print("\nThroughput (MB/s):")
    print(f"Serialization:   {[f'{t:.1f}' for t in throughputs_ser]}")
    print(f"Deserialization: {[f'{t:.1f}' for t in throughputs_deser]}")
    
    # Compare buffer sizes
    print("\nBuffer sizes:")
    print(f"- numpy().tobytes(): {len(buffer_numpy)/1024:.1f} KB")
    print(f"- data_ptr(): {len(buffer_ptr)/1024:.1f} KB")
    print(f"- torch.save: {len(buffer_save)/1024:.1f} KB")
    
    # Verify correctness
    restored_numpy = buffer_to_tensor_numpy(buffer_numpy, x.shape, x.dtype)
    restored_ptr = buffer_to_tensor_ptr(buffer_ptr, x.shape, x.dtype)
    restored_save = buffer_to_tensor_save(buffer_save)
    
    print("\nCorrectness check (max absolute difference):")
    print(f"- numpy().tobytes(): {torch.max(torch.abs(x - restored_numpy)).item():.3e}")
    print(f"- data_ptr(): {torch.max(torch.abs(x - restored_ptr)).item():.3e}")
    print(f"- torch.save: {torch.max(torch.abs(x - restored_save)).item():.3e}")
    
    # Clean up
    del x, restored_numpy, restored_ptr, restored_save
    gc.collect()
    torch.cuda.empty_cache()
	Benchmarking serialization methods...
	Format: numpy().tobytes() / data_ptr() / torch.save
	------------------------------------------------------------

	Tensor shape: (100,)
	Number of iterations: 10000

	Average times per operation (ms):
	Serialization: ['0.001', '0.000', '0.020']
	Deserialization: ['0.001', '0.001', '0.024']

	Throughput (MB/s):
	Serialization: ['724.3', '1305.5', '18.8']
	Deserialization: ['470.9', '444.4', '15.9']

	Buffer sizes:
	- numpy().tobytes(): 0.4 KB
	- data_ptr(): 0.4 KB
	- torch.save: 1.5 KB

	Correctness check (max absolute difference):
	- numpy().tobytes(): 0.000e+00
	- data_ptr(): 0.000e+00
	- torch.save: 0.000e+00

	Tensor shape: (1000,)
	Number of iterations: 10000

	Average times per operation (ms):
	Serialization: ['0.001', '0.000', '0.021']
	Deserialization: ['0.001', '0.001', '0.024']

	Throughput (MB/s):
	Serialization: ['6582.6', '10595.4', '178.9']
	Deserialization: ['4526.6', '4340.6', '157.8']

	Buffer sizes:
	- numpy().tobytes(): 3.9 KB
	- data_ptr(): 3.9 KB
	- torch.save: 5.0 KB

	Correctness check (max absolute difference):
	- numpy().tobytes(): 0.000e+00
	- data_ptr(): 0.000e+00
	- torch.save: 0.000e+00

	Tensor shape: (100, 100)
	Number of iterations: 5000

	Average times per operation (ms):
	Serialization: ['0.001', '0.001', '0.029']
	Deserialization: ['0.001', '0.001', '0.025']

	Throughput (MB/s):
	Serialization: ['42424.0', '59640.4', '1315.9']
	Deserialization: ['31161.4', '30357.5', '1531.3']

	Buffer sizes:
	- numpy().tobytes(): 39.1 KB
	- data_ptr(): 39.1 KB
	- torch.save: 40.2 KB

	Correctness check (max absolute difference):
	- numpy().tobytes(): 0.000e+00
	- data_ptr(): 0.000e+00
	- torch.save: 0.000e+00

	Tensor shape: (1000, 1000)
	Number of iterations: 1000

	Average times per operation (ms):
	Serialization: ['0.178', '0.174', '0.960']
	Deserialization: ['0.172', '0.174', '0.199']

	Throughput (MB/s):
	Serialization: ['21479.6', '21910.3', '3972.7']
	Deserialization: ['22117.4', '21879.9', '19125.0']

	Buffer sizes:
	- numpy().tobytes(): 3906.2 KB
	- data_ptr(): 3906.2 KB
	- torch.save: 3907.4 KB

	Correctness check (max absolute difference):
	- numpy().tobytes(): 0.000e+00
	- data_ptr(): 0.000e+00
	- torch.save: 0.000e+00

	Tensor shape: (5000, 5000)
	Number of iterations: 100

	Average times per operation (ms):
	Serialization: ['4.664', '4.730', '24.671']
	Deserialization: ['4.846', '4.943', '5.403']

	Throughput (MB/s):
	Serialization: ['20446.0', '20163.4', '3865.6']
	Deserialization: ['19679.6', '19293.1', '17649.9']

	Buffer sizes:
	- numpy().tobytes(): 97656.2 KB
	- data_ptr(): 97656.2 KB
	- torch.save: 97657.4 KB

	Correctness check (max absolute difference):
	- numpy().tobytes(): 0.000e+00
	- data_ptr(): 0.000e+00
	- torch.save: 0.000e+00
	import torch
	import numpy as np
	from io import BytesIO
	import timeit
	import ctypes
	import gc

	# Method 1: numpy().tobytes()
	def tensor_to_buffer_numpy(tensor: torch.Tensor) -> bytes:
	if tensor.device.type != "cpu":
	tensor = tensor.cpu()
	return tensor.numpy().tobytes()

	def buffer_to_tensor_numpy(buffer: bytes, shape: tuple[int, ...], dtype: torch.dtype) -> torch.Tensor:
	array = np.frombuffer(buffer, dtype=np.float32).copy()
	return torch.from_numpy(array.reshape(shape))

	# Method 2: data_ptr()
	def tensor_to_buffer_ptr(tensor: torch.Tensor) -> bytes:
	if tensor.device.type != "cpu":
	tensor = tensor.cpu()
	nbytes = tensor.nelement() * tensor.element_size()
	ptr = tensor.data_ptr()
	return ctypes.string_at(ptr, nbytes)

	def buffer_to_tensor_ptr(buffer: bytes, shape: tuple[int, ...], dtype: torch.dtype) -> torch.Tensor:
	return torch.frombuffer(bytearray(buffer), dtype=dtype).reshape(shape)

	# Method 3: torch.save with BytesIO
	def tensor_to_buffer_save(tensor: torch.Tensor) -> bytes:
	buffer = BytesIO()
	torch.save(tensor, buffer)
	return buffer.getvalue()

	def buffer_to_tensor_save(buffer: bytes) -> torch.Tensor:
	return torch.load(BytesIO(buffer))

	def benchmark_serialization(tensor: torch.Tensor, number: int = 1000, warmup: int = 100) -> tuple[float, float, float]:
	# Warmup
	for _ in range(warmup):
	tensor_to_buffer_numpy(tensor)
	tensor_to_buffer_ptr(tensor)
	tensor_to_buffer_save(tensor)

	gc.collect()
	torch.cuda.empty_cache()

	# Benchmark tensor_to_buffer
	time_numpy = timeit.timeit(
	lambda: tensor_to_buffer_numpy(tensor),
	number=number
	)

	time_ptr = timeit.timeit(
	lambda: tensor_to_buffer_ptr(tensor),
	number=number
	)

	time_save = timeit.timeit(
	lambda: tensor_to_buffer_save(tensor),
	number=number
	)

	return time_numpy, time_ptr, time_save

	def benchmark_deserialization(buffer_numpy: bytes, buffer_ptr: bytes, buffer_save: bytes,
	shape: tuple, dtype: torch.dtype, number: int = 1000, warmup: int = 100) -> tuple[float, float, float]:
	# Warmup
	for _ in range(warmup):
	buffer_to_tensor_numpy(buffer_numpy, shape, dtype)
	buffer_to_tensor_ptr(buffer_ptr, shape, dtype)
	buffer_to_tensor_save(buffer_save)

	gc.collect()
	torch.cuda.empty_cache()

	# Benchmark buffer_to_tensor
	time_numpy = timeit.timeit(
	lambda: buffer_to_tensor_numpy(buffer_numpy, shape, dtype),
	number=number
	)

	time_ptr = timeit.timeit(
	lambda: buffer_to_tensor_ptr(buffer_ptr, shape, dtype),
	number=number
	)

	time_save = timeit.timeit(
	lambda: buffer_to_tensor_save(buffer_save),
	number=number
	)

	return time_numpy, time_ptr, time_save

	# Run benchmarks with different sizes and iterations
	sizes = [
	((100,), 10000), # Small tensor, many iterations
	((1000,), 10000), # Medium tensor, many iterations
	((100, 100), 5000), # Medium tensor, medium iterations
	((1000, 1000), 1000), # Large tensor, fewer iterations
	((5000, 5000), 100), # Very large tensor, few iterations
	]

	print("Benchmarking serialization methods...")
	print("Format: numpy().tobytes() / data_ptr() / torch.save")
	print("-" * 60)

	for size, iterations in sizes:
	x = torch.randn(size, dtype=torch.float32)

	print(f"\nTensor shape: {size}")
	print(f"Number of iterations: {iterations}")

	# Get buffers for deserialization benchmark
	buffer_numpy = tensor_to_buffer_numpy(x)
	buffer_ptr = tensor_to_buffer_ptr(x)
	buffer_save = tensor_to_buffer_save(x)

	# Run benchmarks
	ser_times = benchmark_serialization(x, iterations)
	deser_times = benchmark_deserialization(buffer_numpy, buffer_ptr, buffer_save, x.shape, x.dtype, iterations)

	# Print results
	print("\nAverage times per operation (ms):")
	print(f"Serialization: {[f'{t*1000/iterations:.3f}' for t in ser_times]}")
	print(f"Deserialization: {[f'{t*1000/iterations:.3f}' for t in deser_times]}")

	# Calculate throughput in MB/s
	tensor_size_mb = x.nelement() * x.element_size() / (1024 * 1024)
	throughputs_ser = [tensor_size_mb / (t/iterations) for t in ser_times]
	throughputs_deser = [tensor_size_mb / (t/iterations) for t in deser_times]

	print("\nThroughput (MB/s):")
	print(f"Serialization: {[f'{t:.1f}' for t in throughputs_ser]}")
	print(f"Deserialization: {[f'{t:.1f}' for t in throughputs_deser]}")

	# Compare buffer sizes
	print("\nBuffer sizes:")
	print(f"- numpy().tobytes(): {len(buffer_numpy)/1024:.1f} KB")
	print(f"- data_ptr(): {len(buffer_ptr)/1024:.1f} KB")
	print(f"- torch.save: {len(buffer_save)/1024:.1f} KB")

	# Verify correctness
	restored_numpy = buffer_to_tensor_numpy(buffer_numpy, x.shape, x.dtype)
	restored_ptr = buffer_to_tensor_ptr(buffer_ptr, x.shape, x.dtype)
	restored_save = buffer_to_tensor_save(buffer_save)

	print("\nCorrectness check (max absolute difference):")
	print(f"- numpy().tobytes(): {torch.max(torch.abs(x - restored_numpy)).item():.3e}")
	print(f"- data_ptr(): {torch.max(torch.abs(x - restored_ptr)).item():.3e}")
	print(f"- torch.save: {torch.max(torch.abs(x - restored_save)).item():.3e}")

	# Clean up
	del x, restored_numpy, restored_ptr, restored_save
	gc.collect()
	torch.cuda.empty_cache()