Last active
February 23, 2025 02:32
-
-
Save Geson-anko/6cf73a9cb56fe473718cdd11e976e335 to your computer and use it in GitHub Desktop.
Tensor Serialization Performance Comparison
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Benchmarking serialization methods... | |
Format: numpy().tobytes() / data_ptr() / torch.save | |
------------------------------------------------------------ | |
Tensor shape: (100,) | |
Number of iterations: 10000 | |
Average times per operation (ms): | |
Serialization: ['0.001', '0.000', '0.020'] | |
Deserialization: ['0.001', '0.001', '0.024'] | |
Throughput (MB/s): | |
Serialization: ['724.3', '1305.5', '18.8'] | |
Deserialization: ['470.9', '444.4', '15.9'] | |
Buffer sizes: | |
- numpy().tobytes(): 0.4 KB | |
- data_ptr(): 0.4 KB | |
- torch.save: 1.5 KB | |
Correctness check (max absolute difference): | |
- numpy().tobytes(): 0.000e+00 | |
- data_ptr(): 0.000e+00 | |
- torch.save: 0.000e+00 | |
Tensor shape: (1000,) | |
Number of iterations: 10000 | |
Average times per operation (ms): | |
Serialization: ['0.001', '0.000', '0.021'] | |
Deserialization: ['0.001', '0.001', '0.024'] | |
Throughput (MB/s): | |
Serialization: ['6582.6', '10595.4', '178.9'] | |
Deserialization: ['4526.6', '4340.6', '157.8'] | |
Buffer sizes: | |
- numpy().tobytes(): 3.9 KB | |
- data_ptr(): 3.9 KB | |
- torch.save: 5.0 KB | |
Correctness check (max absolute difference): | |
- numpy().tobytes(): 0.000e+00 | |
- data_ptr(): 0.000e+00 | |
- torch.save: 0.000e+00 | |
Tensor shape: (100, 100) | |
Number of iterations: 5000 | |
Average times per operation (ms): | |
Serialization: ['0.001', '0.001', '0.029'] | |
Deserialization: ['0.001', '0.001', '0.025'] | |
Throughput (MB/s): | |
Serialization: ['42424.0', '59640.4', '1315.9'] | |
Deserialization: ['31161.4', '30357.5', '1531.3'] | |
Buffer sizes: | |
- numpy().tobytes(): 39.1 KB | |
- data_ptr(): 39.1 KB | |
- torch.save: 40.2 KB | |
Correctness check (max absolute difference): | |
- numpy().tobytes(): 0.000e+00 | |
- data_ptr(): 0.000e+00 | |
- torch.save: 0.000e+00 | |
Tensor shape: (1000, 1000) | |
Number of iterations: 1000 | |
Average times per operation (ms): | |
Serialization: ['0.178', '0.174', '0.960'] | |
Deserialization: ['0.172', '0.174', '0.199'] | |
Throughput (MB/s): | |
Serialization: ['21479.6', '21910.3', '3972.7'] | |
Deserialization: ['22117.4', '21879.9', '19125.0'] | |
Buffer sizes: | |
- numpy().tobytes(): 3906.2 KB | |
- data_ptr(): 3906.2 KB | |
- torch.save: 3907.4 KB | |
Correctness check (max absolute difference): | |
- numpy().tobytes(): 0.000e+00 | |
- data_ptr(): 0.000e+00 | |
- torch.save: 0.000e+00 | |
Tensor shape: (5000, 5000) | |
Number of iterations: 100 | |
Average times per operation (ms): | |
Serialization: ['4.664', '4.730', '24.671'] | |
Deserialization: ['4.846', '4.943', '5.403'] | |
Throughput (MB/s): | |
Serialization: ['20446.0', '20163.4', '3865.6'] | |
Deserialization: ['19679.6', '19293.1', '17649.9'] | |
Buffer sizes: | |
- numpy().tobytes(): 97656.2 KB | |
- data_ptr(): 97656.2 KB | |
- torch.save: 97657.4 KB | |
Correctness check (max absolute difference): | |
- numpy().tobytes(): 0.000e+00 | |
- data_ptr(): 0.000e+00 | |
- torch.save: 0.000e+00 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
import numpy as np | |
from io import BytesIO | |
import timeit | |
import ctypes | |
import gc | |
# Method 1: numpy().tobytes() | |
def tensor_to_buffer_numpy(tensor: torch.Tensor) -> bytes: | |
if tensor.device.type != "cpu": | |
tensor = tensor.cpu() | |
return tensor.numpy().tobytes() | |
def buffer_to_tensor_numpy(buffer: bytes, shape: tuple[int, ...], dtype: torch.dtype) -> torch.Tensor: | |
array = np.frombuffer(buffer, dtype=np.float32).copy() | |
return torch.from_numpy(array.reshape(shape)) | |
# Method 2: data_ptr() | |
def tensor_to_buffer_ptr(tensor: torch.Tensor) -> bytes: | |
if tensor.device.type != "cpu": | |
tensor = tensor.cpu() | |
nbytes = tensor.nelement() * tensor.element_size() | |
ptr = tensor.data_ptr() | |
return ctypes.string_at(ptr, nbytes) | |
def buffer_to_tensor_ptr(buffer: bytes, shape: tuple[int, ...], dtype: torch.dtype) -> torch.Tensor: | |
return torch.frombuffer(bytearray(buffer), dtype=dtype).reshape(shape) | |
# Method 3: torch.save with BytesIO | |
def tensor_to_buffer_save(tensor: torch.Tensor) -> bytes: | |
buffer = BytesIO() | |
torch.save(tensor, buffer) | |
return buffer.getvalue() | |
def buffer_to_tensor_save(buffer: bytes) -> torch.Tensor: | |
return torch.load(BytesIO(buffer)) | |
def benchmark_serialization(tensor: torch.Tensor, number: int = 1000, warmup: int = 100) -> tuple[float, float, float]: | |
# Warmup | |
for _ in range(warmup): | |
tensor_to_buffer_numpy(tensor) | |
tensor_to_buffer_ptr(tensor) | |
tensor_to_buffer_save(tensor) | |
gc.collect() | |
torch.cuda.empty_cache() | |
# Benchmark tensor_to_buffer | |
time_numpy = timeit.timeit( | |
lambda: tensor_to_buffer_numpy(tensor), | |
number=number | |
) | |
time_ptr = timeit.timeit( | |
lambda: tensor_to_buffer_ptr(tensor), | |
number=number | |
) | |
time_save = timeit.timeit( | |
lambda: tensor_to_buffer_save(tensor), | |
number=number | |
) | |
return time_numpy, time_ptr, time_save | |
def benchmark_deserialization(buffer_numpy: bytes, buffer_ptr: bytes, buffer_save: bytes, | |
shape: tuple, dtype: torch.dtype, number: int = 1000, warmup: int = 100) -> tuple[float, float, float]: | |
# Warmup | |
for _ in range(warmup): | |
buffer_to_tensor_numpy(buffer_numpy, shape, dtype) | |
buffer_to_tensor_ptr(buffer_ptr, shape, dtype) | |
buffer_to_tensor_save(buffer_save) | |
gc.collect() | |
torch.cuda.empty_cache() | |
# Benchmark buffer_to_tensor | |
time_numpy = timeit.timeit( | |
lambda: buffer_to_tensor_numpy(buffer_numpy, shape, dtype), | |
number=number | |
) | |
time_ptr = timeit.timeit( | |
lambda: buffer_to_tensor_ptr(buffer_ptr, shape, dtype), | |
number=number | |
) | |
time_save = timeit.timeit( | |
lambda: buffer_to_tensor_save(buffer_save), | |
number=number | |
) | |
return time_numpy, time_ptr, time_save | |
# Run benchmarks with different sizes and iterations | |
sizes = [ | |
((100,), 10000), # Small tensor, many iterations | |
((1000,), 10000), # Medium tensor, many iterations | |
((100, 100), 5000), # Medium tensor, medium iterations | |
((1000, 1000), 1000), # Large tensor, fewer iterations | |
((5000, 5000), 100), # Very large tensor, few iterations | |
] | |
print("Benchmarking serialization methods...") | |
print("Format: numpy().tobytes() / data_ptr() / torch.save") | |
print("-" * 60) | |
for size, iterations in sizes: | |
x = torch.randn(size, dtype=torch.float32) | |
print(f"\nTensor shape: {size}") | |
print(f"Number of iterations: {iterations}") | |
# Get buffers for deserialization benchmark | |
buffer_numpy = tensor_to_buffer_numpy(x) | |
buffer_ptr = tensor_to_buffer_ptr(x) | |
buffer_save = tensor_to_buffer_save(x) | |
# Run benchmarks | |
ser_times = benchmark_serialization(x, iterations) | |
deser_times = benchmark_deserialization(buffer_numpy, buffer_ptr, buffer_save, x.shape, x.dtype, iterations) | |
# Print results | |
print("\nAverage times per operation (ms):") | |
print(f"Serialization: {[f'{t*1000/iterations:.3f}' for t in ser_times]}") | |
print(f"Deserialization: {[f'{t*1000/iterations:.3f}' for t in deser_times]}") | |
# Calculate throughput in MB/s | |
tensor_size_mb = x.nelement() * x.element_size() / (1024 * 1024) | |
throughputs_ser = [tensor_size_mb / (t/iterations) for t in ser_times] | |
throughputs_deser = [tensor_size_mb / (t/iterations) for t in deser_times] | |
print("\nThroughput (MB/s):") | |
print(f"Serialization: {[f'{t:.1f}' for t in throughputs_ser]}") | |
print(f"Deserialization: {[f'{t:.1f}' for t in throughputs_deser]}") | |
# Compare buffer sizes | |
print("\nBuffer sizes:") | |
print(f"- numpy().tobytes(): {len(buffer_numpy)/1024:.1f} KB") | |
print(f"- data_ptr(): {len(buffer_ptr)/1024:.1f} KB") | |
print(f"- torch.save: {len(buffer_save)/1024:.1f} KB") | |
# Verify correctness | |
restored_numpy = buffer_to_tensor_numpy(buffer_numpy, x.shape, x.dtype) | |
restored_ptr = buffer_to_tensor_ptr(buffer_ptr, x.shape, x.dtype) | |
restored_save = buffer_to_tensor_save(buffer_save) | |
print("\nCorrectness check (max absolute difference):") | |
print(f"- numpy().tobytes(): {torch.max(torch.abs(x - restored_numpy)).item():.3e}") | |
print(f"- data_ptr(): {torch.max(torch.abs(x - restored_ptr)).item():.3e}") | |
print(f"- torch.save: {torch.max(torch.abs(x - restored_save)).item():.3e}") | |
# Clean up | |
del x, restored_numpy, restored_ptr, restored_save | |
gc.collect() | |
torch.cuda.empty_cache() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
結果の分析
numpy().tobytes()
とdata_ptr()
は同じサイズ(生のデータサイズ)torch.save
は小さいテンソルで顕著にオーバーヘッドあり(約4倍)data_ptr()
が最速numpy().tobytes()
とdata_ptr()
が同程度torch.save
は常に遅い(特に小規模テンソルで顕著)numpy().tobytes()
とdata_ptr()
が同程度で、torch.save
も遜色なしtorch.save
のデシリアライズは意外と高速(シリアライズに比べて)data_ptr()
が最高で約60GB/s推奨
data_ptr()
が最適(シンプルで高速)torch.save
numpy().tobytes()