Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save Geson-anko/6cf73a9cb56fe473718cdd11e976e335 to your computer and use it in GitHub Desktop.
Save Geson-anko/6cf73a9cb56fe473718cdd11e976e335 to your computer and use it in GitHub Desktop.
Tensor Serialization Performance Comparison
Benchmarking serialization methods...
Format: numpy().tobytes() / data_ptr() / torch.save
------------------------------------------------------------
Tensor shape: (100,)
Number of iterations: 10000
Average times per operation (ms):
Serialization: ['0.001', '0.000', '0.020']
Deserialization: ['0.001', '0.001', '0.024']
Throughput (MB/s):
Serialization: ['724.3', '1305.5', '18.8']
Deserialization: ['470.9', '444.4', '15.9']
Buffer sizes:
- numpy().tobytes(): 0.4 KB
- data_ptr(): 0.4 KB
- torch.save: 1.5 KB
Correctness check (max absolute difference):
- numpy().tobytes(): 0.000e+00
- data_ptr(): 0.000e+00
- torch.save: 0.000e+00
Tensor shape: (1000,)
Number of iterations: 10000
Average times per operation (ms):
Serialization: ['0.001', '0.000', '0.021']
Deserialization: ['0.001', '0.001', '0.024']
Throughput (MB/s):
Serialization: ['6582.6', '10595.4', '178.9']
Deserialization: ['4526.6', '4340.6', '157.8']
Buffer sizes:
- numpy().tobytes(): 3.9 KB
- data_ptr(): 3.9 KB
- torch.save: 5.0 KB
Correctness check (max absolute difference):
- numpy().tobytes(): 0.000e+00
- data_ptr(): 0.000e+00
- torch.save: 0.000e+00
Tensor shape: (100, 100)
Number of iterations: 5000
Average times per operation (ms):
Serialization: ['0.001', '0.001', '0.029']
Deserialization: ['0.001', '0.001', '0.025']
Throughput (MB/s):
Serialization: ['42424.0', '59640.4', '1315.9']
Deserialization: ['31161.4', '30357.5', '1531.3']
Buffer sizes:
- numpy().tobytes(): 39.1 KB
- data_ptr(): 39.1 KB
- torch.save: 40.2 KB
Correctness check (max absolute difference):
- numpy().tobytes(): 0.000e+00
- data_ptr(): 0.000e+00
- torch.save: 0.000e+00
Tensor shape: (1000, 1000)
Number of iterations: 1000
Average times per operation (ms):
Serialization: ['0.178', '0.174', '0.960']
Deserialization: ['0.172', '0.174', '0.199']
Throughput (MB/s):
Serialization: ['21479.6', '21910.3', '3972.7']
Deserialization: ['22117.4', '21879.9', '19125.0']
Buffer sizes:
- numpy().tobytes(): 3906.2 KB
- data_ptr(): 3906.2 KB
- torch.save: 3907.4 KB
Correctness check (max absolute difference):
- numpy().tobytes(): 0.000e+00
- data_ptr(): 0.000e+00
- torch.save: 0.000e+00
Tensor shape: (5000, 5000)
Number of iterations: 100
Average times per operation (ms):
Serialization: ['4.664', '4.730', '24.671']
Deserialization: ['4.846', '4.943', '5.403']
Throughput (MB/s):
Serialization: ['20446.0', '20163.4', '3865.6']
Deserialization: ['19679.6', '19293.1', '17649.9']
Buffer sizes:
- numpy().tobytes(): 97656.2 KB
- data_ptr(): 97656.2 KB
- torch.save: 97657.4 KB
Correctness check (max absolute difference):
- numpy().tobytes(): 0.000e+00
- data_ptr(): 0.000e+00
- torch.save: 0.000e+00
import torch
import numpy as np
from io import BytesIO
import timeit
import ctypes
import gc
# Method 1: numpy().tobytes()
def tensor_to_buffer_numpy(tensor: torch.Tensor) -> bytes:
if tensor.device.type != "cpu":
tensor = tensor.cpu()
return tensor.numpy().tobytes()
def buffer_to_tensor_numpy(buffer: bytes, shape: tuple[int, ...], dtype: torch.dtype) -> torch.Tensor:
array = np.frombuffer(buffer, dtype=np.float32).copy()
return torch.from_numpy(array.reshape(shape))
# Method 2: data_ptr()
def tensor_to_buffer_ptr(tensor: torch.Tensor) -> bytes:
if tensor.device.type != "cpu":
tensor = tensor.cpu()
nbytes = tensor.nelement() * tensor.element_size()
ptr = tensor.data_ptr()
return ctypes.string_at(ptr, nbytes)
def buffer_to_tensor_ptr(buffer: bytes, shape: tuple[int, ...], dtype: torch.dtype) -> torch.Tensor:
return torch.frombuffer(bytearray(buffer), dtype=dtype).reshape(shape)
# Method 3: torch.save with BytesIO
def tensor_to_buffer_save(tensor: torch.Tensor) -> bytes:
buffer = BytesIO()
torch.save(tensor, buffer)
return buffer.getvalue()
def buffer_to_tensor_save(buffer: bytes) -> torch.Tensor:
return torch.load(BytesIO(buffer))
def benchmark_serialization(tensor: torch.Tensor, number: int = 1000, warmup: int = 100) -> tuple[float, float, float]:
# Warmup
for _ in range(warmup):
tensor_to_buffer_numpy(tensor)
tensor_to_buffer_ptr(tensor)
tensor_to_buffer_save(tensor)
gc.collect()
torch.cuda.empty_cache()
# Benchmark tensor_to_buffer
time_numpy = timeit.timeit(
lambda: tensor_to_buffer_numpy(tensor),
number=number
)
time_ptr = timeit.timeit(
lambda: tensor_to_buffer_ptr(tensor),
number=number
)
time_save = timeit.timeit(
lambda: tensor_to_buffer_save(tensor),
number=number
)
return time_numpy, time_ptr, time_save
def benchmark_deserialization(buffer_numpy: bytes, buffer_ptr: bytes, buffer_save: bytes,
shape: tuple, dtype: torch.dtype, number: int = 1000, warmup: int = 100) -> tuple[float, float, float]:
# Warmup
for _ in range(warmup):
buffer_to_tensor_numpy(buffer_numpy, shape, dtype)
buffer_to_tensor_ptr(buffer_ptr, shape, dtype)
buffer_to_tensor_save(buffer_save)
gc.collect()
torch.cuda.empty_cache()
# Benchmark buffer_to_tensor
time_numpy = timeit.timeit(
lambda: buffer_to_tensor_numpy(buffer_numpy, shape, dtype),
number=number
)
time_ptr = timeit.timeit(
lambda: buffer_to_tensor_ptr(buffer_ptr, shape, dtype),
number=number
)
time_save = timeit.timeit(
lambda: buffer_to_tensor_save(buffer_save),
number=number
)
return time_numpy, time_ptr, time_save
# Run benchmarks with different sizes and iterations
sizes = [
((100,), 10000), # Small tensor, many iterations
((1000,), 10000), # Medium tensor, many iterations
((100, 100), 5000), # Medium tensor, medium iterations
((1000, 1000), 1000), # Large tensor, fewer iterations
((5000, 5000), 100), # Very large tensor, few iterations
]
print("Benchmarking serialization methods...")
print("Format: numpy().tobytes() / data_ptr() / torch.save")
print("-" * 60)
for size, iterations in sizes:
x = torch.randn(size, dtype=torch.float32)
print(f"\nTensor shape: {size}")
print(f"Number of iterations: {iterations}")
# Get buffers for deserialization benchmark
buffer_numpy = tensor_to_buffer_numpy(x)
buffer_ptr = tensor_to_buffer_ptr(x)
buffer_save = tensor_to_buffer_save(x)
# Run benchmarks
ser_times = benchmark_serialization(x, iterations)
deser_times = benchmark_deserialization(buffer_numpy, buffer_ptr, buffer_save, x.shape, x.dtype, iterations)
# Print results
print("\nAverage times per operation (ms):")
print(f"Serialization: {[f'{t*1000/iterations:.3f}' for t in ser_times]}")
print(f"Deserialization: {[f'{t*1000/iterations:.3f}' for t in deser_times]}")
# Calculate throughput in MB/s
tensor_size_mb = x.nelement() * x.element_size() / (1024 * 1024)
throughputs_ser = [tensor_size_mb / (t/iterations) for t in ser_times]
throughputs_deser = [tensor_size_mb / (t/iterations) for t in deser_times]
print("\nThroughput (MB/s):")
print(f"Serialization: {[f'{t:.1f}' for t in throughputs_ser]}")
print(f"Deserialization: {[f'{t:.1f}' for t in throughputs_deser]}")
# Compare buffer sizes
print("\nBuffer sizes:")
print(f"- numpy().tobytes(): {len(buffer_numpy)/1024:.1f} KB")
print(f"- data_ptr(): {len(buffer_ptr)/1024:.1f} KB")
print(f"- torch.save: {len(buffer_save)/1024:.1f} KB")
# Verify correctness
restored_numpy = buffer_to_tensor_numpy(buffer_numpy, x.shape, x.dtype)
restored_ptr = buffer_to_tensor_ptr(buffer_ptr, x.shape, x.dtype)
restored_save = buffer_to_tensor_save(buffer_save)
print("\nCorrectness check (max absolute difference):")
print(f"- numpy().tobytes(): {torch.max(torch.abs(x - restored_numpy)).item():.3e}")
print(f"- data_ptr(): {torch.max(torch.abs(x - restored_ptr)).item():.3e}")
print(f"- torch.save: {torch.max(torch.abs(x - restored_save)).item():.3e}")
# Clean up
del x, restored_numpy, restored_ptr, restored_save
gc.collect()
torch.cuda.empty_cache()
@Geson-anko
Copy link
Author

結果の分析

  1. バッファサイズ:
  • numpy().tobytes()data_ptr()は同じサイズ(生のデータサイズ)
  • torch.saveは小さいテンソルで顕著にオーバーヘッドあり(約4倍)
  • 大きいテンソル(5000x5000)ではオーバーヘッドは無視できるレベル(<0.001%)
  1. シリアライズ性能:
  • 小規模テンソル(~100x100)ではdata_ptr()が最速
  • 大規模テンソル(1000x1000以上)ではnumpy().tobytes()data_ptr()が同程度
  • torch.saveは常に遅い(特に小規模テンソルで顕著)
  1. デシリアライズ性能:
  • 小~中規模では3手法とも同程度
  • 大規模テンソルではnumpy().tobytes()data_ptr()が同程度で、torch.saveも遜色なし
  • torch.saveのデシリアライズは意外と高速(シリアライズに比べて)
  1. スループット:
  • 小規模テンソルではdata_ptr()が最高で約60GB/s
  • 大規模テンソルでは全手法で約20GB/s前後

推奨

  • 単純なテンソルの変換ならdata_ptr()が最適(シンプルで高速)
  • メタデータ(dtype, device等)の保存が必要な場合はtorch.save
  • numpy配列との相互変換が必要な場合はnumpy().tobytes()

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment