Last active
January 27, 2025 21:01
-
-
Save jcrist/d62f450594164d284fbea957fd48b743 to your computer and use it in GitHub Desktop.
A quick benchmark comparing msgspec (https://github.com/jcrist/msgspec), pydantic v1, and pydantic v2
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""A quick benchmark comparing the performance of: | |
- msgspec: https://github.com/jcrist/msgspec | |
- pydantic V1: https://docs.pydantic.dev/1.10/ | |
- pydantic V2: https://docs.pydantic.dev/dev-v2/ | |
The benchmark is modified from the one in the msgspec repo here: | |
https://github.com/jcrist/msgspec/blob/main/benchmarks/bench_validation.py | |
I make no claims that it's illustrative of all use cases. I wrote this up | |
mostly to get an understanding of how msgspec's performance compares with that | |
of pydantic V2. | |
""" | |
from __future__ import annotations | |
import datetime | |
import random | |
import string | |
import timeit | |
import uuid | |
from typing import List, Literal, Union, Annotated | |
import msgspec | |
import pydantic | |
import pydantic.v1 | |
def make_filesystem_data(capacity): | |
"""Generate a tree structure representing a fake filesystem""" | |
UTC = datetime.timezone.utc | |
DATE_2018 = datetime.datetime(2018, 1, 1, tzinfo=UTC) | |
DATE_2023 = datetime.datetime(2023, 1, 1, tzinfo=UTC) | |
UUIDS = [str(uuid.uuid4()) for _ in range(30)] | |
rand = random.Random(42) | |
def randdt(min, max): | |
ts = rand.randint(min.timestamp(), max.timestamp()) | |
return datetime.datetime.fromtimestamp(ts).replace(tzinfo=UTC) | |
def randstr(min=None, max=None): | |
if max is not None: | |
min = rand.randint(min, max) | |
return "".join(rand.choices(string.ascii_letters, k=min)) | |
def make_node(is_dir): | |
nonlocal capacity | |
name = randstr(4, 30) | |
created_by = rand.choice(UUIDS) | |
created_at = randdt(DATE_2018, DATE_2023) | |
updated_at = randdt(created_at, DATE_2023) | |
data = { | |
"type": "directory" if is_dir else "file", | |
"name": name, | |
"created_by": created_by, | |
"created_at": created_at.isoformat(), | |
"updated_at": updated_at.isoformat(), | |
} | |
if is_dir: | |
n = min(rand.randint(0, 30), capacity) | |
capacity -= n | |
data["contents"] = [make_node(rand.random() > 0.9) for _ in range(n)] | |
else: | |
data["nbytes"] = rand.randint(0, 1000000) | |
return data | |
capacity -= 1 | |
out = make_node(True) | |
while capacity: | |
capacity -= 1 | |
out["contents"].append(make_node(rand.random() > 0.9)) | |
return out | |
def bench(raw_data, dumps, loads, convert): | |
msg = convert(raw_data) | |
json_data = dumps(msg) | |
msg2 = loads(json_data) | |
assert msg == msg2 | |
del msg2 | |
timer = timeit.Timer("func(data)", setup="", globals={"func": dumps, "data": msg}) | |
n, t = timer.autorange() | |
dumps_time = t / n | |
timer = timeit.Timer( | |
"func(data)", setup="", globals={"func": loads, "data": json_data} | |
) | |
n, t = timer.autorange() | |
loads_time = t / n | |
return dumps_time, loads_time | |
############################################################################# | |
# msgspec # | |
############################################################################# | |
class File(msgspec.Struct, tag="file"): | |
name: Annotated[str, msgspec.Meta(min_length=1)] | |
created_by: uuid.UUID | |
created_at: datetime.datetime | |
updated_at: datetime.datetime | |
nbytes: Annotated[int, msgspec.Meta(ge=0)] | |
class Directory(msgspec.Struct, tag="directory"): | |
name: Annotated[str, msgspec.Meta(min_length=1)] | |
created_by: uuid.UUID | |
created_at: datetime.datetime | |
updated_at: datetime.datetime | |
contents: List[Union[File, Directory]] | |
def bench_msgspec(data): | |
enc = msgspec.json.Encoder() | |
dec = msgspec.json.Decoder(Directory) | |
def convert(data): | |
return msgspec.convert(data, Directory) | |
return bench(data, enc.encode, dec.decode, convert) | |
############################################################################# | |
# pydantic V2 # | |
############################################################################# | |
class FileModel(pydantic.BaseModel): | |
type: Literal["file"] = "file" | |
name: str = pydantic.Field(min_length=1) | |
created_by: uuid.UUID | |
created_at: datetime.datetime | |
updated_at: datetime.datetime | |
nbytes: pydantic.NonNegativeInt | |
class DirectoryModel(pydantic.BaseModel): | |
type: Literal["directory"] = "directory" | |
name: str = pydantic.Field(min_length=1) | |
created_by: uuid.UUID | |
created_at: datetime.datetime | |
updated_at: datetime.datetime | |
contents: List[Union[DirectoryModel, FileModel]] | |
def bench_pydantic_v2(data): | |
return bench( | |
data, | |
lambda p: p.model_dump_json(), | |
DirectoryModel.model_validate_json, | |
lambda data: DirectoryModel(**data), | |
) | |
############################################################################# | |
# pydantic V1 # | |
############################################################################# | |
class FileModelV1(pydantic.v1.BaseModel): | |
type: Literal["file"] = "file" | |
name: str = pydantic.v1.Field(min_length=1) | |
created_by: uuid.UUID | |
created_at: datetime.datetime | |
updated_at: datetime.datetime | |
nbytes: pydantic.v1.NonNegativeInt | |
class DirectoryModelV1(pydantic.v1.BaseModel): | |
type: Literal["directory"] = "directory" | |
name: str = pydantic.v1.Field(min_length=1) | |
created_by: uuid.UUID | |
created_at: datetime.datetime | |
updated_at: datetime.datetime | |
contents: List[Union[DirectoryModelV1, FileModelV1]] | |
def bench_pydantic_v1(data): | |
return bench( | |
data, | |
lambda p: p.json(), | |
DirectoryModelV1.parse_raw, | |
lambda data: DirectoryModelV1(**data), | |
) | |
if __name__ == "__main__": | |
N = 1000 | |
data = make_filesystem_data(N) | |
ms_dumps, ms_loads = bench_msgspec(data) | |
ms_total = ms_dumps + ms_loads | |
title = f"msgspec {msgspec.__version__}" | |
print(title) | |
print("-" * len(title)) | |
print(f"dumps: {ms_dumps * 1e6:.1f} us") | |
print(f"loads: {ms_loads * 1e6:.1f} us") | |
print(f"total: {ms_total * 1e6:.1f} us") | |
for title, func in [ | |
(f"pydantic {pydantic.__version__}", bench_pydantic_v2), | |
(f"pydantic {pydantic.v1.__version__}", bench_pydantic_v1) | |
]: | |
print() | |
print(title) | |
print("-" * len(title)) | |
dumps, loads = func(data) | |
total = dumps + loads | |
print(f"dumps: {dumps * 1e6:.1f} us ({dumps / ms_dumps:.1f}x slower)") | |
print(f"loads: {loads * 1e6:.1f} us ({loads / ms_loads:.1f}x slower)") | |
print(f"total: {total * 1e6:.1f} us ({total / ms_total:.1f}x slower)") |
Updated results with python 3.12 and latest available versions of pydantic and msgspec:
msgspec 0.18.6
--------------
dumps: 178.8 us
loads: 509.6 us
total: 688.4 us
pydantic 2.9.2
--------------
dumps: 9064.2 us (50.7x slower)
loads: 10563.7 us (20.7x slower)
total: 19627.9 us (28.5x slower)
pydantic 1.10.18
----------------
dumps: 13753.4 us (76.9x slower)
loads: 53922.3 us (105.8x slower)
total: 67675.7 us (98.3x slower)
I tested myself and did not notice a 10x + difference.
from my test, msgspec is about 70% faster than pydantic.
@dataclass
class Item:
product_id: int
name: str
quantity: int
price: float
@dataclass
class Order:
order_id: str
customer_name: str
customer_email: str
items: List[Item]
shipping_address: str
payment_status: str
total_amount: float
discount: Optional[float] = 0.0
# Step 2: Example body data in dictionary format
order_data = {
"order_id": "ORD12345",
"customer_name": "Jane Doe",
"customer_email": "[email protected]",
"items": [
{"product_id": 101, "name": "Laptop", "quantity": 1, "price": 1200.00},
{"product_id": 102, "name": "Mouse", "quantity": 2, "price": 25.50},
{"product_id": 103, "name": "Keyboard", "quantity": 1, "price": 75.75},
],
"shipping_address": "1234 Elm Street, Springfield, IL",
"payment_status": "Paid",
"total_amount": 1400.75,
"discount": 100.0, # optional discount
}
data = json.dumps(order_data).encode()
rounds = 10000
order_adapter = TypeAdapter(Order)
p1 = perf_counter()
for _ in range(rounds):
porder = order_adapter.validate_json(data)
p2 = perf_counter()
r1 = round(p2 - p1, 6)
p1 = perf_counter()
for _ in range(rounds):
morder = decode(data, type=Order)
p2 = perf_counter()
r2 = round(p2 - p1, 6)
print(f"pydantic costs {r1} seconds")
print(f"msgspec costs {r2} seconds")
print(f"pydantic is {round(r1/r2,3)}x slower")
pydantic costs 0.023508 seconds
msgspec costs 0.013425 seconds
pydantic is 1.751x slower
PYDANTIC_VERSION = '2.10.6'
MSGSPEC_VERSION = '0.19.0'
========== Update =========
When use msgspec.Struct
and pydantic.BaseModel
to define model instead of dataclass
, msgspec is much more performant than
pydantic.
Interestingly, it is even faster to user a TypeAdapter(list[dataclasses.dataclass])
than TypeAdapter(list[pydantic.BaseModel])
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Created another benchmark that uses custom types
https://gist.github.com/nrbnlulu/e983ab23bed5806cff5bb8ba97434d6d
results are quite surprising