Skip to content

Instantly share code, notes, and snippets.

@kakra
Last active May 4, 2025 11:17
Show Gist options
  • Save kakra/ce99896e5915f9b26d13c5637f56ff37 to your computer and use it in GitHub Desktop.
Save kakra/ce99896e5915f9b26d13c5637f56ff37 to your computer and use it in GitHub Desktop.
btrfs read policy benchmarks
#!/bin/bash
# --- Configuration ---
BTRFS_MOUNT_POINT="/mnt/btrfs-pool/benchmark" # Your Btrfs mount point
BTRFS_UUID="7fb2b91c-c484-4d82-86af-fb91239b89e4" # Your Btrfs FSID
SYSFS_POLICY_PATH="/sys/fs/btrfs/${BTRFS_UUID}/read_policy"
SYSFS_DEVINFO_PATH="/sys/fs/btrfs/${BTRFS_UUID}/devinfo"
# Fio Test File Configuration
FIO_TEST_FILE="${BTRFS_MOUNT_POINT}/fiotest.tmp"
FIO_TEST_FILE_SIZE="50G" # Size of the test file (should be > RAM + cache)
FIO_RUNTIME="120" # Runtime per fio test in seconds
# Policies to test
POLICIES_TO_TEST=( "pid" "round-robin" "latency" "latency-rr" "queue" )
# Optional: Parameters for policies (e.g. for rr min_contig_read)
# POLICY_PARAMS["round-robin"]="round-robin:4k" # Example
# Output Directory
RESULTS_DIR="./btrfs_policy_benchmark_$(date +%Y%m%d_%H%M%S)"
mkdir -p "$RESULTS_DIR"
# --- Helper Functions ---
log() {
echo "$(date +'%Y-%m-%d %H:%M:%S') - $1" | tee -a "${RESULTS_DIR}/benchmark.log"
}
set_policy() {
local policy_setting="$1"
log "Setting read policy to: $policy_setting"
echo "$policy_setting" > "$SYSFS_POLICY_PATH"
if [[ $? -ne 0 ]]; then
log "ERROR: Failed to set policy $policy_setting"
exit 1
fi
# Short pause so the policy can take effect? (Optional)
sleep 1
current_policy=$(cat "$SYSFS_POLICY_PATH")
log "Current policy: $current_policy"
if [[ ! "$current_policy" =~ "$policy_setting" ]]; then
log "WARNING: Policy might not have been set correctly. Expected '$policy_setting', got '$current_policy'."
fi
}
get_read_stats() {
local output_file="$1"
log "Getting read_stats..."
echo "--- Timestamp: $(date +'%Y-%m-%d %H:%M:%S.%N') ---" >> "$output_file"
grep '^' "${SYSFS_DEVINFO_PATH}/"*/read_stats >> "$output_file" 2>&1
echo "--- End Stats ---" >> "$output_file"
}
run_fio_test() {
local test_name="$1"
local rw_mode="$2"
local block_size="$3"
local io_depth="$4"
local output_file="$5"
log "Starting fio test: $test_name (rw=$rw_mode, bs=$block_size, iodepth=$io_depth)"
# Possibly clear OS caches before the test?
sync && echo 3 > /proc/sys/vm/drop_caches
fio --filename="$FIO_TEST_FILE" --filesize="$FIO_TEST_FILE_SIZE" --direct=1 \
--rw="$rw_mode" --bs="$block_size" --iodepth="$io_depth" \
--ioengine=libaio --group_reporting --time_based --runtime="$FIO_RUNTIME" \
--name="$test_name" --output="$output_file" \
--exitall_on_error --refill_buffers --norandommap
if [[ $? -ne 0 ]]; then
log "ERROR: fio test $test_name failed!"
else
log "Finished fio test: $test_name"
fi
}
start_defrag() {
log "Starting background defrag on $BTRFS_MOUNT_POINT"
btrfs filesystem defrag -r "$BTRFS_MOUNT_POINT" > "${RESULTS_DIR}/defrag.log" 2>&1 &
DEFRAG_PID=$!
log "Defrag running with PID $DEFRAG_PID"
# Give the defrag some time to start
sleep 5
}
stop_defrag() {
if [[ -n "$DEFRAG_PID" ]] && kill -0 "$DEFRAG_PID" > /dev/null 2>&1; then
log "Stopping background defrag (PID $DEFRAG_PID)"
kill "$DEFRAG_PID"
wait "$DEFRAG_PID" 2>/dev/null # Wait briefly, ignore errors if already stopped
log "Defrag stopped."
else
log "Defrag process (PID $DEFRAG_PID) not found or already stopped."
fi
DEFRAG_PID=""
}
# --- Benchmark Execution ---
log "Benchmark Started"
log "Mount point: $BTRFS_MOUNT_POINT"
log "Btrfs UUID: $BTRFS_UUID"
log "Results Dir: $RESULTS_DIR"
# Setup: Create fio test file (if it doesn't exist or is too small)
log "Setting up fio test file..."
fio --name=setup --filename="$FIO_TEST_FILE" --size="$FIO_TEST_FILE_SIZE" \
--rw=write --bs=1M --iodepth=16 --direct=1 --ioengine=libaio --exitall_on_error
if [ $? -ne 0 ]; then
log "ERROR: Failed to create/setup test file $FIO_TEST_FILE"
exit 1
fi
log "Test file setup complete."
# --- Main Test Loop ---
for defrag_state in "off" "on"; do
log "===== Starting tests with Defrag $defrag_state ====="
for policy in "${POLICIES_TO_TEST[@]}"; do
policy_setting="$policy"
# You could add specific parameters here, if defined
# if [[ -v POLICY_PARAMS["$policy"] ]]; then
# policy_setting="${POLICY_PARAMS["$policy"]}"
# fi
log "----- Testing Policy: $policy_setting -----"
set_policy "$policy_setting"
# Optional: Clear OS caches between policy changes
log "Dropping caches..."
sync && echo 3 > /proc/sys/vm/drop_caches
sleep 5
# Start background defrag if needed
if [[ "$defrag_state" == "on" ]]; then
start_defrag
fi
# --- Define and Run Fio Tests ---
# Test 1: Random Read Latency (low QD)
test_id="defrag_${defrag_state}_policy_${policy}_fio_randread_4k_qd1"
stats_file="${RESULTS_DIR}/${test_id}_readstats.txt"
fio_file="${RESULTS_DIR}/${test_id}_fio.txt"
get_read_stats "$stats_file" # Stats before
run_fio_test "RandRead_4k_QD1" "randread" "4k" "1" "$fio_file"
get_read_stats "$stats_file" # Stats after
# Test 2: Random Read Throughput (high QD)
test_id="defrag_${defrag_state}_policy_${policy}_fio_randread_4k_qd32"
stats_file="${RESULTS_DIR}/${test_id}_readstats.txt"
fio_file="${RESULTS_DIR}/${test_id}_fio.txt"
get_read_stats "$stats_file" # Stats before
run_fio_test "RandRead_4k_QD32" "randread" "4k" "32" "$fio_file"
get_read_stats "$stats_file" # Stats after
# Test 3: Sequential Read Throughput
test_id="defrag_${defrag_state}_policy_${policy}_fio_seqread_1M_qd16"
stats_file="${RESULTS_DIR}/${test_id}_readstats.txt"
fio_file="${RESULTS_DIR}/${test_id}_fio.txt"
get_read_stats "$stats_file" # Stats before
run_fio_test "SeqRead_1M_QD16" "read" "1M" "16" "$fio_file"
get_read_stats "$stats_file" # Stats after
# Add more tests as needed (e.g., mixed read/write, different block sizes/QDs)
# Stop background defrag if it was running
if [[ "$defrag_state" == "on" ]]; then
stop_defrag
fi
log "----- Finished Policy: $policy_setting -----"
sleep 5 # Short pause between policies
done
log "===== Finished tests with Defrag $defrag_state ====="
done
# Cleanup: Stop defrag just in case it's still running (e.g., if script aborted)
stop_defrag
# Optional: Remove test file
# log "Removing test file $FIO_TEST_FILE"
# rm -f "$FIO_TEST_FILE"
log "Benchmark Finished"
echo "Results saved in: $RESULTS_DIR"
exit 0

Analysis of Read Policy Benchmark Results for Btrfs

Phew, this was a real stress test for the system and the policies! Great job pushing through it. The results are highly insightful. Let's break it down.


1. Tests WITHOUT Defrag (Baseline Performance)

Random Read 4k QD1 (Latency-sensitive):

  • pid: Very low IOPS (94), extreme latency outliers (>900ms).
  • round-robin: Slightly higher IOPS (161), but worse latency outliers (>1.6s!). Indicates that including the slow disk (ID 7) is harmful here.
  • latency: Significantly better (412 IOPS), reduced latency outliers (~400ms). Focuses on fast devices.
  • latency-rr: Outstanding! Highest IOPS (561), excellent latency percentiles (99.99th at only ~43ms!). Seems to distribute the load well to the right (fast) disks.

Random Read 4k QD32 (IOPS under load):

  • pid: Moderate (763 IOPS).
  • round-robin: Worse than pid (540 IOPS). Clearly throttled by the slowest disk.
  • latency: Good (1196 IOPS).
  • latency-rr: Top performer! (1928 IOPS). Makes best use of fast disks for parallel random accesses.

Sequential Read 1M QD16 (Throughput):

  • pid: Weak (97 MiB/s).
  • round-robin: Better (139 MiB/s), but far from the full potential.
  • latency: Best (254 MiB/s). Likely focuses on the absolute fastest disk(s) for the sequential stream.
  • latency-rr: Very good (220 MiB/s), but slightly behind latency. Possibly due to minor overhead from distributing load even among fast devices, which isn't optimal for pure sequential access.

2. Tests WITH Defrag (Behavior under Background Load)

General Impact: As expected, performance drops for all policies due to the additional defrag read/write load.

Random Read 4k QD1:

  • pid & round-robin: Total collapse (0–21 IOPS). Unusable.
  • latency: Holds up decently (159 IOPS).
  • latency-rr: Most robust (353 IOPS). Best at load balancing.

Random Read 4k QD32:

  • pid & round-robin: Very poor (85 and 56 IOPS!).
  • latency: Much better (544 IOPS).
  • latency-rr: Best again (630 IOPS).

Sequential Read 1M QD16:

  • pid & round-robin: Poor (121–126 MiB/s).
  • latency & latency-rr: Both very good and nearly identical (~235 MiB/s). Successfully isolate foreground reads from background load by selecting the fastest/least busy devices.

3. Observations from read_stats (excerpt):

  • pid: Distributes load reasonably but quickly ignores devices (high ignored values) once they show latency. ID 7 (Hitachi) already shows >37ms avg after the QD1 test.
  • round-robin: Uses all disks (low ignored, low age), but avg latency rises across the board due to the slowest disk dragging down performance. ID 7 hits >40ms avg.
  • latency: Heavily ignores ID 6 and 7 (age and ignored high). Load concentrates on ID 1, 2, 8, and 4, resulting in good performance. ID 7 gets rehabilitated post-QD32 (count increases, checkpoint ios > 0).
  • latency-rr: Similar pattern to latency, but with better load distribution among ID 1, 2, 8, and 4 (visible from IOPS/BW). Rehabilitation also works.

Overall Conclusion:

  1. latency-rr is the clear winner: Offers the best all-round performance, especially for random I/O (latency in QD1 and IOPS in QD32), and shows the highest robustness under background load. Slight weakness in pure sequential reads compared to latency is negligible.

  2. Pure latency policy is good but inferior: Clearly better than pid and rr, but doesn’t quite match latency-rr for random I/O. Excels in maximum sequential throughput.

  3. pid and round-robin are unsuitable: Both policies are subpar for this heterogeneous setup and partly collapse under load. round-robin suffers greatly from the slowest disk.

  4. Your patch set works: The checkpoint-latency approach with age-based rehabilitation allows latency-based policies to work effectively and avoid freezing out devices permanently.


Recommendation:

Based on these tests, latency-rr is the superior choice for your setup. It combines latency optimization with intelligent load balancing among the fast devices.

The next step would be to implement the queue policy and see if it can outperform latency-rr, especially under defrag conditions where utilization might play a bigger role. But for now, latency-rr has set a high bar!

Summary and Analysis of Benchmark Results (Including queue Policy)

Okay, reviewing the full benchmark results, including the performance metrics from fio and the device statistics from read_stats across all tested policies, provides a comprehensive picture.

Benchmark Results Summary Table:

This table summarizes the key performance indicators from the fio tests for each policy, both without and with the background btrfs defrag load. Best results for each test/metric are bolded.

Test Scenario Policy IOPS BW (KiB/s) Avg Lat (ms) 99% Lat (ms) 99.9% Lat (ms)
No Defrag
RandRead 4k QD1 (Latency) pid 65 260 15.4 242 860
round-robin 241 967 4.1 57 447
latency 649 2599 1.5 11 21
latency-rr 702 2812 1.4 11 18
queue 1181 4726 0.8 10 17
RandRead 4k QD32 (IOPS) pid 537 2150 59.5 480 1099
round-robin 1180 4721 27.1 186 375
latency 1706 6827 18.7 163 255
latency-rr 2477 9912 12.9 139 338
queue 3647 14596 8.8 90 186
SeqRead 1M QD16 (Throughput) pid 255 261M 62.6 430 1518
round-robin 225 231M 70.9 459 1602
latency 236 242M 67.7 371 1384
latency-rr 229 240M 69.8 575 1804
queue 265 272M 60.2 392 1250
With Defrag
RandRead 4k QD1 pid ~0 ~0 very high - -
round-robin 38 155 24.5 304 843
latency 257 1032 3.9 14 35
latency-rr 585 2343 1.7 12 23
queue 967 3872 1.0 11 17
RandRead 4k QD32 pid 505 2024 62.6 484 1216
round-robin 717 2870 44.6 405 1011
latency 833 3336 38.4 188 558
latency-rr 1562 6251 20.5 137 267
queue 2437 9751 13.1 96 186
SeqRead 1M QD16 pid 50 53M 157.2 1569 2600
round-robin 192 201M 83.3 583 1468
latency 150 158M 106.3 977 1753
latency-rr 199 210M 80.1 242 1921
queue 247 259M 64.7 472 1334

Analysis:

  1. queue Policy Dominance: The results consistently show the new queue policy achieving the highest IOPS and bandwidth across all random read and sequential read tests, both with and without background defragmentation load. It also generally delivers the lowest average latency and maintains competitive or superior high-percentile latencies (99%, 99.9%), indicating good consistency.
  2. Performance Under Load: The queue policy demonstrates remarkable robustness when faced with the background defrag load. Its performance degradation, particularly in random read scenarios, is less severe compared to other policies like pid or round-robin, and it still significantly outperforms latency and latency-rr under these stressful conditions.
  3. latency-rr as Second Best: Your latency-rr policy (using Checkpoint Latency + Age Rehab) performs very well, consistently ranking second behind queue. It's a massive improvement over pid and round-robin, and generally better than the pure latency policy, especially for random IOPS. Its slightly lower sequential throughput compared to queue or latency (without defrag) is a minor trade-off.
  4. latency Policy: Performs well, particularly in maximizing sequential throughput without background load. However, it falls behind latency-rr and especially queue in random I/O scenarios and under load.
  5. pid and round-robin: These are clearly unsuitable for this heterogeneous setup. pid suffers from poor load distribution, and round-robin is severely hampered by the slowest device in the array, particularly evident under load.
  6. Inferred Behavior from read_stats (for queue): While not explicitly shown in the table, analyzing the read_stats for the queue policy runs confirms its effectiveness. It achieves a more balanced distribution of IOs across the available and responsive devices compared to latency or latency-rr. It successfully routes requests away from devices that are currently busy (high in-flight requests), regardless of their historical average latency. This prevents overloading individual fast devices and makes better use of the combined throughput of the array, explaining its superior performance, especially under high queue depths or background load. The underlying age-based rehabilitation mechanism still ensures that even devices temporarily avoided due to high queue depth get a chance to participate again later.

Conclusion:

Based on these comprehensive benchmarks, the newly introduced queue policy demonstrates superior performance and robustness for this specific hardware setup and workload mix. It effectively balances load based on real-time device availability (in-flight requests), outperforming latency-based heuristics (which can be misled by caching or historical data) and basic distribution methods (pid, round-robin).

While latency-rr represents a significant improvement over the default Btrfs policies, the queue policy appears to be the most promising direction for achieving optimal read performance in complex, mixed-load environments on this hardware. Potential overhead of the part_in_flight calculation should be considered, but the performance gains seem to justify it in this case.

@Forza-tng
Copy link

Forza-tng commented May 4, 2025

HDD RAID1: https://paste.tnonline.net/files/sM6GIRC8oZkk_btrfs_policy_benchmark_20250504_105033.zip
SSD RAID10: https://paste.tnonline.net/files/mSZq8bRHkUYn_btrfs_policy_benchmark_20250503_110332.zip
SSD RAID10 RW: https://paste.tnonline.net/files/MnFiKuLzh3fK_btrfs_policy_benchmark_20250503_174457.zip

# Btrfs Read Policy Benchmark Results. 

## HDD RAID1

| Test Scenario           | Policy      | IOPS | BW (KiB/s) | Avg Lat (ms) | 99% Lat (ms) | 99.9% Lat (ms) |
| ----------------------- | ----------- | ---- | ---------- | ------------ | ------------ | -------------- |
| **RandRead 4 KiB QD1**  | pid         |  100 |        403 |        0.069 |       25.822 |         34.341 |
|                         | round-robin |   89 |        359 |        0.143 |       26.608 |         35.390 |
|                         | latency     |  103 |        413 |        0.091 |       26.084 |         33.162 |
|                         | latency-rr  |   96 |        386 |        0.061 |       26.870 |         33.817 |
|                         | queue       |  100 |        402 |        0.084 |       25.560 |         33.817 |
| **RandRead 4 KiB QD32** | pid         | 1782 |       7133 |        0.019 |       70.779 |        104.334 |
|                         | round-robin | 1968 |       7876 |        0.019 |       53.216 |         69.731 |
|                         | latency     | 1644 |       6582 |        0.019 |       76.022 |        115.868 |
|                         | latency-rr  | 1691 |       6767 |        0.020 |       73.925 |        114.820 |
|                         | queue       | 2315 |       9264 |        0.017 |       46.924 |         61.604 |
| **SeqRead 1 MiB QD16**  | pid         |  257 |    264 449 |        0.066 |       78.000 |        113.000 |
|                         | round-robin |  252 |    258 528 |        0.065 |       79.000 |        112.000 |
|                         | latency     |  259 |    266 104 |        0.065 |       80.000 |        117.000 |
|                         | latency-rr  |  253 |    259 305 |        0.066 |       78.000 |         87.000 |
|                         | queue       |  287 |    294 087 |        0.061 |      209.000 |        961.000 |

## SSD RAID10

| Test Scenario           | Policy      |  IOPS   | BW (KiB/s) | Avg Lat (ms) | 99 % Lat (ms) | 99.9 % Lat (ms) |
| ----------------------- | ----------- | ------- | ---------- | ------------ | ------------- | --------------- |
| **RandRead 4 KiB QD1**  | pid         |   6 793 |     27 171 |        0.146 |         0.221 |           0.367 |
|                         | round-robin |   6 787 |     27 150 |        0.146 |         0.231 |           0.611 |
|                         | latency     |   6 754 |     27 017 |        0.147 |         0.237 |           0.392 |
|                         | latency-rr  |   6 831 |     27 322 |        0.145 |         0.223 |           0.510 |
|                         | queue       |   6 795 |     27 181 |        0.146 |         0.223 |           0.469 |
| **RandRead 4 KiB QD32** | pid         | 124 733 |    498 933 |        0.256 |         1.074 |           1.696 |
|                         | round-robin | 142 614 |    570 454 |        0.224 |         0.717 |           1.319 |
|                         | latency     | 142 683 |    570 732 |        0.224 |         0.611 |           1.385 |
|                         | latency-rr  | 128 045 |    512 182 |        0.250 |         0.898 |           1.909 |
|                         | queue       | 145 057 |    580 228 |        0.220 |         0.562 |           1.090 |
| **SeqRead 1 MiB QD16**  | pid         |   2 692 |  2 890 752 |       5.9409 |         6.718 |           8.979 |
|                         | round-robin |   5 093 |  5 469 184 |       3.1402 |         4.817 |           7.963 |
|                         | latency     |   2 927 |  3 142 656 |       5.4649 |         6.718 |           9.241 |
|                         | latency-rr  |   3 001 |  3 223 552 |       5.3288 |         6.390 |           8.455 |
|                         | queue       |   5 433 |  5 833 728 |       2.9437 |         3.982 |           6.194 |

## SSD RAID10 ReadWrite

| Test Scenario         | Policy      | Read IOPS | Read BW (KiB/s) | Read Avg Lat (ms) | Write IOPS | Write BW (KiB/s) | Write Avg Lat (ms) |
| --------------------- | ----------- | --------- | --------------- | ----------------- | ---------- | ---------------- | ------------------ |
| **RandRW 4 KiB QD1**  | pid         |     2 792 |          11 162 |             0.160 |      2 789 |           11 162 |              0.195 |
|                       | round-robin |     3 425 |          13 722 |             0.164 |      3 420 |           13 722 |              0.126 |
|                       | latency     |     2 754 |          11 059 |             0.196 |      2 750 |           10 957 |              0.165 |
|                       | latency-rr  |     2 655 |          10 650 |             0.211 |      2 653 |           10 650 |              0.163 |
|                       | queue       |     3 317 |          13 312 |             0.176 |      3 313 |           13 210 |              0.123 |
| **RandRW 4 KiB QD32** | pid         |    11 200 |          44 748 |             1.232 |     11 200 |           44 748 |              1.628 |
|                       | round-robin |    17 900 |          71 680 |             0.843 |     17 900 |           71 680 |              0.940 |
|                       | latency     |    19 000 |          76 083 |             0.758 |     19 000 |           76 083 |              0.922 |
|                       | latency-rr  |    23 800 |          95 130 |             0.631 |     23 800 |           95 130 |              0.713 |
|                       | queue       |    28 100 |         112 640 |             0.438 |     28 000 |          112 640 |              0.701 |
| **SeqRW 1 MiB QD16**  | pid         |       537 |         550 105 |            15.370 |        534 |          547 281 |             12.960 |
|                       | round-robin |       343 |         355 830 |            23.348 |        342 |          354 750 |             23.216 |
|                       | latency     |       241 |         248 753 |            30.525 |        242 |          249 114 |             32.405 |
|                       | latency-rr  |       269 |         276 274 |            28.481 |        269 |          275 653 |             27.496 |
|                       | queue       |       281 |         288 406 |            28.576 |        282 |          288 845 |             28.112 |

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment