-
-
Save jjxtra/f6116180b2ef5c1550e60567af506c2a to your computer and use it in GitHub Desktop.
using System; | |
using System.Collections.Generic; | |
using System.Text; | |
using System.Threading; | |
using System.Threading.Tasks; | |
namespace Polly.Utilities | |
{ | |
/// <summary> | |
/// Defines operations for locks used by Polly policies. | |
/// </summary> | |
public interface ILockProviderAsync | |
{ | |
/// <summary> | |
/// Waits to acquire the lock. | |
/// </summary> | |
/// <param name="key">A string key being used by the execution.</param> | |
/// <param name="context">The Polly execution context consuming this lock.</param> | |
/// <param name="cancellationToken">A cancellation token to cancel waiting to acquire the lock.</param> | |
/// <throws>OperationCanceledException, if the passed <paramref name="cancellationToken"/> is signaled before the lock is acquired.</throws> | |
/// <throws>InvalidOperationException, invalid lock state</throws> | |
ValueTask<IDisposable> AcquireLockAsync(string key, Context context, CancellationToken cancellationToken); | |
} | |
/// <summary> | |
/// Defines operations for locks used by Polly policies. | |
/// </summary> | |
public interface ILockProvider | |
{ | |
/// <summary> | |
/// Waits to acquire the lock. | |
/// </summary> | |
/// <param name="key">A string key being used by the execution.</param> | |
/// <param name="context">The Polly execution context consuming this lock.</param> | |
/// <throws>InvalidOperationException, invalid lock state</throws> | |
IDisposable AcquireLock(string key, Context context); | |
} | |
/// <summary> | |
/// Lock provider that locks on a key per process. The locking mechanism is designed to be able | |
/// to be requested and released on different threads if needed. | |
/// </summary> | |
public class ProcessLockProviderAsync : ILockProviderAsync | |
{ | |
// TODO: Pass via IOC or other method instead of hard-coding static | |
internal static readonly int[] keyLocks = new int[1024]; | |
private class ProcessLockAsync : IDisposable | |
{ | |
private uint hash; | |
private bool gotLock; | |
[System.Runtime.CompilerServices.MethodImpl(System.Runtime.CompilerServices.MethodImplOptions.AggressiveInlining)] | |
public void Dispose() | |
{ | |
if (gotLock) | |
{ | |
gotLock = false; | |
ProcessLockProviderAsync.keyLocks[hash] = 0; | |
} | |
// else we do not care, it can be disposed in an error case and we will simply ignore that the key locks were not touched | |
} | |
[System.Runtime.CompilerServices.MethodImpl(System.Runtime.CompilerServices.MethodImplOptions.AggressiveInlining)] | |
public async ValueTask<IDisposable> AcquireLockAsync(string key, Context context, CancellationToken cancellationToken) | |
{ | |
// Monitor.Enter and Monitor.Exit are tied to a specific thread and are | |
// slower than this spin lock, which does not care about threads and will execute very | |
// quickly, regardless of lock contention | |
// https://stackoverflow.com/questions/11001760/monitor-enter-and-monitor-exit-in-different-threads | |
// Get a hash based on the key, use this to lock on a specific int in the array. The array is designed | |
// to be small enough to not use very much memory, but large enough to avoid collisions. | |
// Even if there is a collision, it will be resolved very quickly. | |
hash = (uint)key.GetHashCode() % (uint)ProcessLockProviderAsync.keyLocks.Length; | |
// To get the lock, we must change the int at hash index from a 0 to a 1. If the value is | |
// already a 1, we don't get the lock. The return value must be 0 (the original value of the int). | |
// it is very unlikely to have any contention here, but if so, the spin cycle should be very short. | |
// Parameter index 1 (value of 1) is the value to change to if the existing value (Parameter index 2) is 0. | |
while (!cancellationToken.IsCancellationRequested && Interlocked.CompareExchange(ref ProcessLockProviderAsync.keyLocks[hash], 1, 0) == 1) | |
{ | |
// give up a clock cycle, we want to get back and try to get the lock again very quickly | |
await Task.Yield(); | |
} | |
if (cancellationToken.IsCancellationRequested) | |
{ | |
throw new OperationCanceledException(cancellationToken); | |
} | |
gotLock = true; | |
return this; | |
} | |
} | |
[System.Runtime.CompilerServices.MethodImpl(System.Runtime.CompilerServices.MethodImplOptions.AggressiveInlining)] | |
public ValueTask<IDisposable> AcquireLockAsync(string key, Context context, CancellationToken cancellationToken) | |
{ | |
return new ProcessLockAsync().AcquireLockAsync(key, context, cancellationToken); | |
} | |
} | |
/// <summary> | |
/// Lock provider that locks on a key per process. The locking mechanism is designed to be able | |
/// to be requested and released on different threads if needed. | |
/// </summary> | |
public class ProcessLockProvider : ILockProvider | |
{ | |
private class ProcessLock : IDisposable | |
{ | |
private uint hash; | |
private bool gotLock; | |
[System.Runtime.CompilerServices.MethodImpl(System.Runtime.CompilerServices.MethodImplOptions.AggressiveInlining)] | |
public void Dispose() | |
{ | |
if (gotLock) | |
{ | |
gotLock = false; | |
ProcessLockProviderAsync.keyLocks[hash] = 0; | |
} | |
// if constructor had exception, we will not get in Dispose as the object will never be created, if the constructor succeeds, gotLock will always be true | |
// we still use the gotLock bool in case of multiple dispose calls | |
} | |
[System.Runtime.CompilerServices.MethodImpl(System.Runtime.CompilerServices.MethodImplOptions.AggressiveInlining)] | |
public ProcessLockAsync(string key, Context context) | |
{ | |
hash = (uint)key.GetHashCode() % (uint)ProcessLockProviderAsync.keyLocks.Length; | |
while (Interlocked.CompareExchange(ref ProcessLockProviderAsync.keyLocks[hash], 1, 0) == 1) | |
{ | |
Task.Yield().GetAwaiter().GetResult(); | |
} | |
gotLock = true; | |
} | |
} | |
/// <inheritdoc /> | |
public IDisposable AcquireLock(string key, Context context) | |
{ | |
return new ProcessLock(key, context); | |
} | |
} | |
} |
Test using Interlocked.CompareExchange for LockPerKey
both values, without Thread.Yield... short answer is Thread.Yield is necessary :)
// * Summary *
BenchmarkDotNet=v0.12.0, OS=Windows 10.0.18362
AMD Ryzen Threadripper 1950X, 1 CPU, 32 logical and 16 physical cores
.NET Core SDK=3.1.100
[Host] : .NET Core 3.1.0 (CoreCLR 4.700.19.56402, CoreFX 4.700.19.56404), X64 RyuJIT
Contention100 : .NET Core 3.1.0 (CoreCLR 4.700.19.56402, CoreFX 4.700.19.56404), X64 RyuJIT
Job=Contention100 IterationCount=50 LaunchCount=1
WarmupCount=3
| Method | Work | DifferentKeys | LockPerKey | Mean | Error | StdDev | Median |
|------- |------------ |-------------- |----------- |-----------:|----------:|----------:|-----------:|
| PCLOCD | NoWork | False | False | 405.2 us | 1.98 us | 3.94 us | 405.8 us |
| PCLOCD | NoWork | False | True | 399.6 us | 9.53 us | 18.81 us | 401.7 us |
| PCLOCD | NoWork | True | False | 416.2 us | 2.07 us | 4.18 us | 415.9 us |
| PCLOCD | NoWork | True | True | 402.7 us | 10.22 us | 20.64 us | 407.1 us |
| PCLOCD | NoExtraLock | False | False | 402.9 us | 2.05 us | 4.10 us | 402.9 us |
| PCLOCD | NoExtraLock | False | True | 401.0 us | 17.20 us | 34.35 us | 408.2 us |
| PCLOCD | NoExtraLock | True | False | 450.3 us | 1.22 us | 2.43 us | 449.9 us |
| PCLOCD | NoExtraLock | True | True | 433.2 us | 1.60 us | 3.23 us | 433.6 us |
| PCLOCD | ExtraLock | False | False | 489.6 us | 48.57 us | 97.00 us | 438.3 us |
| PCLOCD | ExtraLock | False | True | 464.0 us | 42.35 us | 83.59 us | 417.1 us |
| PCLOCD | ExtraLock | True | False | 3,193.8 us | 427.48 us | 823.60 us | 3,067.3 us |
| PCLOCD | ExtraLock | True | True | 477.8 us | 1.89 us | 3.81 us | 477.7 us |
// * Warnings *
MultimodalDistribution
Benchmarks.PCLOCD: Contention100 -> It seems that the distribution is bimodal (mValue = 4.18)
MinIterationTime
Benchmarks.PCLOCD: Contention100 -> The minimum observed iteration time is 1.7141 ms which is very small. It's recommended to increase it.
// * Hints *
Outliers
Benchmarks.PCLOCD: Contention100 -> 1 outlier was removed, 3 outliers were detected (393.07 us, 395.51 us, 414.95 us)
Benchmarks.PCLOCD: Contention100 -> 2 outliers were removed, 4 outliers were detected (276.70 us, 386.52 us, 417.56 us, 418.56 us)
Benchmarks.PCLOCD: Contention100 -> 3 outliers were detected (305.68 us..362.29 us)
Benchmarks.PCLOCD: Contention100 -> 1 outlier was removed (414.74 us)
Benchmarks.PCLOCD: Contention100 -> 1 outlier was removed, 5 outliers were detected (193.38 us..393.40 us, 419.50 us)
Benchmarks.PCLOCD: Contention100 -> 1 outlier was removed, 2 outliers were detected (443.79 us, 458.09 us)
Benchmarks.PCLOCD: Contention100 -> 1 outlier was detected (422.52 us)
Benchmarks.PCLOCD: Contention100 -> 1 outlier was removed (1.34 ms)
Benchmarks.PCLOCD: Contention100 -> 2 outliers were removed (777.81 us, 827.17 us)
Benchmarks.PCLOCD: Contention100 -> 4 outliers were removed (5.62 ms..183.76 ms)
// * Legends *
Work : Value of the 'Work' parameter
DifferentKeys : Value of the 'DifferentKeys' parameter
LockPerKey : Value of the 'LockPerKey' parameter
Mean : Arithmetic mean of all measurements
Error : Half of 99.9% confidence interval
StdDev : Standard deviation of all measurements
Median : Value separating the higher half of all measurements (50th percentile)
1 us : 1 Microsecond (0.000001 sec)
// ***** BenchmarkRunner: End *****
// ** Remained 0 benchmark(s) to run **
Run time: 00:07:25 (445.53 sec), executed benchmarks: 12
Global total time: 00:07:29 (449.63 sec), executed benchmarks: 12
// * Artifacts cleanup *
Observations:
- On my system, for the price of 4K RAM, using
Interlocked.CompareExchange
with an array of 1024 int and hashing on key to get a slot and looping withThread.Yield
, there are significant performance gains on locking per key versus not locking per key (the last two rows of each test). There was no significant difference on locking per key usinglock
, except that memory usage required was 4K versus using .NET objects, which with a 1024 array of object, would require 4K * 3 or 6 bytes, depending on CPU architecture (https://stackoverflow.com/questions/14286421/c-sharp-object-size-overhead). Also reference my baseline test, 696us (single lock) vs 451us (lock per key). - Replacing
lock
with anInterlocked.CompareExchange
andThread.Yield
loop using a single global int yielded significant performance gains vs.lock
on a single object. See my baseline test second to last row and compare it against my second to last test 696us vs. 507 us. - It appears that the higher the core / thread count, the more savings gained by locking per key. This makes sense, especially with
lock
, where contention will cause a kernel context switch. On a busy server handling hundreds or thousands of requests per second with high core count, the CPU usage and wait time on the single lock will be significant. Imagine a busy C# dns server using polly with a key collapser policy. A single lock would make it unusable. - When using multiple keys, there is no significant difference or overhead to add the additional lock per key versus not locking at all when using
ConcurrentDictionary
, which is awesome!
Questions from above:
- Is 4K RAM too much? No, and it won't be allocated if the user never creates a collapser policy.
- Is the code more complex on locking per key? No, unless you consider
uint hashCode = (uint)key.GetHashCode % 1024;
andwhile (Interlocked.CompareExchange(ref arrayIntLock[hashCode], 1, 0) == 1) { Thread.Yield(); }
complex. - Is using .NET
lock
ever an issue if there is a need to await inside thelock
? TheInterlocked.CompareExchange
method does not have this issue.
My final analysis:
Core counts are increasing rapidly, it seems like AMD has been doubling every year or two. When I make frameworks or architecture decisions, I try to plan in years, and even decades. I think the default behavior of this feature should be equally performant on many cores vs few. I'd be ok using Interlocked.CompareExchange
on a single global int, but this may break down in a few years especially if 32/64/128 core systems become the norm in the data-center. The lock per key only uses 4K of RAM, miniscule compared to CLR default overhead. I propose that lock per key using Interlocked.CompareExchange
using an array of 1024 int be the default behavior. Again, this 4K is only allocated if someone creates a key collapser policy.
Bottom line, the single lock per process performs linearly more poorly as core counts and requests per second increase.
Source code:
I've pasted in my source using Interlocked.CompareExchange
instead of lock
calls to see if it makes a difference on your machine. I would be curious to know the results.
using System;
using System.Collections.Concurrent;
using System.Linq;
using System.Threading;
using System.Threading.Tasks;
using BenchmarkDotNet.Attributes;
namespace ConcurrentDictionaryLockContention
{
[Config(typeof(BenchmarkConfig))]
public class Benchmarks
{
public enum TheWork
{
NoWork,
NoExtraLock,
ExtraLock,
}
[ParamsAllValues]
public TheWork Work { get; set; }
[ParamsAllValues]
public bool DifferentKeys { get; set; }
[ParamsAllValues]
public bool LockPerKey { get; set; }
private string GetKey(int seed) => DifferentKeys ? seed.ToString() : commonKey;
private object GetLockObject(int seed) => LockPerKey ? lockObjects[seed] : commonLockObject;
public static int ParallelTasks = 100;
public readonly Task<Lazy<object>>[] tasks;
public readonly object[] lockObjects;
public readonly int[] lockObjectInts = new int[1024];
public int lockObjectInt;
public Task[] tasksCast;
public ConcurrentDictionary<string, Lazy<object>> collapser = new ConcurrentDictionary<string, Lazy<object>>(
/*ParallelTasks, Enumerable.Empty<KeyValuePair<string, Lazy<object>>>(), EqualityComparer<string>.Default*/ // Explicitly setting the concurrencyLevel at creation did not significantly alter this benchmark.
);
private readonly Func<object, CancellationToken, object> commonAction = (o, token) => new object();
private readonly object commonInputObject = new object();
private readonly CancellationToken commonToken = default;
private readonly object commonLockObject = new object();
private const string commonKey = "SameKey";
private ManualResetEventSlim startSignal;
private readonly SpinWait spinner = new SpinWait();
private int WaitingForStartSignal = 0;
/*private int ActualContentionEncountered = 0;*/
private readonly Lazy<object> flyweightWork = new Lazy<object>();
public Benchmarks()
{
tasks = new Task<Lazy<object>>[ParallelTasks];
lockObjects = new object[ParallelTasks];
for (int i = 0; i < ParallelTasks; i++)
{
lockObjects[i] = new object();
}
// Avoid thread-pool starvation influencing test results.
ThreadPool.GetMinThreads(out _, out int completionPortThreads);
ThreadPool.SetMinThreads(ParallelTasks + 25, completionPortThreads);
}
public void Setup()
{
collapser = collapser = new ConcurrentDictionary<string, Lazy<object>>(
/*ParallelTasks, Enumerable.Empty<KeyValuePair<string, Lazy<object>>>(), EqualityComparer<string>.Default*/ // Explicitly setting the concurrencyLevel at creation did not significantly alter this benchmark.
);
startSignal = new ManualResetEventSlim(false);
Interlocked.Exchange(ref WaitingForStartSignal, 0);
/*Interlocked.Exchange(ref ActualContentionEncountered, 0);*/
Lazy<object> BenchmarkedActivity(string key, object lockObject)
{
Interlocked.Increment(ref WaitingForStartSignal);
startSignal.WaitHandle.WaitOne();
if (Work == TheWork.ExtraLock)
{
// The commented-out line below can be commented back in (in conjunction with its Console.WriteLine and other related lines) to get a visual indication of how many tasks actually experience lock contention.
/*if (!Monitor.TryEnter(lockObject, 0)) { Interlocked.Increment(ref ActualContentionEncountered); } else { Monitor.Exit(lockObject); }*/
uint hashCode = 0;
if (LockPerKey)
{
unchecked
{
hashCode = (uint)key.GetHashCode() % 1024;
}
while (Interlocked.CompareExchange(ref lockObjectInts[hashCode], 1, 0) == 1)
{
System.Threading.Thread.Yield();
}
}
else
{
while (Interlocked.CompareExchange(ref lockObjectInt, 1, 0) == 1)
{
System.Threading.Thread.Yield();
}
}
try
{
return collapser.GetOrAdd(key, new Lazy<object>(
() => commonAction(commonInputObject, commonToken), /* we don't care what this factory does; we are not running it. We are benchmarking locking over the ConcurrentDictionary<,>.GetOrAdd(). But for similarity to proposed Polly code, it is a delegate of the same form Polly uses. */
LazyThreadSafetyMode.ExecutionAndPublication));
}
finally
{
if (LockPerKey)
{
lockObjectInts[hashCode] = 0;
}
else
{
lockObjectInt = 0;
}
}
}
else if (Work == TheWork.NoExtraLock)
{
return collapser.GetOrAdd(key, new Lazy<object>(
() => commonAction(commonInputObject, commonToken), /* we don't care what this factory does; we are not running it. We are benchmarking locking over the ConcurrentDictionary<,>.GetOrAdd(). But for similarity to proposed Polly code, it is a delegate of the same form Polly uses. */
LazyThreadSafetyMode.ExecutionAndPublication));
}
else if (Work == TheWork.NoWork) { return flyweightWork; }
else throw new InvalidOperationException($"Unknown value for {nameof(TheWork)}");
}
// Set up actions which will contend in parallel.
for (int i = 0; i < ParallelTasks; i++)
{
var key = GetKey(i);
var lockObject = GetLockObject(i);
tasks[i] = Task.Run(() =>
{
return BenchmarkedActivity(key, lockObject);
});
}
tasksCast = tasks.Select(t => t as Task).ToArray();
/*Console.WriteLine($"Potential contention after queueing tasks: {WaitingForStartSignal}");*/
// To maximise contention, ensure all Tasks have actually started and are gated at the ManualResetEvent, before proceeding.
while (WaitingForStartSignal < ParallelTasks)
{
spinner.SpinOnce();
Thread.Yield();
}
/*Console.WriteLine($"Potential contention at the starting gate: {WaitingForStartSignal}");*/
}
public void TearDown()
{
for (int i = 0; i < ParallelTasks; i++)
{
tasks[i] = null;
}
collapser = null;
/*Console.WriteLine($"Actual lock contention encountered: {ActualContentionEncountered} ");*/
startSignal.Dispose();
spinner.Reset();
}
[Benchmark]
public void PCLOCD() // ParallelContendLockOverConcurrentDictionary
{
Setup();
startSignal.Set();
Task.WaitAll(tasksCast);
TearDown();
}
/*
[Benchmark]
public object PureConcurrentDictionaryGetOrAdd()
{
return collapser.GetOrAdd(commonKey, new Lazy<object>(
() => commonAction(commonInputObject, commonToken), /* for the purposes of the benchmark we don't care what this factory does; we are not running it. We are benchmarking locking over the ConcurrentDictionary<,>.GetOrAdd() #1#
LazyThreadSafetyMode.ExecutionAndPublication));
}
*/
}
}
I've shrunk the int array from the above down to 1024 ints (4K RAM) and performance is basically the same, so we can get away with just 4K RAM overhead...
Gist source code updated to only release the lock if it was acquired, along with a 1024 int array instead of 8192.
@jjxtra The changes with hasLock
look as if they introduce a number of bugs. hasLock
is not utilised in a concurrency-safe way (races can invalidate the intent). hasLock
is scoped across all locks in the keylocks[]
array, invaliding the design of having keylocks[]
.
@reisenberger Lock state is now a flag, it can only move from 0 to 1 and from 1 to 2. Once at 2, InvalidOperationException
is always thrown. Key array is now static. Probably better to pass a key array via dependency injection but this is just a proof of concept. As any ILockProvider*
returned from a lock factory would be a new instance and not considered to be thread safe (i.e. multiple thread entries into an AcquireLock
for the same instance of an ILockProvider
would cause undefined behavior) I think I'm good with how it works now. Multiple entries into AcquireLock
from the same thread would throw an exception.
@reisenberger I went ahead and implemented the factory pattern. Let me know what you think! I went back to a simple bool gotLock
because the factory creates a new instance each time and the contract returned (IDisposable
) does not allow attempting to re-acquire the lock. Also, the returned IDisposable
is not guaranteed to be thread safe, so calling dispose from multiple threads is unsupported.
@jjxtra Again, thanks for everything on the locking!
Some notes just on what turned out different in the main Policy implementation, compared to what discussed previously and what we have in this gist:
- I was wrong in this case about the benefit of using a
struct
for the releasers. The compiler does have a special transform for when ausing
statement directly references astruct
(ieusing (<some-struct>) { }
), to avoid boxing, but here we are returning anIDisposable
from a method due to the lock interfaces, so it gets boxed toIDisposable
first anyway π . So I switched back toclass
in the v0.1.0 basic lock implementation. - There were a couple of places in my early spike which used the non-thread-safe members on
ConcurrentDictionary<,>
incl. outside the lock - switched to thread-safe. - I'm thinking that
AcquireLockAsync
needs to returnIAsyncDisposable
, not justValueTask<IDisposable>
. If the lock-releasing is across network I/O (like to Redis), we want the implementation of that release to be able to be fully async.IAsyncDisposable
usesValueTask
as the return type of its release method, so when we are just wrapping a sync implementation, we still get the benefit ofValueTask
.
Assuming you push on forward to bring the stripe-locking idea into Polly.Contrib.DuplicateRequestCollapser
:
- We probably wouldn't need the async implementation
ProcessLockProviderAsync
? There's nothing async in the work it does (setting aside the need to yield asynchronously given it's async), so we could just use a shallow async-over-sync wrapper. (Just running it async when it doesn't have any need for async work will create extra allocations for the state machines, async continuations etc.) - I dug into the source code around
Task.Yield()
, to be sure I understood whatTask.Yield().GetAwaiter().GetResult()
would do. Looks like it doesn't offer any yielding behaviour if used sync like that,.GetAwaiter()
returns aYieldAwaitable
,GetResult()
on that is a no-op. Looks like the benefit ofTask.Yield()
only comes if it's genuinely used in anasync
context; theawait
then schedules the async continuation back onto one or otherTaskScheduler
, which looks like the mechanism by whichawait Task.Yield()
gets its value - that async continuation has to compete with other async tasks in the scheduler. NB Moot now that we can now drop .Net Standard 1.x (:+1: ), just sharing what I found cos I had dug into this.
Thanks again for everything you are doing on the locking. π
Yep, PR onto Polly.Contrib.DuplicateRequestCollapser.
My recommendation would be a PR to add a new ISyncLockingProvider
, not replace/modify InstanceScopedLockProvider
, leaving both as options. Curating a widely-used OSS project really brings home (if it is not already obvious) that different users have different needs. Some will benefit from a striped-lock provider; some will have no need. Giving users options (provided not confusing), rather than pre-deciding on one option to suit all users, can work well. π
Thanks again for helping drive this forward!
You are welcome!
The single lock with Interlocked.CompareExchange is interesting, it is performing a bit better than lock (obj) {}...
I am running another test without a thread.yield to see if it changes things...