-
-
Save jjxtra/f6116180b2ef5c1550e60567af506c2a to your computer and use it in GitHub Desktop.
using System; | |
using System.Collections.Generic; | |
using System.Text; | |
using System.Threading; | |
using System.Threading.Tasks; | |
namespace Polly.Utilities | |
{ | |
/// <summary> | |
/// Defines operations for locks used by Polly policies. | |
/// </summary> | |
public interface ILockProviderAsync | |
{ | |
/// <summary> | |
/// Waits to acquire the lock. | |
/// </summary> | |
/// <param name="key">A string key being used by the execution.</param> | |
/// <param name="context">The Polly execution context consuming this lock.</param> | |
/// <param name="cancellationToken">A cancellation token to cancel waiting to acquire the lock.</param> | |
/// <throws>OperationCanceledException, if the passed <paramref name="cancellationToken"/> is signaled before the lock is acquired.</throws> | |
/// <throws>InvalidOperationException, invalid lock state</throws> | |
ValueTask<IDisposable> AcquireLockAsync(string key, Context context, CancellationToken cancellationToken); | |
} | |
/// <summary> | |
/// Defines operations for locks used by Polly policies. | |
/// </summary> | |
public interface ILockProvider | |
{ | |
/// <summary> | |
/// Waits to acquire the lock. | |
/// </summary> | |
/// <param name="key">A string key being used by the execution.</param> | |
/// <param name="context">The Polly execution context consuming this lock.</param> | |
/// <throws>InvalidOperationException, invalid lock state</throws> | |
IDisposable AcquireLock(string key, Context context); | |
} | |
/// <summary> | |
/// Lock provider that locks on a key per process. The locking mechanism is designed to be able | |
/// to be requested and released on different threads if needed. | |
/// </summary> | |
public class ProcessLockProviderAsync : ILockProviderAsync | |
{ | |
// TODO: Pass via IOC or other method instead of hard-coding static | |
internal static readonly int[] keyLocks = new int[1024]; | |
private class ProcessLockAsync : IDisposable | |
{ | |
private uint hash; | |
private bool gotLock; | |
[System.Runtime.CompilerServices.MethodImpl(System.Runtime.CompilerServices.MethodImplOptions.AggressiveInlining)] | |
public void Dispose() | |
{ | |
if (gotLock) | |
{ | |
gotLock = false; | |
ProcessLockProviderAsync.keyLocks[hash] = 0; | |
} | |
// else we do not care, it can be disposed in an error case and we will simply ignore that the key locks were not touched | |
} | |
[System.Runtime.CompilerServices.MethodImpl(System.Runtime.CompilerServices.MethodImplOptions.AggressiveInlining)] | |
public async ValueTask<IDisposable> AcquireLockAsync(string key, Context context, CancellationToken cancellationToken) | |
{ | |
// Monitor.Enter and Monitor.Exit are tied to a specific thread and are | |
// slower than this spin lock, which does not care about threads and will execute very | |
// quickly, regardless of lock contention | |
// https://stackoverflow.com/questions/11001760/monitor-enter-and-monitor-exit-in-different-threads | |
// Get a hash based on the key, use this to lock on a specific int in the array. The array is designed | |
// to be small enough to not use very much memory, but large enough to avoid collisions. | |
// Even if there is a collision, it will be resolved very quickly. | |
hash = (uint)key.GetHashCode() % (uint)ProcessLockProviderAsync.keyLocks.Length; | |
// To get the lock, we must change the int at hash index from a 0 to a 1. If the value is | |
// already a 1, we don't get the lock. The return value must be 0 (the original value of the int). | |
// it is very unlikely to have any contention here, but if so, the spin cycle should be very short. | |
// Parameter index 1 (value of 1) is the value to change to if the existing value (Parameter index 2) is 0. | |
while (!cancellationToken.IsCancellationRequested && Interlocked.CompareExchange(ref ProcessLockProviderAsync.keyLocks[hash], 1, 0) == 1) | |
{ | |
// give up a clock cycle, we want to get back and try to get the lock again very quickly | |
await Task.Yield(); | |
} | |
if (cancellationToken.IsCancellationRequested) | |
{ | |
throw new OperationCanceledException(cancellationToken); | |
} | |
gotLock = true; | |
return this; | |
} | |
} | |
[System.Runtime.CompilerServices.MethodImpl(System.Runtime.CompilerServices.MethodImplOptions.AggressiveInlining)] | |
public ValueTask<IDisposable> AcquireLockAsync(string key, Context context, CancellationToken cancellationToken) | |
{ | |
return new ProcessLockAsync().AcquireLockAsync(key, context, cancellationToken); | |
} | |
} | |
/// <summary> | |
/// Lock provider that locks on a key per process. The locking mechanism is designed to be able | |
/// to be requested and released on different threads if needed. | |
/// </summary> | |
public class ProcessLockProvider : ILockProvider | |
{ | |
private class ProcessLock : IDisposable | |
{ | |
private uint hash; | |
private bool gotLock; | |
[System.Runtime.CompilerServices.MethodImpl(System.Runtime.CompilerServices.MethodImplOptions.AggressiveInlining)] | |
public void Dispose() | |
{ | |
if (gotLock) | |
{ | |
gotLock = false; | |
ProcessLockProviderAsync.keyLocks[hash] = 0; | |
} | |
// if constructor had exception, we will not get in Dispose as the object will never be created, if the constructor succeeds, gotLock will always be true | |
// we still use the gotLock bool in case of multiple dispose calls | |
} | |
[System.Runtime.CompilerServices.MethodImpl(System.Runtime.CompilerServices.MethodImplOptions.AggressiveInlining)] | |
public ProcessLockAsync(string key, Context context) | |
{ | |
hash = (uint)key.GetHashCode() % (uint)ProcessLockProviderAsync.keyLocks.Length; | |
while (Interlocked.CompareExchange(ref ProcessLockProviderAsync.keyLocks[hash], 1, 0) == 1) | |
{ | |
Task.Yield().GetAwaiter().GetResult(); | |
} | |
gotLock = true; | |
} | |
} | |
/// <inheritdoc /> | |
public IDisposable AcquireLock(string key, Context context) | |
{ | |
return new ProcessLock(key, context); | |
} | |
} | |
} |
I've shrunk the int array from the above down to 1024 ints (4K RAM) and performance is basically the same, so we can get away with just 4K RAM overhead...
Gist source code updated to only release the lock if it was acquired, along with a 1024 int array instead of 8192.
@jjxtra The changes with hasLock
look as if they introduce a number of bugs. hasLock
is not utilised in a concurrency-safe way (races can invalidate the intent). hasLock
is scoped across all locks in the keylocks[]
array, invaliding the design of having keylocks[]
.
@reisenberger Lock state is now a flag, it can only move from 0 to 1 and from 1 to 2. Once at 2, InvalidOperationException
is always thrown. Key array is now static. Probably better to pass a key array via dependency injection but this is just a proof of concept. As any ILockProvider*
returned from a lock factory would be a new instance and not considered to be thread safe (i.e. multiple thread entries into an AcquireLock
for the same instance of an ILockProvider
would cause undefined behavior) I think I'm good with how it works now. Multiple entries into AcquireLock
from the same thread would throw an exception.
@reisenberger I went ahead and implemented the factory pattern. Let me know what you think! I went back to a simple bool gotLock
because the factory creates a new instance each time and the contract returned (IDisposable
) does not allow attempting to re-acquire the lock. Also, the returned IDisposable
is not guaranteed to be thread safe, so calling dispose from multiple threads is unsupported.
@jjxtra Again, thanks for everything on the locking!
Some notes just on what turned out different in the main Policy implementation, compared to what discussed previously and what we have in this gist:
- I was wrong in this case about the benefit of using a
struct
for the releasers. The compiler does have a special transform for when ausing
statement directly references astruct
(ieusing (<some-struct>) { }
), to avoid boxing, but here we are returning anIDisposable
from a method due to the lock interfaces, so it gets boxed toIDisposable
first anyway π . So I switched back toclass
in the v0.1.0 basic lock implementation. - There were a couple of places in my early spike which used the non-thread-safe members on
ConcurrentDictionary<,>
incl. outside the lock - switched to thread-safe. - I'm thinking that
AcquireLockAsync
needs to returnIAsyncDisposable
, not justValueTask<IDisposable>
. If the lock-releasing is across network I/O (like to Redis), we want the implementation of that release to be able to be fully async.IAsyncDisposable
usesValueTask
as the return type of its release method, so when we are just wrapping a sync implementation, we still get the benefit ofValueTask
.
Assuming you push on forward to bring the stripe-locking idea into Polly.Contrib.DuplicateRequestCollapser
:
- We probably wouldn't need the async implementation
ProcessLockProviderAsync
? There's nothing async in the work it does (setting aside the need to yield asynchronously given it's async), so we could just use a shallow async-over-sync wrapper. (Just running it async when it doesn't have any need for async work will create extra allocations for the state machines, async continuations etc.) - I dug into the source code around
Task.Yield()
, to be sure I understood whatTask.Yield().GetAwaiter().GetResult()
would do. Looks like it doesn't offer any yielding behaviour if used sync like that,.GetAwaiter()
returns aYieldAwaitable
,GetResult()
on that is a no-op. Looks like the benefit ofTask.Yield()
only comes if it's genuinely used in anasync
context; theawait
then schedules the async continuation back onto one or otherTaskScheduler
, which looks like the mechanism by whichawait Task.Yield()
gets its value - that async continuation has to compete with other async tasks in the scheduler. NB Moot now that we can now drop .Net Standard 1.x (:+1: ), just sharing what I found cos I had dug into this.
Thanks again for everything you are doing on the locking. π
Yep, PR onto Polly.Contrib.DuplicateRequestCollapser.
My recommendation would be a PR to add a new ISyncLockingProvider
, not replace/modify InstanceScopedLockProvider
, leaving both as options. Curating a widely-used OSS project really brings home (if it is not already obvious) that different users have different needs. Some will benefit from a striped-lock provider; some will have no need. Giving users options (provided not confusing), rather than pre-deciding on one option to suit all users, can work well. π
Thanks again for helping drive this forward!
You are welcome!
Observations:
Interlocked.CompareExchange
with an array of 1024 int and hashing on key to get a slot and looping withThread.Yield
, there are significant performance gains on locking per key versus not locking per key (the last two rows of each test). There was no significant difference on locking per key usinglock
, except that memory usage required was 4K versus using .NET objects, which with a 1024 array of object, would require 4K * 3 or 6 bytes, depending on CPU architecture (https://stackoverflow.com/questions/14286421/c-sharp-object-size-overhead). Also reference my baseline test, 696us (single lock) vs 451us (lock per key).lock
with anInterlocked.CompareExchange
andThread.Yield
loop using a single global int yielded significant performance gains vs.lock
on a single object. See my baseline test second to last row and compare it against my second to last test 696us vs. 507 us.lock
, where contention will cause a kernel context switch. On a busy server handling hundreds or thousands of requests per second with high core count, the CPU usage and wait time on the single lock will be significant. Imagine a busy C# dns server using polly with a key collapser policy. A single lock would make it unusable.ConcurrentDictionary
, which is awesome!Questions from above:
uint hashCode = (uint)key.GetHashCode % 1024;
andwhile (Interlocked.CompareExchange(ref arrayIntLock[hashCode], 1, 0) == 1) { Thread.Yield(); }
complex.lock
ever an issue if there is a need to await inside thelock
? TheInterlocked.CompareExchange
method does not have this issue.My final analysis:
Core counts are increasing rapidly, it seems like AMD has been doubling every year or two. When I make frameworks or architecture decisions, I try to plan in years, and even decades. I think the default behavior of this feature should be equally performant on many cores vs few. I'd be ok using
Interlocked.CompareExchange
on a single global int, but this may break down in a few years especially if 32/64/128 core systems become the norm in the data-center. The lock per key only uses 4K of RAM, miniscule compared to CLR default overhead. I propose that lock per key usingInterlocked.CompareExchange
using an array of 1024 int be the default behavior. Again, this 4K is only allocated if someone creates a key collapser policy.Bottom line, the single lock per process performs linearly more poorly as core counts and requests per second increase.
Source code:
I've pasted in my source using
Interlocked.CompareExchange
instead oflock
calls to see if it makes a difference on your machine. I would be curious to know the results.