Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ namespace Garnet.cluster
internal sealed class ReplicationSyncManager
{
SingleWriterMultiReaderLock syncInProgress;
readonly CancellationTokenSource cts;
CancellationTokenSource cts;
readonly TimeSpan replicaSyncTimeout;
readonly ILogger logger;

Expand All @@ -25,6 +25,8 @@ internal sealed class ReplicationSyncManager

public ClusterProvider ClusterProvider { get; }

SingleWriterMultiReaderLock disposed;

public ReplicationSyncManager(ClusterProvider clusterProvider, ILogger logger = null)
{
GetSessionStore = new ReplicaSyncSessionTaskStore(clusterProvider.storeWrapper, clusterProvider, logger);
Expand All @@ -38,8 +40,11 @@ public ReplicationSyncManager(ClusterProvider clusterProvider, ILogger logger =

public void Dispose()
{
cts.Cancel();
cts.Dispose();
// Return if original value is true, hence already disposed
disposed.WriteLock();
cts?.Cancel();
cts?.Dispose();
cts = null;
syncInProgress.WriteLock();
}

Expand Down Expand Up @@ -313,6 +318,9 @@ async Task TakeStreamingCheckpoint()
// Wait for stream sync to make some progress
await Task.Delay(delay);

// Trigger exception to test reset cts mechanism
ExceptionInjectionHelper.TriggerException(ExceptionInjectionType.Replication_Diskless_Sync_Reset_Cts);

// Check if checkpoint has completed
if (checkpointTask.IsCompleted)
return await checkpointTask;
Expand All @@ -322,17 +330,35 @@ async Task TakeStreamingCheckpoint()

// Throw timeout equals to zero
if (timeout.TotalSeconds <= 0)
{
cts.Cancel();
throw new TimeoutException("Streaming snapshot checkpoint timed out");
}
}
}
catch (Exception ex)
{
logger?.LogError(ex, "{method} faulted", nameof(WaitOrDie));
cts.Cancel();
}

// At this point we failed through a timeout or any other exception
// so try to reset token.
// No race here because the only other cancellation will happen at dispose only
try
{
_ = await checkpointTask;
}
finally
{
var readLock = disposed.TryReadLock();
if (readLock && !cts.TryReset())
{
cts.Dispose();
cts = new();
}

if (readLock)
disposed.ReadUnlock();
}

return (false, default);
}
}
Expand Down
4 changes: 4 additions & 0 deletions libs/common/ExceptionInjectionType.cs
Original file line number Diff line number Diff line change
Expand Up @@ -61,5 +61,9 @@ public enum ExceptionInjectionType
/// Replication InProgress during diskless replica attach sync operation
/// </summary>
Replication_InProgress_During_Diskless_Replica_Attach_Sync,
/// <summary>
/// Replication diskless sync reset cts
/// </summary>
Replication_Diskless_Sync_Reset_Cts
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@

using System;
using System.Collections.Generic;
#if DEBUG
using Garnet.common;
#endif
using Microsoft.Extensions.Logging;
using NUnit.Framework;
using NUnit.Framework.Legacy;
Expand Down Expand Up @@ -395,5 +398,39 @@ public void ClusterDisklessSyncFailover([Values] bool disableObjects, [Values] b
for (var replica = 1; replica < nodes_count; replica++)
Validate(nOffsets[primary], nOffsets[replica], disableObjects);
}

#if DEBUG
[Test, Order(6)]
[Category("REPLICATION")]
public void ClusterDisklessSyncResetSyncManagerCts()
{
var nodes_count = 2;
var primaryIndex = 0;
var replicaOneIndex = 1;
context.CreateInstances(nodes_count, enableAOF: true, useTLS: useTLS, enableDisklessSync: true, timeout: timeout);
context.CreateConnection(useTLS: useTLS);

_ = context.clusterTestUtils.AddDelSlotsRange(primaryIndex, [(0, 16383)], addslot: true, logger: context.logger);
context.clusterTestUtils.SetConfigEpoch(primaryIndex, primaryIndex + 1, logger: context.logger);
context.clusterTestUtils.SetConfigEpoch(replicaOneIndex, replicaOneIndex + 1, logger: context.logger);

context.clusterTestUtils.Meet(primaryIndex, replicaOneIndex, logger: context.logger);
context.clusterTestUtils.WaitUntilNodeIsKnown(replicaOneIndex, primaryIndex, logger: context.logger);

try
{
ExceptionInjectionHelper.EnableException(ExceptionInjectionType.Replication_Diskless_Sync_Reset_Cts);
var _resp = context.clusterTestUtils.ClusterReplicate(replicaNodeIndex: replicaOneIndex, primaryNodeIndex: primaryIndex, failEx: false, logger: context.logger);
ClassicAssert.AreEqual("Wait for sync task faulted", _resp);
}
finally
{
ExceptionInjectionHelper.DisableException(ExceptionInjectionType.Replication_Diskless_Sync_Reset_Cts);
}

var resp = context.clusterTestUtils.ClusterReplicate(replicaNodeIndex: replicaOneIndex, primaryNodeIndex: primaryIndex, logger: context.logger);
ClassicAssert.AreEqual("OK", resp);
}
#endif
}
}