also read lock on flushes

tlrx · tlrx · commit 10ab76b8bac8 · 2025-04-07T18:20:50.000+02:00
diff --git a/server/src/main/java/org/elasticsearch/index/engine/InternalEngine.java b/server/src/main/java/org/elasticsearch/index/engine/InternalEngine.java
@@ -2209,83 +2209,94 @@ protected void flushHoldingLock(boolean force, boolean waitIfOngoing, ActionList
             throw new IllegalArgumentException(message);
         }
         final long generation;
-        if (flushLock.tryLock() == false) {
-            // if we can't get the lock right away we block if needed otherwise barf
-            if (waitIfOngoing == false) {
-                logger.trace("detected an in-flight flush, not blocking to wait for it's completion");
-                listener.onResponse(FlushResult.NO_FLUSH);
-                return;
-            }
-            logger.trace("waiting for in-flight flush to finish");
-            flushLock.lock();
-            logger.trace("acquired flush lock after blocking");
-        } else {
-            logger.trace("acquired flush lock immediately");
-        }
 
-        final long startTime = System.nanoTime();
+        // Acquire an engine read lock before the flush lock. If we were not acquiring a read lock here, a concurrent engine reset could
+        // hold the engine write lock and later be blocked waiting for the flush lock (still holding the write lock), while the current
+        // thread could be blocked waiting for the write lock to be released (and therefore never release the flush lock).
+        final var engineReadLock = engineConfig.getEngineResetLock().readLock();
+        engineReadLock.lock();
         try {
-            // Only flush if (1) Lucene has uncommitted docs, or (2) forced by caller, or (3) the
-            // newly created commit points to a different translog generation (can free translog),
-            // or (4) the local checkpoint information in the last commit is stale, which slows down future recoveries.
-            boolean hasUncommittedChanges = hasUncommittedChanges();
-            if (hasUncommittedChanges
-                || force
-                || shouldPeriodicallyFlush()
-                || getProcessedLocalCheckpoint() > Long.parseLong(
-                    lastCommittedSegmentInfos.userData.get(SequenceNumbers.LOCAL_CHECKPOINT_KEY)
-                )) {
-                ensureCanFlush();
-                Translog.Location commitLocation = getTranslogLastWriteLocation();
-                try {
-                    translog.rollGeneration();
-                    logger.trace("starting commit for flush; commitTranslog=true");
-                    long lastFlushTimestamp = relativeTimeInNanosSupplier.getAsLong();
-                    // Pre-emptively recording the upcoming segment generation so that the live version map archive records
-                    // the correct segment generation for doc IDs that go to the archive while a flush is happening. Otherwise,
-                    // if right after committing the IndexWriter new docs get indexed/updated and a refresh moves them to the archive,
-                    // we clear them from the archive once we see that segment generation on the search shards, but those changes
-                    // were not included in the commit since they happened right after it.
-                    preCommitSegmentGeneration.set(lastCommittedSegmentInfos.getGeneration() + 1);
-                    commitIndexWriter(indexWriter, translog);
-                    logger.trace("finished commit for flush");
-                    // we need to refresh in order to clear older version values
-                    refresh("version_table_flush", SearcherScope.INTERNAL, true);
-                    translog.trimUnreferencedReaders();
-                    // Update the translog location for flushListener if (1) the writeLocation has changed during the flush and
-                    // (2) indexWriter has committed all the changes (checks must be done in this order).
-                    // If the indexWriter has uncommitted changes, they will be flushed by the next flush as intended.
-                    final Translog.Location writeLocationAfterFlush = translog.getLastWriteLocation();
-                    if (writeLocationAfterFlush.equals(commitLocation) == false && hasUncommittedChanges() == false) {
-                        assert writeLocationAfterFlush.compareTo(commitLocation) > 0 : writeLocationAfterFlush + " <= " + commitLocation;
-                        commitLocation = writeLocationAfterFlush;
-                    }
-                    // Use the timestamp from when the flush started, but only update it in case of success, so that any exception in
-                    // the above lines would not lead the engine to think that it recently flushed, when it did not.
-                    this.lastFlushTimestamp = lastFlushTimestamp;
-                } catch (AlreadyClosedException e) {
-                    failOnTragicEvent(e);
-                    throw e;
-                } catch (Exception e) {
-                    throw new FlushFailedEngineException(shardId, e);
+            if (flushLock.tryLock() == false) {
+                // if we can't get the lock right away we block if needed otherwise barf
+                if (waitIfOngoing == false) {
+                    logger.trace("detected an in-flight flush, not blocking to wait for it's completion");
+                    listener.onResponse(FlushResult.NO_FLUSH);
+                    return;
                 }
-                refreshLastCommittedSegmentInfos();
-                generation = lastCommittedSegmentInfos.getGeneration();
-                flushListener.afterFlush(generation, commitLocation);
+                logger.trace("waiting for in-flight flush to finish");
+                flushLock.lock();
+                logger.trace("acquired flush lock after blocking");
             } else {
-                generation = lastCommittedSegmentInfos.getGeneration();
+                logger.trace("acquired flush lock immediately");
+            }
+
+            final long startTime = System.nanoTime();
+            try {
+                // Only flush if (1) Lucene has uncommitted docs, or (2) forced by caller, or (3) the
+                // newly created commit points to a different translog generation (can free translog),
+                // or (4) the local checkpoint information in the last commit is stale, which slows down future recoveries.
+                boolean hasUncommittedChanges = hasUncommittedChanges();
+                if (hasUncommittedChanges
+                    || force
+                    || shouldPeriodicallyFlush()
+                    || getProcessedLocalCheckpoint() > Long.parseLong(
+                        lastCommittedSegmentInfos.userData.get(SequenceNumbers.LOCAL_CHECKPOINT_KEY)
+                    )) {
+                    ensureCanFlush();
+                    Translog.Location commitLocation = getTranslogLastWriteLocation();
+                    try {
+                        translog.rollGeneration();
+                        logger.trace("starting commit for flush; commitTranslog=true");
+                        long lastFlushTimestamp = relativeTimeInNanosSupplier.getAsLong();
+                        // Pre-emptively recording the upcoming segment generation so that the live version map archive records
+                        // the correct segment generation for doc IDs that go to the archive while a flush is happening. Otherwise,
+                        // if right after committing the IndexWriter new docs get indexed/updated and a refresh moves them to the archive,
+                        // we clear them from the archive once we see that segment generation on the search shards, but those changes
+                        // were not included in the commit since they happened right after it.
+                        preCommitSegmentGeneration.set(lastCommittedSegmentInfos.getGeneration() + 1);
+                        commitIndexWriter(indexWriter, translog);
+                        logger.trace("finished commit for flush");
+                        // we need to refresh in order to clear older version values
+                        refresh("version_table_flush", SearcherScope.INTERNAL, true);
+                        translog.trimUnreferencedReaders();
+                        // Update the translog location for flushListener if (1) the writeLocation has changed during the flush and
+                        // (2) indexWriter has committed all the changes (checks must be done in this order).
+                        // If the indexWriter has uncommitted changes, they will be flushed by the next flush as intended.
+                        final Translog.Location writeLocationAfterFlush = translog.getLastWriteLocation();
+                        if (writeLocationAfterFlush.equals(commitLocation) == false && hasUncommittedChanges() == false) {
+                            assert writeLocationAfterFlush.compareTo(commitLocation) > 0
+                                : writeLocationAfterFlush + " <= " + commitLocation;
+                            commitLocation = writeLocationAfterFlush;
+                        }
+                        // Use the timestamp from when the flush started, but only update it in case of success, so that any exception in
+                        // the above lines would not lead the engine to think that it recently flushed, when it did not.
+                        this.lastFlushTimestamp = lastFlushTimestamp;
+                    } catch (AlreadyClosedException e) {
+                        failOnTragicEvent(e);
+                        throw e;
+                    } catch (Exception e) {
+                        throw new FlushFailedEngineException(shardId, e);
+                    }
+                    refreshLastCommittedSegmentInfos();
+                    generation = lastCommittedSegmentInfos.getGeneration();
+                    flushListener.afterFlush(generation, commitLocation);
+                } else {
+                    generation = lastCommittedSegmentInfos.getGeneration();
+                }
+            } catch (FlushFailedEngineException ex) {
+                maybeFailEngine("flush", ex);
+                listener.onFailure(ex);
+                return;
+            } catch (Exception e) {
+                listener.onFailure(e);
+                return;
+            } finally {
+                totalFlushTimeExcludingWaitingOnLock.inc(System.nanoTime() - startTime);
+                flushLock.unlock();
+                logger.trace("released flush lock");
             }
-        } catch (FlushFailedEngineException ex) {
-            maybeFailEngine("flush", ex);
-            listener.onFailure(ex);
-            return;
-        } catch (Exception e) {
-            listener.onFailure(e);
-            return;
         } finally {
-            totalFlushTimeExcludingWaitingOnLock.inc(System.nanoTime() - startTime);
-            flushLock.unlock();
-            logger.trace("released flush lock");
+            engineReadLock.unlock();
         }
 
         afterFlush(generation);