Skip to content

Commit 2c43ea3

Browse files
Ilya Ganelinsrowen
Ilya Ganelin
authored andcommitted
[SPARK-6492][CORE] SparkContext.stop() can deadlock when DAGSchedulerEventProcessLoop dies
I've added a timeout and retry loop around the SparkContext shutdown code that should fix this deadlock. If a SparkContext shutdown is in progress when another thread comes knocking, it will wait for 10 seconds for the lock, then fall through where the outer loop will re-submit the request. Author: Ilya Ganelin <[email protected]> Closes apache#5277 from ilganeli/SPARK-6492 and squashes the following commits: 8617a7e [Ilya Ganelin] Resolved merge conflict 2fbab66 [Ilya Ganelin] Added MIMA Exclude a0e2c70 [Ilya Ganelin] Deleted stale imports fa28ce7 [Ilya Ganelin] reverted to just having a single stopped 76fc825 [Ilya Ganelin] Updated to use atomic booleans instead of the synchronized vars 6e8a7f7 [Ilya Ganelin] Removing unecessary null check for now since i'm not fixing stop ordering yet cdf7073 [Ilya Ganelin] [SPARK-6492] Moved stopped=true back to the start of the shutdown sequence so this can be addressed in a seperate PR 7fb795b [Ilya Ganelin] Spacing b7a0c5c [Ilya Ganelin] Import ordering df8224f [Ilya Ganelin] Added comment for added lock 343cb94 [Ilya Ganelin] [SPARK-6492] Added timeout/retry logic to fix a deadlock in SparkContext shutdown
1 parent c23ba81 commit 2c43ea3

File tree

2 files changed

+34
-29
lines changed

2 files changed

+34
-29
lines changed

core/src/main/scala/org/apache/spark/SparkContext.scala

Lines changed: 30 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ import java.io._
2323
import java.lang.reflect.Constructor
2424
import java.net.URI
2525
import java.util.{Arrays, Properties, UUID}
26-
import java.util.concurrent.atomic.AtomicInteger
26+
import java.util.concurrent.atomic.{AtomicBoolean, AtomicInteger}
2727
import java.util.UUID.randomUUID
2828

2929
import scala.collection.{Map, Set}
@@ -95,10 +95,10 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
9595

9696
val startTime = System.currentTimeMillis()
9797

98-
@volatile private var stopped: Boolean = false
98+
private val stopped: AtomicBoolean = new AtomicBoolean(false)
9999

100100
private def assertNotStopped(): Unit = {
101-
if (stopped) {
101+
if (stopped.get()) {
102102
throw new IllegalStateException("Cannot call methods on a stopped SparkContext")
103103
}
104104
}
@@ -1390,33 +1390,34 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
13901390
addedJars.clear()
13911391
}
13921392

1393-
/** Shut down the SparkContext. */
1393+
// Shut down the SparkContext.
13941394
def stop() {
1395-
SparkContext.SPARK_CONTEXT_CONSTRUCTOR_LOCK.synchronized {
1396-
if (!stopped) {
1397-
stopped = true
1398-
postApplicationEnd()
1399-
ui.foreach(_.stop())
1400-
env.metricsSystem.report()
1401-
metadataCleaner.cancel()
1402-
cleaner.foreach(_.stop())
1403-
executorAllocationManager.foreach(_.stop())
1404-
dagScheduler.stop()
1405-
dagScheduler = null
1406-
listenerBus.stop()
1407-
eventLogger.foreach(_.stop())
1408-
env.actorSystem.stop(heartbeatReceiver)
1409-
progressBar.foreach(_.stop())
1410-
taskScheduler = null
1411-
// TODO: Cache.stop()?
1412-
env.stop()
1413-
SparkEnv.set(null)
1414-
logInfo("Successfully stopped SparkContext")
1415-
SparkContext.clearActiveContext()
1416-
} else {
1417-
logInfo("SparkContext already stopped")
1418-
}
1395+
// Use the stopping variable to ensure no contention for the stop scenario.
1396+
// Still track the stopped variable for use elsewhere in the code.
1397+
1398+
if (!stopped.compareAndSet(false, true)) {
1399+
logInfo("SparkContext already stopped.")
1400+
return
14191401
}
1402+
1403+
postApplicationEnd()
1404+
ui.foreach(_.stop())
1405+
env.metricsSystem.report()
1406+
metadataCleaner.cancel()
1407+
cleaner.foreach(_.stop())
1408+
executorAllocationManager.foreach(_.stop())
1409+
dagScheduler.stop()
1410+
dagScheduler = null
1411+
listenerBus.stop()
1412+
eventLogger.foreach(_.stop())
1413+
env.actorSystem.stop(heartbeatReceiver)
1414+
progressBar.foreach(_.stop())
1415+
taskScheduler = null
1416+
// TODO: Cache.stop()?
1417+
env.stop()
1418+
SparkEnv.set(null)
1419+
SparkContext.clearActiveContext()
1420+
logInfo("Successfully stopped SparkContext")
14201421
}
14211422

14221423

@@ -1478,7 +1479,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
14781479
partitions: Seq[Int],
14791480
allowLocal: Boolean,
14801481
resultHandler: (Int, U) => Unit) {
1481-
if (stopped) {
1482+
if (stopped.get()) {
14821483
throw new IllegalStateException("SparkContext has been shutdown")
14831484
}
14841485
val callSite = getCallSite

project/MimaExcludes.scala

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,10 @@ object MimaExcludes {
6060
) ++ Seq(
6161
// SPARK-6510 Add a Graph#minus method acting as Set#difference
6262
ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.graphx.VertexRDD.minus")
63+
) ++ Seq(
64+
// SPARK-6492 Fix deadlock in SparkContext.stop()
65+
ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.SparkContext.org$" +
66+
"apache$spark$SparkContext$$SPARK_CONTEXT_CONSTRUCTOR_LOCK")
6367
)
6468

6569
case v if v.startsWith("1.3") =>

0 commit comments

Comments
 (0)