Skip to content

Commit 6690c97

Browse files
authored
Log hot threads after cluster cleanup timeout (#122341) (#122461)
In addition to logging the pending cluster tasks after the cluster health request times out during cluster cleanup in REST tests, we should log the hot threads to help identify any issues that could cause tasks to get stuck. Follow-up of #119186 Relates #111632 Relates #111431 Relates #111662
1 parent a443cdb commit 6690c97

File tree

1 file changed

+22
-3
lines changed

1 file changed

+22
-3
lines changed

test/framework/src/main/java/org/elasticsearch/test/rest/ESRestTestCase.java

+22-3
Original file line numberDiff line numberDiff line change
@@ -1054,14 +1054,21 @@ private void wipeCluster() throws Exception {
10541054

10551055
private void waitForClusterUpdates() throws Exception {
10561056
logger.info("Waiting for all cluster updates up to this moment to be processed");
1057+
10571058
try {
10581059
assertOK(adminClient().performRequest(new Request("GET", "_cluster/health?wait_for_events=languid")));
10591060
} catch (ResponseException e) {
10601061
if (e.getResponse().getStatusLine().getStatusCode() == HttpStatus.SC_REQUEST_TIMEOUT) {
1062+
StringBuilder logMessage = new StringBuilder("Timed out waiting for cluster updates to be processed.");
10611063
final var pendingTasks = getPendingClusterStateTasks();
10621064
if (pendingTasks != null) {
1063-
logger.error("Timed out waiting for cluster updates to be processed, {}", pendingTasks);
1065+
logMessage.append('\n').append(pendingTasks);
1066+
}
1067+
final var hotThreads = getHotThreads();
1068+
if (hotThreads != null) {
1069+
logMessage.append("\nHot threads: ").append(hotThreads);
10641070
}
1071+
logger.error(logMessage.toString());
10651072
}
10661073
throw e;
10671074
}
@@ -1071,8 +1078,8 @@ private static String getPendingClusterStateTasks() {
10711078
try {
10721079
Response response = adminClient().performRequest(new Request("GET", "/_cluster/pending_tasks"));
10731080
List<?> tasks = (List<?>) entityAsMap(response).get("tasks");
1074-
if (false == tasks.isEmpty()) {
1075-
StringBuilder message = new StringBuilder("there are still running tasks:");
1081+
if (tasks.isEmpty() == false) {
1082+
StringBuilder message = new StringBuilder("There are still running tasks:");
10761083
for (Object task : tasks) {
10771084
message.append('\n').append(task.toString());
10781085
}
@@ -1084,6 +1091,18 @@ private static String getPendingClusterStateTasks() {
10841091
return null;
10851092
}
10861093

1094+
private String getHotThreads() {
1095+
try {
1096+
Response response = adminClient().performRequest(
1097+
new Request("GET", "/_nodes/hot_threads?ignore_idle_threads=false&threads=9999")
1098+
);
1099+
return EntityUtils.toString(response.getEntity());
1100+
} catch (IOException e) {
1101+
logger.error("Failed to retrieve hot threads in the cluster during cleanup", e);
1102+
}
1103+
return null;
1104+
}
1105+
10871106
/**
10881107
* This method checks whether ILM policies or templates get recreated after they have been deleted. If so, we are probably deleting
10891108
* them unnecessarily, potentially causing test performance problems. This could happen for example if someone adds a new standard ILM

0 commit comments

Comments
 (0)