Skip to content

Remove dependency on cluster state API in SpecificMasterNodesIT #127213

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Original file line number Diff line number Diff line change
Expand Up @@ -13,248 +13,73 @@
import org.elasticsearch.action.admin.cluster.configuration.AddVotingConfigExclusionsRequest;
import org.elasticsearch.action.admin.cluster.configuration.TransportAddVotingConfigExclusionsAction;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.core.TimeValue;
import org.elasticsearch.discovery.MasterNotDiscoveredException;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.test.ESIntegTestCase;
import org.elasticsearch.test.ESIntegTestCase.ClusterScope;
import org.elasticsearch.test.ESIntegTestCase.Scope;

import java.io.IOException;

import static org.elasticsearch.test.NodeRoles.dataOnlyNode;
import static org.elasticsearch.test.NodeRoles.masterNode;
import static org.elasticsearch.test.NodeRoles.nonDataNode;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.nullValue;

@ClusterScope(scope = Scope.TEST, numDataNodes = 0, autoManageMasterNodes = false)
public class SpecificMasterNodesIT extends ESIntegTestCase {

public void testSimpleOnlyMasterNodeElection() throws IOException {
public void testSimpleOnlyMasterNodeElection() throws Exception {
internalCluster().setBootstrapMasterNodeIndex(0);
logger.info("--> start data node / non master node");
internalCluster().startNode(Settings.builder().put(dataOnlyNode()).put("discovery.initial_state_timeout", "1s"));
try {
assertThat(
clusterAdmin().prepareState(TEST_REQUEST_TIMEOUT)
.setMasterNodeTimeout(TimeValue.timeValueMillis(100))
.get()
.getState()
.nodes()
.getMasterNodeId(),
nullValue()
);
fail("should not be able to find master");
} catch (MasterNotDiscoveredException e) {
// all is well, no master elected
}
awaitMasterNotFound();

logger.info("--> start master node");
final String masterNodeName = internalCluster().startMasterOnlyNode();
assertThat(
internalCluster().nonMasterClient()
.admin()
.cluster()
.prepareState(TEST_REQUEST_TIMEOUT)
.get()
.getState()
.nodes()
.getMasterNode()
.getName(),
equalTo(masterNodeName)
);
assertThat(
internalCluster().masterClient()
.admin()
.cluster()
.prepareState(TEST_REQUEST_TIMEOUT)
.get()
.getState()
.nodes()
.getMasterNode()
.getName(),
equalTo(masterNodeName)
);

awaitMasterNode(internalCluster().getNonMasterNodeName(), masterNodeName);
awaitMasterNode(internalCluster().getMasterName(), masterNodeName);

logger.info("--> stop master node");
Settings masterDataPathSettings = internalCluster().dataPathSettings(internalCluster().getMasterName());
internalCluster().stopCurrentMasterNode();

try {
assertThat(
clusterAdmin().prepareState(TEST_REQUEST_TIMEOUT)
.setMasterNodeTimeout(TimeValue.timeValueMillis(100))
.get()
.getState()
.nodes()
.getMasterNodeId(),
nullValue()
);
fail("should not be able to find master");
} catch (MasterNotDiscoveredException e) {
// all is well, no master elected
}
awaitMasterNotFound();

logger.info("--> start previous master node again");
final String nextMasterEligibleNodeName = internalCluster().startNode(
Settings.builder().put(nonDataNode(masterNode())).put(masterDataPathSettings)
);
assertThat(
internalCluster().nonMasterClient()
.admin()
.cluster()
.prepareState(TEST_REQUEST_TIMEOUT)
.get()
.getState()
.nodes()
.getMasterNode()
.getName(),
equalTo(nextMasterEligibleNodeName)
);
assertThat(
internalCluster().masterClient()
.admin()
.cluster()
.prepareState(TEST_REQUEST_TIMEOUT)
.get()
.getState()
.nodes()
.getMasterNode()
.getName(),
equalTo(nextMasterEligibleNodeName)
);
awaitMasterNode(internalCluster().getNonMasterNodeName(), nextMasterEligibleNodeName);
awaitMasterNode(internalCluster().getMasterName(), nextMasterEligibleNodeName);
}

public void testElectOnlyBetweenMasterNodes() throws Exception {
internalCluster().setBootstrapMasterNodeIndex(0);
logger.info("--> start data node / non master node");
internalCluster().startNode(Settings.builder().put(dataOnlyNode()).put("discovery.initial_state_timeout", "1s"));
try {
assertThat(
clusterAdmin().prepareState(TEST_REQUEST_TIMEOUT)
.setMasterNodeTimeout(TimeValue.timeValueMillis(100))
.get()
.getState()
.nodes()
.getMasterNodeId(),
nullValue()
);
fail("should not be able to find master");
} catch (MasterNotDiscoveredException e) {
// all is well, no master elected
}
awaitMasterNotFound();

logger.info("--> start master node (1)");
final String masterNodeName = internalCluster().startMasterOnlyNode();
assertThat(
internalCluster().nonMasterClient()
.admin()
.cluster()
.prepareState(TEST_REQUEST_TIMEOUT)
.get()
.getState()
.nodes()
.getMasterNode()
.getName(),
equalTo(masterNodeName)
);
assertThat(
internalCluster().masterClient()
.admin()
.cluster()
.prepareState(TEST_REQUEST_TIMEOUT)
.get()
.getState()
.nodes()
.getMasterNode()
.getName(),
equalTo(masterNodeName)
);
awaitMasterNode(internalCluster().getNonMasterNodeName(), masterNodeName);
awaitMasterNode(internalCluster().getMasterName(), masterNodeName);

logger.info("--> start master node (2)");
final String nextMasterEligableNodeName = internalCluster().startMasterOnlyNode();
assertThat(
internalCluster().nonMasterClient()
.admin()
.cluster()
.prepareState(TEST_REQUEST_TIMEOUT)
.get()
.getState()
.nodes()
.getMasterNode()
.getName(),
equalTo(masterNodeName)
);
assertThat(
internalCluster().masterClient()
.admin()
.cluster()
.prepareState(TEST_REQUEST_TIMEOUT)
.get()
.getState()
.nodes()
.getMasterNode()
.getName(),
equalTo(masterNodeName)
);
awaitMasterNode(internalCluster().getNonMasterNodeName(), masterNodeName);
awaitMasterNode(internalCluster().getMasterName(), masterNodeName);

logger.info("--> closing master node (1)");
client().execute(
TransportAddVotingConfigExclusionsAction.TYPE,
new AddVotingConfigExclusionsRequest(TEST_REQUEST_TIMEOUT, masterNodeName)
).get();
// removing the master from the voting configuration immediately triggers the master to step down
assertBusy(() -> {
assertThat(
internalCluster().nonMasterClient()
.admin()
.cluster()
.prepareState(TEST_REQUEST_TIMEOUT)
.get()
.getState()
.nodes()
.getMasterNode()
.getName(),
equalTo(nextMasterEligableNodeName)
);
assertThat(
internalCluster().masterClient()
.admin()
.cluster()
.prepareState(TEST_REQUEST_TIMEOUT)
.get()
.getState()
.nodes()
.getMasterNode()
.getName(),
equalTo(nextMasterEligableNodeName)
);
});
awaitMasterNode(internalCluster().getNonMasterNodeName(), nextMasterEligableNodeName);
awaitMasterNode(internalCluster().getMasterName(), nextMasterEligableNodeName);

internalCluster().stopNode(masterNodeName);
assertThat(
internalCluster().nonMasterClient()
.admin()
.cluster()
.prepareState(TEST_REQUEST_TIMEOUT)
.get()
.getState()
.nodes()
.getMasterNode()
.getName(),
equalTo(nextMasterEligableNodeName)
);
assertThat(
internalCluster().masterClient()
.admin()
.cluster()
.prepareState(TEST_REQUEST_TIMEOUT)
.get()
.getState()
.nodes()
.getMasterNode()
.getName(),
equalTo(nextMasterEligableNodeName)
);
awaitMasterNode(internalCluster().getNonMasterNodeName(), nextMasterEligableNodeName);
awaitMasterNode(internalCluster().getMasterName(), nextMasterEligableNodeName);
}

public void testAliasFilterValidation() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,7 @@
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Optional;
import java.util.Random;
import java.util.Set;
import java.util.concurrent.Callable;
Expand Down Expand Up @@ -940,6 +941,39 @@ public void waitNoPendingTasksOnAll() throws Exception {
assertNoTimeout(clusterAdmin().prepareHealth(TEST_REQUEST_TIMEOUT).setWaitForEvents(Priority.LANGUID).get());
}

/**
* Waits for the node {@code viaNode} to see {@code masterNodeName} as the master node in the cluster state.
* Note that this does not guarantee that all other nodes in the cluster are on the same cluster state version already.
*
* @param viaNode the node to check the cluster state one
* @param masterNodeName the master node name that we wait for
*/
public void awaitMasterNode(String viaNode, String masterNodeName) {
var listener = ClusterServiceUtils.addTemporaryStateListener(
internalCluster().clusterService(viaNode),
state -> Optional.ofNullable(state.nodes().getMasterNode()).map(m -> m.getName().equals(masterNodeName)).orElse(false),
TEST_REQUEST_TIMEOUT
);
safeAwait(listener, TEST_REQUEST_TIMEOUT);
}

/**
* Waits for a random node in the cluster to not see a master node in the cluster state.
* Note that this does not guarantee that all other nodes in the cluster are on the same cluster state version already.
*/
public void awaitMasterNotFound() {
var viaNode = internalCluster().getRandomNodeName();
// We use a temporary state listener instead of `awaitClusterState` here because the `ClusterStateObserver` doesn't run the
// predicate if the cluster state version didn't change. When a master node leaves the cluster (i.e. what this method is used for),
// the cluster state version is not incremented.
var listener = ClusterServiceUtils.addTemporaryStateListener(
internalCluster().clusterService(viaNode),
state -> state.nodes().getMasterNode() == null,
TEST_REQUEST_TIMEOUT
);
safeAwait(listener, TEST_REQUEST_TIMEOUT);
}

/** Ensures the result counts are as expected, and logs the results if different */
public void assertResultsAndLogOnFailure(long expectedResults, SearchResponse searchResponse) {
final TotalHits totalHits = searchResponse.getHits().getTotalHits();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2033,15 +2033,28 @@ public String getMasterName() {
* in the viaNode parameter. If viaNode isn't specified a random node will be picked to the send the request to.
*/
public String getMasterName(@Nullable String viaNode) {
viaNode = viaNode != null ? viaNode : getRandomNodeName();
if (viaNode == null) {
throw new AssertionError("Unable to get master name, no node found");
}
try {
Client client = viaNode != null ? client(viaNode) : client();
return client.admin().cluster().prepareState(TEST_REQUEST_TIMEOUT).get().getState().nodes().getMasterNode().getName();
ClusterServiceUtils.awaitClusterState(logger, state -> state.nodes().getMasterNode() != null, clusterService(viaNode));
final ClusterState state = client(viaNode).admin().cluster().prepareState(TEST_REQUEST_TIMEOUT).setLocal(true).get().getState();
return state.nodes().getMasterNode().getName();
} catch (Exception e) {
logger.warn("Can't fetch cluster state", e);
throw new RuntimeException("Can't get master node " + e.getMessage(), e);
}
Comment on lines 2040 to 2047
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I still worry this reads the state twice and might see no master the second time due to election jitter. We could cheat and remember the master node seen in the state on which we're awaiting (also saves the exception handling):

        final var masterNameListener = new SubscribableListener<String>();
        return safeAwait(ClusterServiceUtils.addTemporaryStateListener(clusterService(viaNode), cs -> {
            Optional.ofNullable(cs.nodes().getMasterNode()).ifPresent(masterNode -> masterNameListener.onResponse(masterNode.getName()));
            return masterNameListener.isDone();
        }).andThen(masterNameListener::addListener));

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought about that too, but I figured "election jitter" would likely disrupt the test anyway. Tests should be designed to be deterministic/consistent, but there's only so much outside influence tests can account for (e.g. election jitter or CI blips).

I'm a little hesitant to "cheat" here. I seem to recall someone saying

Having to obtain the cluster state in another way is a feature, not a bug :)

#125195 (comment) 😉

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Heh yeah I thought I'd said that at some point :) On reflection I think you're right, tests should not be calling this if the master isn't stable.

}

public String getNonMasterNodeName() {
NodeAndClient randomNodeAndClient = getRandomNodeAndClient(new NodeNamePredicate(getMasterName()).negate());
if (randomNodeAndClient != null) {
return randomNodeAndClient.getName();
}
throw new AssertionError("No non-master node found");
}

/**
* @return the name of a random node in a cluster
*/
Expand Down