Uploaded image for project: 'Core Server'
  1. Core Server
  2. SERVER-46387

Only vote for candidate with same config version and term as self

    • Type: Icon: Bug Bug
    • Resolution: Fixed
    • Priority: Icon: Major - P3 Major - P3
    • 4.4.0-rc0, 4.7.0
    • Affects Version/s: 4.3.3
    • Component/s: Replication
    • None
    • Fully Compatible
    • ALL
    • v4.4
    • Hide
      load("jstests/libs/fail_point_util.js");  // For configureFailPoint.
      //
      // Test sending a vote request to a removed node.
      //
      
      // Start out with {n1, n2, n3}
      let rst = new ReplSetTest({nodes: 3});
      rst.startSet();
      rst.initiate();
      
      let primary = rst.getPrimary();
      
      // Save the host of the node that will become removed.
      let removedHost = rst.nodes[2].host;
      
      // Remove n3 from the config.
      let config = rst.getReplSetConfigFromNode();
      let origConfig = Object.assign({}, config);
      config.members = config.members.slice(0, 2);
      config.version++;
      assert.commandWorked(primary.adminCommand({replSetReconfig: config}));
      
      // Give plenty of time for config to propagate.
      sleep(5000);
      
      // Block the removed secondary from installing new configs via heartbeat at this point. This is to
      // simulate a case where heartbeats are propagating very slowly for some reason between nodes.
      let removedConn = new Mongo(removedHost);
      let fp = configureFailPoint(removedConn, "blockHeartbeatReconfigFinish");
      
      // Reconfig back to the original config: {n1, n2, n3}. n3 will not hear about this yet, though,
      // and still think it is REMOVED.
      origConfig.version = config.version + 1;
      assert.commandWorked(primary.adminCommand({replSetReconfig: origConfig}));
      
      // Step down the primary and back up again. It should send a vote request to the REMOVED node.
      assert.commandWorked(primary.adminCommand({replSetStepDown: 1, force: true}));
      sleep(2000);
      assert.commandWorked(primary.adminCommand({replSetStepUp: 1}));
      rst.getPrimary();
      
      rst.stopSet();
      
      Show
      load( "jstests/libs/fail_point_util.js" ); // For configureFailPoint. // // Test sending a vote request to a removed node. // // Start out with {n1, n2, n3} let rst = new ReplSetTest({nodes: 3}); rst.startSet(); rst.initiate(); let primary = rst.getPrimary(); // Save the host of the node that will become removed. let removedHost = rst.nodes[2].host; // Remove n3 from the config. let config = rst.getReplSetConfigFromNode(); let origConfig = Object .assign({}, config); config.members = config.members.slice(0, 2); config.version++; assert.commandWorked(primary.adminCommand({replSetReconfig: config})); // Give plenty of time for config to propagate. sleep(5000); // Block the removed secondary from installing new configs via heartbeat at this point. This is to // simulate a case where heartbeats are propagating very slowly for some reason between nodes. let removedConn = new Mongo(removedHost); let fp = configureFailPoint(removedConn, "blockHeartbeatReconfigFinish" ); // Reconfig back to the original config: {n1, n2, n3}. n3 will not hear about this yet, though, // and still think it is REMOVED. origConfig.version = config.version + 1; assert.commandWorked(primary.adminCommand({replSetReconfig: origConfig})); // Step down the primary and back up again. It should send a vote request to the REMOVED node. assert.commandWorked(primary.adminCommand({replSetStepDown: 1, force: true })); sleep(2000); assert.commandWorked(primary.adminCommand({replSetStepUp: 1})); rst.getPrimary(); rst.stopSet();
    • Repl 2020-03-23

      If we remove a secondary from a replica set config, it will enter the REMOVED state and record its selfIndex as -1. It is possible that a primary reconfigs to add this secondary back into the config and runs for a new election before this secondary learns that it is no longer REMOVED. In this scenario, it is possible for the primary to send a vote request to the REMOVED secondary, which triggers an invariant when the secondary tries to look itself up via the TopologyCoordinator::_selfConfig method. Since it is REMOVED, its selfIndex is -1 which causes us to violate this invariant.

            Assignee:
            siyuan.zhou@mongodb.com Siyuan Zhou
            Reporter:
            william.schultz@mongodb.com William Schultz (Inactive)
            Votes:
            0 Vote for this issue
            Watchers:
            5 Start watching this issue

              Created:
              Updated:
              Resolved: