Uploaded image for project: 'Core Server'
  1. Core Server
  2. SERVER-47852

Two primaries can satisfy write concern "majority" after data erased on a node

    • Type: Icon: Bug Bug
    • Resolution: Works as Designed
    • Priority: Icon: Major - P3 Major - P3
    • None
    • Affects Version/s: None
    • Component/s: Replication
    • None
    • Replication
    • ALL
    • Hide
      /*
       * This test demonstrates 2 primaries existing in the same replica set and both primaries
       * can satisfy majority write concern.
       *
       * Basically the test simulates below scenario
       * Note: 'P' refers primary, 'S' refers to secondary.
       * 1) [P, S0, S1, S2] // Start a 4 node replica set.
       * 2) Partition A: [P] Partition B: [S0->P, S1, S2] // Create n/w partition A & B.
       * 3) Partition A: [P] Partition B: [P, S1, S2, S3] // Add a new node S3 to Partition B.
       * 4) Partition A: [P, S2] Partition B: [P, S1, S3] // Restart/resync S2 and move back to partition A pool.
       * 5) Partition A: [P, S2, S4] Partition B: [P, S1, S3] // Add a new node S4 to Partition A.
       */
      load('jstests/replsets/rslib.js');
      (function() {
      'use strict';
      
      // Start a 4 node replica set.
      // [P, S0, S1, S2]
      const rst = new ReplSetTest({
          nodes: [{}, {}, {rsConfig: {priority: 0}}, {rsConfig: {priority: 0}}],
          nodeOptions: {setParameter: {enableAutomaticReconfig: false}},
          useBridge: true
      });
      
      // Disable Chaining and disable automatic election from happening due to liveness timeout.
      var config = rst.getReplSetConfig();
      config.settings = config.settings || {};
      config.settings["chainingAllowed"] = false;
      config.settings["electionTimeoutMillis"] = ReplSetTest.kForeverMillis;
      
      rst.startSet();
      rst.initiate(config);
      
      const dbName = jsTest.name();
      const collName = "coll";
      
      let primary1 = rst.getPrimary();
      const primaryDB = primary1.getDB(dbName);
      const primaryColl = primaryDB[collName];
      const secondaries = rst.getSecondaries();
      
      jsTestLog("Do a document write");
      assert.commandWorked(primaryColl.insert({_id: 1, x: 1}, {"writeConcern": {"w": 4}}));
      rst.awaitReplication();
      
      // Create a n/w partition such that we result in this state [P] [S0, S1, S2].
      jsTestLog("Disconnect primary1 from all secondaries");
      primary1.disconnect([secondaries[0], secondaries[1], secondaries[2]]);
      
      jsTestLog("Make secondary0 to be become primary");
      assert.commandWorked(secondaries[0].adminCommand({"replSetStepUp": 1}));
      
      // Now our network topology will be [P] [S0->P, S1, S2].
      jsTestLog("Wait for secondary0 to become master");
      checkLog.contains(secondaries[0], "Transition to primary complete");
      let primary2 = secondaries[0];
      
      jsTestLog("Adding a new voting node to the replica set");
      const node5 = rst.add({
          rsConfig: {priority: 0, votes: 1},
          setParameter: {
              'numInitialSyncAttempts': 1,
              'enableAutomaticReconfig': false,
          }
      });
      
      // Simulate this network topology [P] [P, S1, S2, S3].
      node5.disconnect([primary1]);
      
      // Run a reconfig command on the primary 2 to add node 5.
      var config = rst.getReplSetConfigFromNode(1);
      var newConfig = rst.getReplSetConfig();
      config.members = newConfig.members;
      config.version += 1;
      assert.adminCommandWorkedAllowingNetworkError(
          primary2, {replSetReconfig: config, maxTimeMS: ReplSetTest.kDefaultTimeoutMS});
      
      // Make sure the new writes is able to propagate to the newly added node.
      jsTestLog("Do a document write on the primary2");
      assert.commandWorked(
          primary2.getDB(dbName)[collName].insert({_id: 2, x: 2}, {"writeConcern": {"w": 4}}));
      
      // Now make sure, we get into this state [P, S2] [P, S1, S3].
      jsTestLog("Disconnect Secondary2 from primary2 and reconnect to primary1");
      secondaries[2].disconnect([secondaries[0], secondaries[1], node5]);
      secondaries[2].reconnect([primary1]);
      
      jsTestLog("Kill and restart Secondary2");
      rst.stop(3, 9, {allowedExitCode: MongoRunner.EXIT_SIGKILL}, {forRestart: true});
      jsTestLog("Restarting the node.");
      var restartNode = rst.start(3, {startClean: true}, true);
      
      jsTestLog("wait for secondary state");
      waitForState(restartNode, ReplSetTest.State.SECONDARY);
      
      jsTestLog("Adding a new voting node to the replica set");
      const node6 = rst.add({
          rsConfig: {priority: 0, votes: 1},
          setParameter: {
              'numInitialSyncAttempts': 1,
              'enableAutomaticReconfig': false,
          }
      });
      
      // Simulate this network topology [P, S2, S4] [P, S1, S3].
      node6.disconnect([secondaries[0], secondaries[1], node5]);
      
      // Run a reconfig command on the primary1 to add node 6
      config = rst.getReplSetConfigFromNode(0);
      newConfig = rst.getReplSetConfig();
      // Only reset members.
      config.members[4] = newConfig.members[5];
      config.version += 1;
      assert.adminCommandWorkedAllowingNetworkError(
          primary1, {replSetReconfig: config, maxTimeMS: ReplSetTest.kDefaultTimeoutMS});
      
      jsTestLog(
          "Do some document writes to verify we have 2 primaries and both satisfy write concern majority");
      assert.commandWorked(primary1.getDB(dbName)[collName].insert({_id: 3, x: "primary1 Doc"},
                                                                   {"writeConcern": {"w": "majority"}}));
      assert.commandWorked(primary1.getDB(dbName)[collName].insert({_id: 4, x: "primary1 Doc"},
                                                                   {"writeConcern": {"w": 3}}));
      assert.commandWorked(primary1.getDB(dbName)[collName].insert({_id: 5, x: "primary1 Doc"},
                                                                   {"writeConcern": {"w": "majority"}}));
      assert.commandWorked(primary2.getDB(dbName)[collName].insert({_id: 6, x: "primary2 Doc"},
                                                                   {"writeConcern": {"w": "majority"}}));
      
      jsTestLog("Verify our primary1 can be get re-elected.");
      assert.commandWorked(primary1.adminCommand({"replSetStepDown": 1000, "force": true}));
      assert.commandWorked(primary1.adminCommand({replSetFreeze: 0}));
      assert.commandWorked(primary1.adminCommand({"replSetStepUp": 1}));
      
      jsTestLog("Test completed");
      rst.stopSet();
      }());
      
      Show
      /* * This test demonstrates 2 primaries existing in the same replica set and both primaries * can satisfy majority write concern. * * Basically the test simulates below scenario * Note: 'P' refers primary, 'S' refers to secondary. * 1) [P, S0, S1, S2] // Start a 4 node replica set. * 2) Partition A: [P] Partition B: [S0->P, S1, S2] // Create n/w partition A & B. * 3) Partition A: [P] Partition B: [P, S1, S2, S3] // Add a new node S3 to Partition B. * 4) Partition A: [P, S2] Partition B: [P, S1, S3] // Restart/resync S2 and move back to partition A pool. * 5) Partition A: [P, S2, S4] Partition B: [P, S1, S3] // Add a new node S4 to Partition A. */ load('jstests/replsets/rslib.js'); (function() { 'use strict'; // Start a 4 node replica set. // [P, S0, S1, S2] const rst = new ReplSetTest({ nodes: [{}, {}, {rsConfig: {priority: 0}}, {rsConfig: {priority: 0}}], nodeOptions: {setParameter: {enableAutomaticReconfig: false}}, useBridge: true }); // Disable Chaining and disable automatic election from happening due to liveness timeout. var config = rst.getReplSetConfig(); config.settings = config.settings || {}; config.settings["chainingAllowed"] = false; config.settings["electionTimeoutMillis"] = ReplSetTest.kForeverMillis; rst.startSet(); rst.initiate(config); const dbName = jsTest.name(); const collName = "coll"; let primary1 = rst.getPrimary(); const primaryDB = primary1.getDB(dbName); const primaryColl = primaryDB[collName]; const secondaries = rst.getSecondaries(); jsTestLog("Do a document write"); assert.commandWorked(primaryColl.insert({_id: 1, x: 1}, {"writeConcern": {"w": 4}})); rst.awaitReplication(); // Create a n/w partition such that we result in this state [P] [S0, S1, S2]. jsTestLog("Disconnect primary1 from all secondaries"); primary1.disconnect([secondaries[0], secondaries[1], secondaries[2]]); jsTestLog("Make secondary0 to be become primary"); assert.commandWorked(secondaries[0].adminCommand({"replSetStepUp": 1})); // Now our network topology will be [P] [S0->P, S1, S2]. jsTestLog("Wait for secondary0 to become master"); checkLog.contains(secondaries[0], "Transition to primary complete"); let primary2 = secondaries[0]; jsTestLog("Adding a new voting node to the replica set"); const node5 = rst.add({ rsConfig: {priority: 0, votes: 1}, setParameter: { 'numInitialSyncAttempts': 1, 'enableAutomaticReconfig': false, } }); // Simulate this network topology [P] [P, S1, S2, S3]. node5.disconnect([primary1]); // Run a reconfig command on the primary 2 to add node 5. var config = rst.getReplSetConfigFromNode(1); var newConfig = rst.getReplSetConfig(); config.members = newConfig.members; config.version += 1; assert.adminCommandWorkedAllowingNetworkError( primary2, {replSetReconfig: config, maxTimeMS: ReplSetTest.kDefaultTimeoutMS}); // Make sure the new writes is able to propagate to the newly added node. jsTestLog("Do a document write on the primary2"); assert.commandWorked( primary2.getDB(dbName)[collName].insert({_id: 2, x: 2}, {"writeConcern": {"w": 4}})); // Now make sure, we get into this state [P, S2] [P, S1, S3]. jsTestLog("Disconnect Secondary2 from primary2 and reconnect to primary1"); secondaries[2].disconnect([secondaries[0], secondaries[1], node5]); secondaries[2].reconnect([primary1]); jsTestLog("Kill and restart Secondary2"); rst.stop(3, 9, {allowedExitCode: MongoRunner.EXIT_SIGKILL}, {forRestart: true}); jsTestLog("Restarting the node."); var restartNode = rst.start(3, {startClean: true}, true); jsTestLog("wait for secondary state"); waitForState(restartNode, ReplSetTest.State.SECONDARY); jsTestLog("Adding a new voting node to the replica set"); const node6 = rst.add({ rsConfig: {priority: 0, votes: 1}, setParameter: { 'numInitialSyncAttempts': 1, 'enableAutomaticReconfig': false, } }); // Simulate this network topology [P, S2, S4] [P, S1, S3]. node6.disconnect([secondaries[0], secondaries[1], node5]); // Run a reconfig command on the primary1 to add node 6 config = rst.getReplSetConfigFromNode(0); newConfig = rst.getReplSetConfig(); // Only reset members. config.members[4] = newConfig.members[5]; config.version += 1; assert.adminCommandWorkedAllowingNetworkError( primary1, {replSetReconfig: config, maxTimeMS: ReplSetTest.kDefaultTimeoutMS}); jsTestLog( "Do some document writes to verify we have 2 primaries and both satisfy write concern majority"); assert.commandWorked(primary1.getDB(dbName)[collName].insert({_id: 3, x: "primary1 Doc"}, {"writeConcern": {"w": "majority"}})); assert.commandWorked(primary1.getDB(dbName)[collName].insert({_id: 4, x: "primary1 Doc"}, {"writeConcern": {"w": 3}})); assert.commandWorked(primary1.getDB(dbName)[collName].insert({_id: 5, x: "primary1 Doc"}, {"writeConcern": {"w": "majority"}})); assert.commandWorked(primary2.getDB(dbName)[collName].insert({_id: 6, x: "primary2 Doc"}, {"writeConcern": {"w": "majority"}})); jsTestLog("Verify our primary1 can be get re-elected."); assert.commandWorked(primary1.adminCommand({"replSetStepDown": 1000, "force": true})); assert.commandWorked(primary1.adminCommand({replSetFreeze: 0})); assert.commandWorked(primary1.adminCommand({"replSetStepUp": 1})); jsTestLog("Test completed"); rst.stopSet(); }());

      While working on initial sync semantics upgrade downgrade piece, I found a scenario which can lead to 2 primaries in a replica set and both primaries can satisfy write concern "majority". It seems like a safe reconfig bug.
      Below is the scenario. Assume 'P' is primary and 'S' indicates secondary and assume all the nodes we are dealing in the scenario are voters (votes:1).
      1) Start a 4 node replica set A, B, C, D ==> [A(P), B (S), C(S), D(S)] , write/elect quorum = 3.
      2) Create n/w partition X & Y Partition X: [A(P)] Partition Y: [B(S), C(S), D(S)].
      3) Step up the node B Partition X: [A(P)] Partition Y: [B(P), C(S), D(S)].
      4) Add a new node E to Partition Y using reconfig cmd. Partition X: [A(P)] Partition Y: [B(P), C(S), D(S) E(S)], write/elect quorum will still be 3.
      5) Now, move node D to partition X pool  and make it to restart and resync  from node A and Partition X: [A(P), D(S)] Partition Y: [B(P), C(S), E(S)] .
      6) Now, add a new node F to Partition X using reconfig cmd. Partition X: [A(P), D(S), F(S)] Partition Y: [B(P), C(S), E(S)]

      • To be noted, prerequisite for a reconfig cmd is that
        the current config should be C~i~ majority committed and all committed entries in the previous config C~i-1~ should also be committed in the current config C~i~ . Since for node A, it's current config C~i~  is[A, B, C, D] (commit quorum = 3) which is majority committed and all committed entries in the previous config C~i-1~ is also committed (+ check quorum step -it's also able to contact majority of nodes A, D, F in the new config), node A was successfully able to run reconfig cmd by updating and persisting the new config document i.e., from [A,B, C, D] -> [A, B, C, D, F] and it's write/elect quorum will still be 3.

      So, at end of this, partition X thinks it's config is [A, B, C, D, F] and partition Y thinks as [A, B, C, D, E]and A being the primary on partition X and B being the primary on partition Y.

      Note: This problem can also be reproduced with initial sync semantics on. And, I have attached the jstest to demonstrate the problem.

            Assignee:
            backlog-server-repl [DO NOT USE] Backlog - Replication Team
            Reporter:
            suganthi.mani@mongodb.com Suganthi Mani
            Votes:
            0 Vote for this issue
            Watchers:
            11 Start watching this issue

              Created:
              Updated:
              Resolved: