Uploaded image for project: 'Core Server'
  1. Core Server
  2. SERVER-44953

Secondaries should restart index builds when a commitIndexBuild oplog entry is processed but no index build is active

    • Type: Icon: Bug Bug
    • Resolution: Fixed
    • Priority: Icon: Major - P3 Major - P3
    • 4.3.3
    • Affects Version/s: None
    • Component/s: None
    • None
    • Fully Compatible
    • ALL
    • Hide

      Using this failpoint:

      diff --git a/src/mongo/db/index_builds_coordinator.cpp b/src/mongo/db/index_builds_coordinator.cpp
      index 50d5b1a721..f2ee3c6794 100644
      --- a/src/mongo/db/index_builds_coordinator.cpp
      +++ b/src/mongo/db/index_builds_coordinator.cpp
      @@ -67,6 +67,7 @@ using namespace indexbuildentryhelpers;
       MONGO_FAIL_POINT_DEFINE(hangAfterIndexBuildFirstDrain);
       MONGO_FAIL_POINT_DEFINE(hangAfterIndexBuildSecondDrain);
       MONGO_FAIL_POINT_DEFINE(hangAfterIndexBuildDumpsInsertsFromBulk);
      +MONGO_FAIL_POINT_DEFINE(hangBeforeIndexBuildCleanUp);
      
       namespace {
      
      @@ -1344,6 +1345,8 @@ void IndexBuildsCoordinator::_runIndexBuildInner(OperationContext* opCtx,
                                   << replState->buildUUID);
           NamespaceString nss = collection->ns();
      
      +    hangBeforeIndexBuildCleanUp.pauseWhileSet();
      +
           if (status.isOK()) {
               _indexBuildsManager.tearDownIndexBuild(
                   opCtx, collection, replState->buildUUID, MultiIndexBlock::kNoopOnCleanUpFn);
      

      This test will fail:

      (function() {
      "use strict";
      
      load('jstests/noPassthrough/libs/index_build.js');
      
      const rst = new ReplSetTest({
          nodes: [
              {},
              {},
              {arbiter: true},
          ]
      });
      const nodes = rst.startSet();
      rst.initiate();
      
      const primary = rst.getPrimary();
      const testDB = primary.getDB('test');
      const coll = testDB.getCollection('test');
      
      assert.commandWorked(coll.insert({a: 1}));
      
      let res = assert.commandWorked(primary.adminCommand(
          {configureFailPoint: 'hangAfterInitializingIndexBuild', mode: 'alwaysOn'}));
      const hangAfterInitFailpointTimesEntered = res.count;
      
      res = assert.commandWorked(
          primary.adminCommand({configureFailPoint: 'hangBeforeIndexBuildCleanUp', mode: 'alwaysOn'}));
      const hangBeforeCleanUpFailpointTimesEntered = res.count;
      
      const createIdx = IndexBuildTest.startIndexBuild(primary, coll.getFullName(), {a: 1});
      
      try {
          assert.commandWorked(primary.adminCommand({
              waitForFailPoint: "hangAfterInitializingIndexBuild",
              timesEntered: hangAfterInitFailpointTimesEntered + 1,
              maxTimeMS: kDefaultWaitForFailPointTimeout
          }));
      
      
          // When the index build starts, find its op id. This will be the op id of the client
          // connection, not the thread pool task managed by IndexBuildsCoordinatorMongod.
          const filter = {"desc": {$regex: /conn.*/}};
          const opId = IndexBuildTest.waitForIndexBuildToStart(testDB, coll.getName(), 'a_1', filter);
      
          // Kill the index build.
          assert.commandWorked(testDB.killOp(opId));
      
          // Let the index build continue running and abort.
          assert.commandWorked(
              primary.adminCommand({configureFailPoint: 'hangAfterInitializingIndexBuild', mode: 'off'}));
      
          // Wait for the index build to start cleaning up.
          assert.commandWorked(primary.adminCommand({
              waitForFailPoint: "hangBeforeIndexBuildCleanUp",
              timesEntered: hangBeforeCleanUpFailpointTimesEntered + 1,
              maxTimeMS: kDefaultWaitForFailPointTimeout
          }));
      
          // Step down the primary, preventing the index build from generating an abort oplog entry.
          assert.commandWorked(primary.adminCommand({replSetStepDown: 30, force: true}));
      } finally {
          assert.commandWorked(
              primary.adminCommand({configureFailPoint: 'hangAfterInitializingIndexBuild', mode: 'off'}));
          // Let the index build finish cleaning up.
          assert.commandWorked(
              primary.adminCommand({configureFailPoint: 'hangBeforeIndexBuildCleanUp', mode: 'off'}));
      }
      
      const exitCode = createIdx({checkExitSuccess: false});
      assert.neq(0, exitCode, 'expected shell to exit abnormally due to index build being terminated');
      
      // Wait for the index build to stop.
      IndexBuildTest.waitForIndexBuildToStop(testDB);
      
      // With two phase index builds, a stepdown will not abort the index build, which should complete
      // after the node becomes primary again.
      rst.awaitReplication();
      
      // >>> This line FAILS <<< 
      IndexBuildTest.assertIndexes(coll, 2, ['_id_', 'a_1'], [], {includeBuildUUIDs: true});
      
      const secondaryColl = rst.getSecondary().getCollection(coll.getFullName());
      IndexBuildTest.assertIndexes(secondaryColl, 2, ['_id_', 'a_1'], [], {includeBuildUUIDs: true});
      
      rst.stopSet();
      })();
      
      Show
      Using this failpoint: diff --git a/src/mongo/db/index_builds_coordinator.cpp b/src/mongo/db/index_builds_coordinator.cpp index 50d5b1a721..f2ee3c6794 100644 --- a/src/mongo/db/index_builds_coordinator.cpp +++ b/src/mongo/db/index_builds_coordinator.cpp @@ -67,6 +67,7 @@ using namespace indexbuildentryhelpers; MONGO_FAIL_POINT_DEFINE(hangAfterIndexBuildFirstDrain); MONGO_FAIL_POINT_DEFINE(hangAfterIndexBuildSecondDrain); MONGO_FAIL_POINT_DEFINE(hangAfterIndexBuildDumpsInsertsFromBulk); +MONGO_FAIL_POINT_DEFINE(hangBeforeIndexBuildCleanUp); namespace { @@ -1344,6 +1345,8 @@ void IndexBuildsCoordinator::_runIndexBuildInner(OperationContext* opCtx, << replState->buildUUID); NamespaceString nss = collection->ns(); + hangBeforeIndexBuildCleanUp.pauseWhileSet(); + if (status.isOK()) { _indexBuildsManager.tearDownIndexBuild( opCtx, collection, replState->buildUUID, MultiIndexBlock::kNoopOnCleanUpFn); This test will fail: ( function () { "use strict" ; load( 'jstests/noPassthrough/libs/index_build.js' ); const rst = new ReplSetTest({ nodes: [ {}, {}, {arbiter: true }, ] }); const nodes = rst.startSet(); rst.initiate(); const primary = rst.getPrimary(); const testDB = primary.getDB( 'test' ); const coll = testDB.getCollection( 'test' ); assert.commandWorked(coll.insert({a: 1})); let res = assert.commandWorked(primary.adminCommand( {configureFailPoint: 'hangAfterInitializingIndexBuild' , mode: 'alwaysOn' })); const hangAfterInitFailpointTimesEntered = res.count; res = assert.commandWorked( primary.adminCommand({configureFailPoint: 'hangBeforeIndexBuildCleanUp' , mode: 'alwaysOn' })); const hangBeforeCleanUpFailpointTimesEntered = res.count; const createIdx = IndexBuildTest.startIndexBuild(primary, coll.getFullName(), {a: 1}); try { assert.commandWorked(primary.adminCommand({ waitForFailPoint: "hangAfterInitializingIndexBuild" , timesEntered: hangAfterInitFailpointTimesEntered + 1, maxTimeMS: kDefaultWaitForFailPointTimeout })); // When the index build starts, find its op id. This will be the op id of the client // connection, not the thread pool task managed by IndexBuildsCoordinatorMongod. const filter = { "desc" : {$regex: /conn.*/}}; const opId = IndexBuildTest.waitForIndexBuildToStart(testDB, coll.getName(), 'a_1' , filter); // Kill the index build. assert.commandWorked(testDB.killOp(opId)); // Let the index build continue running and abort. assert.commandWorked( primary.adminCommand({configureFailPoint: 'hangAfterInitializingIndexBuild' , mode: 'off' })); // Wait for the index build to start cleaning up. assert.commandWorked(primary.adminCommand({ waitForFailPoint: "hangBeforeIndexBuildCleanUp" , timesEntered: hangBeforeCleanUpFailpointTimesEntered + 1, maxTimeMS: kDefaultWaitForFailPointTimeout })); // Step down the primary, preventing the index build from generating an abort oplog entry. assert.commandWorked(primary.adminCommand({replSetStepDown: 30, force: true })); } finally { assert.commandWorked( primary.adminCommand({configureFailPoint: 'hangAfterInitializingIndexBuild' , mode: 'off' })); // Let the index build finish cleaning up. assert.commandWorked( primary.adminCommand({configureFailPoint: 'hangBeforeIndexBuildCleanUp' , mode: 'off' })); } const exitCode = createIdx({checkExitSuccess: false }); assert.neq(0, exitCode, 'expected shell to exit abnormally due to index build being terminated' ); // Wait for the index build to stop. IndexBuildTest.waitForIndexBuildToStop(testDB); // With two phase index builds, a stepdown will not abort the index build, which should complete // after the node becomes primary again. rst.awaitReplication(); // >>> This line FAILS <<< IndexBuildTest.assertIndexes(coll, 2, [ '_id_' , 'a_1' ], [], {includeBuildUUIDs: true }); const secondaryColl = rst.getSecondary().getCollection(coll.getFullName()); IndexBuildTest.assertIndexes(secondaryColl, 2, [ '_id_' , 'a_1' ], [], {includeBuildUUIDs: true }); rst.stopSet(); })();
    • Execution Team 2020-01-13, Execution Team 2020-01-27
    • 17

      The sequence is as follows:
      On the primary node:

      See this patch build.

            Assignee:
            louis.williams@mongodb.com Louis Williams
            Reporter:
            louis.williams@mongodb.com Louis Williams
            Votes:
            0 Vote for this issue
            Watchers:
            5 Start watching this issue

              Created:
              Updated:
              Resolved: