Uploaded image for project: 'Core Server'
  1. Core Server
  2. SERVER-85423

CollectionScan::work() returns NEED_TIME for clustered collections which causes early yielding

    • Type: Icon: Bug Bug
    • Resolution: Won't Fix
    • Priority: Icon: Major - P3 Major - P3
    • None
    • Affects Version/s: None
    • Component/s: None
    • None
    • ALL
    • Hide

      Below is a modified version of the test case, stripped down to the bare minimum to reliably reproduce the issue. In the failure case, the test will hang indefinitely.

      // Test that the read concern level 'snapshot' exhibits the correct yielding behavior. That is,
      // operations performed at read concern level snapshot check for interrupt but do not yield locks or
      // storage engine resources.
      // @tags: [
      //   uses_transactions,
      // ]
      import {waitForCurOpByFailPoint} from "jstests/libs/curop_helpers.js";
      
      const dbName = "test";
      const collName = "coll";
      
      const rst = new ReplSetTest({nodes: 1});
      rst.startSet();
      rst.initiate();
      const db = rst.getPrimary().getDB(dbName);
      const adminDB = db.getSiblingDB("admin");
      const coll = db.coll;
      TestData.numDocs = 4;
      
      // Set 'internalQueryExecYieldIterations' to 2 to ensure that commands yield on the second try
      // (i.e. after they have established a snapshot but before they have returned any documents).
      assert.commandWorked(db.adminCommand({setParameter: 1, internalQueryExecYieldIterations: 2}));
      
      // Set 'internalQueryExecYieldPeriodMS' to 24 hours to significantly reduce a probability of a
      // situation occuring where the execution threads do not receive enough CPU time and commands yield
      // on timeout (i.e. yield period expiration) instead of on the second try as expected by setting
      // parameter 'internalQueryExecYieldIterations' to 2.
      assert.commandWorked(db.adminCommand({setParameter: 1, internalQueryExecYieldPeriodMS: 86400000}));
      
      function assertKillPending(opId) {
          const res =
              adminDB.aggregate([{$currentOp: {}}, {$match: {ns: coll.getFullName(), opid: opId}}])
                  .toArray();
          assert.eq(
              res.length,
              1,
              tojson(
                  adminDB.aggregate([{$currentOp: {}}, {$match: {ns: coll.getFullName()}}]).toArray()));
          assert(res[0].hasOwnProperty("killPending"), tojson(res));
          assert.eq(true, res[0].killPending, tojson(res));
      }
      
      function populateCollection() {
          db.coll.drop({writeConcern: {w: "majority"}});
          for (let i = 0; i < TestData.numDocs; i++) {
              assert.commandWorked(
                  db.coll.insert({_id: i, x: 1, location: [0, 0]}, {writeConcern: {w: "majority"}}));
          }
      
          assert.commandWorked(db.runCommand({
              createIndexes: "coll",
              indexes: [{key: {location: "2d"}, name: "geo_2d"}],
              writeConcern: {w: "majority"}
          }));
      }
      
      function testCommand(awaitCommandFn, curOpFilter, testWriteConflict, setYieldIterations) {
          //
          // Test that the command does not yield locks.
          //
      
          TestData.txnNumber++;
          populateCollection();
      
          // Start a command that hangs before checking for interrupt.
          assert.commandWorked(db.adminCommand(
              {configureFailPoint: "setInterruptOnlyPlansCheckForInterruptHang", mode: "alwaysOn"}));
          let awaitCommand = startParallelShell(awaitCommandFn, rst.ports[0]);
          waitForCurOpByFailPoint(
              db, coll.getFullName(), "setInterruptOnlyPlansCheckForInterruptHang", curOpFilter);
      
          // Start a drop. This should block behind the command, since the command does not yield
          // locks.
          let awaitDrop = startParallelShell(function() {
              db.getSiblingDB("test").coll.drop({writeConcern: {w: "majority"}});
          }, rst.ports[0]);
      
          // Remove the hang. The command should complete successfully.
          assert.commandWorked(db.adminCommand(
              {configureFailPoint: "setInterruptOnlyPlansCheckForInterruptHang", mode: "off"}));
          awaitCommand();
      
          // Now the drop can complete.
          awaitDrop();
      
          //
          // Test that the command does not read data that is inserted during its execution.
          // 'awaitCommandFn' should fail if it reads the following document:
          //     {_id: <numDocs>, x: 1, new: 1, location: [0, 0]}
          //
      
          TestData.txnNumber++;
          populateCollection();
      
          // Start a command that hangs before checking for interrupt.
          assert.commandWorked(db.adminCommand(
              {configureFailPoint: "setInterruptOnlyPlansCheckForInterruptHang", mode: "alwaysOn"}));
          awaitCommand = startParallelShell(awaitCommandFn, rst.ports[0]);
          waitForCurOpByFailPoint(
              db, coll.getFullName(), "setInterruptOnlyPlansCheckForInterruptHang", curOpFilter);
      
          // Insert data that should not be read by the command.
          assert.commandWorked(db.coll.insert({_id: TestData.numDocs, x: 1, new: 1, location: [0, 0]},
                                              {writeConcern: {w: "majority"}}));
      
          // Remove the hang. The command should complete successfully.
          assert.commandWorked(db.adminCommand(
              {configureFailPoint: "setInterruptOnlyPlansCheckForInterruptHang", mode: "off"}));
          awaitCommand();
      }
      
      // Test delete.
      testCommand(function() {
          const session = db.getMongo().startSession({causalConsistency: false});
          const sessionDb = session.getDatabase("test");
          session.startTransaction({readConcern: {level: "snapshot"}});
          const res = assert.commandWorked(sessionDb.runCommand(
              {delete: "coll", deletes: [{q: {}, limit: 1}, {q: {new: 1}, limit: 1}]}));
          assert.commandWorked(session.commitTransaction_forTesting());
          // Only remove one existing doc committed before the transaction.
          assert.eq(res.n, 1, tojson(res));
      }, {op: "remove"}, true);
      
      rst.stopSet();
      
      Show
      Below is a modified version of the test case, stripped down to the bare minimum to reliably reproduce the issue. In the failure case, the test will hang indefinitely. // Test that the read concern level 'snapshot' exhibits the correct yielding behavior. That is, // operations performed at read concern level snapshot check for interrupt but do not yield locks or // storage engine resources. // @tags: [ // uses_transactions, // ] import {waitForCurOpByFailPoint} from "jstests/libs/curop_helpers.js" ; const dbName = "test" ; const collName = "coll" ; const rst = new ReplSetTest({nodes: 1}); rst.startSet(); rst.initiate(); const db = rst.getPrimary().getDB(dbName); const adminDB = db.getSiblingDB( "admin" ); const coll = db.coll; TestData.numDocs = 4; // Set 'internalQueryExecYieldIterations' to 2 to ensure that commands yield on the second try // (i.e. after they have established a snapshot but before they have returned any documents). assert .commandWorked(db.adminCommand({setParameter: 1, internalQueryExecYieldIterations: 2})); // Set 'internalQueryExecYieldPeriodMS' to 24 hours to significantly reduce a probability of a // situation occuring where the execution threads do not receive enough CPU time and commands yield // on timeout (i.e. yield period expiration) instead of on the second try as expected by setting // parameter 'internalQueryExecYieldIterations' to 2. assert .commandWorked(db.adminCommand({setParameter: 1, internalQueryExecYieldPeriodMS: 86400000})); function assertKillPending(opId) { const res = adminDB.aggregate([{$currentOp: {}}, {$match: {ns: coll.getFullName(), opid: opId}}]) .toArray(); assert .eq( res.length, 1, tojson( adminDB.aggregate([{$currentOp: {}}, {$match: {ns: coll.getFullName()}}]).toArray())); assert (res[0].hasOwnProperty( "killPending" ), tojson(res)); assert .eq( true , res[0].killPending, tojson(res)); } function populateCollection() { db.coll.drop({writeConcern: {w: "majority" }}); for (let i = 0; i < TestData.numDocs; i++) { assert .commandWorked( db.coll.insert({_id: i, x: 1, location: [0, 0]}, {writeConcern: {w: "majority" }})); } assert .commandWorked(db.runCommand({ createIndexes: "coll" , indexes: [{key: {location: "2d" }, name: "geo_2d" }], writeConcern: {w: "majority" } })); } function testCommand(awaitCommandFn, curOpFilter, testWriteConflict, setYieldIterations) { // // Test that the command does not yield locks. // TestData.txnNumber++; populateCollection(); // Start a command that hangs before checking for interrupt. assert .commandWorked(db.adminCommand( {configureFailPoint: "setInterruptOnlyPlansCheckForInterruptHang" , mode: "alwaysOn" })); let awaitCommand = startParallelShell(awaitCommandFn, rst.ports[0]); waitForCurOpByFailPoint( db, coll.getFullName(), "setInterruptOnlyPlansCheckForInterruptHang" , curOpFilter); // Start a drop. This should block behind the command, since the command does not yield // locks. let awaitDrop = startParallelShell(function() { db.getSiblingDB( "test" ).coll.drop({writeConcern: {w: "majority" }}); }, rst.ports[0]); // Remove the hang. The command should complete successfully. assert .commandWorked(db.adminCommand( {configureFailPoint: "setInterruptOnlyPlansCheckForInterruptHang" , mode: "off" })); awaitCommand(); // Now the drop can complete. awaitDrop(); // // Test that the command does not read data that is inserted during its execution. // 'awaitCommandFn' should fail if it reads the following document: // {_id: <numDocs>, x: 1, new : 1, location: [0, 0]} // TestData.txnNumber++; populateCollection(); // Start a command that hangs before checking for interrupt. assert .commandWorked(db.adminCommand( {configureFailPoint: "setInterruptOnlyPlansCheckForInterruptHang" , mode: "alwaysOn" })); awaitCommand = startParallelShell(awaitCommandFn, rst.ports[0]); waitForCurOpByFailPoint( db, coll.getFullName(), "setInterruptOnlyPlansCheckForInterruptHang" , curOpFilter); // Insert data that should not be read by the command. assert .commandWorked(db.coll.insert({_id: TestData.numDocs, x: 1, new : 1, location: [0, 0]}, {writeConcern: {w: "majority" }})); // Remove the hang. The command should complete successfully. assert .commandWorked(db.adminCommand( {configureFailPoint: "setInterruptOnlyPlansCheckForInterruptHang" , mode: "off" })); awaitCommand(); } // Test delete. testCommand(function() { const session = db.getMongo().startSession({causalConsistency: false }); const sessionDb = session.getDatabase( "test" ); session.startTransaction({readConcern: {level: "snapshot" }}); const res = assert .commandWorked(sessionDb.runCommand( {delete: "coll" , deletes: [{q: {}, limit: 1}, {q: { new : 1}, limit: 1}]})); assert .commandWorked(session.commitTransaction_forTesting()); // Only remove one existing doc committed before the transaction. assert .eq(res.n, 1, tojson(res)); }, {op: "remove" }, true ); rst.stopSet();

      The CollectionScan::work() method returns NEED_TIME when it is scanning a clustered collection and it is unable to seekNear() close enough to the target record. As a result, PlanExecutorImpl::_getNextImpl() will continue to the next iteration in the loop, resulting in another check for yielding.

      In the case of jstests/noPassthrough/read_concern_snapshot_yielding.js, the test sets internalQueryExecYieldIterations to 2 and the internalQueryExecYieldPeriodMS to 24 hours. The effect of this is that the setInterruptOnlyPlansCheckForInterruptHang failpoint will be entered whenever the loop in PlanExecutorImpl::_getNextImpl() reaches its second iteration. So, if the collection being scanned is clustered, in this case config.transactions, and if the scan is unable to reach to the interior of the scan boundary, the failpoint will be entered before the first document is made available.

      My changes to make config.transactions clustered in SERVER-80862 results in this test nondeterministically failing because the failpoint is reached too early in this way. The patch below provides a solution likely unfit for production, but which demonstrates the problem:

      diff --git a/src/mongo/db/exec/collection_scan.cpp b/src/mongo/db/exec/collection_scan.cpp
      index 4e88a1cc4fb..69362d2cc9f 100644
      --- a/src/mongo/db/exec/collection_scan.cpp
      +++ b/src/mongo/db/exec/collection_scan.cpp
      @@ -156,6 +156,7 @@ CollectionScan::CollectionScan(ExpressionContext* expCtx,
       }
      
       PlanStage::StageState CollectionScan::doWork(WorkingSetID* out) {
      +    _needTimeDueToClusteredCollection = false;
           if (_commonStats.isEOF) {
               _priority.reset();
               return PlanStage::IS_EOF;
      @@ -474,6 +475,8 @@ PlanStage::StageState CollectionScan::returnIfMatches(WorkingSetMember* member,
           // mark their position for resuming the tailable scan later on.
           if (beforeStartOfRange(_params, *member)) {
               _workingSet->free(memberID);
      +        LOGV2(8086240, "setting PlanStage::NEED_TIME bc clustered collection didn't seekNear close enough");
      +        _needTimeDueToClusteredCollection = true;
               return PlanStage::NEED_TIME;
           }
      
      diff --git a/src/mongo/db/exec/plan_stage.h b/src/mongo/db/exec/plan_stage.h
      index bdf48637c72..4a2ae06c4a0 100644
      --- a/src/mongo/db/exec/plan_stage.h
      +++ b/src/mongo/db/exec/plan_stage.h
      @@ -363,6 +363,11 @@ public:
               _commonStats.executionTime.emplace(0);
           }
      
      +    bool needTimeDueToClusteredCollection() const {
      +        return _needTimeDueToClusteredCollection;
      +    }
      +    bool _needTimeDueToClusteredCollection{false};
      +
       protected:
           /**
            * Performs one unit of work.  See comment at work() above.
      diff --git a/src/mongo/db/query/plan_executor_impl.cpp b/src/mongo/db/query/plan_executor_impl.cpp
      index 0828f77067c..c68d29388c2 100644
      --- a/src/mongo/db/query/plan_executor_impl.cpp
      +++ b/src/mongo/db/query/plan_executor_impl.cpp
      @@ -415,7 +415,7 @@ PlanExecutor::ExecState PlanExecutorImpl::_getNextImpl(Snapshotted<Document>* ob
                   }
               };
      
      -        if (_yieldPolicy->shouldYieldOrInterrupt(_opCtx)) {
      +        if (!_root->needTimeDueToClusteredCollection() && _yieldPolicy->shouldYieldOrInterrupt(_opCtx)) {
                   uassertStatusOK(_yieldPolicy->yieldOrInterrupt(_opCtx, whileYieldingFn));
               }
      

      With this patch, the failure is no longer reproducible. I suspect that this is a bona fide bug in the query code, since it seems wrong to me that yielding would occur more often because a collection is clustered (note that the issue never occurs when the collection is nonclustered).

            Assignee:
            james.bronsted@mongodb.com James Bronsted
            Reporter:
            james.bronsted@mongodb.com James Bronsted
            Votes:
            0 Vote for this issue
            Watchers:
            7 Start watching this issue

              Created:
              Updated:
              Resolved: