Uploaded image for project: 'Core Server'
  1. Core Server
  2. SERVER-63143

Operation can be interrupted by maxTimeMS timeout while waiting for lock even if _ignoreInterruptsExceptForReplStateChange is set

    • Type: Icon: Bug Bug
    • Resolution: Fixed
    • Priority: Icon: Major - P3 Major - P3
    • 5.3.0
    • Affects Version/s: None
    • Component/s: Replication
    • None
    • Fully Compatible
    • ALL
    • v5.2, v5.0, v4.4, v4.2
    • Hide

      Apply the below git patch and run the given resmoke command after building.

      buildscripts/resmoke.py run --suite=replica_sets jstests/replsets/bf_23831_repro.js
      
      Unable to find source-code formatter for language: diff. Available languages are: actionscript, ada, applescript, bash, c, c#, c++, cpp, css, erlang, go, groovy, haskell, html, java, javascript, js, json, lua, none, nyan, objc, perl, php, python, r, rainbow, ruby, scala, sh, sql, swift, visualbasic, xml, yaml
      diff --git a/jstests/replsets/bf_23831_repro.js b/jstests/replsets/bf_23831_repro.js
      new file mode 100644
      index 00000000000..3757540dc3e
      --- /dev/null
      +++ b/jstests/replsets/bf_23831_repro.js
      @@ -0,0 +1,72 @@
      +load("jstests/libs/fail_point_util.js");
      +
      +const rst = new ReplSetTest({nodes: 2});
      +rst.startSet();
      +rst.initiate();
      +
      +const primary = rst.getPrimary();
      +const secondary = rst.getSecondary();
      +
      +const kNumDocs = 10;
      +const kDbName = "BF23831DB";
      +const kCollName = "testCollection";
      +const kNamespace = kDbName + "." + kCollName;
      +
      +let formatProfileQuery = function(ns, cmdQuery, isQueryOp = false) {
      +    let profileQuery = {op: isQueryOp ? "query" : "command", errCode: {$exists: false}};
      +    if (ns) {
      +        profileQuery["ns"] = ns;
      +    }
      +
      +    for (const field in cmdQuery) {
      +        profileQuery["command." + field] = cmdQuery[field];
      +    }
      +
      +    return profileQuery;
      +};
      +
      +let getNumNodesCmdRanOn = function(rsNodes, {dbName, profileQuery}) {
      +    let numNodesCmdRanOn = 0;
      +    rsNodes.forEach(function(node) {
      +        let profileDB = node.getDB(dbName);
      +        let result = profileDB.system.profile.findOne(profileQuery);
      +        if (result != null) {
      +            assert(profileDB.adminCommand({isMaster: 1}).secondary);
      +            numNodesCmdRanOn += 1;
      +        }
      +    });
      +    return numNodesCmdRanOn;
      +};
      +
      +let assertCmdRanOnExpectedNodes = function(rsNodes, cmdTestCase) {
      +    cmdTestCase.cmdFunc();
      +    assert.eq(getNumNodesCmdRanOn(rsNodes, cmdTestCase), 1);
      +};
      +
      +rst.nodes.forEach(function(node) {
      +    assert(node.getDB(kDbName).setProfilingLevel(2));
      +    assert(node.getDB('admin').setProfilingLevel(2));
      +});
      +
      +const testCollPrimary = primary.getCollection(kNamespace);
      +
      +const bulk = testCollPrimary.initializeUnorderedBulkOp();
      +for (let i = 0; i < kNumDocs; ++i) {
      +    bulk.insert({_id: i, x: i});
      +}
      +assert.commandWorked(bulk.execute());
      +
      +const testCollSecondary = secondary.getCollection(kNamespace);
      +
      +configureFailPoint(secondary, "acquireLockDuringIntrospect");
      +const cursor = testCollSecondary.find({x: {$gte: 0}}).maxTimeMS(3000);
      +
      +const cmdFunc = () => cursor.toArray();
      +
      +const profileQuery =
      +    formatProfileQuery(kNamespace, {find: kCollName, filter: {x: {$gte: 0}}}, true);
      +const dbName = kDbName;
      +
      +assertCmdRanOnExpectedNodes([primary, secondary], {cmdFunc, profileQuery, dbName});
      +
      +rst.stopSet();
      diff --git a/src/mongo/db/introspect.cpp b/src/mongo/db/introspect.cpp
      index f63cffbd86c..e3473d9f765 100644
      --- a/src/mongo/db/introspect.cpp
      +++ b/src/mongo/db/introspect.cpp
      @@ -49,6 +49,10 @@
       
       namespace mongo {
       
      +namespace {
      +MONGO_FAIL_POINT_DEFINE(acquireLockDuringIntrospect);
      +}
      +
       using std::endl;
       using std::string;
       using std::unique_ptr;
      @@ -108,6 +112,25 @@ void profile(OperationContext* opCtx, NetworkOp op) {
           });
       
           try {
      +        auto pf = makePromiseFuture<void>();
      +        if (acquireLockDuringIntrospect.shouldFail()) {
      +            acquireLockDuringIntrospect.execute(
      +                [p = std::move(pf.promise)](const auto& data) mutable {
      +                    std::thread t{[p = std::move(p)]() mutable {
      +                        auto client = getGlobalServiceContext()->makeClient("BF-23831");
      +                        auto opCtx = client->makeOperationContext();
      +                        {
      +                            Lock::ParallelBatchWriterMode pbwm(opCtx->lockState());
      +                            p.emplaceValue();
      +                            std::this_thread::sleep_for(std::chrono::milliseconds(5000));
      +                        }
      +                    }};
      +                    t.detach();
      +                });
      +        } else {
      +            pf.promise.emplaceValue();
      +        }
      +        pf.future.get();
               const auto dbProfilingNS = NamespaceString(dbName, "system.profile");
               AutoGetCollection autoColl(opCtx, dbProfilingNS, MODE_IX);
               Database* const db = autoColl.getDb();
      
      Show
      Apply the below git patch and run the given resmoke command after building. buildscripts/resmoke.py run --suite=replica_sets jstests/replsets/bf_23831_repro.js Unable to find source-code formatter for language: diff. Available languages are: actionscript, ada, applescript, bash, c, c#, c++, cpp, css, erlang, go, groovy, haskell, html, java, javascript, js, json, lua, none, nyan, objc, perl, php, python, r, rainbow, ruby, scala, sh, sql, swift, visualbasic, xml, yaml diff --git a/jstests/replsets/bf_23831_repro.js b/jstests/replsets/bf_23831_repro.js new file mode 100644 index 00000000000..3757540dc3e --- /dev/ null +++ b/jstests/replsets/bf_23831_repro.js @@ -0,0 +1,72 @@ +load( "jstests/libs/fail_point_util.js" ); + + const rst = new ReplSetTest({nodes: 2}); +rst.startSet(); +rst.initiate(); + + const primary = rst.getPrimary(); + const secondary = rst.getSecondary(); + + const kNumDocs = 10; + const kDbName = "BF23831DB" ; + const kCollName = "testCollection" ; + const kNamespace = kDbName + "." + kCollName; + +let formatProfileQuery = function(ns, cmdQuery, isQueryOp = false ) { + let profileQuery = {op: isQueryOp ? "query" : "command" , errCode: {$exists: false }}; + if (ns) { + profileQuery[ "ns" ] = ns; + } + + for ( const field in cmdQuery) { + profileQuery[ "command." + field] = cmdQuery[field]; + } + + return profileQuery; +}; + +let getNumNodesCmdRanOn = function(rsNodes, {dbName, profileQuery}) { + let numNodesCmdRanOn = 0; + rsNodes.forEach(function(node) { + let profileDB = node.getDB(dbName); + let result = profileDB.system.profile.findOne(profileQuery); + if (result != null ) { + assert (profileDB.adminCommand({isMaster: 1}).secondary); + numNodesCmdRanOn += 1; + } + }); + return numNodesCmdRanOn; +}; + +let assertCmdRanOnExpectedNodes = function(rsNodes, cmdTestCase) { + cmdTestCase.cmdFunc(); + assert .eq(getNumNodesCmdRanOn(rsNodes, cmdTestCase), 1); +}; + +rst.nodes.forEach(function(node) { + assert (node.getDB(kDbName).setProfilingLevel(2)); + assert (node.getDB( 'admin' ).setProfilingLevel(2)); +}); + + const testCollPrimary = primary.getCollection(kNamespace); + + const bulk = testCollPrimary.initializeUnorderedBulkOp(); + for (let i = 0; i < kNumDocs; ++i) { + bulk.insert({_id: i, x: i}); +} + assert .commandWorked(bulk.execute()); + + const testCollSecondary = secondary.getCollection(kNamespace); + +configureFailPoint(secondary, "acquireLockDuringIntrospect" ); + const cursor = testCollSecondary.find({x: {$gte: 0}}).maxTimeMS(3000); + + const cmdFunc = () => cursor.toArray(); + + const profileQuery = + formatProfileQuery(kNamespace, {find: kCollName, filter: {x: {$gte: 0}}}, true ); + const dbName = kDbName; + +assertCmdRanOnExpectedNodes([primary, secondary], {cmdFunc, profileQuery, dbName}); + +rst.stopSet(); diff --git a/src/mongo/db/introspect.cpp b/src/mongo/db/introspect.cpp index f63cffbd86c..e3473d9f765 100644 --- a/src/mongo/db/introspect.cpp +++ b/src/mongo/db/introspect.cpp @@ -49,6 +49,10 @@ namespace mongo { +namespace { +MONGO_FAIL_POINT_DEFINE(acquireLockDuringIntrospect); +} + using std::endl; using std::string; using std::unique_ptr; @@ -108,6 +112,25 @@ void profile(OperationContext* opCtx, NetworkOp op) { }); try { + auto pf = makePromiseFuture<void>(); + if (acquireLockDuringIntrospect.shouldFail()) { + acquireLockDuringIntrospect.execute( + [p = std::move(pf.promise)]( const auto& data) mutable { + std::thread t{[p = std::move(p)]() mutable { + auto client = getGlobalServiceContext()->makeClient( "BF-23831" ); + auto opCtx = client->makeOperationContext(); + { + Lock::ParallelBatchWriterMode pbwm(opCtx->lockState()); + p.emplaceValue(); + std::this_thread::sleep_for(std::chrono::milliseconds(5000)); + } + }}; + t.detach(); + }); + } else { + pf.promise.emplaceValue(); + } + pf. future .get(); const auto dbProfilingNS = NamespaceString(dbName, "system.profile" ); AutoGetCollection autoColl(opCtx, dbProfilingNS, MODE_IX); Database* const db = autoColl.getDb();
    • Replication 2022-02-07, Repl 2022-02-21
    • 0

      It is possible for an operation to fail with "operation exceeded time limit" while waiting for a lock if maxTimeMS is set even if _ignoreInterruptsExceptForReplStateChange was also set and there was no step up or step down. This can cause operation profiling to fail on a secondary if the parallel batch writer lock was held while profiling began.

      See also SERVER-59226 for context.

            Assignee:
            matthew.russotto@mongodb.com Matthew Russotto
            Reporter:
            brett.nawrocki@mongodb.com Brett Nawrocki
            Votes:
            0 Vote for this issue
            Watchers:
            7 Start watching this issue

              Created:
              Updated:
              Resolved: