Uploaded image for project: 'Core Server'
  1. Core Server
  2. SERVER-31887

clusterTime advanced on primary without anything being written to oplog

    • Type: Icon: Bug Bug
    • Resolution: Works as Designed
    • Priority: Icon: Major - P3 Major - P3
    • None
    • Affects Version/s: 3.6.0-rc2
    • Component/s: Sharding
    • None
    • ALL
    • Hide
      python buildscripts/resmoke.py --suites=no_server repro_server31887.js --storageEngine=wiredTiger
      
      repro_server31887.js
      (function() {
          "use strict";
      
          const rst = new ReplSetTest({nodes: 1, nodeOptions: {setParameter: {writePeriodicNoops: 0}}});
          rst.startSet();
          rst.initiate();
      
          const primary = rst.getPrimary();
          const session = primary.startSession({causalConsistency: true});
          const db = session.getDatabase("test");
      
          function logClusterTime(prefix, res) {
              jsTest.log(
                  prefix + ": " +
                  tojson({operationTime: res.operationTime, clusterTime: res.$clusterTime.clusterTime}));
          }
      
          let res = assert.commandWorked(db.runCommand({insert: "mycoll", documents: [{_id: 0}]}));
          logClusterTime("After doing insert", res);
          assert.eq(1, res.n, "insert should have succeeded");
      
          res = assert.commandWorked(db.runCommand({insert: "mycoll", documents: [{_id: 0}]}));
          logClusterTime("After triggering duplicate key error", res);
          assert(res.hasOwnProperty("writeErrors"), "insert should have failed");
          assert.eq(1, res.writeErrors.length, "insert should have failed");
          assert.eq(ErrorCodes.DuplicateKey,
                    res.writeErrors[0].code,
                    "insert should have failed due to duplicate key error");
      
          res = assert.commandFailed(db.runCommand({fakeCommand: 1}));
          logClusterTime("After running a bad command", res);
      
          // This "find" command hangs because the server returned a clusterTime for which there is no
          // corresponding entry in the oplog. This means we must wait until some other client (e.g. the
          // periodic no-op writer) comes in and writes an entry to the oplog.
          assert.commandWorked(db.runCommand({find: "mycoll", maxTimeMS: 10 * 1000}));
      
          session.endSession();
          rst.stopSet();
      })();
      
      Output
      [js_test:repro_server31887] 2017-11-09T01:30:06.643-0500 ----
      [js_test:repro_server31887] 2017-11-09T01:30:06.643-0500 After doing insert: {
      [js_test:repro_server31887] 2017-11-09T01:30:06.643-0500 	"operationTime" : Timestamp(1510209006, 7),
      [js_test:repro_server31887] 2017-11-09T01:30:06.643-0500 	"clusterTime" : Timestamp(1510209006, 7)
      [js_test:repro_server31887] 2017-11-09T01:30:06.644-0500 }
      [js_test:repro_server31887] 2017-11-09T01:30:06.644-0500 ----
      ...
      [js_test:repro_server31887] 2017-11-09T01:30:06.644-0500 ----
      [js_test:repro_server31887] 2017-11-09T01:30:06.645-0500 After triggering duplicate key error: {
      [js_test:repro_server31887] 2017-11-09T01:30:06.645-0500 	"operationTime" : Timestamp(1510209006, 7),
      [js_test:repro_server31887] 2017-11-09T01:30:06.645-0500 	"clusterTime" : Timestamp(1510209006, 8)
      [js_test:repro_server31887] 2017-11-09T01:30:06.645-0500 }
      [js_test:repro_server31887] 2017-11-09T01:30:06.645-0500 ----
      ...
      [js_test:repro_server31887] 2017-11-09T01:30:06.646-0500 ----
      [js_test:repro_server31887] 2017-11-09T01:30:06.646-0500 After running a bad command: {
      [js_test:repro_server31887] 2017-11-09T01:30:06.647-0500 	"operationTime" : Timestamp(1510209006, 8),
      [js_test:repro_server31887] 2017-11-09T01:30:06.647-0500 	"clusterTime" : Timestamp(1510209006, 8)
      [js_test:repro_server31887] 2017-11-09T01:30:06.647-0500 }
      [js_test:repro_server31887] 2017-11-09T01:30:06.647-0500 ----
      ...
      [js_test:repro_server31887] 2017-11-09T01:30:16.659-0500 2017-11-09T01:30:16.655-0500 E QUERY    [thread1] Error: command failed: {
      [js_test:repro_server31887] 2017-11-09T01:30:16.659-0500 	"ok" : 0,
      [js_test:repro_server31887] 2017-11-09T01:30:16.659-0500 	"errmsg" : "operation exceeded time limit",
      [js_test:repro_server31887] 2017-11-09T01:30:16.659-0500 	"code" : 50,
      [js_test:repro_server31887] 2017-11-09T01:30:16.659-0500 	"codeName" : "ExceededTimeLimit",
      [js_test:repro_server31887] 2017-11-09T01:30:16.660-0500 	"$clusterTime" : {
      [js_test:repro_server31887] 2017-11-09T01:30:16.660-0500 		"clusterTime" : Timestamp(1510209006, 8),
      [js_test:repro_server31887] 2017-11-09T01:30:16.660-0500 		"signature" : {
      [js_test:repro_server31887] 2017-11-09T01:30:16.660-0500 			"hash" : BinData(0,"AAAAAAAAAAAAAAAAAAAAAAAAAAA="),
      [js_test:repro_server31887] 2017-11-09T01:30:16.660-0500 			"keyId" : NumberLong(0)
      [js_test:repro_server31887] 2017-11-09T01:30:16.660-0500 		}
      [js_test:repro_server31887] 2017-11-09T01:30:16.661-0500 	}
      [js_test:repro_server31887] 2017-11-09T01:30:16.661-0500 } : undefined :
      [js_test:repro_server31887] 2017-11-09T01:30:16.661-0500 _getErrorWithCode@src/mongo/shell/utils.js:25:13
      [js_test:repro_server31887] 2017-11-09T01:30:16.661-0500 doassert@src/mongo/shell/assert.js:16:14
      [js_test:repro_server31887] 2017-11-09T01:30:16.661-0500 assert.commandWorked@src/mongo/shell/assert.js:403:5
      [js_test:repro_server31887] 2017-11-09T01:30:16.661-0500 @repro_server31887.js:36:1
      [js_test:repro_server31887] 2017-11-09T01:30:16.662-0500 @repro_server31887.js:1:2
      [js_test:repro_server31887] 2017-11-09T01:30:16.662-0500 failed to load: repro_server31887.js
      
      Show
      python buildscripts/resmoke.py --suites=no_server repro_server31887.js --storageEngine=wiredTiger repro_server31887.js ( function () { "use strict" ; const rst = new ReplSetTest({nodes: 1, nodeOptions: {setParameter: {writePeriodicNoops: 0}}}); rst.startSet(); rst.initiate(); const primary = rst.getPrimary(); const session = primary.startSession({causalConsistency: true }); const db = session.getDatabase( "test" ); function logClusterTime(prefix, res) { jsTest.log( prefix + ": " + tojson({operationTime: res.operationTime, clusterTime: res.$clusterTime.clusterTime})); } let res = assert.commandWorked(db.runCommand({insert: "mycoll" , documents: [{_id: 0}]})); logClusterTime( "After doing insert" , res); assert.eq(1, res.n, "insert should have succeeded" ); res = assert.commandWorked(db.runCommand({insert: "mycoll" , documents: [{_id: 0}]})); logClusterTime( "After triggering duplicate key error" , res); assert(res.hasOwnProperty( "writeErrors" ), "insert should have failed" ); assert.eq(1, res.writeErrors.length, "insert should have failed" ); assert.eq(ErrorCodes.DuplicateKey, res.writeErrors[0].code, "insert should have failed due to duplicate key error" ); res = assert.commandFailed(db.runCommand({fakeCommand: 1})); logClusterTime( "After running a bad command" , res); // This "find" command hangs because the server returned a clusterTime for which there is no // corresponding entry in the oplog. This means we must wait until some other client (e.g. the // periodic no-op writer) comes in and writes an entry to the oplog. assert.commandWorked(db.runCommand({find: "mycoll" , maxTimeMS: 10 * 1000})); session.endSession(); rst.stopSet(); })(); Output [js_test:repro_server31887] 2017-11-09T01:30:06.643-0500 ---- [js_test:repro_server31887] 2017-11-09T01:30:06.643-0500 After doing insert: { [js_test:repro_server31887] 2017-11-09T01:30:06.643-0500 "operationTime" : Timestamp(1510209006, 7), [js_test:repro_server31887] 2017-11-09T01:30:06.643-0500 "clusterTime" : Timestamp(1510209006, 7) [js_test:repro_server31887] 2017-11-09T01:30:06.644-0500 } [js_test:repro_server31887] 2017-11-09T01:30:06.644-0500 ---- ... [js_test:repro_server31887] 2017-11-09T01:30:06.644-0500 ---- [js_test:repro_server31887] 2017-11-09T01:30:06.645-0500 After triggering duplicate key error: { [js_test:repro_server31887] 2017-11-09T01:30:06.645-0500 "operationTime" : Timestamp(1510209006, 7), [js_test:repro_server31887] 2017-11-09T01:30:06.645-0500 "clusterTime" : Timestamp(1510209006, 8) [js_test:repro_server31887] 2017-11-09T01:30:06.645-0500 } [js_test:repro_server31887] 2017-11-09T01:30:06.645-0500 ---- ... [js_test:repro_server31887] 2017-11-09T01:30:06.646-0500 ---- [js_test:repro_server31887] 2017-11-09T01:30:06.646-0500 After running a bad command: { [js_test:repro_server31887] 2017-11-09T01:30:06.647-0500 "operationTime" : Timestamp(1510209006, 8), [js_test:repro_server31887] 2017-11-09T01:30:06.647-0500 "clusterTime" : Timestamp(1510209006, 8) [js_test:repro_server31887] 2017-11-09T01:30:06.647-0500 } [js_test:repro_server31887] 2017-11-09T01:30:06.647-0500 ---- ... [js_test:repro_server31887] 2017-11-09T01:30:16.659-0500 2017-11-09T01:30:16.655-0500 E QUERY [thread1] Error: command failed: { [js_test:repro_server31887] 2017-11-09T01:30:16.659-0500 "ok" : 0, [js_test:repro_server31887] 2017-11-09T01:30:16.659-0500 "errmsg" : "operation exceeded time limit", [js_test:repro_server31887] 2017-11-09T01:30:16.659-0500 "code" : 50, [js_test:repro_server31887] 2017-11-09T01:30:16.659-0500 "codeName" : "ExceededTimeLimit", [js_test:repro_server31887] 2017-11-09T01:30:16.660-0500 "$clusterTime" : { [js_test:repro_server31887] 2017-11-09T01:30:16.660-0500 "clusterTime" : Timestamp(1510209006, 8), [js_test:repro_server31887] 2017-11-09T01:30:16.660-0500 "signature" : { [js_test:repro_server31887] 2017-11-09T01:30:16.660-0500 "hash" : BinData(0,"AAAAAAAAAAAAAAAAAAAAAAAAAAA="), [js_test:repro_server31887] 2017-11-09T01:30:16.660-0500 "keyId" : NumberLong(0) [js_test:repro_server31887] 2017-11-09T01:30:16.660-0500 } [js_test:repro_server31887] 2017-11-09T01:30:16.661-0500 } [js_test:repro_server31887] 2017-11-09T01:30:16.661-0500 } : undefined : [js_test:repro_server31887] 2017-11-09T01:30:16.661-0500 _getErrorWithCode@src/mongo/shell/utils.js:25:13 [js_test:repro_server31887] 2017-11-09T01:30:16.661-0500 doassert@src/mongo/shell/assert.js:16:14 [js_test:repro_server31887] 2017-11-09T01:30:16.661-0500 assert.commandWorked@src/mongo/shell/assert.js:403:5 [js_test:repro_server31887] 2017-11-09T01:30:16.661-0500 @repro_server31887.js:36:1 [js_test:repro_server31887] 2017-11-09T01:30:16.662-0500 @repro_server31887.js:1:2 [js_test:repro_server31887] 2017-11-09T01:30:16.662-0500 failed to load: repro_server31887.js
    • Sharding 2019-07-01
    • 60

      Command errors such as duplicate key errors still cause the clusterTime to be advanced; however, no oplog entry is written for those failed operations. We then use the current clusterTime as the operationTime in the server's error responses for other error cases (as part of the changes from SERVER-31306), leading a client to wait until a time no operation actually happened at.

      The behavior of advancing the clusterTime prior to RecoveryUnit::Change::commit() (e.g. for cases where the operation fails on WiredTiger transaction commit) may have other negative effects on causal consistency that I haven't thought about. The impact of this bug insofar as I have observed failures in Evergreen is that we no longer have a guarantee of making forward progress when waiting for afterClusterTime unless the periodic no-op writer is running.

            Assignee:
            misha.tyulenev@mongodb.com Misha Tyulenev (Inactive)
            Reporter:
            max.hirschhorn@mongodb.com Max Hirschhorn
            Votes:
            0 Vote for this issue
            Watchers:
            20 Start watching this issue

              Created:
              Updated:
              Resolved: