diff --git a/jstests/sharding/retry_find_on_start_transaction.js b/jstests/sharding/retry_find_on_start_transaction.js new file mode 100644 index 00000000000..36993ce2089 --- /dev/null +++ b/jstests/sharding/retry_find_on_start_transaction.js @@ -0,0 +1,92 @@ +import {configureFailPoint} from "jstests/libs/fail_point_util.js"; +import {Thread} from "jstests/libs/parallelTester.js"; + +(() => { + const st = new ShardingTest({shards: 2, rs: {nodes: 3}}); + + // Put one db on one shard, and one on the other + assert.commandWorked(st.s.adminCommand({enableSharding: "test", primaryShard: st.shard0.name})); + assert.commandWorked( + st.s.adminCommand({enableSharding: "test2", primaryShard: st.shard1.name})); + + // Create both collections + st.s.getDB("test").foo.insert({x: 1}); + st.s.getDB("test2").bar.insert({x: -50}); + + // Set hang failpoints on shard0 primary + let primary = st.rs0.getPrimary(); + let hangfp = configureFailPoint(primary, 'hangInFind'); + let stepfp = configureFailPoint(primary, 'stepdownHangBeforeRSTLEnqueue'); + + const lsid = UUID(); + + // Run a find that should target the primary of shard0 and force it to hang + function runFind(host, lsid) { + let mongosConn = new Mongo(host); + let f = mongosConn.getDB("test").runCommand({ + find: "foo", + lsid: {id: eval(lsid)}, + txnNumber: NumberLong(1), + startTransaction: true, + stmtId: NumberInt(0), + autocommit: false, + $readPreference: {mode: "primary"}, + }); + } + + let thread1 = new Thread(runFind, st.s.host, tojson(lsid)); + thread1.start(); + + // While the find is being run (aka hanging), step down the primary running the find + hangfp.wait(); + function runstepdown(host) { + let p = new Mongo(host); + return assert.commandWorked(p.getDB("admin").adminCommand( + {replSetStepDown: ReplSetTest.kForeverSecs, force: true})); + } + + let thread2 = new Thread(runstepdown, primary.host); + thread2.start(); + + // Turn off the failpoints once we know both are active, and wait for both threads to finish. + // The stedown will kill the find, and the mongos will retry the find on the new primary for + // shard0. + stepfp.wait(); + hangfp.off(); + stepfp.off(); + + thread2.join(); + thread1.join(); + + // Do a write on both shards so that we'll do a 2PC and write a prepare oplog entry on both + // shards + assert.commandWorked(st.s.getDB("test").runCommand({ + insert: "foo", + documents: [{x: 2}], + lsid: {id: eval(lsid)}, + txnNumber: NumberLong(1), + stmtId: NumberInt(0), + autocommit: false, + })); + assert.commandWorked(st.s.getDB("test2").runCommand({ + insert: "bar", + documents: [{x: -100}], + lsid: {id: eval(lsid)}, + txnNumber: NumberLong(1), + stmtId: NumberInt(0), + autocommit: false, + })); + + // If we didn't clean up and reset the transaction state on the old primary for shard0, this + // should fail because when the secondary replicates the prepare oplog entry, it would error if + // the secondary's TransactionParticipant state was aborted. + assert.commandWorked(st.s.getDB("admin").runCommand({ + commitTransaction: 1, + lsid: {id: eval(lsid)}, + txnNumber: NumberLong(1), + stmtId: NumberInt(0), + autocommit: false, + })); + + st.stop(); +})(); diff --git a/src/mongo/db/commands/find_cmd.cpp b/src/mongo/db/commands/find_cmd.cpp index 73ea6a08b0c..103b4a44550 100644 --- a/src/mongo/db/commands/find_cmd.cpp +++ b/src/mongo/db/commands/find_cmd.cpp @@ -153,6 +153,8 @@ namespace mongo { namespace { MONGO_FAIL_POINT_DEFINE(allowExternalReadsForReverseOplogScanRule); +MONGO_FAIL_POINT_DEFINE(hangInFind); + const auto kTermField = "term"_sd; @@ -469,7 +471,9 @@ public: CommandHelpers::handleMarkKillOnClientDisconnect(opCtx); const BSONObj& cmdObj = _request.body; - + std::cout << "xxx pausing in hang" << std::endl; + hangInFind.pauseWhileSet(); + std::cout << "xxx finished pausing" << std::endl; // Parse the command BSON to a FindCommandRequest. Pass in the parsedNss in case cmdObj // does not have a UUID. const bool isOplogNss = (_ns == NamespaceString::kRsOplogNamespace); diff --git a/src/mongo/s/async_requests_sender.cpp b/src/mongo/s/async_requests_sender.cpp index 74fe6b2dbd7..42a1816ab8d 100644 --- a/src/mongo/s/async_requests_sender.cpp +++ b/src/mongo/s/async_requests_sender.cpp @@ -323,11 +323,11 @@ auto AsyncRequestsSender::RemoteData::handleResponse(RemoteCommandOnAnyCallbackA if (rcr.response.target) { shard->updateReplSetMonitor(*rcr.response.target, status); } - - bool isStartingTransaction = _cmdObj.getField("startTransaction").booleanSafe(); + std::cout << "xxx there was an error, will retry? " << rcr.request.cmdObj << std::endl; if (!_ars->_stopRetrying && shard->isRetriableError(status.code(), _ars->_retryPolicy) && - _retryCount < kMaxNumFailedHostRetryAttempts && !isStartingTransaction) { + _retryCount < kMaxNumFailedHostRetryAttempts) { + std::cout << "xxx in retry" << std::endl; LOGV2_DEBUG( 4615637,