From 1b9dcb316f21741cc27e2b49b981ea8b4c6ac0cd Mon Sep 17 00:00:00 2001 From: Jordi Serra Torrens Date: Tue, 7 May 2024 10:52:33 +0000 Subject: [PATCH] Repro SERVER-50143 --- jstests/sharding/repro-server-50143.js | 96 ++++++++++++++++++++++++++ src/mongo/db/commands/txn_cmds.cpp | 7 ++ 2 files changed, 103 insertions(+) create mode 100644 jstests/sharding/repro-server-50143.js diff --git a/jstests/sharding/repro-server-50143.js b/jstests/sharding/repro-server-50143.js new file mode 100644 index 00000000000..09d876f77c0 --- /dev/null +++ b/jstests/sharding/repro-server-50143.js @@ -0,0 +1,96 @@ +import {configureFailPoint} from "jstests/libs/fail_point_util.js"; +import {moveOutSessionChunks, removeShard} from "jstests/sharding/libs/remove_shard_util.js"; + +let st = ShardingTest({shards: 3}); + +let dbName = "test"; +let collName = "foo"; +let coll = st.getDB(dbName)[collName]; + +// Initial placement: +// - shard0: [-inf, 0) +// - shard1: [0, +inf] +// - shard2: nothing +assert.commandWorked( + st.s.adminCommand({enableSharding: dbName, primaryShard: st.shard1.shardName})); +assert.commandWorked(st.s.adminCommand({shardCollection: coll.getFullName(), key: {x: 1}})); +assert.commandWorked(st.s.adminCommand({split: coll.getFullName(), middle: {x: 0}})); +assert.commandWorked( + st.s.adminCommand({moveChunk: coll.getFullName(), find: {x: -1}, to: st.shard0.shardName})); +assert.commandWorked( + st.s.adminCommand({moveChunk: coll.getFullName(), find: {x: 1}, to: st.shard1.shardName})); + +// Insert two documents, one on each chunk. +assert.commandWorked(coll.insertMany([{x: -1, y: 0}, {x: 1, y: 0}])); + +// Set failpoint on shard1 so that it hangs when committing the transaction (after prepare). Also +// set a failpoint so that commitTransaction will fail once we let it continue. +let fpHangCommitTransactionOnShard1 = + configureFailPoint(st.rs1.getPrimary(), "hangBeforeCommitingTxn"); +let fpFailCommitTransactionOnShard1 = configureFailPoint( + st.rs1.getPrimary(), "transactionParticipantFailWithNetworkErrorBeforeCommitTransaction"); + +// Run a transaction that targets shard0 and shard1. Make it so shard0 will be nominated as the +// TransactionCoordinator. Start the 2PC commit but make it hang on shard1, after it has prepared. +let awaitTxn = startParallelShell(() => { + // Start transaction. First make a write to shard0 so that it will be nominated as the + // TransactionCoordinator. + let session = db.getMongo().startSession(); + session.startTransaction(); + let sessionColl = session.getDatabase("test")["foo"]; + assert.commandWorked(sessionColl.updateOne({x: -1}, {$set: {y: 1}})); + + // Write to shard1 as well, so this transaction becomes a distributed transaction that will + // require 2PC. + assert.commandWorked(sessionColl.updateOne({x: 1}, {$set: {y: 1}})); + + // Start the commit. + assert.commandWorked(session.commitTransaction_forTesting()); +}, st.s.port); + +jsTest.log("--DEBUG-- Waiting for fpHangCommitTransactionOnShard1 to be hit"); +fpHangCommitTransactionOnShard1.wait(); +jsTest.log("--DEBUG-- fpHangCommitTransactionOnShard1 hit"); + +// Remove shard0. +// To do so, first donate its chunk to shard2. +assert.commandWorked( + st.s.adminCommand({moveChunk: coll.getFullName(), find: {x: -1}, to: st.shard2.shardName})); +moveOutSessionChunks(st, st.shard0.shardName, st.shard2.shardName); +removeShard(st.s, st.shard0.shardName); +jsTest.log("--DEBUG-- Removed shard0"); + +// Decommission the removed shard. +st.rs0.stopSet(undefined, false /* forRestart */); +jsTest.log("--DEBUG-- Stopped shard0 replica set"); + +// Unblock the transaction participant. The ongoing commit will fail. Unset the failpoints so new +// commit attempts by the TransactionCoordinator will succeed. +fpHangCommitTransactionOnShard1.off(); +sleep(1000); +fpFailCommitTransactionOnShard1.off(); + +jsTest.log("--DEBUG-- Checking that transaction stays prepared forever"); + +// See that shard1 will stay with a prepared transaction forever. +assert.soon(() => { + return st.s.getDB("admin") + .aggregate([ + {$currentOp: {}}, + { + $match: { + shard: st.shard1.shardName, + "transaction.timePreparedMicros": {$exists: true} + } + } + ]) + .itcount() === 0; +}); + +// Note: Because of the bug described on SERVER-50143, execution will never reach here. +jsTest.log("--DEBUG-- Transaction no longer prepared"); + +// ... and other operations won't be able to write to the prepared documents +assert.commandWorked(coll.updateOne({x: 1}, {$set: {y: 2}})); + +st.stop(); diff --git a/src/mongo/db/commands/txn_cmds.cpp b/src/mongo/db/commands/txn_cmds.cpp index eb60dad9b10..55eac83554b 100644 --- a/src/mongo/db/commands/txn_cmds.cpp +++ b/src/mongo/db/commands/txn_cmds.cpp @@ -73,6 +73,7 @@ namespace { MONGO_FAIL_POINT_DEFINE(participantReturnNetworkErrorForAbortAfterExecutingAbortLogic); MONGO_FAIL_POINT_DEFINE(participantReturnNetworkErrorForCommitAfterExecutingCommitLogic); +MONGO_FAIL_POINT_DEFINE(transactionParticipantFailWithNetworkErrorBeforeCommitTransaction); MONGO_FAIL_POINT_DEFINE(hangBeforeCommitingTxn); MONGO_FAIL_POINT_DEFINE(hangBeforeAbortingTxn); // TODO SERVER-39704: Remove this fail point once the router can safely retry within a transaction @@ -161,6 +162,12 @@ public: CurOpFailpointHelpers::waitWhileFailPointEnabled( &hangBeforeCommitingTxn, opCtx, "hangBeforeCommitingTxn"); + if (MONGO_unlikely(transactionParticipantFailWithNetworkErrorBeforeCommitTransaction + .shouldFail())) { + uasserted(ErrorCodes::HostUnreachable, + "returning network error because failpoint is on"); + } + auto optionalCommitTimestamp = request().getCommitTimestamp(); if (optionalCommitTimestamp) { // commitPreparedTransaction will throw if the transaction is not prepared. -- 2.34.1