Uploaded image for project: 'Core Server'
  1. Core Server
  2. SERVER-70852

Recommit SERVER-66716 on v6.0

    • Type: Icon: Bug Bug
    • Resolution: Fixed
    • Priority: Icon: Major - P3 Major - P3
    • 6.0.3
    • Affects Version/s: None
    • Component/s: Sharding
    • None
    • Fully Compatible
    • ALL
    • Hide

      Reproduction test (smaller part of https://github.com/10gen/mongo/blob/v6.0/jstests/sharding/update_shard_key_doc_moves_shards.js)
      Test fails at least once every 20 runs (use --repeatTests 20 -j4)

       /**
        * Tests that changing the shard key value of a document using update and findAndModify works
        * correctly when the doc will change shards.
        * @tags: [
        *   uses_multi_shard_transaction,
        *   uses_transactions,
        * ]
        */
      
       (function() {
       'use strict';
      
       load("jstests/sharding/libs/sharded_transactions_helpers.js");
       load("jstests/sharding/libs/update_shard_key_helpers.js");
      
       const st = new ShardingTest({
           mongos: 1,
           shards: \{rs0: {nodes: 3}, rs1: \{nodes: 3}},
           rsOptions:
               \{setParameter: {maxTransactionLockRequestTimeoutMillis: ReplSetTest.kDefaultTimeoutMS}}
       });
      
       const kDbName = 'db';
       const mongos = st.s0;
       const shard0 = st.shard0.shardName;
       const shard1 = st.shard1.shardName;
       const ns = kDbName + '.foo';
      
       const updateDocumentShardKeyUsingTransactionApiEnabled =
           isUpdateDocumentShardKeyUsingTransactionApiEnabled(st.s);
      
       enableCoordinateCommitReturnImmediatelyAfterPersistingDecision(st);
       assert.commandWorked(mongos.adminCommand(\{enableSharding: kDbName}));
       st.ensurePrimaryShard(kDbName, shard0);
      
       const changeShardKeyOptions = [
           [false, false, false],
           [true, false, false],
           [true, true, false],
           [false, true, false],
           [false, false, true],
           [true, false, true],
           [false, true, true],
           [true, true, true]
       ];
      
       //
       //  Tests for op-style updates.
       //
      
       changeShardKeyOptions.forEach(function(updateConfig) {
           let runInTxn, isFindAndModify, upsert;
           [runInTxn, isFindAndModify, upsert] = [updateConfig[0], updateConfig[1], updateConfig[2]];
      
           jsTestLog("Testing changing the shard key using op style update and " +
                     (isFindAndModify ? "findAndModify command " : "update command ") +
                     (runInTxn ? "in transaction " : "as retryable write"));
      
           let session = st.s.startSession(\{retryWrites: runInTxn ? false : true});
           let sessionDB = session.getDatabase(kDbName);
           
           assertCanUpdatePrimitiveShardKey(st,
                                            kDbName,
                                            ns,
                                            session,
                                            sessionDB,
                                            runInTxn,
                                            isFindAndModify,
                                            [\{"x": 300}, \{"x": 4}],
                                            [\{"$set": {"x": 30}}, \{"$set": {"x": 600}}],
                                            upsert);
           assertCanUpdateDottedPath(st,
                                     kDbName,
                                     ns,
                                     session,
                                     sessionDB,
                                     runInTxn,
                                     isFindAndModify,
                                     [\{"x.a": 300}, \{"x.a": 4}],
                                     [\{"$set": {"x": {"a": 30}}}, \{"$set": {"x": {"a": 600}}}],
                                     upsert);
           assertCanUpdatePartialShardKey(st,
                                          kDbName,
                                          ns,
                                          session,
                                          sessionDB,
                                          runInTxn,
                                          isFindAndModify,
                                          [\{"x": 300, "y": 80}, \{"x": 4, "y": 3}],
                                          [\{"$set": {"x": 30}}, \{"$set": {"x": 600}}],
                                          upsert);
      
           assertCanUnsetSKField(st,
                                 kDbName,
                                 ns,
                                 session,
                                 sessionDB,
                                 runInTxn,
                                 isFindAndModify,
                                 \{"x": 300},
                                 \{"$unset": {"x": 1}},
                                 upsert);
       });
       st.stop();
       })();
      
      
      Show
      Reproduction test (smaller part of https://github.com/10gen/mongo/blob/v6.0/jstests/sharding/update_shard_key_doc_moves_shards.js ) Test fails at least once every 20 runs (use --repeatTests 20 -j4)  /**   * Tests that changing the shard key value of a document using update and findAndModify works   * correctly when the doc will change shards.   * @tags: [   *   uses_multi_shard_transaction,   *   uses_transactions,   * ]   */  (function() {  'use strict';  load("jstests/sharding/libs/sharded_transactions_helpers.js");  load("jstests/sharding/libs/update_shard_key_helpers.js");  const st = new ShardingTest({      mongos: 1,      shards: \{rs0: {nodes: 3}, rs1: \{nodes: 3}},      rsOptions:          \{setParameter: {maxTransactionLockRequestTimeoutMillis: ReplSetTest.kDefaultTimeoutMS}}  });  const kDbName = 'db';  const mongos = st.s0;  const shard0 = st.shard0.shardName;  const shard1 = st.shard1.shardName;  const ns = kDbName + '.foo';  const updateDocumentShardKeyUsingTransactionApiEnabled =      isUpdateDocumentShardKeyUsingTransactionApiEnabled(st.s);  enableCoordinateCommitReturnImmediatelyAfterPersistingDecision(st);  assert.commandWorked(mongos.adminCommand(\{enableSharding: kDbName}));  st.ensurePrimaryShard(kDbName, shard0);  const changeShardKeyOptions = [      [false, false, false],      [true, false, false],      [true, true, false],      [false, true, false],      [false, false, true],      [true, false, true],      [false, true, true],      [true, true, true]  ];  //  //  Tests for op-style updates.  //  changeShardKeyOptions.forEach(function(updateConfig) {      let runInTxn, isFindAndModify, upsert;      [runInTxn, isFindAndModify, upsert] = [updateConfig[0], updateConfig[1], updateConfig[2]];      jsTestLog("Testing changing the shard key using op style update and " +                (isFindAndModify ? "findAndModify command " : "update command ") +                (runInTxn ? "in transaction " : "as retryable write"));      let session = st.s.startSession(\{retryWrites: runInTxn ? false : true});      let sessionDB = session.getDatabase(kDbName);            assertCanUpdatePrimitiveShardKey(st,                                       kDbName,                                       ns,                                       session,                                       sessionDB,                                       runInTxn,                                       isFindAndModify,                                       [\{"x": 300}, \{"x": 4}],                                       [\{"$set": {"x": 30}}, \{"$set": {"x": 600}}],                                       upsert);      assertCanUpdateDottedPath(st,                                kDbName,                                ns,                                session,                                sessionDB,                                runInTxn,                                isFindAndModify,                                [\{"x.a": 300}, \{"x.a": 4}],                                [\{"$set": {"x": {"a": 30}}}, \{"$set": {"x": {"a": 600}}}],                                upsert);      assertCanUpdatePartialShardKey(st,                                     kDbName,                                     ns,                                     session,                                     sessionDB,                                     runInTxn,                                     isFindAndModify,                                     [\{"x": 300, "y": 80}, \{"x": 4, "y": 3}],                                     [\{"$set": {"x": 30}}, \{"$set": {"x": 600}}],                                     upsert);      assertCanUnsetSKField(st,                            kDbName,                            ns,                            session,                            sessionDB,                            runInTxn,                            isFindAndModify,                            \{"x": 300},                            \{"$unset": {"x": 1}},                            upsert);  });  st.stop();  })();
    • Sharding EMEA 2022-10-31
    • 150

      Summary

      The changes from 7b45064 as part of backporting SERVER-65816 to the 6.0 branch reintroduced the early return which the changes from 3faed4f as part of SERVER-66716 had already removed. Re-committing SERVER-66716 should solve the issue.

      Detailed explanation

      The failing part of https://github.com/mongodb/mongo/blob/v6.0/jstests/sharding/update_shard_key_doc_moves_shards.js involves inserting a few documents, splitting them into chunks and moving one of the chunks to a different shard. Then the shard key of a document is updated with a value such that it must be moved back to the original shard (eg. {x: 300} is updated to {x: 30} and the split doc is {x:100}, so {x:300} was first moved to shard_1 but after the update it is moved back to shard_0). The moving involves deleting the old doc ({x:300}) from shard_1 and inserting a new doc ({x:30}) on shard_0. Both these documents have the same id "_id" : ObjectId("6349e3c6d8028aa1c48abe4a")).

       497532 [js_test:test2] c20797| \{"t":{"$date":"2022-10-23T13:50:19.436+00:00"},"s":"D2", "c":"ASIO","id":4630601, "ctx":"ReplCoordExtern-1","msg":"Request acquired a connection","attr":\{"requestId":255,"target":"ip-10-122-3-17:20798"}}
       497556 [js_test:test2] d20793| \{"t":{"$date":"2022-10-23T13:50:19.440+00:00"},"s":"W",  "c":"COMMAND",  "id":23802,   "ctx":"conn74","msg":"Plan executor error during findAndModify","attr":\{"error":{"code":283,"codeName":"WouldChangeOwningShard","errmsg":"This update would cause the doc to change owning shards","preImage":{"_id":{"$oid":"63554695d899a1e359adecfa"},"x":\{"a":300,"y":1},"a":3},"postImage":\{"_id":{"$oid":"63554695d899a1e359adecfa"},"x":\{"a":30},"a":3},"shouldUpsert":false},"stats":\{"stage":"UPDATE","nReturned":0,"executionTimeMillisEstimate":0,"works":1,"advanced":0,"needTime":0,"needYield":0,"saveState":0,"restoreState":0,"failed":true,"isEOF":0,"nMatched":0,"nWouldModify":0,"nWouldUpsert":0,"inputStage":{"stage":"FETCH","nReturned":1,"executionTimeMillisEstimate":0,"works":1,"advanced":1,"needTime":0,"needYield":0,"saveState":1,"restoreState":0,"isEOF":0,"docsExamined":1,"alreadyHasObj":0,"inputStage":{"stage":        "IXSCAN","nReturned":1,"executionTimeMillisEstimate":0,"works":1,"advanced":1,"needTime":0,"needYield":0,"saveState":1,"restoreState":0,"isEOF":0,"keyPattern":{"x.a":1},"indexName":"x.a_1","isMultiKey":false,"multiKeyPaths":\{"x.a":[]        },"isUnique":false,"isSparse":false,"isPartial":false,"indexVersion":2,"direction":"forward","indexBounds":\{"x.a":["[300.0, 300.0]"]},"keysExamined":1,"seeks":1,"dupsTested":0,"dupsDropped":0}}},"cmd":\{"findAndModify":"foo","query":{"x.a":300},"update":\{"$set":{"x":{"a":30}}},"upsert":false,"new":false,"stmtId":0,"runtimeConstants":\{"localNow":{"$date":"2022-10-23T13:50:19.439Z"},"clusterTime":\{"$timestamp":{"t":1666533019,"i":11}}}}}}
       497557 [js_test:test2] s20799| \{"t":{"$date":"2022-10-23T13:50:19.440+00:00"},"s":"D3", "c":"TXN","id":22880,   "ctx":"conn12","msg":"Marking shard as read-only participant","attr":\{"sessionId":{"id":{"$uuid":"50c9bd02-a30e-4cb0-b84a-08092c30f77f"},"uid":\{"$binary":{"base64":"47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=","subType":"0"}}},"txnNumber":4,"txnRetryCounter":0,"shardId":"test2-rs1"}}
       497558 [js_test:test2] s20799| \{"t":{"$date":"2022-10-23T13:50:19.441+00:00"},"s":"D3", "c":"TXN","id":22881,   "ctx":"conn12","msg":"Marking shard has having done a write","attr":\{"sessionId":{"id":{"$uuid":"50c9bd02-a30e-4cb0-b84a-08092c30f77f"},"uid":\{"$binary":{"base64":"47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=","subType":"0"}}},"txnNumber":4,"txnRetryCounter":0,"shardId":"test2-rs1"}}
       497559 [js_test:test2] s20799| \{"t":{"$date":"2022-10-23T13:50:19.441+00:00"},"s":"D3", "c":"TXN","id":22882,   "ctx":"conn12","msg":"Choosing shard as recovery shard","attr":\{"sessionId":{"id":{"$uuid":"50c9bd02-a30e-4cb0-b84a-08092c30f77f"},"uid":\{"$binary":{"base64":"47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=","subType":"0"}}},"txnNumber":4,"txnRetryCounter":0,"shardId":"test2-rs1"}}
       497561 [js_test:test2] s20799| \{"t":{"$date":"2022-10-23T13:50:19.441+00:00"},"s":"I",  "c":"SHARDING", "id":1234,"ctx":"conn12","msg":"INSERT DETAILS: ","attr":\{"jsonInsert":"{\"insert\":\"foo\",\"bypassDocumentValidation\":false,\"ordered\":true,\"documents\":[{\"_id\":{\"$oid\":\"63554695d899a1e359adecfa\"},\"x\":\{\"a\":{\"$numberDouble\":\"30\"}},\"a\":\{\"$numberDouble\":\"3\"}}]}"}}
       497562 [js_test:test2] d20790| \{"t":{"$date":"2022-10-23T13:50:19.441+00:00"},"s":"D2", "c":"MIGRATE",  "id":23897,"ctx":"MoveChunk","msg":"Marking range deletion task on donor as ready for processing","attr":\{"migrationId":{"uuid":{"$uuid":"1e94e158-e52c-4ba9-9fe4-c521d78771e7"}}}}
       497563 [js_test:test2] d20790| \{"t":{"$date":"2022-10-23T13:50:19.441+00:00"},"s":"D4", "c":"TXN","id":23984,   "ctx":"conn83","msg":"New transaction started","attr":\{"txnNumberAndRetryCounter":{"txnNumber":4,"txnRetryCounter":0},"lsid":\{"id":{"$uuid":"50c9bd02-a30e-4cb0-b84a-08092c30f77f"},"uid":\{"$binary":{"base64":"47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=","subType":"0"}}},"apiParameters":{}}}
       497564 [js_test:test2] d20790| \{"t":{"$date":"2022-10-23T13:50:19.442+00:00"},"s":"I",  "c":"TXN","id":51802,   "ctx":"conn83","msg":"transaction","attr":\{"parameters":{"lsid":{"id":{"$uuid":"50c9bd02-a30e-4cb0-b84a-08092c30f77f"},"uid":\{"$binary":{"base64":"47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=","subType":"0"}}},"txnNumber":4,"txnRetryCounter":0,"autocommit":false,"readConcern":\{"afterClusterTime":{"$timestamp":{"t":1666533019,"i":11}},"provenance":"clientSupplied"}},"readTimestamp":"Timestamp(0, 0)","ninserted":0,"terminationCause":"aborted","timeActiveMicros":256,"timeInactiveMicros":11,"numYields":0,"locks":\{"FeatureCompatibilityVersion":{"acquireCount":{"r":1,"w":1}},"Replication        StateTransition":\{"acquireCount":{"w":4}},"Global":\{"acquireCount":{"r":1,"w":1}},"Database":\{"acquireCount":{"w":1}},"Collection":\{"acquireCount":{"w":1}},"Mutex":\{"acquireCount":{"r":3}}},"storage":{},"wasPrepared":false,"durationM        illis":0}}
       497565 [js_test:test2] s20799| \{"t":{"$date":"2022-10-23T13:50:19.442+00:00"},"s":"D3", "c":"TXN","id":22881,   "ctx":"conn12","msg":"Marking shard has having done a write","attr":\{"sessionId":{"id":{"$uuid":"50c9bd02-a30e-4cb0-b84a-08092c30f77f"},"uid":\{"$binary":{"base64":"47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=","subType":"0"}}},"txnNumber":4,"txnRetryCounter":0,"shardId":"test2-rs0"}}
       497566 [js_test:test2] s20799| \{"t":{"$date":"2022-10-23T13:50:19.442+00:00"},"s":"D1", "c":"ASSERT","id":23074,"ctx":"conn12","msg":"User assertion","attr":\{"error":"DuplicateKey{ keyPattern: { _id: 1 }, keyValue: \{ _id: ObjectId('63554695d899a1e359adecfa') }, foundValue: 3 }: During insert stage of updating a shard key :: caused by :: E11000 duplicate key error collection: db.foo index: _id_ dup key: \{ _id: ObjectId('63554695d899a1e359adecfa') } found value: RecordId(3)","file":"src/mongo/s/commands/document_shard_key_update_util.cpp","line":96}}
       497567 [js_test:test2] s20799| \{"t":{"$date":"2022-10-23T13:50:19.442+00:00"},"s":"D3", "c":"TXN","id":22897,   "ctx":"conn12","msg":"Implicitly aborting transaction on all participant shards","attr":\{"sessionId":{"id":{"$uuid":"50c9b        d02-a30e-4cb0-b84a-08092c30f77f"},"uid":\{"$binary":{"base64":"47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=","subType":"0"}}},"txnNumber":4,"txnRetryCounter":0,"numParticipantShards":2,"error":"DuplicateKey\{ keyPattern: { _id: 1 }, keyValue: \{ _id: ObjectId('63554695d899a1e359adecfa') }, foundValue: 3 }: findAndModify :: caused by :: Failed to update document's shard key field. There is either an orphan for this document or _id for this collection is not globally unique. :: caused by :: During insert stage of updating a shard key :: caused by :: E11000 duplicate key error collection: db.foo index: _id_ dup key: \{ _id: ObjectId('63554695d899a1e359adecfa') } found value: RecordId(3)"}}
       497568 [js_test:test2] d20793| \{"t":{"$date":"2022-10-23T13:50:19.442+00:00"},"s":"D3", "c":"TXN","id":20508,"ctx":"conn74","msg":"Received abortTransaction","attr":\{"txnNumberAndRetryCounter":{"txnNumber":4,"txnRetryCounter":0},"sessionId":\{"id":{"$uuid":"50c9bd02-a30e-4cb0-b84a-08092c30f77f"},"uid":\{"$binary":{"base64":"47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=","subType":"0"}}}}}
       497569 [js_test:test2] d20793| \{"t":{"$date":"2022-10-23T13:50:19.443+00:00"},"s":"I",  "c":"TXN","id":51802,"ctx":"conn74","msg":"transaction","attr":\{"parameters":{"lsid":{"id":{"$uuid":"50c9bd02-a30e-4cb0-b84a-08092c30f77f"},"uid":\{"$binary":{"base64":"47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=","subType":"0"}}},"txnNumber":4,"txnRetryCounter":0,"autocommit":false,"readConcern":\{"afterClusterTime":{"$timestamp":{"t":1666533019,"i":11}},"provenance":"clientSupplied"}},"readTimestamp":"Timestamp(0, 0)","terminationCause":"aborted","timeActiveMicros":1023,"timeInactiveMicros":2004,"numYields":0,"locks":\{"FeatureCompatibilityVersion":{"acquireCount":{"r":1,"w":1}},"ReplicationStateTransi        tion":\{"acquireCount":{"w":4}},"Global":\{"acquireCount":{"r":1,"w":1}},"Database":\{"acquireCount":{"w":1}},"Collection":\{"acquireCount":{"w":1}},"Mutex":\{"acquireCount":{"r":11}}},"storage":{},"wasPrepared":false,"durationMillis":3}}
       497570 [js_test:test2] d20793| \{"t":{"$date":"2022-10-23T13:50:19.443+00:00"},"s":"D3", "c":"TXN","id":22447,   "ctx":"Sharding-Fixed-2","msg":"Two-phase commit completed","attr":\{"sessionId":{"id":{"$uuid":"50c9bd02-a30e-4cb0-b84a-08092c30f77f"},"uid":\{"$binary":{"base64":"47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=","subType":"0"}}},"txnNumberAndRetryCounter":\{"txnNumber":4,"txnRetryCounter":0},"status":"TransactionCoordinatorCanceled: Transaction exceeded deadline or newer transaction started"}}
       497571 [js_test:test2] d20793| \{"t":{"$date":"2022-10-23T13:50:19.443+00:00"},"s":"D3", "c":"TXN","id":22440,   "ctx":"Sharding-Fixed-2","msg":"Removing coordinator from in-memory catalog","attr":\{"sessionId":{"uuid":{"$uuid":"50c9bd02-a30e-4cb0-b84a-08092c30f77f"}},"txnNumberAndRetryCounter":\{"txnNumber":4,"txnRetryCounter":0}}}
       497572 [js_test:test2] d20793| \{"t":{"$date":"2022-10-23T13:50:19.443+00:00"},"s":"D3", "c":"TXN","id":22441,"ctx":"Sharding-Fixed-2","msg":"Signaling last active coordinator removed"}
       497573 [js_test:test2] s20799| \{"t":{"$date":"2022-10-23T13:50:19.443+00:00"},"s":"I",  "c":"TXN","id":51805,   "ctx":"conn12","msg":"transaction","attr":\{"parameters":{"lsid":{"id":{"$uuid":"50c9bd02-a30e-4cb0-b84a-08092c30f77f"},"uid":\{"$binary":{"base64":"47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=","subType":"0"}}},"txnNumber":4,"txnRetryCounter":0,"autocommit":false,"readConcern":\{"afterClusterTime":{"$timestamp":{"t":1666533019,"i":11}}}},"numParticipants"        :2,"terminationCause":"aborted","abortCause":"DuplicateKey","timeActiveMicros":3783,"timeInactiveMicros":0,"durationMillis":3}}
       497574 [js_test:test2] s20799| \{"t":{"$date":"2022-10-23T13:50:19.443+00:00"},"s":"D1", "c":"ASSERT",   "id":23074,   "ctx":"conn12","msg":"User assertion","attr":\{"error":"DuplicateKey{ keyPattern: { _id: 1 }, keyValue: \{ _id: ObjectId('63554695d899a1e359adecfa') }, foundValue: 3 }: findAndModify :: caused by :: Failed to update document's shard key field. There is either an orphan for this document or _id for this collection is not globally unique. :: caused by :: Du        ring insert stage of updating a shard key :: caused by :: E11000 duplicate key error collection: db.foo index: _id_ dup key: \{ _id: ObjectId('63554695d899a1e359adecfa') } found value: RecordId(3)","file":"src/mongo/util/future_impl.h","line":1087}}
      

      It seems that there is a lack of synchronization (i.e. ensure the old doc is deleted before inserting a new doc) that leads to an error while inserting the new doc ({x:30}) on shard_0 since a document of the same _id still exists on shard_1. A sleep of 1000ms just before inserting the new doc alleviated this problem as shown below:

      diff --git a/src/mongo/s/commands/document_shard_key_update_util.cpp b/src/mongo/s/commands/document_shard_key_update_util.cpp
      index 2bff33fa82a..497783dadba 100644
      --- a/src/mongo/s/commands/document_shard_key_update_util.cpp
      +++ b/src/mongo/s/commands/document_shard_key_update_util.cpp
      @@ -83,9 +83,12 @@ bool executeOperationsAsPartOfShardKeyUpdate(OperationContext* opCtx,
               hangBeforeInsertOnUpdateShardKey.pauseWhileSet(opCtx);
           }
      
      +    sleepmillis(1000);
           auto insertOpMsg = OpMsgRequest::fromDBAndBody(db, insertCmdObj);
           auto insertRequest = BatchedCommandRequest::parseInsert(insertOpMsg);
      
      

            Assignee:
            pierlauro.sciarelli@mongodb.com Pierlauro Sciarelli
            Reporter:
            nandini.bhartiya@mongodb.com Nandini Bhartiya
            Votes:
            0 Vote for this issue
            Watchers:
            4 Start watching this issue

              Created:
              Updated:
              Resolved: