Uploaded image for project: 'Core Server'
  1. Core Server
  2. SERVER-74734

After renaming the "config.shards" collection, attempting to rename it back can cause a deadlock

    • Sharding EMEA
    • Fully Compatible
    • ALL
    • Sharding EMEA 2023-09-04
    • 2

      Spinoff from SERVER-74679

      I tried reproducing this, and then when trying to fix it by renaming it back (which I think succeeded, even though config.shards no longer exists), I may have won a race condition

      I still couldn't do anything on the cluster, so I then tried to rename the collection on the csrs (functioning as both csrs and shard), which succeeded.

      Now I'm now in a deadlock with myself

      Timeline (all logs from the csrs-but-also-shard):
      First rename from "config.shards" -> "config.foo2". As far as I can tell, this completes

      {"t":{"$date":"2023-03-08T13:18:46.716+11:00"},"s":"I",  "c":"SHARDING", "id":22080,   "ctx":"ShardingDDLCoordinator-3","msg":"About to log metadata event","attr":{"namespace":"changelog","event":{"_id":"M-K4F3614CKP:27019-2023-03-08T13:18:46.716+11:00-6407f0860b0873873c5224e5","server":"M-K4F3614CKP:27019","shard":"config","clientAddr":"","time":{"$date":"2023-03-08T02:18:46.716Z"},"what":"renameCollection.start","ns":"config.shards","details":{"source":"config.shards","destination":"config.foo2"}}}}
      ...
      {"t":{"$date":"2023-03-08T13:18:46.814+11:00"},"s":"I",  "c":"SHARDING", "id":5515106, "ctx":"RenameCollectionParticipantService-0","msg":"Collection locally renamed, waiting for CRUD to be unblocked","attr":{"fromNs":"config.shards","toNs":"config.foo2"}}
      {"t":{"$date":"2023-03-08T13:18:46.837+11:00"},"s":"I",  "c":"SHARDING", "id":22080,   "ctx":"conn44","msg":"About to log metadata event","attr":{"namespace":"changelog","event":{"_id":"M-K4F3614CKP:27019-2023-03-08T13:18:46.836+11:00-6407f0860b0873873c5224ff","server":"M-K4F3614CKP:27019","shard":"config","clientAddr":"","time":{"$date":"2023-03-08T02:18:46.836Z"},"what":"renameCollection.metadata","ns":"admin.shards to config.foo2 : dropped target collection.","details":{}}}}
      ...
      {"t":{"$date":"2023-03-08T13:18:46.919+11:00"},"s":"I",  "c":"SHARDING", "id":5460504, "ctx":"ShardingDDLCoordinator-1","msg":"Collection renamed","attr":{"namespace":"config.shards"}}
      {"t":{"$date":"2023-03-08T13:18:46.920+11:00"},"s":"I",  "c":"SHARDING", "id":5565601, "ctx":"ShardingDDLCoordinator-2","msg":"Releasing sharding DDL coordinator","attr":{"coordinatorId":{"namespace":"config.shards","operationType":"renameCollection"}}}
      {"t":{"$date":"2023-03-08T13:18:46.932+11:00"},"s":"I",  "c":"SHARDING", "id":6855302, "ctx":"ShardingDDLCoordinator-2","msg":"Released DDL lock","attr":{"resource":"config.foo2","reason":"renameCollection"}}
      {"t":{"$date":"2023-03-08T13:18:46.933+11:00"},"s":"I",  "c":"SHARDING", "id":6855302, "ctx":"ShardingDDLCoordinator-2","msg":"Released DDL lock","attr":{"resource":"config.shards","reason":"renameCollection"}}
      {"t":{"$date":"2023-03-08T13:18:46.933+11:00"},"s":"I",  "c":"SHARDING", "id":6855302, "ctx":"ShardingDDLCoordinator-2","msg":"Released DDL lock","attr":{"resource":"config","reason":"renameCollection"}}
      {"t":{"$date":"2023-03-08T13:18:46.934+11:00"},"s":"I",  "c":"COMMAND",  "id":51803,   "ctx":"conn69","msg":"Slow query","attr":{"type":"command","ns":"config.$cmd","appName":"MongoDB Shell","command":{"_shardsvrRenameCollection":"shards","to":"config.foo2","dropTarget":false,"stayTemp":false,"allowEncryptedCollectionRename":true,"writeConcern":{"w":"majority","wtimeout":60000},"lsid":{"id":{"$uuid":"c37d658d-be88-4ffc-a4ea-d5ec08523f4a"},"uid":{"$binary":{"base64":"47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=","subType":"0"}}},"$replData":1,"$clusterTime":{"clusterTime":{"$timestamp":{"t":1678241921,"i":7}},"signature":{"hash":{"$binary":{"base64":"AAAAAAAAAAAAAAAAAAAAAAAAAAA=","subType":"0"}},"keyId":0}},"$configTime":{"$timestamp":{"t":1678241921,"i":7}},"$topologyTime":{"$timestamp":{"t":1678168305,"i":2}},"$client":{"application":{"name":"MongoDB Shell"},"driver":{"name":"MongoDB Internal Client","version":"7.0.0-alpha-538-g7cec1b7"},"os":{"type":"Darwin","name":"Mac OS X","architecture":"arm64","version":"21.6.0"},"mongos":{"host":"M-K4F3614CKP:27017","client":"127.0.0.1:52330","version":"7.0.0-alpha-538-g7cec1b7"}},"mayBypassWriteBlocking":false,"$db":"config"},"numYields":0,"reslen":515,"locks":{"Mutex":{"acquireCount":{"r":1}}},"writeConcern":{"w":"majority","wtimeout":60000,"provenance":"clientSupplied"},"remote":"127.0.0.1:62699","protocol":"op_msg","durationMillis":243}}
      

      Renaming it back "config.foo2" -> "config.shards" , still via the mongos (which I think did not succeed)

      {"t":{"$date":"2023-03-08T13:19:14.864+11:00"},"s":"I",  "c":"SHARDING", "id":5390510, "ctx":"conn69","msg":"Constructing new sharding DDL coordinator","attr":{"coordinatorDoc":{"_id":{"namespace":"config.foo2","operationType":"renameCollection"},"recoveredFromDisk":false,"forwardableOpMetadata":{"mayBypassWriteBlocking":false}}}}
      {"t":{"$date":"2023-03-08T13:19:14.865+11:00"},"s":"I",  "c":"SHARDING", "id":6855301, "ctx":"ShardingDDLCoordinator-1","msg":"Acquired DDL lock","attr":{"resource":"config","reason":"renameCollection"}}
      {"t":{"$date":"2023-03-08T13:19:14.866+11:00"},"s":"I",  "c":"SHARDING", "id":6855301, "ctx":"ShardingDDLCoordinator-1","msg":"Acquired DDL lock","attr":{"resource":"config.foo2","reason":"renameCollection"}}
      {"t":{"$date":"2023-03-08T13:19:14.869+11:00"},"s":"I",  "c":"SHARDING", "id":6855301, "ctx":"ShardingDDLCoordinator-2","msg":"Acquired DDL lock","attr":{"resource":"config.shards","reason":"renameCollection"}}
      {"t":{"$date":"2023-03-08T13:19:14.884+11:00"},"s":"I",  "c":"SHARDING", "id":22080,   "ctx":"ShardingDDLCoordinator-3","msg":"About to log metadata event","attr":{"namespace":"changelog","event":{"_id":"M-K4F3614CKP:27019-2023-03-08T13:19:14.884+11:00-6407f0a20b0873873c5225ae","server":"M-K4F3614CKP:27019","shard":"config","clientAddr":"","time":{"$date":"2023-03-08T02:19:14.884Z"},"what":"renameCollection.start","ns":"config.foo2","details":{"source":"config.foo2","destination":"config.shards"}}}}
      {"t":{"$date":"2023-03-08T13:19:14.912+11:00"},"s":"I",  "c":"SHARDING", "id":22080,   "ctx":"conn44","msg":"About to log metadata event","attr":{"namespace":"changelog","event":{"_id":"M-K4F3614CKP:27019-2023-03-08T13:19:14.911+11:00-6407f0a20b0873873c5225b8","server":"M-K4F3614CKP:27019","shard":"config","clientAddr":"","time":{"$date":"2023-03-08T02:19:14.911Z"},"what":"renameCollection.metadata","ns":"admin.foo2 to config.shards : dropped target collection.","details":{}}}}
      {"t":{"$date":"2023-03-08T13:19:14.937+11:00"},"s":"I",  "c":"SH_REFR",  "id":4619902, "ctx":"CatalogCache-19","msg":"Collection has found to be unsharded after refresh","attr":{"namespace":"config.shards","durationMillis":0}}
      {"t":{"$date":"2023-03-08T13:19:14.938+11:00"},"s":"I",  "c":"SHARDING", "id":22080,   "ctx":"ShardingDDLCoordinator-2","msg":"About to log metadata event","attr":{"namespace":"changelog","event":{"_id":"M-K4F3614CKP:27019-2023-03-08T13:19:14.938+11:00-6407f0a20b0873873c5225c1","server":"M-K4F3614CKP:27019","shard":"config","clientAddr":"","time":{"$date":"2023-03-08T02:19:14.938Z"},"what":"renameCollection.end","ns":"config.foo2","details":{"source":"config.foo2","destination":"config.shards"}}}}
      {"t":{"$date":"2023-03-08T13:19:14.944+11:00"},"s":"I",  "c":"SHARDING", "id":5460504, "ctx":"ShardingDDLCoordinator-2","msg":"Collection renamed","attr":{"namespace":"config.foo2"}}
      {"t":{"$date":"2023-03-08T13:19:14.945+11:00"},"s":"I",  "c":"SHARDING", "id":5565601, "ctx":"ShardingDDLCoordinator-1","msg":"Releasing sharding DDL coordinator","attr":{"coordinatorId":{"namespace":"config.foo2","operationType":"renameCollection"}}}
      {"t":{"$date":"2023-03-08T13:19:14.955+11:00"},"s":"I",  "c":"SHARDING", "id":6855302, "ctx":"ShardingDDLCoordinator-1","msg":"Released DDL lock","attr":{"resource":"config.shards","reason":"renameCollection"}}
      {"t":{"$date":"2023-03-08T13:19:14.956+11:00"},"s":"I",  "c":"SHARDING", "id":6855302, "ctx":"ShardingDDLCoordinator-1","msg":"Released DDL lock","attr":{"resource":"config.foo2","reason":"renameCollection"}}
      {"t":{"$date":"2023-03-08T13:19:14.957+11:00"},"s":"I",  "c":"SHARDING", "id":6855302, "ctx":"ShardingDDLCoordinator-1","msg":"Released DDL lock","attr":{"resource":"config","reason":"renameCollection"}}
      {"t":{"$date":"2023-03-08T13:19:22.512+11:00"},"s":"W",  "c":"SHARDING", "id":21852,   "ctx":"Balancer","msg":"Unable to enforce zone range policy for collection","attr":{"namespace":"test.foo","error":{"code":70,"codeName":"ShardNotFound","errmsg":"The primary shard for database test does not exist :: caused by :: Shard shard2 not found"}}}
      

      Attempt to redo the "config.foo2" -> "config.shards" this time directly on the csrs-but-also-shard

      {"t":{"$date":"2023-03-08T13:21:56.822+11:00"},"s":"I",  "c":"NETWORK",  "id":22943,   "ctx":"listener","msg":"Connection accepted","attr":{"remote":"127.0.0.1:63567","uuid":{"uuid":{"$uuid":"2709bddd-0ce5-49e1-9450-4313e13849fa"}},"connectionId":112,"connectionCount":14}}
      {"t":{"$date":"2023-03-08T13:21:56.822+11:00"},"s":"I",  "c":"NETWORK",  "id":51800,   "ctx":"conn112","msg":"client metadata","attr":{"remote":"127.0.0.1:63567","client":"conn112","doc":{"application":{"name":"MongoDB Shell"},"driver":{"name":"MongoDB Internal Client","version":"7.0.0-alpha-538-g7cec1b7"},"os":{"type":"Darwin","name":"Mac OS X","architecture":"arm64","version":"21.6.0"}}}}
      ...
      {"t":{"$date":"2023-03-08T13:22:15.248+11:00"},"s":"I",  "c":"COMMAND",  "id":20400,   "ctx":"conn112","msg":"renameCollectionForCommand","attr":{"sourceNamespace":"config.foo2","targetNamespace":"config.shards","dropTarget":"no"}}
      {"t":{"$date":"2023-03-08T13:22:15.248+11:00"},"s":"I",  "c":"STORAGE",  "id":20319,   "ctx":"conn112","msg":"renameCollection","attr":{"uuid":{"uuid":{"$uuid":"870d0320-10ba-4fb4-976c-6242115c9423"}},"fromName":"config.foo2","toName":"config.shards"}}
      ...
      {"t":{"$date":"2023-03-08T13:22:58.293+11:00"},"s":"I",  "c":"SHARDING", "id":22080,   "ctx":"ShardingDDLCoordinator-4","msg":"About to log metadata event","attr":{"namespace":"changelog","event":{"_id":"M-K4F3614CKP:27019-2023-03-08T13:22:58.292+11:00-6407f1820b0873873c522a29","server":"M-K4F3614CKP:27019","shard":"config","clientAddr":"","time":{"$date":"2023-03-08T02:22:58.292Z"},"what":"renameCollection.start","ns":"config.shards","details":{"source":"config.shards","destination":"config.foo2"}}}}
      {"t":{"$date":"2023-03-08T13:22:58.317+11:00"},"s":"E",  "c":"ASSERT",   "id":4457000, "ctx":"RenameCollectionParticipantService-2","msg":"Tripwire assertion","attr":{"error":{"code":7032368,"codeName":"Location7032368","errmsg":"Trying to acquire a  critical section blocking writes for namespace 'config.shards' and reason '{ command: \"rename\", from: \"config.shards\", to: \"config.foo2\" }' but it is already taken by another operation with different reason '{ command: \"rename\", from: \"config.foo2\", to: \"config.shards\" }'"},"location":"{fileName:\"src/mongo/db/s/sharding_recovery_service.cpp\", line:200, functionName:\"acquireRecoverableCriticalSectionBlockWrites\"}"}}
      (there is a long BACKTRACE here)
      

      This sounds like my first rename (from: \"config.shards\", to: \"config.foo2\") is being blocked by my second rename (from: \"config.foo2\", to: \"config.shards\"). Even though it looks like from the logs the first rename ran to completion, so it's not clear why it's still in progress here.

            Assignee:
            antonio.fuschetto@mongodb.com Antonio Fuschetto
            Reporter:
            joanna.cheng@mongodb.com Joanna Cheng
            Votes:
            0 Vote for this issue
            Watchers:
            9 Start watching this issue

              Created:
              Updated:
              Resolved: