Currently, setFCV to 4.4 on a shard iterates each collection and for each, builds a vector of orphaned ranges. It then inserts range deletion tasks for the orphaned ranges one at a time.
On clusters with collections with a huge number of chunks, inserting the range deletion tasks takes a long time (e.g., ~15 minutes per shard for a collection with 100k chunks).
We should see if batching the inserts by doing something like this (where the inserts are grouped into batches here) speeds it up.
This script creates a cluster with 100k chunks:
(function() { 'use strict'; // Skip checking orphans because it takes a long time when there are so many chunks. Also we don't // actually insert any data so there can't be orphans. TestData.skipCheckOrphans = true; const dbName = "db1"; const collName = "foo"; const ns = dbName + "." + collName; const st = new ShardingTest({ shards: 2, config: 1, other: { // How to run this test with a specific binary version. // mongosOptions: {binVersion: "4.4"}, // shardOptions: {binVersion: "4.4"}, // configOptions: {binVersion: "4.4"}, } }); jsTest.log("Set FCV to 4.2 since we want to test upgrading the FCV to 4.4"); assert.commandWorked(st.s.adminCommand({ setFeatureCompatibilityVersion: "4.2" })); jsTest.log("Create a database."); // enableSharding creates the databases. assert.commandWorked(st.s.adminCommand({enableSharding: dbName})); st.ensurePrimaryShard(dbName, st.shard0.shardName); jsTest.log("Shard a collection with a huge number of initial chunks"); const NUM_CHUNKS = 100000; assert.commandWorked(st.s.adminCommand({ shardCollection: ns, key: {x: "hashed"}, numInitialChunks: NUM_CHUNKS })); assert.gt(st.s.getDB("config").chunks.count(), NUM_CHUNKS - 1); jsTest.log("Set FCV to 4.4"); assert.commandWorked(st.s.adminCommand({ setFeatureCompatibilityVersion: "4.4" })); st.stop(); })();
For this script to work, we'll have to disable the limit on numInitialChunks in the server:
diff --git a/src/mongo/db/s/config/configsvr_shard_collection_command.cpp b/src/mongo/db/s/config/configsvr_shard_collection_command.cpp index 68af077751..9f37b7da94 100644 --- a/src/mongo/db/s/config/configsvr_shard_collection_command.cpp +++ b/src/mongo/db/s/config/configsvr_shard_collection_command.cpp @@ -107,7 +107,7 @@ void validateAndDeduceFullRequestOptions(OperationContext* opCtx, // Cannot have more than 8192 initial chunks per shard. Setting a maximum of 1,000,000 // chunks in total to limit the amount of memory this command consumes so there is less // danger of an OOM error. - const int maxNumInitialChunksForShards = numShards * 8192; + /*const int maxNumInitialChunksForShards = numShards * 8192; const int maxNumInitialChunksTotal = 1000 * 1000; // Arbitrary limit to memory consumption int numChunks = request->getNumInitialChunks(); uassert(ErrorCodes::InvalidOptions, @@ -116,7 +116,7 @@ void validateAndDeduceFullRequestOptions(OperationContext* opCtx, << maxNumInitialChunksTotal, numChunks >= 0 && numChunks <= maxNumInitialChunksForShards && numChunks <= maxNumInitialChunksTotal); - + */ // Retrieve the collection metadata in order to verify that it is legal to shard this // collection. BSONObj res;
We also want to distribute the chunks round-robin between the shards so that each shard has many unowned ranges:
diff --git a/src/mongo/db/s/config/initial_split_policy.cpp b/src/mongo/db/s/config/initial_split_policy.cpp index 1af9ceb57f..dfb054f4fa 100644 --- a/src/mongo/db/s/config/initial_split_policy.cpp +++ b/src/mongo/db/s/config/initial_split_policy.cpp @@ -193,7 +193,7 @@ InitialSplitPolicy::ShardCollectionConfig InitialSplitPolicy::generateShardColle // shards, and we need to be sure that at least one chunk is placed on the primary shard const ShardId shardId = (i == 0 && finalSplitPoints.size() + 1 < allShardIds.size()) ? databasePrimaryShardId - : allShardIds[(i / numContiguousChunksPerShard) % allShardIds.size()]; + : allShardIds[(i / 1 /*numContiguousChunksPerShard*/) % allShardIds.size()]; appendChunk(nss, min, max, &version, validAfter, shardId, &chunks); }
Here's an example of timing how long a section of code takes:
void myFunc() { Timer t; // do something time-consuming auto micros = t.micros(); LOGV2(123456, "How long it took", "micros"_attr = micros); }