Loading...

Type: Improvement
Resolution: Unresolved
Priority: Major - P3
Fix Version/s: None
Affects Version/s: None
Component/s: Query Planning
Labels:

Assigned Teams:

Query Optimization
Confidence Status:
None
Work Order:
3

Aha! Reference:
None
Tracking Level:
None
Risk Status:
None
Exec Notes:
None
Goal Name:
None
Goal Link:
None

Hi,

I have a time-series collection where I'm keeping the prices of cryptocurrencies for 30 days with the second granularity. Around 2000 cryptocurrency ticker data have been inserted into the time-series collection every one or two seconds. For some reason, I require to retrieve the distinct symbols. I have more than 2 Billion records in the time-series collection.

When I run the following query, it didn't complete in a few minutes and I had to stop it.

db.cryptoTickerBinance.distinct("symbol")

Available indexes on the collection are listed below:

[
   {
      "v": 2,
      "key": {
         "symbol": 1,
         "time": 1
      },
      "name": "symbol_1_time_1"
   },
   {
      "v": 2,
      "key": {
         "time": 1
      },
      "name": "time_1"
   },
   {
      "v": 2,
      "key": {
         "symbol": 1,
         "time": -1
      },
      "name": "sym2"
   },
   {
      "v": 2,
      "key": {
         "time": -1
      },
      "name": "time_2"
   }
]

However, when I use $group staging it works fast as expected since it's using DISTINCT_SCAN step.

db.cryptoTickerBinance.aggregate([
  {
    '$group': {
      '_id': '$symbol'
    }
  }
]).explain('executionStats')

{ explainVersion: '1',
  stages: 
   [ { '$cursor': 
        { queryPlanner: 
           { namespace: 'exchange.system.buckets.cryptoTickerBinance',
             indexFilterSet: false,
             parsedQuery: {},
             queryHash: 'A855245D',
             planCacheKey: 'A855245D',
             maxIndexedOrSolutionsReached: false,
             maxIndexedAndSolutionsReached: false,
             maxScansToExplodeReached: false,
             winningPlan: 
              { stage: 'PROJECTION_COVERED',
                transformBy: { meta: 1, _id: 0 },
                inputStage: 
                 { stage: 'DISTINCT_SCAN',
                   keyPattern: { meta: 1, 'control.min.time': 1, 'control.max.time': 1 },
                   indexName: 'symbol_1_time_1',
                   isMultiKey: false,
                   multiKeyPaths: { meta: [], 'control.min.time': [], 'control.max.time': [] },
                   isUnique: false,
                   isSparse: false,
                   isPartial: false,
                   indexVersion: 2,
                   direction: 'forward',
                   indexBounds: 
                    { meta: [ '[MinKey, MaxKey]' ],
                      'control.min.time': [ '[MinKey, MaxKey]' ],
                      'control.max.time': [ '[MinKey, MaxKey]' ] } } },
             rejectedPlans: [] },
          executionStats: 
           { executionSuccess: true,
             nReturned: 2080,
             executionTimeMillis: 52,
             totalKeysExamined: 2080,
             totalDocsExamined: 0,
             executionStages: 
              { stage: 'PROJECTION_COVERED',
                nReturned: 2080,
                executionTimeMillisEstimate: 49,
                works: 2081,
                advanced: 2080,
                needTime: 0,
                needYield: 0,
                saveState: 4,
                restoreState: 4,
                isEOF: 1,
                transformBy: { meta: 1, _id: 0 },
                inputStage: 
                 { stage: 'DISTINCT_SCAN',
                   nReturned: 2080,
                   executionTimeMillisEstimate: 49,
                   works: 2081,
                   advanced: 2080,
                   needTime: 0,
                   needYield: 0,
                   saveState: 4,
                   restoreState: 4,
                   isEOF: 1,
                   keyPattern: { meta: 1, 'control.min.time': 1, 'control.max.time': 1 },
                   indexName: 'symbol_1_time_1',
                   isMultiKey: false,
                   multiKeyPaths: { meta: [], 'control.min.time': [], 'control.max.time': [] },
                   isUnique: false,
                   isSparse: false,
                   isPartial: false,
                   indexVersion: 2,
                   direction: 'forward',
                   indexBounds: 
                    { meta: [ '[MinKey, MaxKey]' ],
                      'control.min.time': [ '[MinKey, MaxKey]' ],
                      'control.max.time': [ '[MinKey, MaxKey]' ] },
                   keysExamined: 2080 } } } },
       nReturned: 2080,
       executionTimeMillisEstimate: 49 },
     { '$groupByDistinctScan': { newRoot: { _id: '$meta' } },
       nReturned: 2080,
       executionTimeMillisEstimate: 49 } ],
  serverInfo: 
   { host: 'atlas-3uvq7m-shard-00-02.iubjy.mongodb.net',
     port: 27017,
     version: '6.0.0',
     gitVersion: 'e61bf27c2f6a83fed36e5a13c008a32d563babe2' },
  serverParameters: 
   { internalQueryFacetBufferSizeBytes: 104857600,
     internalQueryFacetMaxOutputDocSizeBytes: 104857600,
     internalLookupStageIntermediateDocumentMaxSizeBytes: 104857600,
     internalDocumentSourceGroupMaxMemoryBytes: 104857600,
     internalQueryMaxBlockingSortMemoryUsageBytes: 104857600,
     internalQueryProhibitBlockingMergeOnMongoS: 0,
     internalQueryMaxAddToSetBytes: 104857600,
     internalDocumentSourceSetWindowFieldsMaxMemoryBytes: 104857600 },
  command: 
   { aggregate: 'system.buckets.cryptoTickerBinance',
     pipeline: 
      [ { '$_internalUnpackBucket': 
           { timeField: 'time',
             metaField: 'symbol',
             bucketMaxSpanSeconds: 3600,
             assumeNoMixedSchemaData: true } },
        { '$group': { _id: '$symbol' } } ],
     cursor: {},
     collation: { locale: 'simple' } },
  ok: 1,
  '$clusterTime': 
   { clusterTime: Timestamp({ t: 1659631070, i: 2089 }),
     signature: 
      { hash: Binary(Buffer.from("d16b946bcb75e6999bd12f47bf93c85c1bab16ea", "hex"), 0),
        keyId: 7079845501138373000 } },
  operationTime: Timestamp({ t: 1659631070, i: 2089 }) }

How the distinct() query can be optimized on time-series collections? It may be linked to another ticket (https://jira.mongodb.org/browse/SERVER-14227).

depends on

SERVER-37715 Use DISTINCT_SCAN for $unwind-$group pipelines

Backlog

related to

SERVER-87741 Make distinct command on views use DISTINCT_SCAN

Backlog

Details

Description

Attachments

Issue Links

Activity

People

Dates