Uploaded image for project: 'Core Server'
  1. Core Server
  2. SERVER-68567

distinct command on the Time-Series collection doesn't utilize the existing index

    • Query Optimization

      Hi,

      I have a time-series collection where I'm keeping the prices of cryptocurrencies for 30 days with the second granularity. Around 2000 cryptocurrency ticker data have been inserted into the time-series collection every one or two seconds. For some reason, I require to retrieve the distinct symbols. I have more than 2 Billion records in the time-series collection. 

      When I run the following query, it didn't complete in a few minutes and I had to stop it.

       

      db.cryptoTickerBinance.distinct("symbol") 

       

      Available indexes on the collection are listed below:

       

      [
         {
            "v": 2,
            "key": {
               "symbol": 1,
               "time": 1
            },
            "name": "symbol_1_time_1"
         },
         {
            "v": 2,
            "key": {
               "time": 1
            },
            "name": "time_1"
         },
         {
            "v": 2,
            "key": {
               "symbol": 1,
               "time": -1
            },
            "name": "sym2"
         },
         {
            "v": 2,
            "key": {
               "time": -1
            },
            "name": "time_2"
         }
      ] 

      However, when I use $group staging it works fast as expected since it's using DISTINCT_SCAN step. 

       

      db.cryptoTickerBinance.aggregate([
        {
          '$group': {
            '_id': '$symbol'
          }
        }
      ]).explain('executionStats') 

       

       

      { explainVersion: '1',
        stages: 
         [ { '$cursor': 
              { queryPlanner: 
                 { namespace: 'exchange.system.buckets.cryptoTickerBinance',
                   indexFilterSet: false,
                   parsedQuery: {},
                   queryHash: 'A855245D',
                   planCacheKey: 'A855245D',
                   maxIndexedOrSolutionsReached: false,
                   maxIndexedAndSolutionsReached: false,
                   maxScansToExplodeReached: false,
                   winningPlan: 
                    { stage: 'PROJECTION_COVERED',
                      transformBy: { meta: 1, _id: 0 },
                      inputStage: 
                       { stage: 'DISTINCT_SCAN',
                         keyPattern: { meta: 1, 'control.min.time': 1, 'control.max.time': 1 },
                         indexName: 'symbol_1_time_1',
                         isMultiKey: false,
                         multiKeyPaths: { meta: [], 'control.min.time': [], 'control.max.time': [] },
                         isUnique: false,
                         isSparse: false,
                         isPartial: false,
                         indexVersion: 2,
                         direction: 'forward',
                         indexBounds: 
                          { meta: [ '[MinKey, MaxKey]' ],
                            'control.min.time': [ '[MinKey, MaxKey]' ],
                            'control.max.time': [ '[MinKey, MaxKey]' ] } } },
                   rejectedPlans: [] },
                executionStats: 
                 { executionSuccess: true,
                   nReturned: 2080,
                   executionTimeMillis: 52,
                   totalKeysExamined: 2080,
                   totalDocsExamined: 0,
                   executionStages: 
                    { stage: 'PROJECTION_COVERED',
                      nReturned: 2080,
                      executionTimeMillisEstimate: 49,
                      works: 2081,
                      advanced: 2080,
                      needTime: 0,
                      needYield: 0,
                      saveState: 4,
                      restoreState: 4,
                      isEOF: 1,
                      transformBy: { meta: 1, _id: 0 },
                      inputStage: 
                       { stage: 'DISTINCT_SCAN',
                         nReturned: 2080,
                         executionTimeMillisEstimate: 49,
                         works: 2081,
                         advanced: 2080,
                         needTime: 0,
                         needYield: 0,
                         saveState: 4,
                         restoreState: 4,
                         isEOF: 1,
                         keyPattern: { meta: 1, 'control.min.time': 1, 'control.max.time': 1 },
                         indexName: 'symbol_1_time_1',
                         isMultiKey: false,
                         multiKeyPaths: { meta: [], 'control.min.time': [], 'control.max.time': [] },
                         isUnique: false,
                         isSparse: false,
                         isPartial: false,
                         indexVersion: 2,
                         direction: 'forward',
                         indexBounds: 
                          { meta: [ '[MinKey, MaxKey]' ],
                            'control.min.time': [ '[MinKey, MaxKey]' ],
                            'control.max.time': [ '[MinKey, MaxKey]' ] },
                         keysExamined: 2080 } } } },
             nReturned: 2080,
             executionTimeMillisEstimate: 49 },
           { '$groupByDistinctScan': { newRoot: { _id: '$meta' } },
             nReturned: 2080,
             executionTimeMillisEstimate: 49 } ],
        serverInfo: 
         { host: 'atlas-3uvq7m-shard-00-02.iubjy.mongodb.net',
           port: 27017,
           version: '6.0.0',
           gitVersion: 'e61bf27c2f6a83fed36e5a13c008a32d563babe2' },
        serverParameters: 
         { internalQueryFacetBufferSizeBytes: 104857600,
           internalQueryFacetMaxOutputDocSizeBytes: 104857600,
           internalLookupStageIntermediateDocumentMaxSizeBytes: 104857600,
           internalDocumentSourceGroupMaxMemoryBytes: 104857600,
           internalQueryMaxBlockingSortMemoryUsageBytes: 104857600,
           internalQueryProhibitBlockingMergeOnMongoS: 0,
           internalQueryMaxAddToSetBytes: 104857600,
           internalDocumentSourceSetWindowFieldsMaxMemoryBytes: 104857600 },
        command: 
         { aggregate: 'system.buckets.cryptoTickerBinance',
           pipeline: 
            [ { '$_internalUnpackBucket': 
                 { timeField: 'time',
                   metaField: 'symbol',
                   bucketMaxSpanSeconds: 3600,
                   assumeNoMixedSchemaData: true } },
              { '$group': { _id: '$symbol' } } ],
           cursor: {},
           collation: { locale: 'simple' } },
        ok: 1,
        '$clusterTime': 
         { clusterTime: Timestamp({ t: 1659631070, i: 2089 }),
           signature: 
            { hash: Binary(Buffer.from("d16b946bcb75e6999bd12f47bf93c85c1bab16ea", "hex"), 0),
              keyId: 7079845501138373000 } },
        operationTime: Timestamp({ t: 1659631070, i: 2089 }) } 

      How the distinct() query can be optimized on time-series collections? It may be linked to another ticket (https://jira.mongodb.org/browse/SERVER-14227).

       

       

       

       

       

       

       

       

       

            Assignee:
            backlog-query-optimization [DO NOT USE] Backlog - Query Optimization
            Reporter:
            fuat.sungur@mongodb.com Fuat Sungur
            Votes:
            0 Vote for this issue
            Watchers:
            9 Start watching this issue

              Created:
              Updated: