Uploaded image for project: 'Core Server'
  1. Core Server
  2. SERVER-41171

MongoDB $lookup using "$expr" is slow compared to simple $lookup

    • Type: Icon: Improvement Improvement
    • Resolution: Unresolved
    • Priority: Icon: Major - P3 Major - P3
    • None
    • Affects Version/s: 4.0.8
    • Component/s: Aggregation Framework
    • None
    • Query Execution

      $lookup with let+pipeline performs slower than simple lookup.

      In the MongoDB shell...

      To build the data set:

      db.User.deleteMany({});
      db.Order.deleteMany({});
      db.User.ensureIndex({anni: 1});
      db.Order.ensureIndex({_userId: 1});for (var va2 = 0; va2 < 1000; va2++) {
              var myUserA = {};
              myUserA._id = new ObjectId();
              myUserA.nome = "DanieleA";
              myUserA.anni = va2;    var myUserB = {};
              myUserB._id = new ObjectId();
              myUserB.nome = "DanieleB";
              myUserB.anni = va2;
          
          db.User.insertOne(myUserA);
          db.User.insertOne(myUserB);    var myOrders = [];
          for (var va = 0; va < 100; va++) {
              var myOrder = {};
                  myOrder.prezzo = Math.random() * 1 * 100;
                  myOrder._userId = myUserB._id;
                  myOrders.push (myOrder);
          }
          db.Order.insertMany (myOrders);
      };
      

      To test performance:

      for (var tries = 0; tries < 10; tries++) {
        startDate = new Date();db.getCollection('User').aggregate([
            {
              $lookup:
              {
                from: "Order",
                localField: "_id",
                foreignField: "_userId",
                as: "users"
              }
            },
            { $count: "totalCount" }
        ]);print("Without pipeline " + (new Date() - startDate))
      
        // Lookup with pipeline
        startDate = new Date();db.getCollection('User').aggregate([
            {
              $lookup: {
                from: "Order",
                let: {
                  idUtente: "$_id"
                },
                pipeline: [
                  {
                    $match: 
                    {
                      $expr: {
                        $eq: ["$_userId", "$$idUtente"]
                      }
                    }
                  }
                ],
                as: "Orders"
              }
            },
            { $count: "totalCount" }
        ]);print("With pipeline " + (new Date() - startDate))
      }
      

      Output (a local reproduction):

      Without pipeline 259
      With pipeline 338
      Without pipeline 234
      With pipeline 342
      Without pipeline 235
      With pipeline 341
      Without pipeline 231
      With pipeline 368
      Without pipeline 262
      With pipeline 375
      Without pipeline 270
      With pipeline 414
      Without pipeline 281
      With pipeline 395
      Without pipeline 266
      With pipeline 406
      Without pipeline 285
      With pipeline 384
      Without pipeline 262
      With pipeline 383
      

      Is this difference expected and/or can it be improved?

      original description

      Hi - i'm experiencing a slow performance issue  while using $lookup with an embedded "pipeline". There is no slow performance issue for $lookup that is not using "pipeline".

      // SEED THE COLLECTIONS
      // 1) Create collections
      // 2) Create index

      db.User.deleteMany({});
      db.Order.deleteMany({});db.User.ensureIndex({anni: 1});
      db.Order.ensureIndex({_userId: 1});for (var va2 = 0; va2 < 1000; va2++) {
          var myUserA = {};}
              myUserA._id = new ObjectId();
              myUserA.nome = "DanieleA";
              myUserA.anni = va2;    var myUserB = {};
              myUserB._id = new ObjectId();
              myUserB.nome = "DanieleB";
              myUserB.anni = va2;
          
          db.User.insertOne(myUserA);
          db.User.insertOne(myUserB);    var myOrders = [];
          for (var va = 0; va < 100; va++) {
              var myOrder = {};
                  myOrder.prezzo = Math.random() * 1 * 100;
                  myOrder._userId = myUserB._id;
                  myOrders.push (myOrder);
          }
          db.Order.insertMany (myOrders);
      }
      

      // QUERY with $lookup is fast (0.06ms)

      db.getCollection('User').aggregate([
          {
              $match: {
                  anni: {$gt: 50}
              }
          },
          {
              $project: {
                  anni: 1
              }
          },
          {
            $lookup:
            {
              from: "User",
              localField: "_id",
              foreignField: "_userId",
              as: "users"
            }
          },
          { $count: "totalCount" }
      ], {explain: false});
      

      // QUERY with $lookup and pipeline is slow (0.54ms)

      db.getCollection('User').aggregate([
          {
              $match: {
                  anni: {$gt: 50}
              }
          },
          {
              $project: {
                  anni: 1
              }
          },
          {
            $lookup: {
              from: "Order",
              let: {
                idUtente: "$_id"
              },
              pipeline: [
                {
                  $match: 
                  {
                    $expr: {
                      $eq: ["$_userId", "$$idUtente"]
                    }
                  }
                }
              ],
              as: "Orders"
            }
          },
          { $count: "totalCount" }
      ], {explain: false});
      

       // QUERY with $lookup and pipeline but without $count is fast (0.03ms)

      db.getCollection('User').aggregate([
          {
              $match: {
                  anni: {$gt: 50}
              }
          },
          {
              $project: {
                  anni: 1
              }
          },
          {
            $lookup: {
              from: "Order",
              let: {
                idUtente: "$_id"
              },
              pipeline: [
                {
                  $match: 
                  {
                    $expr: {
                      $eq: ["$_userId", "$$idUtente"]
                    }
                  }
                }
              ],
              as: "Orders"
            }
          },
      ], {explain: false});
      

      Replacing $count with $sort make the query still slow while using "lookup with pipeline".
      No slow performance in the case of "lookup" with no-pipeline.

      Basically this problem make the $lookup impossibile to be used for scenario where the lookup must use 2 fields to compute the join. In fact, in this scenario you must use lookup with "pipeline". But the poor performance described make the query really slow the not suitable for production.

       

            Assignee:
            backlog-query-execution [DO NOT USE] Backlog - Query Execution
            Reporter:
            daniele1981 Daniele Tassone
            Votes:
            8 Vote for this issue
            Watchers:
            27 Start watching this issue

              Created:
              Updated: