填写记录中缺失的日期

Fill missing dates in records

我有一个 collection 的 ProductViews:

{
    productId: "5b8c0f3204a10228b00a1745",
    createdAt: "2018-09-07T17:18:40.759Z"
}

我有一个获取特定产品每日浏览量的查询:

ProductView.aggregate([
    { $match: { productId } },
    { $project: { day: { $substr: ["$createdAt", 0, 10] } } },
    {
        $group: {
            _id: "$day",
            count: { $sum: 1 },
            time: { $avg: "$createdAt" },
        }
    },
    { $sort: { _id: 1 } },
    {
        $project: {
            date: '$_id',
            views: '$count',
        },
    },
]).exec((err, result) => ...)

当前给出:

[
    { date: '2018-09-01', views: 1 },
    { date: '2018-09-02', views: 3 },
    { date: '2018-09-04', views: 2 },
    { date: '2018-09-05', views: 5 },
    // ...
]

问题:

问题是,此聚合不会 return { date: '2018-09-03', views: 0 }0 观看次数。这会导致数据显示不正确:[![在此处输入图像描述][1]][1]

结果应如下所示:

[
    { date: '2018-09-01', views: 1 },
    { date: '2018-09-02', views: 3 },
    { date: '2018-09-03', views: 0 }, <=
    { date: '2018-09-04', views: 2 },
    { date: '2018-09-05', views: 5 },
    // ...
]

P.S.: 传入开始和结束日期,根据这个范围输出结果就完美了 [1]: https://i.stack.imgur.com/uHPBs.png

您需要一些额外的步骤来 return 默认值。首先,您需要使用 $group 并将 _id 设置为 null 以将所有结果收集到一个文档中。然后你可以使用 $map with an array of days as an input. Inside that $map you can use $indexOfArray to find if that date exists in your current result set. If yes (index != -1) then you can return that value, otherwise you need to return default subdocument with views set to 0. Then you can use $unwind to get back a list of documents and $replaceRoot 将嵌套 stats 提升到顶层。

ProductView.aggregate([
    { $match: { productId: '5b8c0f3204a10228b00a1745' } },
    { $project: { day: { $substr: ["$createdAt", 0, 10] } } },
    {
        $group: {
            _id: "$day",
            count: { $sum: 1 },
            time: { $avg: "$createdAt" },
        }
    },
    { $sort: { _id: 1 } },
    {
        $project: {
            date: '$_id',
            views: '$count',
        },
    },
    {
        $group: {
            _id: null,
            stats: { $push: "$$ROOT" }
        }
    },
    {
        $project: {
            stats: {
                $map: {
                    input: [ "2018-09-01", "2018-09-02", "2018-09-03", "2018-09-04", "2018-09-05" ],
                    as: "date",
                    in: {
                        $let: {
                            vars: { dateIndex: { "$indexOfArray": [ "$stats._id", "$$date" ] } },
                            in: { 
                                $cond: {
                                    if: { $ne: [ "$$dateIndex", -1 ] },
                                    then: { $arrayElemAt: [ "$stats", "$$dateIndex" ] },
                                    else: { _id: "$$date", date: "$$date", views: 0 }
                                } 
                            }
                        }
                    }
                }
            }
        }
    },
    {
        $unwind: "$stats"
    },
    {
        $replaceRoot: {
            newRoot: "$stats"
        }
    }
]).exec((err, result) => ...)

您可以使用简单循环在应用程序逻辑中生成静态日期列表。我相信这在 MongoDB 中也是可能的(使用 $range),但它可能会使此聚合管道复杂化。让我知道您是否同意,或者您想尝试在 MongoDB.

中生成该日期数组

使用一些 javascriptaggregation 技巧。

您需要先找到所提供日期范围内的日期。

function getDates(startDate, stopDate) {
  var dateArray = []
  var currentDate = moment(startDate)
  var stopDate = moment(stopDate)
  while (currentDate <= stopDate) {
    dateArray.push(moment(currentDate).format('YYYY-MM-DD'))
    currentDate = moment(currentDate).add(1, 'days')
  }
  return dateArray
}

const dummyArray = getDates('2018-09-01', '2018-09-05')
dummyArray = [ "2018-09-01", "2018-09-02", "2018-09-03", "2018-09-04", "2018-09-05" ]

现在,您可以使用以下聚合找到数据库中不可用的日期。

db.collection.aggregate([
  { "$match": { productId } },
  { "$group": {
    "_id": { "$substr": ["$createdAt", 0, 10] },
    "count": { "$sum": 1 },
    "time": { "$avg": "$createdAt" },
  }},
  { "$sort": { "_id": 1 } },
  { "$project": { "date": "$_id", "views": "$count" }},
  { "$group": { "_id": null, "data": { "$push": "$$ROOT" }}},
  { "$project": {
    "data": {
      "$map": {
        "input": dummyArray,
        "in": {
          "k": "$$this",
          "v": { "$cond": [{ "$in": ["$$this", "$data.date" ] }, 1, 0 ] }
        }
      }
    }
  }},
  { "$unwind": "$data" },
  { "$group": { "_id": "$data.k", "count": { "$sum": "$data.v" }}}
])

输出将是

[
    { date: '2018-09-01', views: 1 },
    { date: '2018-09-02', views: 3 },
    { date: '2018-09-03', views: 0 },
    { date: '2018-09-04', views: 2 },
    { date: '2018-09-05', views: 5 }
]

您的问题与 2014 中的 post 相似。

post 中提供的所有答案都是有效的,如果您可以在应用程序代码中生成缺少的天数,事情会简单得多。

由于您要求 mongodb 解决方案并且自 2014 年以来发生了很多变化,我创建了一个新的聚合管道,您可以将其与 3.6 版本一起使用。

ProductView.aggregate([
   -- convert the string date into date type for date calcualtions. can avoid this step if you can store the date as date type in collection
    {"$addFields":{"createdAt":{"$dateFromString":{"dateString":"$createdAt"}}}},
      -- strip the time part so we can add whole milliseconds from epoch to calculate next day
    {"$project":{
        "day":{"$dateFromParts":{"year":{"$year":"$createdAt"},"month":{"$month":"$createdAt"},"day":{"$dayOfMonth":"$createdAt"}}}
    }},
      -- generate two sets of data, one that has count by day, other that has unique days, min day and max day
    {"$facet":{
        "daycounts":[{"$group":{"_id":"$day","count":{"$sum":1}}}],
        "maxmindays":[
          {"$group":{
             "_id":null,
             "days":{"$addToSet":"$day"},
             "minday":{"$min":{"$divide":[{"$subtract":["$day",new Date("1-1-1970")]},1000]}},
             "maxday":{"$max":{"$divide":[{"$subtract":["$day",new Date("1-1-1970")]},1000]}}
           }}
        ]
    }},
    {"$project":{
        "data":{
          "$let":{
            "vars":{"maxminday":{"$arrayElemAt":["$maxmindays",0]}},
            "in":{
              -- $range - iterate from min date to max date one day at a time
              "$map":{
                "input":{"$range":["$$maxminday.minday",{"$add": ["$$maxminday.maxday", 60*60*24]},60*60*24]},
                "as":"r",
                "in": {
              -- convert back to milliseconds to get the day
                  "$let":{
                    "vars":{"current":{"$add": [new Date(0), {"$multiply":["$$r", 1000 ]}]}},
                    "in":{
              -- check if the day is in the collection, if yes lookup view inside the daycount facet to get the matching count, else set the view to zero
                      "$cond":[
                        {"$in":["$$current","$$maxminday.days"]},
                        {
                          "date":{"$substr":["$$current",0,10]},
                          "views":{"$let":{"vars":{"daycount":{"$arrayElemAt":["$daycounts",{"$indexOfArray":["$daycounts._id","$$current"]}]}},"in":"$$daycount.count"}}
                        },
                        {"date":{"$substr":["$$current",0,10]},"views":0}
                      ]
                    }
                  }
                }
              }
            }
          }
        }
    }},
    -- flatten the array of data
    {"$unwind":"$data"},
    -- promote the data to top
    {"$replaceRoot":{newRoot:"$data"}}
])

如果只有一个或两个并且要处理的文档数量很少,我建议您添加缺少的日期客户端。

也就是说,以下管道仅适用于 MongoDB 4.0+,但稍加努力,我们可以让它在 3.6 中运行。

[
    {
        $group: {
            _id: null,
            dates: {
                $push: {
                    $let: {
                        vars: {
                            date: {
                                $dateToParts: {
                                    date: {
                                        $toDate: "$createdAt"
                                    }
                                }
                            }
                        },
                        in: {
                            $toDouble: {
                                $dateFromParts: {
                                    year: "$$date.year",
                                    month: "$$date.month",
                                    day: "$$date.day"
                                }
                            }
                        }
                    }
                }
            }
        }
    },
    {
        $addFields: {
            startDate: {
                $divide: [
                    {
                        $min: "$dates"
                    },
                    1000
                ]
            },
            endDate: {
                $divide: [
                    {
                        "$max": "$dates"
                    },
                    1000
                ]
            }
        }
    },
    {
        $addFields: {
            dates: {
                $map: {
                    input: {
                        $concatArrays: [
                            "$dates",
                            {
                                $setDifference: [
                                    {
                                        $map: {
                                            input: {
                                                $range: [
                                                    {
                                                        $toDouble: "$startDate"
                                                    },
                                                    {
                                                        $toDouble: "$endDate"
                                                    },
                                                    24*60*60
                                                ]
                                            },
                                            in: {
                                                $multiply: [
                                                    "$$this",
                                                    1000
                                                ]
                                            }
                                        }
                                    },
                                    "$dates"
                                ]
                            }
                        ]
                    },
                    in: {
                        $toDate: "$$this"
                    }
                }
            }
        }
    },
    {
        "$unwind": "$dates"
    },
    {
        "$group": {
            _id: "$dates",
            views: {
                $sum: 1
            }
        }
    },
    {
        "$sort": {
            _id: -1
        }
    }
]

Mongo 5.1 开始,这是新 $densify 聚合运算符的完美用例:

// { date: ISODate("2018-09-01"), views: 1 }
// { date: ISODate("2018-09-02"), views: 3 }
// { date: ISODate("2018-09-04"), views: 2 }
// { date: ISODate("2018-09-05"), views: 5 }
db.collection.aggregate([
  { $densify: {
    field: "date",
    range: { step: 1, unit: "day", bounds: "full" }
  }},
  { $set: { views: { $cond: [ { $not: ["$views"] }, 0, "$views" ] } } }
])
// { date: ISODate("2018-09-01"), views: 1 }
// { date: ISODate("2018-09-02"), views: 3 }
// { date: ISODate("2018-09-03"), views: 0 } <=
// { date: ISODate("2018-09-04"), views: 2 }
// { date: ISODate("2018-09-05"), views: 5 }

这个:

  • 通过在文档序列中创建新文档来增加文档 ($densify) 的密度,其中缺少 field(在我们的例子中 field: "date")的某些值:
    • 我们的致密化步骤是 1 天:range: { step: 1, unit: "day", ... }
    • 并且我们在文档定义的日期范围内进行加密:bounds: "full"
  • 最终将 ($set) views 设置为 0 仅适用于在加密阶段包含的新文档 ({ views: { $cond: [ { $not: ["$views"] }, 0, "$views" ] })

当然,要使这个与您的特定示例一起使用,您可以使用 $dateToString ($dateToString: { format: "%Y-%m-%d", date: "$date" }) and back to dates with $dateFromString ($dateFromString: { dateString: "$date" })

从日期切换到字符串

关于你的 P.S。 (传入开始和结束日期以根据此范围输出结果),您可以将 bounds: "full" 替换为 bounds: [ISODate("2018-08-25"), ISODate("2018-09-07")]