mongoDB学习笔记五

来源:互联网 发布:软件开发包括哪些 编辑:程序博客网 时间:2024/06/05 05:50

第六章 聚合


第四章  

 一 count

返回集合中的文档数量

> db.users.count()

5

 

二 distinct

查询所有给定键的所有不同的值

>db.runCommand({"distinct":"users","key":"username"})

{

       "waitedMS" : NumberLong(0),

       "values" : [

                "andy",

                "ann",

                "mike",

                "retacn yue"

       ],

       "stats" : {

                "n" : 4,

                "nscanned" : 4,

                "nscannedObjects" :0,

                "timems" : 88,

                "planSummary" :"DISTINCT_SCAN { username: 1.0 }"

       },

       "ok" : 1

}

 

三 group

 

在shell中运行

 

#查询所有数据

> db.prices.find()

{ "_id" :ObjectId("5872e51a2e29db2d68661bd3"), "time" :"10/03/2016 01:05:02", "price" : 4.23, "day" :"2016/10/03" }

{ "_id" : ObjectId("5872e51a2e29db2d68661bd4"),"time" : "10/04/2016 11:05:02", "price" : 4.27,"day" : "2016/10/04" }

{ "_id" :ObjectId("5872e51a2e29db2d68661bd5"), "time" :"10/03/2016 05:05:02", "price" : 4.1, "day" :"2016/10/03" }

{ "_id" :ObjectId("5872e51a2e29db2d68661bd6"), "time" :"10/06/2016 05:05:02", "price" : 4.3, "day" :"2016/10/06" }

{ "_id" :ObjectId("5872e51a2e29db2d68661bd7"), "time" :"10/04/2016 08:05:02", "price" : 4.01, "day" :"2016/10/04" }

 

 

#分组查询

> db.runCommand({"group":{

... "ns":"prices",

... "key":"day",

..."initial":{"time":""},

... "$reduce":function(doc,prev){

...      if(doc.time>prev.time){

...           prev.price=doc.price;

...         prev.time=doc.time;

... }}}})

{

       "waitedMS" : NumberLong(0),

       "retval" : [

                {

                        "time" :"10/06/2016 05:05:02",

                        "price" : 4.3

                }

       ],

       "count" : NumberLong(5),

       "keys" : NumberLong(1),

       "ok" : 1

}

 

#添加分组条件

> db.runCommand({"group":{

 "ns":"prices",

"key":"day",

"initial":{"time":""},

"$reduce":function(doc,prev){

     if(doc.time>prev.time){ 

        prev.price=doc.price;  

        prev.time=doc.time;

 }

},

"condition":{"day":{"$gt":"2016/09/30"}}}})

 

 

{

       "waitedMS" : NumberLong(0),

       "retval" : [

                {

                        "time" :"10/06/2016 05:05:02",

                        "price" : 4.3

                }

       ],

       "count" : NumberLong(5),

       "keys" : NumberLong(1),

        "ok" : 1

}

 

 

使用python进行分组查询,示例代码如下:

from pymongo import MongoClientimport statsmodels.api as smimport jsonimport pandas as pdimport timefrom datetime import datetime, date, time, timezone, timedeltaimport datetime as dtimport time# 创建数据库连接client = MongoClient()db = client.test# 添加记录# db.prices.insert({"day":"2016/10/03","time":"10/03/2016 01:05:02","price":4.23})# db.prices.insert({"day":"2016/10/04","time":"10/04/2016 11:05:02","price":4.27})# db.prices.insert({"day":"2016/10/03","time":"10/03/2016 05:05:02","price":4.10})# db.prices.insert({"day":"2016/10/06","time":"10/06/2016 05:05:02","price":4.30})# db.prices.insert({"day":"2016/10/04","time":"10/04/2016 08:05:02","price":4.01})# 分组查询func = """        function(doc, prev){            if (doc.time > prev.time){                prev.price = doc.price;                prev.time = doc.time;            }        }"""# 函数原型# def group(self,#           key, 进行分组的键#           condition, 设置分组条件#           initial, 指定reduce中参数#           reduce,  每一条记录都作为reduce函数的第一个参数执行#           finalize=None,#           **kwargs):
gp = db.prices.group({"day": 1}, {"day": {"$gt": "2016/09/30"}}, {"time": ""}, func)print(gp)

 

查询结果如下

[{'day': '2016/10/03', 'price': 4.1,'time': '10/03/2016 05:05:02'},

{'day': '2016/10/04', 'price': 4.27,'time': '10/04/2016 11:05:02'},

{'day': '2016/10/06', 'price': 4.3, 'time':'10/06/2016 05:05:02'}]

 

使用完成器

 

#添加博客数据

>db.blog.posts.insert({"title":"A blogpost2","content":"...","author":{"name":"andy","email":"andy@sina.com"},"comments":[{"name":"tan","email":"tan

@sina.com","content":"good","votes":2}],"tags":{"java":5,"c++":2},"day":"2017/01/10"})

>db.blog.posts.insert({"title":"A blog post3","content":"...","author":{"name":"ruby","email":"ruby@sina.com"},"comments":[{"name":"san","email":"san

@sina.com","content":"good","votes":2}],"tags":{"c":6,"winter":2,"nosql":15},"day":"2017/01/11"})

WriteResult({ "nInserted" : 1 })

 

#查询所有数据

> db.blog.posts.find()

{ "_id" :ObjectId("586f21115da1e3bac2c373fe"), "title" : "Ablog post", "content" : "...", "author" : {"name" : "retacn yue", "email" : "zhenhuayue@

sina.com" }, "comments" : [{ "name" : "yue", "email" :"zhenhuayue@126.com", "content" : "good","votes" : 11 }, { "name" : "three","email" : "zhenh

uayue@hotmail.com","content" : "better", "votes" : 8 }, {"name" : "zhenhua", "email" :"zhenhuayue@qq.com", "content" : "best","votes" : 11 } ], "t

ags" : { "nosql" : 4,"python" : 10, "slendding" : 2 }, "day" :"2017/01/09" }

{ "_id" :ObjectId("5873036164a560b0e9430160"), "title" : "Ablog post2", "content" : "...", "author" : {"name" : "andy", "email" : "andy@sina.com"}

, "comments" : [ {"name" : "tan", "email" :"tan@sina.com", "content" : "good","votes" : 2 } ], "tags" : { "java" : 5,"c++" : 2 }, "day" : "2017/01

/10" }

{ "_id" :ObjectId("587303ed64a560b0e9430161"), "title" : "Ablog post3", "content" : "...", "author" : {"name" : "ruby", "email" : "ruby@sina.com"}

, "comments" : [ { "name": "san", "email" : "san@sina.com","content" : "good", "votes" : 2 } ],"tags" : { "c" : 6, "winter" : 2,"nosql" : 15 }, "d

ay" : "2017/01/11" }

 

 

#分组

> db.runCommand({"group":{

... "ns":"blog.posts",

..."key":{"tags":true},

..."initial":{"tags":{}},

... "$reduce":function(doc,prev){

...        for(i in doc.tags){

...             if(doc.tags[i] in prev.tags){

...                 prev.tags[doc.tags[i]]++;

...             }else{

...                prev.tags[doc.tags[i]]=1;

...             }

...        }

... }

... }})

#查询结果:

{

       "waitedMS" : NumberLong(0),

       "retval" : [

                {

                        "tags" : {

                                "2" :1,

                                "4" :1,

                                "10": 1

                        }

                },

                {

                        "tags" : {

                                "2" :1,

                                "5" :1

                        }

                },

                {

                       "tags" : {

                                "2" :1,

                                "6" :1,

                                "15": 1

                        }

                }

       ],

       "count" : NumberLong(3),

       "keys" : NumberLong(3),

       "ok" : 1

}

 

 

#使用完成器(finalizer)精减后的分组

> db.runCommand({"group":{

... "ns":"blog.posts",

..."key":{"tags":true},

..."initial":{"tags":{}},

... "$reduce":function(doc,prev){

...        for(i in doc.tags){

...             if(doc.tags[i] in prev.tags){

...                 prev.tags[doc.tags[i]]++;

...             }else{

...                prev.tags[doc.tags[i]]=1;

...             }

...        }

... },

... "finalize":function(prev){

... var mostPopular=0;

... for(i in prev.tags){

... if(prev.tags[i]>mostPopular){

... prev.tag=i;

... mostPopulat=prev.tags[i];

... }

... }

... delete prev.tags

... }

... }})

{

       "waitedMS" : NumberLong(0),

       "retval" : [

                {

                        "tag" :"10"

                },

                {

                        "tag" :"5"

                },

                {

                        "tag" :"15"

                }

       ],

       "count" : NumberLong(3),

       "keys" : NumberLong(3),

       "ok" : 1

}

 

Python代码实现,

from pymongo import MongoClientimport statsmodels.api as smimport jsonimport pandas as pdimport timefrom datetime import datetime, date, time, timezone, timedeltaimport datetime as dtimport time# 创建数据库连接client = MongoClient()db = client.testfunc = """        function(doc, prev){            if (doc.time > prev.time){                prev.price = doc.price;                prev.time = doc.time;            }        }"""func_none = """function(doc,prev){        for(i in doc.tags){            if(doc.tags[i] in prev.tags){                prev.tags[doc.tags[i]]++;            }else{               prev.tags[doc.tags[i]]=1;            }        }}"""func_finalize = """function(prev){   var mostPopular=0;   for(i in prev.tags){      if(prev.tags[i]>mostPopular){         prev.tag=i;         mostPopulat=prev.tags[i];      }   }   delete prev.tags}"""# 函数原型# def group(self,#           key, 进行分组的键#           condition, 设置分组条件#           initial, 指定reduce中参数#           reduce,  每一条记录都作为reduce函数的第一个参数执行#           finalize=None, 完成器精减数据库到用户的数据#           **kwargs):# gp = db.prices.group({"day": 1}, {"day": {"$gt": "2016/09/30"}}, {"time": ""}, func)# 分组gp_none = db.blog.posts.group({"tags": True}, {},{"tags": {}}, func_none )print("gp_none", gp_none)# 使用完成器(finalizer)精减后的分组gp_finalizer = db.blog.posts.group({"tags": True},{}, {"tags": {}}, func_none, finalize= func_finalize)print("gp_finalizer", gp_finalizer)

 

 

运行结果如下:

gp_none [{'tags': {'10': 1.0, '4': 1.0,'2': 1.0}}, {'tags': {'5': 1.0, '2': 1.0}}, {'tags': {'6': 1.0, '15': 1.0, '2':1.0}}]

gp_finalizer [{'tag': '10'}, {'tag': '5'},{'tag': '15'}]

 

 

将函数作为键使用

#将函数作为分级的键来使用,示例代码如下:

Db.posts.group({“ns”:”blog.posts”,”$keyf”:function(x){returnx.columnName.toLowerCase();}

...

})

 

四 mapreduce

可以并行化到多个服务器的聚合方法,运行速度较慢,不适合作实时的数据处理

 

1 找出集合中的所有键

 

#emit返回要处理的值,this为当前映射文档

>map=function(){

for(var key inthis){

     emit(key,{count:1});

}

};

 

 

#key是emit返回的值,emits对应键的{count:1}文档

> reduce=function(key,emits){

... total=0;

... for(var i in emits){

...    total+=emits[i].count;

... }

... return {"count":total};

... }

 

#方式一

>mr=db.runCommand({"mapreduce":"blog.posts","map":map,"reduce":reduce,"out":{inline:1}})

{

       "results" : [

                {

                        "_id" :"_id",

                        "value" : {

                               "count" : 3

                        }

                },

                {

                        "_id" :"author",

                        "value" : {

                               "count" : 3

                        }

                },

                {

                        "_id" :"comments",

                        "value" : {

                               "count" : 3

                        }

                },

                {

                        "_id" :"content",

                        "value" : {

                               "count" : 3

                        }

                },

                {

                        "_id" :"day",

                        "value" : {

                               "count" : 3

                        }

                },

                {

                        "_id" :"tags",

                        "value" : {

                                "count" : 3

                        }

                },

                {

                        "_id" :"title",

                        "value" : {

                               "count" : 3

                        }

                }

       ],

       "timeMillis" : 442,  #操作所需时间

       "counts" : {

                "input" : 3,   #发送到map函数的文档个数

                "emit" : 21,   #emit被调用的次数

                "reduce" : 7   #reduce调用次数,

                "output" : 7  #结果集中的文档数量

       },

       "ok" : 1

}

 

 

#方式二

>db.users.mapReduce(map,reduce,{out:{inline:1}})

{

       "results" : [

                {

                        "_id" :"_id",

                        "value" : {

                               "count" : 5

                        }

                },

                {

                        "_id" :"age",

                        "value" : {

                               "count" : 4

                        }

                },

                {

                        "_id" :"emails",

                        "value" : {

                               "count" : 4

                        }

                },

                {

                        "_id" :"languages",

                        "value" : {

                               "count" : 4

                        }

                },

                {

                        "_id" :"location",

                        "value" : {

                               "count" : 4

                        }

                },

                {

                        "_id" :"relationships",

                        "value" : {

                               "count" : 4

                        }

                },

                {

                        "_id" :"sex",

                        "value" : {

                               "count" : 4

                        }

                },

                {

                        "_id" :"state",

                        "value" : {

                                "count" : 4

                        }

                },

                {

                        "_id" :"username",

                        "value" : {

                               "count" : 5

                        }

                }

       ],

       "timeMillis" : 51,

       "counts" : {

                "input" : 5,

                "emit" : 38,

                "reduce" : 9,

                "output" : 9

       },

       "ok" : 1

}

2 网页分类

3 mongodb与mapReduce

 

Mapreduce的可选键

Finalize  处理过程的最后一步

Keeptemp 临时结果集是否保存

Output 结果集合名

Query 发往map前的过滤条件

Sort 发往map前腓序

Limit 发往map前文档数量上限

Scope js变量

Verbose 详细日志

 

0 0
原创粉丝点击