mongoDB学习笔记五

来源：互联网发布：软件开发包括哪些编辑：程序博客网时间：2024/06/05 05:50

第六章聚合

第四章

一 count

返回集合中的文档数量

> db.users.count()

二 distinct

查询所有给定键的所有不同的值

>db.runCommand({"distinct":"users","key":"username"})

{

"waitedMS" : NumberLong(0),

"values" : [

"andy",

"ann",

"mike",

"retacn yue"

"stats" : {

"n" : 4,

"nscanned" : 4,

"nscannedObjects" :0,

"timems" : 88,

"planSummary" :"DISTINCT_SCAN { username: 1.0 }"

"ok" : 1

}

三 group

在shell中运行

#查询所有数据

> db.prices.find()

{ "_id" :ObjectId("5872e51a2e29db2d68661bd3"), "time" :"10/03/2016 01:05:02", "price" : 4.23, "day" :"2016/10/03" }

{ "_id" : ObjectId("5872e51a2e29db2d68661bd4"),"time" : "10/04/2016 11:05:02", "price" : 4.27,"day" : "2016/10/04" }

{ "_id" :ObjectId("5872e51a2e29db2d68661bd5"), "time" :"10/03/2016 05:05:02", "price" : 4.1, "day" :"2016/10/03" }

{ "_id" :ObjectId("5872e51a2e29db2d68661bd6"), "time" :"10/06/2016 05:05:02", "price" : 4.3, "day" :"2016/10/06" }

{ "_id" :ObjectId("5872e51a2e29db2d68661bd7"), "time" :"10/04/2016 08:05:02", "price" : 4.01, "day" :"2016/10/04" }

#分组查询

> db.runCommand({"group":{

... "ns":"prices",

... "key":"day",

..."initial":{"time":""},

... "$reduce":function(doc,prev){

... if(doc.time>prev.time){

... prev.price=doc.price;

... prev.time=doc.time;

... }}}})

{

"waitedMS" : NumberLong(0),

"retval" : [

{

"time" :"10/06/2016 05:05:02",

"price" : 4.3

}

"count" : NumberLong(5),

"keys" : NumberLong(1),

"ok" : 1

}

#添加分组条件

> db.runCommand({"group":{

"ns":"prices",

"key":"day",

"initial":{"time":""},

"$reduce":function(doc,prev){

if(doc.time>prev.time){

prev.price=doc.price;

prev.time=doc.time;

}

"condition":{"day":{"$gt":"2016/09/30"}}}})

{

"waitedMS" : NumberLong(0),

"retval" : [

{

"time" :"10/06/2016 05:05:02",

"price" : 4.3

}

"count" : NumberLong(5),

"keys" : NumberLong(1),

"ok" : 1

}

使用python进行分组查询,示例代码如下:

from pymongo import MongoClientimport statsmodels.api as smimport jsonimport pandas as pdimport timefrom datetime import datetime, date, time, timezone, timedeltaimport datetime as dtimport time# 创建数据库连接client = MongoClient()db = client.test# 添加记录# db.prices.insert({"day":"2016/10/03","time":"10/03/2016 01:05:02","price":4.23})# db.prices.insert({"day":"2016/10/04","time":"10/04/2016 11:05:02","price":4.27})# db.prices.insert({"day":"2016/10/03","time":"10/03/2016 05:05:02","price":4.10})# db.prices.insert({"day":"2016/10/06","time":"10/06/2016 05:05:02","price":4.30})# db.prices.insert({"day":"2016/10/04","time":"10/04/2016 08:05:02","price":4.01})# 分组查询func = """        function(doc, prev){            if (doc.time > prev.time){                prev.price = doc.price;                prev.time = doc.time;            }        }"""# 函数原型# def group(self,#           key, 进行分组的键#           condition, 设置分组条件#           initial, 指定reduce中参数#           reduce,  每一条记录都作为reduce函数的第一个参数执行#           finalize=None,#           **kwargs):

gp = db.prices.group({"day": 1}, {"day": {"$gt": "2016/09/30"}}, {"time": ""}, func)print(gp)

查询结果如下

[{'day': '2016/10/03', 'price': 4.1,'time': '10/03/2016 05:05:02'},

{'day': '2016/10/04', 'price': 4.27,'time': '10/04/2016 11:05:02'},

{'day': '2016/10/06', 'price': 4.3, 'time':'10/06/2016 05:05:02'}]

使用完成器

#添加博客数据

>db.blog.posts.insert({"title":"A blogpost2","content":"...","author":{"name":"andy","email":"andy@sina.com"},"comments":[{"name":"tan","email":"tan

@sina.com","content":"good","votes":2}],"tags":{"java":5,"c++":2},"day":"2017/01/10"})

>db.blog.posts.insert({"title":"A blog post3","content":"...","author":{"name":"ruby","email":"ruby@sina.com"},"comments":[{"name":"san","email":"san

@sina.com","content":"good","votes":2}],"tags":{"c":6,"winter":2,"nosql":15},"day":"2017/01/11"})

WriteResult({ "nInserted" : 1 })

#查询所有数据

> db.blog.posts.find()

{ "_id" :ObjectId("586f21115da1e3bac2c373fe"), "title" : "Ablog post", "content" : "...", "author" : {"name" : "retacn yue", "email" : "zhenhuayue@

sina.com" }, "comments" : [{ "name" : "yue", "email" :"zhenhuayue@126.com", "content" : "good","votes" : 11 }, { "name" : "three","email" : "zhenh

uayue@hotmail.com","content" : "better", "votes" : 8 }, {"name" : "zhenhua", "email" :"zhenhuayue@qq.com", "content" : "best","votes" : 11 } ], "t

ags" : { "nosql" : 4,"python" : 10, "slendding" : 2 }, "day" :"2017/01/09" }

{ "_id" :ObjectId("5873036164a560b0e9430160"), "title" : "Ablog post2", "content" : "...", "author" : {"name" : "andy", "email" : "andy@sina.com"}

, "comments" : [ {"name" : "tan", "email" :"tan@sina.com", "content" : "good","votes" : 2 } ], "tags" : { "java" : 5,"c++" : 2 }, "day" : "2017/01

/10" }

{ "_id" :ObjectId("587303ed64a560b0e9430161"), "title" : "Ablog post3", "content" : "...", "author" : {"name" : "ruby", "email" : "ruby@sina.com"}

, "comments" : [ { "name": "san", "email" : "san@sina.com","content" : "good", "votes" : 2 } ],"tags" : { "c" : 6, "winter" : 2,"nosql" : 15 }, "d

ay" : "2017/01/11" }

#分组

> db.runCommand({"group":{

... "ns":"blog.posts",

..."key":{"tags":true},

..."initial":{"tags":{}},

... "$reduce":function(doc,prev){

... for(i in doc.tags){

... if(doc.tags[i] in prev.tags){

... prev.tags[doc.tags[i]]++;

... }else{

... prev.tags[doc.tags[i]]=1;

... }

... }})

#查询结果:

{

"waitedMS" : NumberLong(0),

"retval" : [

{

"tags" : {

"2" :1,

"4" :1,

"10": 1

}

{

"tags" : {

"2" :1,

"5" :1

}

{

"tags" : {

"2" :1,

"6" :1,

"15": 1

}

"count" : NumberLong(3),

"keys" : NumberLong(3),

"ok" : 1

}

#使用完成器(finalizer)精减后的分组

> db.runCommand({"group":{

... "ns":"blog.posts",

..."key":{"tags":true},

..."initial":{"tags":{}},

... "$reduce":function(doc,prev){

... for(i in doc.tags){

... if(doc.tags[i] in prev.tags){

... prev.tags[doc.tags[i]]++;

... }else{

... prev.tags[doc.tags[i]]=1;

... }

... },

... "finalize":function(prev){

... var mostPopular=0;

... for(i in prev.tags){

... if(prev.tags[i]>mostPopular){

... prev.tag=i;

... mostPopulat=prev.tags[i];

... }

... delete prev.tags

... }

... }})

{

"waitedMS" : NumberLong(0),

"retval" : [

{

"tag" :"10"

{

"tag" :"5"

{

"tag" :"15"

}

"count" : NumberLong(3),

"keys" : NumberLong(3),

"ok" : 1

}

Python代码实现,

from pymongo import MongoClientimport statsmodels.api as smimport jsonimport pandas as pdimport timefrom datetime import datetime, date, time, timezone, timedeltaimport datetime as dtimport time# 创建数据库连接client = MongoClient()db = client.testfunc = """        function(doc, prev){            if (doc.time > prev.time){                prev.price = doc.price;                prev.time = doc.time;            }        }"""func_none = """function(doc,prev){        for(i in doc.tags){            if(doc.tags[i] in prev.tags){                prev.tags[doc.tags[i]]++;            }else{               prev.tags[doc.tags[i]]=1;            }        }}"""func_finalize = """function(prev){   var mostPopular=0;   for(i in prev.tags){      if(prev.tags[i]>mostPopular){         prev.tag=i;         mostPopulat=prev.tags[i];      }   }   delete prev.tags}"""# 函数原型# def group(self,#           key, 进行分组的键#           condition, 设置分组条件#           initial, 指定reduce中参数#           reduce,  每一条记录都作为reduce函数的第一个参数执行#           finalize=None, 完成器精减数据库到用户的数据#           **kwargs):# gp = db.prices.group({"day": 1}, {"day": {"$gt": "2016/09/30"}}, {"time": ""}, func)# 分组gp_none = db.blog.posts.group({"tags": True}, {},{"tags": {}}, func_none )print("gp_none", gp_none)# 使用完成器(finalizer)精减后的分组gp_finalizer = db.blog.posts.group({"tags": True},{}, {"tags": {}}, func_none, finalize= func_finalize)print("gp_finalizer", gp_finalizer)

运行结果如下:

gp_none [{'tags': {'10': 1.0, '4': 1.0,'2': 1.0}}, {'tags': {'5': 1.0, '2': 1.0}}, {'tags': {'6': 1.0, '15': 1.0, '2':1.0}}]

gp_finalizer [{'tag': '10'}, {'tag': '5'},{'tag': '15'}]

将函数作为键使用

#将函数作为分级的键来使用,示例代码如下:

Db.posts.group({“ns”:”blog.posts”,”$keyf”:function(x){returnx.columnName.toLowerCase();}

...

})

四 mapreduce

可以并行化到多个服务器的聚合方法,运行速度较慢,不适合作实时的数据处理

1 找出集合中的所有键

#emit返回要处理的值,this为当前映射文档

>map=function(){

for(var key inthis){

emit(key,{count:1});

}

};

#key是emit返回的值,emits对应键的{count:1}文档

> reduce=function(key,emits){

... total=0;

... for(var i in emits){

... total+=emits[i].count;

... }

... return {"count":total};

... }

#方式一

>mr=db.runCommand({"mapreduce":"blog.posts","map":map,"reduce":reduce,"out":{inline:1}})

{

"results" : [

{

"_id" :"_id",

"value" : {

"count" : 3

}

{

"_id" :"author",

"value" : {

"count" : 3

}

{

"_id" :"comments",

"value" : {

"count" : 3

}

{

"_id" :"content",

"value" : {

"count" : 3

}

{

"_id" :"day",

"value" : {

"count" : 3

}

{

"_id" :"tags",

"value" : {

"count" : 3

}

{

"_id" :"title",

"value" : {

"count" : 3

}

"timeMillis" : 442, #操作所需时间

"counts" : {

"input" : 3, #发送到map函数的文档个数

"emit" : 21, #emit被调用的次数

"reduce" : 7 #reduce调用次数,

"output" : 7 #结果集中的文档数量

"ok" : 1

}

#方式二

>db.users.mapReduce(map,reduce,{out:{inline:1}})

{

"results" : [

{

"_id" :"_id",

"value" : {

"count" : 5

}

{

"_id" :"age",

"value" : {

"count" : 4

}

{

"_id" :"emails",

"value" : {

"count" : 4

}

{

"_id" :"languages",

"value" : {

"count" : 4

}

{

"_id" :"location",

"value" : {

"count" : 4

}

{

"_id" :"relationships",

"value" : {

"count" : 4

}

{

"_id" :"sex",

"value" : {

"count" : 4

}

{

"_id" :"state",

"value" : {

"count" : 4

}

{

"_id" :"username",

"value" : {

"count" : 5

}

"timeMillis" : 51,

"counts" : {

"input" : 5,

"emit" : 38,

"reduce" : 9,

"output" : 9

"ok" : 1

}

2 网页分类

3 mongodb与mapReduce

Mapreduce的可选键

Finalize 处理过程的最后一步

Keeptemp 临时结果集是否保存

Output 结果集合名

Query 发往map前的过滤条件

Sort 发往map前腓序

Limit 发往map前文档数量上限

Scope js变量

Verbose 详细日志

0 0