Rpackage【dplyr】

来源:互联网 发布:mac os x 10.13 镜像 编辑:程序博客网 时间:2024/05/16 10:00

原文地址:https://zhuanlan.zhihu.com/p/25784721

使用举例

# 安装并载入dplyr包install.packages("dplyr")library(dplyr)# 创建两个数据集order和price> set.seed(1)> ordernum <- sample(1:20, 5)> itemprice <- sample(100:500, 5)> order <- data.frame(itemid=1:5, ordernum, class=rep(1:2,2:3))> order  itemid ordernum class1      1        6     12      2        8     13      3       11     24      4       16     25      5        4     2> price <- data.frame(itemid = 2:6, itemprice)> price  itemid itemprice1      2       4602      3       4773      4       3634      5       3505      6       124

# filter()--返回满足条件的观测

> # filter()> filter(order, ordernum >= 10)  itemid ordernum class1      3       11     22      4       16     2> filter(price, itemid %in% c(3,5))  itemid itemprice1      3       4772      5       350

# select()--选择所需变量

> # select()> select(order, itemid, ordernum)  itemid ordernum1      1        62      2        83      3       114      4       165      5        4> select(price, -itemid)  itemprice1       4602       4773       3634       3505       124

# arrange()--对观测进行排序

> # arrange()> arrange(order, ordernum)  itemid ordernum class1      5        4     22      1        6     13      2        8     14      3       11     25      4       16     2> arrange(price, -itemprice)  itemid itemprice1      3       4772      2       4603      4       3634      5       3505      6       124

# mutate()--添加新变量

> # mutate()> mutate(price, cost=0.7*itemprice, profit=itemprice-cost)  itemid itemprice  cost profit1      2       460 322.0  138.02      3       477 333.9  143.13      4       363 254.1  108.94      5       350 245.0  105.05      6       124  86.8   37.2

# rename()--重命名变量

> # rename()> rename(price, id=itemid)  id itemprice1  2       4602  3       4773  4       3634  5       3505  6       124

# summarise()--数据汇总

> # summarise()> mygroup <- group_by(order, class)> summarise(mygroup, count=n(), total=sum(ordernum))# A tibble: 2 × 3  class count total  <int> <int> <int>1     1     2    142     2     3    31

# %>%--管道操作(将上个输出作为下个输入)

> # %>%> order %>% group_by(class) %>%+   summarise(count=n(), total=sum(ordernum))# A tibble: 2 × 3  class count total  <int> <int> <int>1     1     2    142     2     3    31

# join()--连接数据集

> # join()> inner_join(order, price, by="itemid")  itemid ordernum class itemprice1      2        8     1       4602      3       11     2       4773      4       16     2       3634      5        4     2       350> left_join(order, price, by="itemid")  itemid ordernum class itemprice1      1        6     1        NA2      2        8     1       4603      3       11     2       4774      4       16     2       3635      5        4     2       350> right_join(order, price, by="itemid")  itemid ordernum class itemprice1      2        8     1       4602      3       11     2       4773      4       16     2       3634      5        4     2       3505      6       NA    NA       124> full_join(order, price, by="itemid")  itemid ordernum class itemprice1      1        6     1        NA2      2        8     1       4603      3       11     2       4774      4       16     2       3635      5        4     2       3506      6       NA    NA       124> semi_join(order, price, by="itemid")  itemid ordernum class1      2        8     12      3       11     23      4       16     24      5        4     2> anti_join(order, price, by="itemid")  itemid ordernum class1      1        6     1

# src_mysql()--连接MySQL数据库

> # src_mysql()> library(RMySQL)> library(DBI)> src_mysql("mydb",user="root",password="****") %>% tbl(from = "sample")Source:   query [?? x 2]Database: mysql 5.7.17 [root@localhost:/mydb]  itemid class   <int> <int>1      1     12      2     13      3     14      4     25      5     2
原创粉丝点击