R语言中str_extract_all函数
来源:互联网 发布:java excel预览插件 编辑:程序博客网 时间:2024/06/05 17:08
这个函数是在stringr包下面的一个函数,在做数据清洗的时候还是很有用的,大概用法就是去提取一个字符串下的某种内容,按照一些自己想要的规则,具体用法如下:
x<-"abacdef12g"
str_extract_all(x,"[f0-9]")
[[1]]
[1] "f" "1" "2"
> str_extract_all(x,"[f0-9]{1,3}")
[[1]]
[1] "f12"
> str_extract_all(x,"[f0-9]{1,2}")
[[1]]
[1] "f1" "2"
附上一些平时写的代码
library(ggplot2)library(RMySQL)library(stringr)library(sqldf)library(plyr)conn <- dbConnect(MySQL(), dbname = "tracker", username="gaoyang922", password="gaoyang922@123456!",host="10.10.109.62",port=1333)dbSendQuery(conn,'SET NAMES utf8')query<-dbSendQuery(conn, "SELECT key_table,left(insert_time,8) as insert_date,label,sessionid,stay_time,site,page_url FROM tracker.hbase_visitwhere insert_time is not null and page_url like 'https://item.zhong.com%' ")rawdata_vi <- fetch(query,n=-1)dbDisconnect(conn)dim(rawdata_vi)head(rawdata_vi)nrow(rawdata_vi)# rawdata_vi$prodID=as.numeric(unlist(str_extract_all(rawdata_vi$page_url,"[0-9]{1,2}")))f<-function(x){ if(grepl("productId",x)){ result<-as.numeric(unlist(str_extract_all(x,"[0-9]{1,2}"))[1]) else{ result<-9999 } result}rawdata_vi$prodID =sapply(rawdata_vi$page_url,f) rawdata_vi_prod<-subset(rawdata_vi,prodID!=9999)head(rawdata_vi_prod)dim(rawdata_vi_prod)############## 上面是得到了每个url的产品ID#### 下面的code 要得到每个产品的属性conn <- dbConnect(MySQL(), dbname = "pms", username="gaoyang922", password="gaoyang922@123456!",host="10.10.109.62",port=1333)dbSendQuery(conn,'SET NAMES utf8')query<-dbSendQuery(conn, "select a.*,b.BizCategoryName from pms.pms_product a left join pms.pms_biz_category bon a.ProductType = b.ID ")rawdata_pd <- fetch(query,n=-1)dbDisconnect(conn)head(rawdata_pd)####### ##### 每天的总结prod_sumy1= rawdata_vi_prod[,c(2,8)]prod_sumy2 = count(prod_sumy1, c("insert_date", "prodID"))prod_sumy3 = count(prod_sumy1, "prodID")head(prod_sumy2)prod1 = merge(prod_sumy2,rawdata_pd[,c(1,4,5,31)],by.x = "prodID", by.y = "ID",all.x=T)head(prod1)daily_prod_rank = prod1[with(prod1, order(-as.numeric(insert_date),-freq)),]names(daily_prod_rank)[c(1,2,3)] <- c("ProdId","InsertDate","PVCnt")head(daily_prod_rank)daily_prod_rank$InsertDate = as.character(daily_prod_rank$InsertDate)### 导入到数据库conn <- dbConnect(MySQL(), dbname = "analyse", username="gaoyang922", password="gaoyang922@123456!",host="10.10.109.62",port=1333)dbWriteTable(conn, "daily_prod_rank_raw", prod_sumy2)### 产品1.关注,2.购买,3.预约,4.点赞conn <- dbConnect(MySQL(), dbname = "pms", username="gaoyang922", password="gaoyang922@123456!",host="10.10.109.62",port=1333)query<-dbSendQuery(conn, "SELECT productID,RelationType,(case when RelationType=1 then '关注'when RelationType=2 then '购买' when RelationType=4 then '点赞' else '预约' end)as RT_desc ,count(*) as pd_cnt FROM pms.pms_user_relation where CreateTime between '2015-08-18' and '2015-09-05' group by ProductId,RelationType ")rawdata_RT <- fetch(query,n=-1)dbDisconnect(conn)head(rawdata_RT)table(prodID)prodID_ggplot<-rawdata_vi$prodID;prodID_ggplot<-reorder(prodID_ggplot,prodID_ggplot,length)rawdata_vi$prodID_ggplot<-prodIDe_ggplotggplot(subset(rawdata_vi,prodID!=9999),aes(x=prodID_ggplot))+geom_bar()# # library(ggplot2)# library(RMySQL)# library(stringr)# library(dplyr)# conn <- dbConnect(MySQL(), dbname = "tracker", username="zhoumeixu204", password="zhoumeixu204@123456!",host="10.10.109.62",port=1333)# query<-dbSendQuery(conn, "SELECT key_table,left(insert_time,8) as # insert_date,label,sessionid,stay_time,site,page_url FROM tracker.hbase_visit# where insert_time is not null ")# # # query_1<-dbSendQuery(conn,"# select * from tracker.hbase_visitor where insert_time # is not NULL and city is not NUll and country='china'")# rawdata_vi <- fetch(query,n=-1)# hbase_visitor<-fetch(query_1,n=-1)# dbDisconnect(conn)# dim(rawdata_vi)# head(rawdata_vi)# f<-function(x){# if(grepl("productId",x)){# result<-as.numeric(unlist(str_extract_all(x,"[0-9]{1,2}"))[1])# # }# else{# result<-9999# }# result# }# rawdata_vi$prodID =sapply(rawdata_vi$page_url,f)# # rawdata_vi<-subset(rawdata_vi,prodID!=9999)# table(prodID)# # # prodID_ggplot<-rawdata_vi$prodID;prodID_ggplot<-reorder(prodID_ggplot,prodID_ggplot,length)# rawdata_vi$prodID_ggplot<-prodID_ggplot# site_ggplot<-rawdata_vi$site;site_ggplot<-reorder(site_ggplot,site_ggplot,length)# rawdata_vi$site_ggplot<-site_ggplot# ggplot(subset(rawdata_vi,prodID!=9999),aes(x=prodID_ggplot))+geom_bar(aes(fill=prodID_ggplot))# ggplot(subset(rawdata_vi,prodID!=9999),aes(x=prodID_ggplot,fill=factor(insert_date)))+geom_bar(position = 'stack')+labs(title="移动终端占比柱形图")# ggplot(rawdata_vi,aes(x=site_ggplot,fill=factor(insert_date)))+geom_bar(position = 'dodge')+labs(title="移动终端占比柱形图")# ggplot(rawdata_vi,aes(x=site_ggplot,fill=factor(insert_date)))+geom_bar(position = 'stack')+labs(title="移动终端占比柱形图")# ggplot(rawdata_vi,aes(x=site_ggplot,fill=factor(insert_date)))+geom_bar(position = 'dodge')+labs(title="移动终端占比柱形图")+facet_wrap(~insert_date,ncol=1)# str(rawdata_vi)# rawdata_vi_to_mysql<-data.frame(rawdata_vi$key_table,rawdata_vi$insert_date,rawdata_vi$label,rawdata_vi$sessionid,rawdata_vi$stay_time,rawdata_vi$site,rawdata_vi$page_url,rawdata_vi$prodID)# conn <- dbConnect(MySQL(), dbname = "analyse_dev", username="root", password="Pa123456!",host="202.69.27.239",port=8443)# rawdata_vi_to_mysql<-subset(rawdata_vi_to_mysql,rawdata_vi.prodID !=9999)# dbWriteTable(conn, "rawdata_vi_to_mysql", rawdata_vi_to_mysql)# dbDisconnect(conn)# #
0 0
- R语言中str_extract_all函数
- R语言中mean函数
- R语言数据挖掘中常用函数
- R语言中seq函数的用法
- r语言中生成序列的函数
- R语言中函数定义与调用
- r语言中时间函数处理
- R语言中apply函数家族
- R语言中round()函数的使用
- R语言中apply函数使用
- R语言中之分布函数
- R语言中常用函数手册
- R语言常用函数
- R语言常用函数
- R语言常用函数
- R语言函数
- R语言函数
- R语言常用函数
- js 滚动加载
- Android开发模板------RecyclerView的使用
- 一篇关于通知的文章
- Deep Learning(深度学习)学习笔记整理系列之(四)
- Java指定编码格式读写文件
- R语言中str_extract_all函数
- 12 Design and Deployment Techniques RAC设计和部署技术
- Linux网络编程---UDP洪水攻击
- 项目部署的时候 卡在 项目部署log
- android的多媒体扫描
- 浅谈java异常[Exception]
- 如何实现iOS图书动画-第2部分(上)
- Deep Learning(深度学习)学习笔记整理系列之(五)
- python sorted函数以及operator.itemgetter函数