R语言中str_extract_all函数

来源:互联网 发布:java excel预览插件 编辑:程序博客网 时间:2024/06/05 17:08

这个函数是在stringr包下面的一个函数,在做数据清洗的时候还是很有用的,大概用法就是去提取一个字符串下的某种内容,按照一些自己想要的规则,具体用法如下:

x<-"abacdef12g"

 str_extract_all(x,"[f0-9]")
[[1]]
[1] "f" "1" "2"

> str_extract_all(x,"[f0-9]{1,3}")
[[1]]
[1] "f12"

> str_extract_all(x,"[f0-9]{1,2}")
[[1]]
[1] "f1" "2" 

附上一些平时写的代码

library(ggplot2)library(RMySQL)library(stringr)library(sqldf)library(plyr)conn <- dbConnect(MySQL(), dbname = "tracker", username="gaoyang922", password="gaoyang922@123456!",host="10.10.109.62",port=1333)dbSendQuery(conn,'SET NAMES utf8')query<-dbSendQuery(conn, "SELECT key_table,left(insert_time,8) as insert_date,label,sessionid,stay_time,site,page_url FROM tracker.hbase_visitwhere insert_time is not null and page_url like 'https://item.zhong.com%' ")rawdata_vi <- fetch(query,n=-1)dbDisconnect(conn)dim(rawdata_vi)head(rawdata_vi)nrow(rawdata_vi)# rawdata_vi$prodID=as.numeric(unlist(str_extract_all(rawdata_vi$page_url,"[0-9]{1,2}")))f<-function(x){  if(grepl("productId",x)){    result<-as.numeric(unlist(str_extract_all(x,"[0-9]{1,2}"))[1])      else{    result<-9999  }  result}rawdata_vi$prodID =sapply(rawdata_vi$page_url,f) rawdata_vi_prod<-subset(rawdata_vi,prodID!=9999)head(rawdata_vi_prod)dim(rawdata_vi_prod)##############  上面是得到了每个url的产品ID#### 下面的code 要得到每个产品的属性conn <- dbConnect(MySQL(), dbname = "pms", username="gaoyang922", password="gaoyang922@123456!",host="10.10.109.62",port=1333)dbSendQuery(conn,'SET NAMES utf8')query<-dbSendQuery(conn, "select a.*,b.BizCategoryName from pms.pms_product a left join pms.pms_biz_category bon a.ProductType = b.ID ")rawdata_pd <- fetch(query,n=-1)dbDisconnect(conn)head(rawdata_pd)#######  ##### 每天的总结prod_sumy1=  rawdata_vi_prod[,c(2,8)]prod_sumy2 = count(prod_sumy1, c("insert_date", "prodID"))prod_sumy3 = count(prod_sumy1, "prodID")head(prod_sumy2)prod1 = merge(prod_sumy2,rawdata_pd[,c(1,4,5,31)],by.x = "prodID", by.y = "ID",all.x=T)head(prod1)daily_prod_rank =  prod1[with(prod1, order(-as.numeric(insert_date),-freq)),]names(daily_prod_rank)[c(1,2,3)] <- c("ProdId","InsertDate","PVCnt")head(daily_prod_rank)daily_prod_rank$InsertDate = as.character(daily_prod_rank$InsertDate)### 导入到数据库conn <- dbConnect(MySQL(), dbname = "analyse", username="gaoyang922", password="gaoyang922@123456!",host="10.10.109.62",port=1333)dbWriteTable(conn, "daily_prod_rank_raw", prod_sumy2)### 产品1.关注,2.购买,3.预约,4.点赞conn <- dbConnect(MySQL(), dbname = "pms", username="gaoyang922", password="gaoyang922@123456!",host="10.10.109.62",port=1333)query<-dbSendQuery(conn, "SELECT productID,RelationType,(case when RelationType=1 then '关注'when  RelationType=2 then '购买'                   when RelationType=4 then '点赞'                   else '预约' end)as RT_desc                   ,count(*) as pd_cnt FROM pms.pms_user_relation                  where CreateTime between '2015-08-18' and '2015-09-05'                   group by ProductId,RelationType ")rawdata_RT <- fetch(query,n=-1)dbDisconnect(conn)head(rawdata_RT)table(prodID)prodID_ggplot<-rawdata_vi$prodID;prodID_ggplot<-reorder(prodID_ggplot,prodID_ggplot,length)rawdata_vi$prodID_ggplot<-prodIDe_ggplotggplot(subset(rawdata_vi,prodID!=9999),aes(x=prodID_ggplot))+geom_bar()# # library(ggplot2)# library(RMySQL)# library(stringr)# library(dplyr)# conn <- dbConnect(MySQL(), dbname = "tracker", username="zhoumeixu204", password="zhoumeixu204@123456!",host="10.10.109.62",port=1333)# query<-dbSendQuery(conn, "SELECT key_table,left(insert_time,8) as #                    insert_date,label,sessionid,stay_time,site,page_url FROM tracker.hbase_visit#                    where insert_time is not null  ")# # # query_1<-dbSendQuery(conn,"#                      select * from  tracker.hbase_visitor   where  insert_time  #                      is not   NULL  and  city is not NUll   and country='china'")# rawdata_vi <- fetch(query,n=-1)# hbase_visitor<-fetch(query_1,n=-1)# dbDisconnect(conn)# dim(rawdata_vi)# head(rawdata_vi)# f<-function(x){#   if(grepl("productId",x)){#     result<-as.numeric(unlist(str_extract_all(x,"[0-9]{1,2}"))[1])#     #   }#   else{#     result<-9999#   }#   result# }# rawdata_vi$prodID =sapply(rawdata_vi$page_url,f)# # rawdata_vi<-subset(rawdata_vi,prodID!=9999)# table(prodID)# # # prodID_ggplot<-rawdata_vi$prodID;prodID_ggplot<-reorder(prodID_ggplot,prodID_ggplot,length)# rawdata_vi$prodID_ggplot<-prodID_ggplot# site_ggplot<-rawdata_vi$site;site_ggplot<-reorder(site_ggplot,site_ggplot,length)# rawdata_vi$site_ggplot<-site_ggplot# ggplot(subset(rawdata_vi,prodID!=9999),aes(x=prodID_ggplot))+geom_bar(aes(fill=prodID_ggplot))# ggplot(subset(rawdata_vi,prodID!=9999),aes(x=prodID_ggplot,fill=factor(insert_date)))+geom_bar(position = 'stack')+labs(title="移动终端占比柱形图")# ggplot(rawdata_vi,aes(x=site_ggplot,fill=factor(insert_date)))+geom_bar(position = 'dodge')+labs(title="移动终端占比柱形图")# ggplot(rawdata_vi,aes(x=site_ggplot,fill=factor(insert_date)))+geom_bar(position = 'stack')+labs(title="移动终端占比柱形图")# ggplot(rawdata_vi,aes(x=site_ggplot,fill=factor(insert_date)))+geom_bar(position = 'dodge')+labs(title="移动终端占比柱形图")+facet_wrap(~insert_date,ncol=1)# str(rawdata_vi)# rawdata_vi_to_mysql<-data.frame(rawdata_vi$key_table,rawdata_vi$insert_date,rawdata_vi$label,rawdata_vi$sessionid,rawdata_vi$stay_time,rawdata_vi$site,rawdata_vi$page_url,rawdata_vi$prodID)# conn <- dbConnect(MySQL(), dbname = "analyse_dev", username="root", password="Pa123456!",host="202.69.27.239",port=8443)# rawdata_vi_to_mysql<-subset(rawdata_vi_to_mysql,rawdata_vi.prodID !=9999)# dbWriteTable(conn, "rawdata_vi_to_mysql", rawdata_vi_to_mysql)# dbDisconnect(conn)# # 


0 0
原创粉丝点击