Learning R---stringr

来源：互联网发布：关键词seo 编辑：程序博客网时间：2024/05/18 18:19
看了下stringr包的文档，常用的应该是匹配正则那一块吧。后面如果实际工作遇到需要处理的情况，就加上些case吧.
# ****************stringr学习**************** -------------------------------------------------------# *****字符串的转换***** --------------------------------------------------------library(stringr)## 参数设置:string 需要变换的字符串 locale需要转换成的语言，默认英语"en"dog <- "The quick brown dog"str_to_upper(dog)# "THE QUICK BROWN DOG"str_to_lower(dog)# "the quick brown dog"str_to_title(dog)# "The Quick Brown Dog"# *****反转匹配***** --------------------------------------------------------numbers <- "1 and 2 and 4 and 456"num_loc <- str_locate_all(numbers, "[0-9]+")[[1]] # 返回数字的起止位置str_sub(numbers, num_loc[, "start"], num_loc[, "end"]) # "1"   "2"   "4"   "456"text_loc <- invert_match(num_loc) # 返回非数字的起止位置str_sub(numbers, text_loc[, "start"], text_loc[, "end"]) # ""      " and " " and " " and " ""  # *****控制匹配行为***** --------------------------------------------------------## 针对pattern的参数设置，默认是正则## fixed 匹配一个固定的字符串## coll 匹配不同语言,文档上有例子## regex 正则## boundary 按照什么格式划分,字符character，单词word，句子sentence## 详细的参数设置还是看文档比较重要pattern <- "a.b"strings <- c("abb", "a.b")str_detect(strings, pattern) # TRUE  TRUEstr_detect(strings, fixed(pattern)) # 严格匹配FALSE  TRUEstr_detect(strings, coll(pattern)) # FALSE  TRUE# coll() is useful for locale-aware case-insensitive matchingi <- c("I", "\u0130", "i")istr_detect(i, fixed("i", TRUE))str_detect(i, coll("i", TRUE))str_detect(i, coll("i", TRUE, locale = "tr"))# 需要指定语言，暂且没遇到过# Word boundarieswords <- c("These are some words.")str_count(words, boundary("word")) # 统计单词个数str_split(words, " ")[[1]] # 按照空格划分字符串str_split(words, boundary("word"))[[1]]# Regular expression variationsstr_extract_all("The Cat in the Hat", "[a-z]+") # 正则，匹配小写a-zstr_extract_all("The Cat in the Hat", regex("[a-z]+", ignore_case = TRUE)) # 忽略大小写str_extract_all("a\nb\nc", "^.")str_extract_all("a\nb\nc", regex("^.", multiline = TRUE))str_extract_all("a\nb\nc", "a.")str_extract_all("a\nb\nc", regex("a.", dotall = TRUE))# *****字符串拼接***** --------------------------------------------------------## 和paste差不多，不赘str_c("Letter: ", letters)str_c("Letter", letters, sep = ": ")str_c(letters, " is for", "...")str_c(letters[-26], " comes before ", letters[-1])str_c(letters, collapse = "")str_c(letters, collapse = ", ")# *****指定字符编码***** --------------------------------------------------------### 常遇到的是window连接数据库出现问题。。。x <- rawToChar(as.raw(177))xstr_conv(x, "ISO-8859-2") # Polish "a with ogonek"str_conv(x, "ISO-8859-1") # Plus-minus# *****统计字符串出现次数***** --------------------------------------------------------fruit <- c("apple", "banana", "pear", "pineapple")str_count(fruit, "a")str_count(fruit, "p")str_count(fruit, "e")str_count(fruit, c("a", "b", "p", "p")) # 貌似是先匹配第一个，匹配上就结束，匹配不上转下一个str_count(c("a.", "...", ".a.a"), ".") #234 正则里面.代表匹配任何单个的字符字母数字甚至.字符本身str_count(c("a.", "...", ".a.a"), fixed(".")) # fix严格匹配# Missing inputs give missing outputsstr_c(c("a", NA, "b"), "-d")# Use str_replace_NA to display literal NAs:str_c(str_replace_na(c("a", NA, "b")), "-d")# *****是否存在某字符串***** --------------------------------------------------------fruit <- c("apple", "banana", "pear", "pinapple")str_detect(fruit, "a")str_detect(fruit, "^a") # 开始是astr_detect(fruit, "a$") # 结尾是astr_detect(fruit, "b") # 有bstr_detect(fruit, "[aeiou]") #[]或的关系# Also vectorised over patternstr_detect("aecfg", letters)# *****复制***** --------------------------------------------------------fruit <- c("apple", "pear", "banana")str_dup(fruit, 2)str_dup(fruit, 1:3)str_c("ba", str_dup("na", 0:5))# *****提取***** --------------------------------------------------------shopping_list <- c("apples x4", "bag of flour", "bag of sugar", "milk x2")str_extract(shopping_list, "\\d") # 匹配数字，第一个\是转义符，等价[0-9]str_extract(shopping_list, "[a-z]+") # +匹配多次，即匹配所有小写字母的组合情况,找到不间断的小写字母组合str_extract(shopping_list, "[a-z]{1,4}") #最少1个，最多4个小写字母的组合 str_extract(shopping_list, "\\b[a-z]{1,4}\\b") # "\b"匹配一个单词边界，此处为匹配长度为1-4的单词# Extract all matchesstr_extract_all(shopping_list, "[a-z]+")str_extract_all(shopping_list, "\\b[a-z]+\\b")str_extract_all(shopping_list, "\\d")# Simplify results into character matrix 返回矩阵str_extract_all(shopping_list, "\\b[a-z]+\\b", simplify = TRUE)str_extract_all(shopping_list, "\\d", simplify = TRUE)# Extract all wordsstr_extract_all("This is, suprisingly, a sentence.", boundary("word"))# *****插入字符串***** --------------------------------------------------------## 像是字符格式的设置，暂时没用到，不管了## paste0，拼接下似乎也可以的，写sql的时候，似乎方便点# Using values from the environment, and some formatsuser_name <- "smbache"amount <- 6.656account <- 1337str_interp("User ${user_name} (account $[08d]{account}) has $$[.2f]{amount}.")# Nested brace pairs work inside expressions too, and any braces can be# placed outside the expressions.str_interp("Works with } nested { braces too: $[.2f]{{{2 + 2}*{amount}}}")# Values can also come from a liststr_interp(  "One value, ${value1}, and then another, ${value2*2}.",  list(value1 = 10, value2 = 20))# Or a data framestr_interp(  "Values are $[.2f]{max(Sepal.Width)} and $[.2f]{min(Sepal.Width)}.",  iris)# *****统计字符串长度***** --------------------------------------------------------# 和nchar类似，不赘str_length(c("a","ab","abc"))# *****指定字符位置***** --------------------------------------------------------## 参数设置： string 输入字符串 pattern 正则表达式[有空学习下]## 返回的结果:## str_locate 一个整数元素组成的矩阵，第一列是符合匹配的字符的开始位置，第二列为结束位置## str_locate_all 列表，矩阵组成fruit <- c("apple", "banana", "pear", "pineapple")str_locate(fruit, "a")#       start end# [1,]     1   1# [2,]     2   2# [3,]     3   3# [4,]     5   5str_locate_all(fruit, "a")# [[1]]#         start end# [1,]     1   1# # [[2]]#         start end# [1,]     2   2# [2,]     4   4# [3,]     6   6# # [[3]]#         start end# [1,]     3   3# # [[4]]#         start end# [1,]     5   5numbers <- "1 and 2 and 4 and 456"num_loc <- str_locate_all(numbers, "[0-9]+")[[1]]#       start end# [1,]     1   1# [2,]     7   7# [3,]    13  13# [4,]    19  21# *****排序***** --------------------------------------------------------## str_orderstr_order(letters,decreasing = TRUE) # 返回排序的序号,降序str_sort(letters) # 返回排序后的字符# *****填补字符串***** --------------------------------------------------------## 参数## string 字符串## width 填补内容之后的长度## side 填补位置:left right both## pad 填补内容，默认空格str_pad("hadley", 10, "both","*") # "**hadley**"# *****查找覆盖***** --------------------------------------------------------## 参数## string 字符串## pattern 正则## replacement 替代内容str_replace("abccba", "[ab]", "-") # "-bccba" 匹配第一个str_replace_all("abccba", "[ab]", "-") # "--cc--" 全部匹配# *****切分字符串***** --------------------------------------------------------## 参数 n 切割的个数str_split(string = "baacaad",pattern = "aa", n = 3)# "b" "c" "d"str_split(string = "baacaad",pattern = "aa", n = 2)# "b"    "caad"# *****查找替换***** --------------------------------------------------------hw <- "Hadley Wickham"str_sub(hw, 1, 6) # "Hadley"str_sub(hw, 1, 6) <- "Wr";hw # "Wr Wickham"# *****提取子集，或者返回位置***** --------------------------------------------------------fruit <- c("apple", "banana", "pear", "pinapple")str_subset(fruit, "a")str_which(fruit, "a")# *****删除空格***** --------------------------------------------------------str_trim(string,side=c("both", "left", "right"))# *****加密字符串***** --------------------------------------------------------## 参数## width 最终显示的字符串长度,貌似不能给指定位置加密## side 加密的位置## ellipsis 加密符号x <- "15190869703"str_trunc(string = x, width = 10, side = "right",ellipsis = "****")
阅读全文
0 0