stringr包介绍

来源：互联网发布：linux dd 备份u盘编辑：程序博客网时间：2024/05/18 18:43

1. case用法：

str_to_upper(string, locale = "")
str_to_lower(string, locale = "")
str_to_title(string, locale = "")

string为要处理的字符串；locale为要翻译的...。

> dog <- "The quick brown dog"> str_to_upper(dog)[1] "THE QUICK BROWN DOG"> str_to_lower(dog)[1] "the quick brown dog"> str_to_title(dog)[1] "The Quick Brown Dog"> str_to_upper("i", "en") # english[1] "I"> str_to_upper("i", "tr") # Turkish[1] "İ"

2. str_c的用法：str_c(..., sep = "", collapse = NULL)

... 为一组字符串向量；sep为插入字符串向量的字符串；collapse为把输入的字符串合并为单个字符串（默认没有）> str_c("Letter", letters, sep = ": ") [1] "Letter: a" "Letter: b" "Letter: c" "Letter: d" "Letter: e" [6] "Letter: f" "Letter: g" "Letter: h" "Letter: i" "Letter: j"[11] "Letter: k" "Letter: l" "Letter: m" "Letter: n" "Letter: o"[16] "Letter: p" "Letter: q" "Letter: r" "Letter: s" "Letter: t"[21] "Letter: u" "Letter: v" "Letter: w" "Letter: x" "Letter: y"[26] "Letter: z"

> str_c(letters, collapse = "")[1] "abcdefghijklmnopqrstuvwxyz"> str_c(letters, collapse = ",")[1] "a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z"> str_c(c("a", NA, "B"), "-d")[1] "a-d" NA    "B-d"> str_c(str_replace_na(c("a", NA, "b")), "-d")

[1] "a-d"  "NA-d" "b-d"

3. str_count的用法：

str_count(string, pattern = "")

string为字符串；pattern为寻找模式。

> fruit <- c("apple", "banana", "pear", "pineapple")> str_count(fruit, "a")[1] 1 3 1 1> str_count(fruit, "p")[1] 2 0 1 3> str_count(fruit, c("a","b","p","p")) # 对应每一个查找[1] 1 1 1 3> str_count(c("a.", "...", ".a.a"), ".") # 此处. 为正则表达式[1] 2 3 4> str_count(c("a.", "...", ".a.a"), fixed(".")) #fixed(".")为只查找.号，也可用"\\."[1] 1 3 2

4. str_detect的用法：

str_detect(string, pattern)

string与pattern如3.

> str_detect(fruit, "a") # 检测是否有a[1] TRUE TRUE TRUE TRUE> str_detect(fruit, "^a") # 检测字符串是否以a开头[1]  TRUE FALSE FALSE FALSE> str_detect(fruit, "a$")  # 检测字符串是否以a结尾[1] FALSE  TRUE FALSE FALSE

5. str_extract/str_extract_all的用法：

str_extract(string, pattern)
str_extract_all(string, pattern, simplify = FALSE)

string,pattern如上；simplify：FALSE为返回字符串向量，TRUE为返回字符串矩阵。

> shopping_list <- c("apples x4", "bag of flour", "bag of sugar", "milk x2")> str_extract(shopping_list, "\\d") # \\d+ 更好一些 [1] "4" NA  NA  "2"

> str_extract(shopping_list, "[a-z]+")[1] "apples" "bag"    "bag"    "milk"

> str_extract(shopping_list, "[a-z]{1,4}")[1] "appl" "bag"  "bag"  "milk"> str_extract(shopping_list, "\\b[a-z]{1,4}\\b") # \\b 为边界 [1] NA     "bag"  "bag"  "milk"> str_extract_all(shopping_list, "[a-z]+") #由此看出str_extract与str_extract_all的不同[[1]][1] "apples" "x"     [[2]][1] "bag"   "of"    "flour"[[3]][1] "bag"   "of"    "sugar"[[4]][1] "milk" "x"   > str_extract_all(shopping_list, "\\b[a-z]+\\b")[[1]][1] "apples"[[2]][1] "bag"   "of"    "flour"[[3]][1] "bag"   "of"    "sugar"[[4]]

> str_extract_all(shopping_list, "\\b[a-z]+\\b", simplify = TRUE) # 生成字符串矩阵     [,1]     [,2] [,3]   [1,] "apples" ""   ""     [2,] "bag"    "of" "flour"[3,] "bag"    "of" "sugar"[4,] "milk"   ""   ""     > str_extract_all("This is, suprisingly, a sentence.", boundary("word"))# 以单词为边界[[1]][1] "This"        "is"          "suprisingly" "a"          [5] "sentence"

6. str_match的用法：

str_match(string, pattern)

string与pattern用法如上。

> strings <- c(" 219 733 8965", "329-293-8753 ", "banana", "595 794 7569",+              "387 287 6718", "apple", "233.398.9187 ", "482 952 3315",+              "239 923 8115 and 842 566 4692", "Work: 579-499-7527", "$1000",+              "Home: 543.355.3679")> strings [1] " 219 733 8965"                 "329-293-8753 "                 [3] "banana"                        "595 794 7569"                  [5] "387 287 6718"                  "apple"                         [7] "233.398.9187 "                 "482 952 3315"                  [9] "239 923 8115 and 842 566 4692" "Work: 579-499-7527"           [11] "$1000"                         "Home: 543.355.3679"           > phone <- "([2-9][0-9]{2})[- .]([0-9]{3})[- .]([0-9]{4})" # 正则表达式的用法详见《正则表达式必知必会》> str_extract(strings, phone) [1] "219 733 8965" "329-293-8753" NA             "595 794 7569" "387 287 6718" [6] NA             "233.398.9187" "482 952 3315" "239 923 8115" "579-499-7527"[11] NA             "543.355.3679"> str_match(strings, phone)      [,1]           [,2]  [,3]  [,4]   [1,] "219 733 8965" "219" "733" "8965" [2,] "329-293-8753" "329" "293" "8753" [3,] NA             NA    NA    NA     [4,] "595 794 7569" "595" "794" "7569" [5,] "387 287 6718" "387" "287" "6718" [6,] NA             NA    NA    NA     [7,] "233.398.9187" "233" "398" "9187" [8,] "482 952 3315" "482" "952" "3315" [9,] "239 923 8115" "239" "923" "8115"[10,] "579-499-7527" "579" "499" "7527"[11,] NA             NA    NA    NA    [12,] "543.355.3679" "543" "355" "3679"

7. str_pad的用法：

str_pad(string, width, side = c("left", "right", "both"), pad = " ")

string为字符串；width为空格的最小宽度；side为空格字符填充的方向；pad为填充的单个字符，默认的为空格。

> rbind(+     str_pad("hadley", 30, "left"),+     str_pad("hadley", 30, "right"),+     str_pad("hadley", 30, "both")+ )     [,1]                            [1,] "                        hadley"[2,] "hadley                        "[3,] "            hadley            "

> rbind(+     str_pad("hadley", 30, "left", pad = "."),+     str_pad("hadley", 30, "right", pad = "."),+     str_pad("hadley", 30, "both", pad = ".")+ )     [,1]                            [1,] "........................hadley"[2,] "hadley........................"[3,] "............hadley............"

8. str_replace的用法：

str_replace(string, pattern, replacement)

string为字符串；pattern为要替换的内容，常为正则表达式；replacement为替换者。

> fruits <- c("one apple", "two pears", "three bananas")> str_replace(fruits, "[aeiou]", "-")[1] "-ne apple"     "tw- pears"     "thr-e bananas"> str_replace_all(fruits, "[aeiou]", "-")[1] "-n- -ppl-"     "tw- p--rs"     "thr-- b-n-n-s"

> str_replace(fruits, "([aeiou])", "")[1] "ne apple"     "tw pears"     "thre bananas"

> str_replace_all(str_c(fruits, collapse = "---"), c("one" = 1, "two" = 2, "three" = 3))##如果使用多个模式与替换作用于同一个字符串，可以把名称传递给模式 [1] "1 apple---2 pears---3 bananas"

9. str_split的用法：

str_split(string, pattern, n = Inf, simplify = FALSE)

str_split_fixed(string, pattern, n)

string为字符串；pattern为分离模式；n为分割为多少块；simplify：FALSE的时候返回字符串向量列表，为TRUE的时候返回字符串矩阵。

10. str_sub的用法：

str_sub(string, start = 1L, end = -1L)

string为字符串；start和end分别为开始和结束字符。

11. str_subset的用法：

str_subset(string, pattern)

string与pattern用法如上。

> fruit <- c("apple", "banana", "pear", "pinapple")> str_subset(fruit, "a")[1] "apple"    "banana"   "pear"     "pinapple"> str_subset(fruit, "^a")[1] "apple"> str_detect(fruit, "^a") # 对比str_detect与str_subset的用法[1]  TRUE FALSE FALSE FALSE> str_subset(fruit, "a$")[1] "banana"> str_subset(fruit, "b")[1] "banana"> str_subset(fruit, "[aeiou]")[1] "apple"    "banana"   "pear"     "pinapple"

> hw <- "Hadley Wickham"> str_sub(hw, 1, 6)[1] "Hadley"> str_sub(hw, end = 6)[1] "Hadley"> str_sub(hw, 8, 14)[1] "Wickham"> str_sub(hw, c(1,8), c(6,14))[1] "Hadley"  "Wickham"> str_sub(hw, -1)[1] "m"> str_sub(hw, -7)[1] "Wickham"

> x <- "BBCDEF"> str_sub(x, 1, 1)[1] "B"> str_sub(x, 1, 1) <- "A+ > str_sub(x, 1, 1) <- "A"> x[1] "ABCDEF"> str_sub(x, -1, -1) <- "K"> x[1] "ABCDEK"> str_sub(x, -2, -2) <- "GHIJ"; x[1] "ABCDGHIJK"> str_sub(x, 2, -2) <- ""; x[1] "AK"

> fruits <- c(+     "apples and oranges and pears and bananas",+     "pineapples and mangos and guavas"+ )> fruits[1] "apples and oranges and pears and bananas" "pineapples and mangos and guavas"        > str_split(fruits, "and")[[1]][1] "apples "   " oranges " " pears "   " bananas" [[2]][1] "pineapples " " mangos "    " guavas"    > str_split(fruits, "and", simplify = TRUE)     [,1]          [,2]        [,3]      [,4]      [1,] "apples "     " oranges " " pears " " bananas"[2,] "pineapples " " mangos "  " guavas" ""

> str_split(fruits, "and", n=3)[[1]][1] "apples "            " oranges "          " pears and bananas"[[2]][1] "pineapples " " mangos "    " guavas"

> str_split(fruits, "and", n=5)[[1]][1] "apples "   " oranges " " pears "   " bananas" [[2]][1] "pineapples " " mangos "    " guavas"    > str_split_fixed(fruits, "and", 3)     [,1]          [,2]        [,3]                [1,] "apples "     " oranges " " pears and bananas"[2,] "pineapples " " mangos "  " guavas"           > str_split_fixed(fruits, "and", 4) # n大于分割的字符串时，多余的用空字符串表示     [,1]          [,2]        [,3]      [,4]      [1,] "apples "     " oranges " " pears " " bananas"[2,] "pineapples " " mangos "  " guavas" ""

> str_subset(c("a", "b", NA), ".") # 自动去掉缺失值[1] "a" "b"

12. word的用法：

word(string, start = 1L, end = start, sep =fixed(" "))
sep为单词之间的分隔符。

> sentences <- c("Jane saw a cat", "Jane sat down")> sentences[1] "Jane saw a cat" "Jane sat down" > word(sentences, 1)[1] "Jane" "Jane"> word(sentences, 2)[1] "saw" "sat"> word(sentences, -1)[1] "cat"  "down"> word(sentences, 2, -1)[1] "saw a cat" "sat down" > word(sentences[1], 1:3, -1)[1] "Jane saw a cat" "saw a cat"      "a cat"         > word(sentences[1], 1, 1:4)[1] "Jane"           "Jane saw"       "Jane saw a"     "Jane saw a cat"> str <- 'abc.def..123.4568.999'> word(str, 1, sep = fixed('..'))#提取分隔后的第一个[1] "abc.def"> word(str, 2, sep = fixed('..')) #提取分隔后的第二个[1] "123.4568.999"> word(str, 1, -1, sep = fixed('..'))[1] "abc.def..123.4568.999"

0 0