数据清洗与收集week3

来源：互联网发布：怎样删除mac照片编辑：程序博客网时间：2024/04/28 19:01

3.1 subsetting and sort

Subsetting - quick review

set.seed(13435)X <- data.frame("var1"=sample(1:5),"var2"=sample(6:10),"var3"=sample(11:15))X <- X[sample(1:5),]; X$var2[c(1,3)] = NAX

  var1 var2 var31    2   NA   154    1   10   112    3   NA   123    5    6   145    4    9   13

Subsetting - quick review

X[,1]

[1] 2 1 3 5 4

X[,"var1"]

[1] 2 1 3 5 4

X[1:2,"var2"]

[1] NA 10

Logicals ands and ors

X[(X$var1 <= 3 & X$var3 > 11),]

  var1 var2 var31    2   NA   152    3   NA   12

X[(X$var1 <= 3 | X$var3 > 15),]

  var1 var2 var31    2   NA   154    1   10   112    3   NA   12

Dealing with missing values

X[which(X$var2 > 8),]

  var1 var2 var34    1   10   115    4    9   13

Sorting

sort(X$var1)

[1] 1 2 3 4 5

sort(X$var1,decreasing=TRUE)

[1] 5 4 3 2 1

sort(X$var2,na.last=TRUE)

[1]  6  9 10 NA NA

Ordering ##order(..., na.last = TRUE, decreasing = FALSE)

X[order(X$var1),]

  var1 var2 var34    1   10   111    2   NA   152    3   NA   125    4    9   133    5    6   14

Ordering

X[order(X$var1,X$var3),]

  var1 var2 var34    1   10   111    2   NA   152    3   NA   125    4    9   133    5    6   14

Ordering with plyr

library(plyr)arrange(X,var1)

  var1 var2 var31    1   10   112    2   NA   153    3   NA   124    4    9   135    5    6   14

arrange(X,desc(var1))

  var1 var2 var31    5    6   142    4    9   133    3   NA   124    2   NA   155    1   10   11

# sort mtcars data by cylinder and displacementmtcars[with(mtcars, order(cyl, disp)), ]# Same result using arrange: no need to use with(), as the context is implicit# NOTE: plyr functions do NOT preserve row.namesarrange(mtcars, cyl, disp)# Let's keep the row.names in this examplemyCars = cbind(vehicle=row.names(mtcars), mtcars)arrange(myCars, cyl, disp)# Sort with displacement in descending orderarrange(myCars, cyl, desc(disp))

Notes and further resources

R programming in the Data Science Track
Andrew Jaffe's lecture notes http://www.biostat.jhsph.edu/~ajaffe/lec_winterR/Lecture%202.pdf

3.2 summarizing Data

Getting the data from the web

if(!file.exists("./data")){dir.create("./data")}fileUrl <- "https://data.baltimorecity.gov/api/views/k5ry-ef3g/rows.csv?accessType=DOWNLOAD"download.file(fileUrl,destfile="./data/restaurants.csv",method="curl")restData <- read.csv("./data/restaurants.csv")

再瞄一下

如head(restData,3) tail(restData,3) summary(restData) str()

Quantiles of quantitative variables

quantile(restData$councilDistrict,na.rm=TRUE)

  0%  25%  50%  75% 100%    1    2    9   11   14

quantile(restData$councilDistrict,probs=c(0.5,0.75,0.9))

50% 75% 90%   9  11  12

Make table

table(restData$zipCode,useNA="ifany")

-21226  21201  21202  21205  21206  21207  21208  21209  21210  21211  21212  21213  21214  21215      1    136    201     27     30      4      1      8     23     41     28     31     17     54  21216  21217  21218  21220  21222  21223  21224  21225  21226  21227  21229  21230  21231  21234     10     32     69      1      7     56    199     19     18      4     13    156    127      7  21237  21239  21251  21287      1      3      2      1

table(restData$councilDistrict,restData$zipCode)

Check for missing values

sum(is.na(restData$councilDistrict))

[1] 0

any(is.na(restData$councilDistrict))

[1] FALSE

all(restData$zipCode > 0)

[1] FALSE

Row and column sums

colSums(is.na(restData))

           name         zipCode    neighborhood councilDistrict  policeDistrict      Location.1               0               0               0               0               0               0

all(colSums(is.na(restData))==0)

[1] TRUE

Values with specific characteristics

table(restData$zipCode %in% c("21212"))

FALSE  TRUE  1299    28

table(restData$zipCode %in% c("21212","21213"))

FALSE  TRUE  1268    59

Values with specific characteristics

restData[restData$zipCode %in% c("21212","21213"),]

Cross tabs

data(UCBAdmissions)DF = as.data.frame(UCBAdmissions)summary(DF)

      Admit       Gender   Dept       Freq     Admitted:12   Male  :12   A:4   Min.   :  8   Rejected:12   Female:12   B:4   1st Qu.: 80                             C:4   Median :170                             D:4   Mean   :189                             E:4   3rd Qu.:302                             F:4   Max.   :512

Cross tabs

xt <- xtabs(Freq ~ Gender + Admit,data=DF)xt

        AdmitGender   Admitted Rejected  Male       1198     1493  Female      557     1278

Flat tables

warpbreaks$replicate <- rep(1:9, len = 54)xt = xtabs(breaks ~.,data=warpbreaks)xt

Flat tables

ftable(xt)

             replicate  1  2  3  4  5  6  7  8  9wool tension                                     A    L                 26 30 54 25 70 52 51 26 67     M                 18 21 29 17 12 18 35 30 36     H                 36 21 24 18 10 43 28 15 26B    L                 27 14 29 19 29 31 41 20 44     M                 42 26 19 16 39 28 21 39 29     H                 20 21 24 17 13 15 15 16 28

Size of a data set

fakeData = rnorm(1e5)object.size(fakeData)

800040 bytes

print(object.size(fakeData),units="Mb")

0.8 Mb

3.3 创建新变量

Why create new variables?
Often the raw data won't have a value you are looking for
You will need to transform the data to get the values you would like
Usually you will add those values to the data frames you are working with
Common variables to createMissingness indicators
"Cutting up" quantitative variables
Applying transforms
Getting the data from the web
if(!file.exists("./data")){dir.create("./data")}fileUrl <- "https://data.baltimorecity.gov/api/views/k5ry-ef3g/rows.csv?accessType=DOWNLOAD"download.file(fileUrl,destfile="./data/restaurants.csv",method="curl")restData <- read.csv("./data/restaurants.csv")
Creating sequences
Sometimes you need an index for your data set
s1 <- seq(1,10,by=2) ; s1
[1] 1 3 5 7 9
s2 <- seq(1,10,length=3); s2
[1]  1.0  5.5 10.0
x <- c(1,3,8,25,100); seq(along = x)
[1] 1 2 3 4 5
Subsetting variables
restData$nearMe = restData$neighborhood %in% c("Roland Park", "Homeland")table(restData$nearMe)
FALSE  TRUE  1314    13 
Creating binary variables
restData$zipWrong = ifelse(restData$zipCode < 0, TRUE, FALSE)table(restData$zipWrong,restData$zipCode < 0)
        FALSE TRUE  FALSE  1326    0  TRUE      0    1
Creating categorical variables
restData$zipGroups = cut(restData$zipCode,breaks=quantile(restData$zipCode))table(restData$zipGroups)
(-2.123e+04,2.12e+04]  (2.12e+04,2.122e+04] (2.122e+04,2.123e+04] (2.123e+04,2.129e+04]                   337                   375                   282                   332 
table(restData$zipGroups,restData$zipCode)
                        -21226 21201 21202 21205 21206 21207 21208 21209 21210 21211 21212 21213  (-2.123e+04,2.12e+04]      0   136   201     0     0     0     0     0     0     0     0     0  (2.12e+04,2.122e+04]       0     0     0    27    30     4     1     8    23    41    28    31  (2.122e+04,2.123e+04]      0     0     0     0     0     0     0     0     0     0     0     0  (2.123e+04,2.129e+04]      0     0     0     0     0     0     0     0     0     0     0     0                        21214 21215 21216 21217 21218 21220 21222 21223 21224 21225 21226 21227  (-2.123e+04,2.12e+04]     0     0     0     0     0     0     0     0     0     0     0     0  (2.12e+04,2.122e+04]     17    54    10    32    69     0     0     0     0     0     0     0  (2.122e+04,2.123e+04]     0     0     0     0     0     1     7    56   199    19     0     0  (2.123e+04,2.129e+04]     0     0     0     0     0     0     0     0     0     0    18     4                        21229 21230 21231 21234 21237 21239 21251 21287  (-2.123e+04,2.12e+04]     0     0     0     0     0     0     0     0  (2.12e+04,2.122e+04]      0     0     0     0     0     0     0     0  (2.122e+04,2.123e+04]     0     0     0     0     0     0     0     0  (2.123e+04,2.129e+04]    13   156   127     7     1     3     2     1
Easier cutting
library(Hmisc)restData$zipGroups = cut2(restData$zipCode,g=4)table(restData$zipGroups)
[-21226,21205) [ 21205,21220) [ 21220,21227) [ 21227,21287]            338            375            300            314 
Creating factor variables
restData$zcf <- factor(restData$zipCode)restData$zcf[1:10]
 [1] 21206 21231 21224 21211 21223 21218 21205 21211 21205 2123132 Levels: -21226 21201 21202 21205 21206 21207 21208 21209 21210 21211 21212 21213 21214 ... 21287
class(restData$zcf)
[1] "factor"

Levels of factor variables
yesno <- sample(c("yes","no"),size=10,replace=TRUE)yesnofac = factor(yesno,levels=c("yes","no"))relevel(yesnofac,ref="no")###如果不用relevel这个函数的话，则为levels:yes,no
 [1] yes yes yes yes no  yes yes yes no  no Levels: no yes
as.numeric(yesnofac)
 [1] 1 1 1 1 2 1 1 1 2 2
Cutting produces factor variables
library(Hmisc)restData$zipGroups = cut2(restData$zipCode,g=4)table(restData$zipGroups)
[-21226,21205) [ 21205,21220) [ 21220,21227) [ 21227,21287]            338            375            300            314 

Using the mutate function
library(Hmisc); library(plyr)restData2 = mutate(restData,zipGroups=cut2(zipCode,g=4))table(restData2$zipGroups)
[-21226,21205) [ 21205,21220) [ 21220,21227) [ 21227,21287]            338            375            300            314 

Common transforms
abs(x) absolute value
sqrt(x) square root
ceiling(x) ceiling(3.475) is 4
floor(x) floor(3.475) is 3
round(x,digits=n) round(3.475,digits=2) is 3.48
signif(x,digits=n) signif(3.475,digits=2) is 3.5
cos(x), sin(x) etc.
log(x) natural logarithm
log2(x), log10(x) other common logs
exp(x) exponentiating x
http://www.biostat.jhsph.edu/~ajaffe/lec_winterR/Lecture%202.pdf http://statmethods.net/management/functions.html
Notes and further reading
A tutorial from the developer of plyr - http://plyr.had.co.nz/09-user/
Andrew Jaffe's R notes http://www.biostat.jhsph.edu/~ajaffe/lec_winterR/Lecture%202.pdf
A nice lecture on categorical and factor variables http://www.stat.berkeley.edu/classes/s133/factors.html
3.4 reshape the data
Start with reshaping
library(reshape2)head(mtcars)
                   mpg cyl disp  hp drat    wt  qsec vs am gear carbMazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
Melting data frames
mtcars$carname <- rownames(mtcars)carMelt <- melt(mtcars,id=c("carname","gear","cyl"),measure.vars=c("mpg","hp"))head(carMelt,n=3)
        carname gear cyl variable value1     Mazda RX4    4   6      mpg  21.02 Mazda RX4 Wag    4   6      mpg  21.03    Datsun 710    4   4      mpg  22.8
tail(carMelt,n=3)
         carname gear cyl variable value62  Ferrari Dino    5   6       hp   17563 Maserati Bora    5   8       hp   33564    Volvo 142E    4   4       hp   109
http://www.statmethods.net/management/reshape.html
Casting data frames
cylData <- dcast(carMelt, cyl ~ variable)cylData
  cyl mpg hp1   4  11 112   6   7  73   8  14 14
cylData <- dcast(carMelt, cyl ~ variable,mean)cylData
  cyl   mpg     hp1   4 26.66  82.642   6 19.74 122.293   8 15.10 209.21
http://www.statmethods.net/management/reshape.html
Averaging values
head(InsectSprays)
  count spray1    10     A2     7     A3    20     A4    14     A5    14     A6    12     A
tapply(InsectSprays$count,InsectSprays$spray,sum)
  A   B   C   D   E   F 174 184  25  59  42 200 
http://www.r-bloggers.com/a-quick-primer-on-split-apply-combine-problems/
Another way - split
spIns =  split(InsectSprays$count,InsectSprays$spray)spIns
$A [1] 10  7 20 14 14 12 10 23 17 20 14 13$B [1] 11 17 21 11 16 14 17 17 19 21  7 13$C [1] 0 1 7 2 3 1 2 1 3 0 1 4$D [1]  3  5 12  6  4  3  5  5  5  5  2  4$E [1] 3 5 3 5 3 6 1 1 3 2 6 4$F [1] 11  9 15 22 15 16 13 10 26 26 24 13
Another way - apply
sprCount = lapply(spIns,sum)sprCount
$A[1] 174$B[1] 184$C[1] 25$D[1] 59$E[1] 42$F[1] 200
Another way - combine
unlist(sprCount)##也就是说用完lapply再用一个unlist简直爽翻了
  A   B   C   D   E   F 174 184  25  59  42 200 
sapply(spIns,sum)
  A   B   C   D   E   F 174 184  25  59  42 200 
Another way - plyr package
ddply(InsectSprays,.(spray),summarize,sum=sum(count))
  spray sum1     A 1742     B 1843     C  254     D  595     E  426     F 200
Creating a new variable
spraySums <- ddply(InsectSprays,.(spray),summarize,sum=ave(count,FUN=sum))dim(spraySums)
[1] 72  2
head(spraySums)
  spray sum1     A 1742     A 1743     A 1744     A 1745     A 1746     A 174
More information
A tutorial from the developer of plyr - http://plyr.had.co.nz/09-user/
A nice reshape tutorial http://www.slideshare.net/jeffreybreen/reshaping-data-in-r
A good plyr primer - http://www.r-bloggers.com/a-quick-primer-on-split-apply-combine-problems/
See also the functionsacast - for casting as multi-dimensional arrays
arrange - for faster reordering without using order() commands
mutate - adding new variables

3.5merging Data

Peer review data
if(!file.exists("./data")){dir.create("./data")}fileUrl1 = "https://dl.dropboxusercontent.com/u/7710864/data/reviews-apr29.csv"fileUrl2 = "https://dl.dropboxusercontent.com/u/7710864/data/solutions-apr29.csv"download.file(fileUrl1,destfile="./data/reviews.csv",method="curl")download.file(fileUrl2,destfile="./data/solutions.csv",method="curl")reviews = read.csv("./data/reviews.csv"); solutions <- read.csv("./data/solutions.csv")head(reviews,2)
  id solution_id reviewer_id      start       stop time_left accept1  1           3          27 1304095698 1304095758      1754      12  2           4          22 1304095188 1304095206      2306      1
head(solutions,2)
  id problem_id subject_id      start       stop time_left answer1  1        156         29 1304095119 1304095169      2343      B2  2        269         25 1304095119 1304095183      2329      C
Merging data - merge()
Merges data frames
Important parameters: x,y,by,by.x,by.y,all
names(reviews)
[1] "id"          "solution_id" "reviewer_id" "start"       "stop"        "time_left"  [7] "accept"     
names(solutions)
[1] "id"         "problem_id" "subject_id" "start"      "stop"       "time_left"  "answer"    
Merging data - merge()
mergedData = merge(reviews,solutions,by.x="solution_id",by.y="id",all=TRUE)head(mergedData)
  solution_id id reviewer_id    start.x     stop.x time_left.x accept problem_id subject_id1           1  4          26 1304095267 1304095423        2089      1        156         292           2  6          29 1304095471 1304095513        1999      1        269         253           3  1          27 1304095698 1304095758        1754      1         34         224           4  2          22 1304095188 1304095206        2306      1         19         235           5  3          28 1304095276 1304095320        2192      1        605         266           6 16          22 1304095303 1304095471        2041      1        384         27     start.y     stop.y time_left.y answer1 1304095119 1304095169        2343      B2 1304095119 1304095183        2329      C3 1304095127 1304095146        2366      C4 1304095127 1304095150        2362      D5 1304095127 1304095167        2345      A6 1304095131 1304095270        2242      C
Default - merge all common column names
intersect(names(solutions),names(reviews))
[1] "id"        "start"     "stop"      "time_left"
mergedData2 = merge(reviews,solutions,all=TRUE)head(mergedData2)
  id      start       stop time_left solution_id reviewer_id accept problem_id subject_id answer1  1 1304095119 1304095169      2343          NA          NA     NA        156         29      B2  1 1304095698 1304095758      1754           3          27      1         NA         NA   <NA>3  2 1304095119 1304095183      2329          NA          NA     NA        269         25      C4  2 1304095188 1304095206      2306           4          22      1         NA         NA   <NA>5  3 1304095127 1304095146      2366          NA          NA     NA         34         22      C6  3 1304095276 1304095320      2192           5          28      1         NA         NA   <NA>
Using join in the plyr package
Faster, but less full featured - defaults to left join, see help file for more
df1 = data.frame(id=sample(1:10),x=rnorm(10))df2 = data.frame(id=sample(1:10),y=rnorm(10))arrange(join(df1,df2),id)
   id       x       y1   1  0.2514  0.22862   2  0.1048  0.83953   3 -0.1230 -1.11654   4  1.5057 -0.11215   5 -0.2505  1.21246   6  0.4699 -1.60387   7  0.4627 -0.80608   8 -1.2629 -1.28489   9 -0.9258 -0.827610 10  2.8065  0.5794
If you have multiple data frames
df1 = data.frame(id=sample(1:10),x=rnorm(10))df2 = data.frame(id=sample(1:10),y=rnorm(10))df3 = data.frame(id=sample(1:10),z=rnorm(10))dfList = list(df1,df2,df3)join_all(dfList)
   id        x        y        z1   6  0.39093 -0.16670  0.565232   1 -1.90467  0.43811 -0.374493   7 -1.48798 -0.85497 -0.692094  10 -2.59440  0.39591 -0.361345   3 -0.08539  0.08053  1.012476   4 -1.63165 -0.13158  0.219277   5 -0.50594  0.24256 -0.440038   9 -0.85062 -2.08066 -0.969509   2 -0.63767 -0.10069  0.0900210  8  1.20439  1.29138 -0.88586
More on merging data
The quick R data merging page - http://www.statmethods.net/management/merging.html
plyr information - http://plyr.had.co.nz/
Types of joins - http://en.wikipedia.org/wiki/Join_(SQL)

0 0