数据清洗与收集week3

来源:互联网 发布:怎样删除mac照片 编辑:程序博客网 时间:2024/04/28 19:01

3.1 subsetting and sort

Subsetting - quick review

set.seed(13435)X <- data.frame("var1"=sample(1:5),"var2"=sample(6:10),"var3"=sample(11:15))X <- X[sample(1:5),]; X$var2[c(1,3)] = NAX
  var1 var2 var31    2   NA   154    1   10   112    3   NA   123    5    6   145    4    9   13

Subsetting - quick review

X[,1]
[1] 2 1 3 5 4
X[,"var1"]
[1] 2 1 3 5 4
X[1:2,"var2"]
[1] NA 10

Logicals ands and ors

X[(X$var1 <= 3 & X$var3 > 11),]
  var1 var2 var31    2   NA   152    3   NA   12
X[(X$var1 <= 3 | X$var3 > 15),]
  var1 var2 var31    2   NA   154    1   10   112    3   NA   12

Dealing with missing values

X[which(X$var2 > 8),]
  var1 var2 var34    1   10   115    4    9   13

Sorting

sort(X$var1)
[1] 1 2 3 4 5
sort(X$var1,decreasing=TRUE)
[1] 5 4 3 2 1
sort(X$var2,na.last=TRUE)
[1]  6  9 10 NA NA

Ordering ##order(..., na.last = TRUE, decreasing = FALSE)

X[order(X$var1),]
  var1 var2 var34    1   10   111    2   NA   152    3   NA   125    4    9   133    5    6   14

Ordering

X[order(X$var1,X$var3),]
  var1 var2 var34    1   10   111    2   NA   152    3   NA   125    4    9   133    5    6   14

Ordering with plyr

library(plyr)arrange(X,var1)
  var1 var2 var31    1   10   112    2   NA   153    3   NA   124    4    9   135    5    6   14
arrange(X,desc(var1))
  var1 var2 var31    5    6   142    4    9   133    3   NA   124    2   NA   155    1   10   11
# sort mtcars data by cylinder and displacementmtcars[with(mtcars, order(cyl, disp)), ]# Same result using arrange: no need to use with(), as the context is implicit# NOTE: plyr functions do NOT preserve row.namesarrange(mtcars, cyl, disp)# Let's keep the row.names in this examplemyCars = cbind(vehicle=row.names(mtcars), mtcars)arrange(myCars, cyl, disp)# Sort with displacement in descending orderarrange(myCars, cyl, desc(disp))

Notes and further resources

  • R programming in the Data Science Track
  • Andrew Jaffe's lecture notes http://www.biostat.jhsph.edu/~ajaffe/lec_winterR/Lecture%202.pdf

3.2 summarizing Data

Getting the data from the web

if(!file.exists("./data")){dir.create("./data")}fileUrl <- "https://data.baltimorecity.gov/api/views/k5ry-ef3g/rows.csv?accessType=DOWNLOAD"download.file(fileUrl,destfile="./data/restaurants.csv",method="curl")restData <- read.csv("./data/restaurants.csv")
再瞄一下
如head(restData,3)   tail(restData,3)   summary(restData)   str()

Quantiles of quantitative variables

quantile(restData$councilDistrict,na.rm=TRUE)
  0%  25%  50%  75% 100%    1    2    9   11   14 
quantile(restData$councilDistrict,probs=c(0.5,0.75,0.9))
50% 75% 90%   9  11  12 

Make table

table(restData$zipCode,useNA="ifany")
-21226  21201  21202  21205  21206  21207  21208  21209  21210  21211  21212  21213  21214  21215      1    136    201     27     30      4      1      8     23     41     28     31     17     54  21216  21217  21218  21220  21222  21223  21224  21225  21226  21227  21229  21230  21231  21234     10     32     69      1      7     56    199     19     18      4     13    156    127      7  21237  21239  21251  21287      1      3      2      1 
table(restData$councilDistrict,restData$zipCode)

Check for missing values

sum(is.na(restData$councilDistrict))
[1] 0
any(is.na(restData$councilDistrict))
[1] FALSE
all(restData$zipCode > 0)
[1] FALSE

Row and column sums

colSums(is.na(restData))
           name         zipCode    neighborhood councilDistrict  policeDistrict      Location.1               0               0               0               0               0               0 
all(colSums(is.na(restData))==0)
[1] TRUE

Values with specific characteristics

table(restData$zipCode %in% c("21212"))
FALSE  TRUE  1299    28 
table(restData$zipCode %in% c("21212","21213"))
FALSE  TRUE  1268    59 

Values with specific characteristics

restData[restData$zipCode %in% c("21212","21213"),]

Cross tabs

data(UCBAdmissions)DF = as.data.frame(UCBAdmissions)summary(DF)
      Admit       Gender   Dept       Freq     Admitted:12   Male  :12   A:4   Min.   :  8   Rejected:12   Female:12   B:4   1st Qu.: 80                             C:4   Median :170                             D:4   Mean   :189                             E:4   3rd Qu.:302                             F:4   Max.   :512  

Cross tabs

xt <- xtabs(Freq ~ Gender + Admit,data=DF)xt
        AdmitGender   Admitted Rejected  Male       1198     1493  Female      557     1278

Flat tables

warpbreaks$replicate <- rep(1:9, len = 54)xt = xtabs(breaks ~.,data=warpbreaks)xt

Flat tables

ftable(xt)
             replicate  1  2  3  4  5  6  7  8  9wool tension                                     A    L                 26 30 54 25 70 52 51 26 67     M                 18 21 29 17 12 18 35 30 36     H                 36 21 24 18 10 43 28 15 26B    L                 27 14 29 19 29 31 41 20 44     M                 42 26 19 16 39 28 21 39 29     H                 20 21 24 17 13 15 15 16 28

Size of a data set

fakeData = rnorm(1e5)object.size(fakeData)
800040 bytes
print(object.size(fakeData),units="Mb")
0.8 Mb

3.3 创建新变量

Why create new variables?

  • Often the raw data won't have a value you are looking for
  • You will need to transform the data to get the values you would like
  • Usually you will add those values to the data frames you are working with
  • Common variables to create
    • Missingness indicators
    • "Cutting up" quantitative variables
    • Applying transforms

Getting the data from the web

if(!file.exists("./data")){dir.create("./data")}fileUrl <- "https://data.baltimorecity.gov/api/views/k5ry-ef3g/rows.csv?accessType=DOWNLOAD"download.file(fileUrl,destfile="./data/restaurants.csv",method="curl")restData <- read.csv("./data/restaurants.csv")

Creating sequences

Sometimes you need an index for your data set

s1 <- seq(1,10,by=2) ; s1
[1] 1 3 5 7 9
s2 <- seq(1,10,length=3); s2
[1]  1.0  5.5 10.0
x <- c(1,3,8,25,100); seq(along = x)
[1] 1 2 3 4 5

Subsetting variables

restData$nearMe = restData$neighborhood %in% c("Roland Park", "Homeland")table(restData$nearMe)
FALSE  TRUE  1314    13 

Creating binary variables

restData$zipWrong = ifelse(restData$zipCode < 0, TRUE, FALSE)table(restData$zipWrong,restData$zipCode < 0)
        FALSE TRUE  FALSE  1326    0  TRUE      0    1

Creating categorical variables

restData$zipGroups = cut(restData$zipCode,breaks=quantile(restData$zipCode))table(restData$zipGroups)
(-2.123e+04,2.12e+04]  (2.12e+04,2.122e+04] (2.122e+04,2.123e+04] (2.123e+04,2.129e+04]                   337                   375                   282                   332 
table(restData$zipGroups,restData$zipCode)
                        -21226 21201 21202 21205 21206 21207 21208 21209 21210 21211 21212 21213  (-2.123e+04,2.12e+04]      0   136   201     0     0     0     0     0     0     0     0     0  (2.12e+04,2.122e+04]       0     0     0    27    30     4     1     8    23    41    28    31  (2.122e+04,2.123e+04]      0     0     0     0     0     0     0     0     0     0     0     0  (2.123e+04,2.129e+04]      0     0     0     0     0     0     0     0     0     0     0     0                        21214 21215 21216 21217 21218 21220 21222 21223 21224 21225 21226 21227  (-2.123e+04,2.12e+04]     0     0     0     0     0     0     0     0     0     0     0     0  (2.12e+04,2.122e+04]     17    54    10    32    69     0     0     0     0     0     0     0  (2.122e+04,2.123e+04]     0     0     0     0     0     1     7    56   199    19     0     0  (2.123e+04,2.129e+04]     0     0     0     0     0     0     0     0     0     0    18     4                        21229 21230 21231 21234 21237 21239 21251 21287  (-2.123e+04,2.12e+04]     0     0     0     0     0     0     0     0  (2.12e+04,2.122e+04]      0     0     0     0     0     0     0     0  (2.122e+04,2.123e+04]     0     0     0     0     0     0     0     0  (2.123e+04,2.129e+04]    13   156   127     7     1     3     2     1

Easier cutting

library(Hmisc)restData$zipGroups = cut2(restData$zipCode,g=4)table(restData$zipGroups)
[-21226,21205) [ 21205,21220) [ 21220,21227) [ 21227,21287]            338            375            300            314 

Creating factor variables

restData$zcf <- factor(restData$zipCode)restData$zcf[1:10]
 [1] 21206 21231 21224 21211 21223 21218 21205 21211 21205 2123132 Levels: -21226 21201 21202 21205 21206 21207 21208 21209 21210 21211 21212 21213 21214 ... 21287
class(restData$zcf)
[1] "factor"

Levels of factor variables

yesno <- sample(c("yes","no"),size=10,replace=TRUE)yesnofac = factor(yesno,levels=c("yes","no"))relevel(yesnofac,ref="no")###如果不用relevel这个函数的话,则为levels:yes,no
 [1] yes yes yes yes no  yes yes yes no  no Levels: no yes
as.numeric(yesnofac)
 [1] 1 1 1 1 2 1 1 1 2 2

Cutting produces factor variables

library(Hmisc)restData$zipGroups = cut2(restData$zipCode,g=4)table(restData$zipGroups)
[-21226,21205) [ 21205,21220) [ 21220,21227) [ 21227,21287]            338            375            300            314 

Using the mutate function

library(Hmisc); library(plyr)restData2 = mutate(restData,zipGroups=cut2(zipCode,g=4))table(restData2$zipGroups)
[-21226,21205) [ 21205,21220) [ 21220,21227) [ 21227,21287]            338            375            300            314 

Common transforms

  • abs(x) absolute value
  • sqrt(x) square root
  • ceiling(x) ceiling(3.475) is 4
  • floor(x) floor(3.475) is 3
  • round(x,digits=n) round(3.475,digits=2) is 3.48
  • signif(x,digits=n) signif(3.475,digits=2) is 3.5
  • cos(x), sin(x) etc.
  • log(x) natural logarithm
  • log2(x)log10(x) other common logs
  • exp(x) exponentiating x

http://www.biostat.jhsph.edu/~ajaffe/lec_winterR/Lecture%202.pdf http://statmethods.net/management/functions.html


Notes and further reading

  • A tutorial from the developer of plyr - http://plyr.had.co.nz/09-user/
  • Andrew Jaffe's R notes http://www.biostat.jhsph.edu/~ajaffe/lec_winterR/Lecture%202.pdf
  • A nice lecture on categorical and factor variables http://www.stat.berkeley.edu/classes/s133/factors.html
3.4 reshape the data

Start with reshaping

library(reshape2)head(mtcars)
                   mpg cyl disp  hp drat    wt  qsec vs am gear carbMazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1

Melting data frames

mtcars$carname <- rownames(mtcars)carMelt <- melt(mtcars,id=c("carname","gear","cyl"),measure.vars=c("mpg","hp"))head(carMelt,n=3)
        carname gear cyl variable value1     Mazda RX4    4   6      mpg  21.02 Mazda RX4 Wag    4   6      mpg  21.03    Datsun 710    4   4      mpg  22.8
tail(carMelt,n=3)
         carname gear cyl variable value62  Ferrari Dino    5   6       hp   17563 Maserati Bora    5   8       hp   33564    Volvo 142E    4   4       hp   109

http://www.statmethods.net/management/reshape.html

Casting data frames

cylData <- dcast(carMelt, cyl ~ variable)cylData
  cyl mpg hp1   4  11 112   6   7  73   8  14 14
cylData <- dcast(carMelt, cyl ~ variable,mean)cylData
  cyl   mpg     hp1   4 26.66  82.642   6 19.74 122.293   8 15.10 209.21

http://www.statmethods.net/management/reshape.html

Averaging values

head(InsectSprays)
  count spray1    10     A2     7     A3    20     A4    14     A5    14     A6    12     A
tapply(InsectSprays$count,InsectSprays$spray,sum)
  A   B   C   D   E   F 174 184  25  59  42 200 

http://www.r-bloggers.com/a-quick-primer-on-split-apply-combine-problems/

Another way - split

spIns =  split(InsectSprays$count,InsectSprays$spray)spIns
$A [1] 10  7 20 14 14 12 10 23 17 20 14 13$B [1] 11 17 21 11 16 14 17 17 19 21  7 13$C [1] 0 1 7 2 3 1 2 1 3 0 1 4$D [1]  3  5 12  6  4  3  5  5  5  5  2  4$E [1] 3 5 3 5 3 6 1 1 3 2 6 4$F [1] 11  9 15 22 15 16 13 10 26 26 24 13

Another way - apply

sprCount = lapply(spIns,sum)sprCount
$A[1] 174$B[1] 184$C[1] 25$D[1] 59$E[1] 42$F[1] 200

Another way - combine

unlist(sprCount)##也就是说用完lapply再用一个unlist简直爽翻了
  A   B   C   D   E   F 174 184  25  59  42 200 
sapply(spIns,sum)
  A   B   C   D   E   F 174 184  25  59  42 200 

Another way - plyr package

ddply(InsectSprays,.(spray),summarize,sum=sum(count))
  spray sum1     A 1742     B 1843     C  254     D  595     E  426     F 200

Creating a new variable

spraySums <- ddply(InsectSprays,.(spray),summarize,sum=ave(count,FUN=sum))dim(spraySums)
[1] 72  2
head(spraySums)
  spray sum1     A 1742     A 1743     A 1744     A 1745     A 1746     A 174

More information

  • A tutorial from the developer of plyr - http://plyr.had.co.nz/09-user/
  • A nice reshape tutorial http://www.slideshare.net/jeffreybreen/reshaping-data-in-r
  • A good plyr primer - http://www.r-bloggers.com/a-quick-primer-on-split-apply-combine-problems/
  • See also the functions
    • acast - for casting as multi-dimensional arrays
    • arrange - for faster reordering without using order() commands
    • mutate - adding new variables

3.5merging Data

Peer review data

if(!file.exists("./data")){dir.create("./data")}fileUrl1 = "https://dl.dropboxusercontent.com/u/7710864/data/reviews-apr29.csv"fileUrl2 = "https://dl.dropboxusercontent.com/u/7710864/data/solutions-apr29.csv"download.file(fileUrl1,destfile="./data/reviews.csv",method="curl")download.file(fileUrl2,destfile="./data/solutions.csv",method="curl")reviews = read.csv("./data/reviews.csv"); solutions <- read.csv("./data/solutions.csv")head(reviews,2)
  id solution_id reviewer_id      start       stop time_left accept1  1           3          27 1304095698 1304095758      1754      12  2           4          22 1304095188 1304095206      2306      1
head(solutions,2)
  id problem_id subject_id      start       stop time_left answer1  1        156         29 1304095119 1304095169      2343      B2  2        269         25 1304095119 1304095183      2329      C

Merging data - merge()

  • Merges data frames
  • Important parameters: x,y,by,by.x,by.y,all
names(reviews)
[1] "id"          "solution_id" "reviewer_id" "start"       "stop"        "time_left"  [7] "accept"     
names(solutions)
[1] "id"         "problem_id" "subject_id" "start"      "stop"       "time_left"  "answer"    

Merging data - merge()

mergedData = merge(reviews,solutions,by.x="solution_id",by.y="id",all=TRUE)head(mergedData)
  solution_id id reviewer_id    start.x     stop.x time_left.x accept problem_id subject_id1           1  4          26 1304095267 1304095423        2089      1        156         292           2  6          29 1304095471 1304095513        1999      1        269         253           3  1          27 1304095698 1304095758        1754      1         34         224           4  2          22 1304095188 1304095206        2306      1         19         235           5  3          28 1304095276 1304095320        2192      1        605         266           6 16          22 1304095303 1304095471        2041      1        384         27     start.y     stop.y time_left.y answer1 1304095119 1304095169        2343      B2 1304095119 1304095183        2329      C3 1304095127 1304095146        2366      C4 1304095127 1304095150        2362      D5 1304095127 1304095167        2345      A6 1304095131 1304095270        2242      C

Default - merge all common column names

intersect(names(solutions),names(reviews))
[1] "id"        "start"     "stop"      "time_left"
mergedData2 = merge(reviews,solutions,all=TRUE)head(mergedData2)
  id      start       stop time_left solution_id reviewer_id accept problem_id subject_id answer1  1 1304095119 1304095169      2343          NA          NA     NA        156         29      B2  1 1304095698 1304095758      1754           3          27      1         NA         NA   <NA>3  2 1304095119 1304095183      2329          NA          NA     NA        269         25      C4  2 1304095188 1304095206      2306           4          22      1         NA         NA   <NA>5  3 1304095127 1304095146      2366          NA          NA     NA         34         22      C6  3 1304095276 1304095320      2192           5          28      1         NA         NA   <NA>

Using join in the plyr package

Faster, but less full featured - defaults to left join, see help file for more

df1 = data.frame(id=sample(1:10),x=rnorm(10))df2 = data.frame(id=sample(1:10),y=rnorm(10))arrange(join(df1,df2),id)
   id       x       y1   1  0.2514  0.22862   2  0.1048  0.83953   3 -0.1230 -1.11654   4  1.5057 -0.11215   5 -0.2505  1.21246   6  0.4699 -1.60387   7  0.4627 -0.80608   8 -1.2629 -1.28489   9 -0.9258 -0.827610 10  2.8065  0.5794

If you have multiple data frames

df1 = data.frame(id=sample(1:10),x=rnorm(10))df2 = data.frame(id=sample(1:10),y=rnorm(10))df3 = data.frame(id=sample(1:10),z=rnorm(10))dfList = list(df1,df2,df3)join_all(dfList)
   id        x        y        z1   6  0.39093 -0.16670  0.565232   1 -1.90467  0.43811 -0.374493   7 -1.48798 -0.85497 -0.692094  10 -2.59440  0.39591 -0.361345   3 -0.08539  0.08053  1.012476   4 -1.63165 -0.13158  0.219277   5 -0.50594  0.24256 -0.440038   9 -0.85062 -2.08066 -0.969509   2 -0.63767 -0.10069  0.0900210  8  1.20439  1.29138 -0.88586

More on merging data

  • The quick R data merging page - http://www.statmethods.net/management/merging.html
  • plyr information - http://plyr.had.co.nz/
  • Types of joins - http://en.wikipedia.org/wiki/Join_(SQL)


0 0
原创粉丝点击
热门问题 老师的惩罚 人脸识别 我在镇武司摸鱼那些年 重生之率土为王 我在大康的咸鱼生活 盘龙之生命进化 天生仙种 凡人之先天五行 春回大明朝 姑娘不必设防,我是瞎子 乐视电视没声音怎么办 老公疑心病很重怎么办啊 被安装了尿道锁怎么办 狼青小狗腿罗圈怎么办 备孕期间有霉菌怎么办 虫子进皮肤里了怎么办 生完孩子肚子越来越大怎么办 怀孕8个月肚子小怎么办 孕晚期胎儿不长怎么办 肚子上肉特别多怎么办 奶堵了有硬块怎么办 便秘5天肚子胀怎么办 上火大便拉不出来怎么办 大便拉不出来肚子痛怎么办 戒奶奶涨有硬块怎么办 忌奶的时候涨奶怎么办 娃儿隔奶,,奶涨怎么办 狗肚子很大很鼓怎么办 注册不了的二建怎么办 专升本没考过怎么办 警察乱拘留人该怎么办 玩英雄联盟太卡怎么办 一方离婚证丢了怎么办 遭遇呼死你软件怎么办 开车遇见碰瓷的怎么办 遇到碰瓷老人的怎么办 睿强遥控锁坏了怎么办 地暖分水器坏了怎么办 京东赠品无货怎么办 图书馆的书丢了怎么办 三次临牌用完了怎么办 m3u8文件只有10k怎么办 寿县到淮南高铁怎么办 冰箱停电肉臭了怎么办 新买冰箱有异味怎么办 海康用户被锁定怎么办 大华dss录像没了怎么办 燃气软管超过2米怎么办 中央空调进了水怎么办 车门被划了一道怎么办 镜前灯没有留线怎么办