数据预处理demo

来源:互联网 发布:东莞plc编程培训机构 编辑:程序博客网 时间:2024/05/19 20:39

R笔记:

#step(1): Reading datatxt=readLines("Data_Hw2.txt")  #readLines: when the rows in a data files are not uniformly formattedtxt#step(2):Selecting lines containing dataI=grepl("^//",txt)Idat=txt[!I]dat#step(3):Split lines into separate fields(fieldList=strsplit(dat,split=";"))str(fieldList)#step(4):Standardize rows#先定义一个对列表中单个元素处理的assignFields=function(x)  #函数声明{  l=length(x)  out=character(3)  if(l>1){  #匹配list中的字符作为输出的第一列  i=grepl("[[:alpha:]]",x)  #print(i)  out[1]=x[i]  out[2]=round(as.numeric(x[2]))  #若长度不大于0,则赋值为NA  #print(i)  if(l==3){    out[3]=x[3]  }else{    out[3]=NA  }  }    #若长度不大于0,则赋值为NA  return(out)}#lapply函数用来处理列表的每一个元素standardFields=lapply(fieldList,assignFields) #apply a function over a liststandardFields#step(5): transform a list to data.frame(将list转化为data.frame)M=matrix(unlist(standardFields),nrow=length(standardFields),byrow=TRUE)  #copy into a matrix which is then coerced into a data.frame#unlist() produce a vector which contains all the atomic components which occur in xcolnames(M)=c("Gender","Age","weight")MM=M[1:4,]Mdeltons=as.data.frame(M,stringsAsFactors=FALSE)  #stringsAsFactors=FALSE 防止R把第一列默认成因子模式factordeltons#step(6):Normalize and coerce to correct types(强制转换类型)str(deltons)J=grepl("^m",deltons$Gender,ignore.case=T)Jfor(i in 1:length(deltons$Gender)){  deltons$Gender[i]=ifelse(J[i],"man","woman")}#deltons$Gender=gsub("^m","man",deltons$Gender,ignore.case=T)deltons$weight=gsub(",",".",deltons$weight)deltons$Age=as.integer(deltons$Age)deltons$weight=as.numeric(deltons$weight)deltonsstr(deltons)
原创粉丝点击