728x90
04.R programming 데이터 전처리 (hflights 파일) in R


library(hflights)
h001<-hflights
str(h001)

summary(h001)

h001<-h001[-c(1,20,21)]

h001$UniqueCarrier<-as.factor(h001$UniqueCarrier)

h001$TailNum<-as.factor(h001$TailNum)

h001$Origin<-as.factor(h001$Origin)

h001$Dest<-as.factor(h001$Dest)

h001$Cancelled<-as.factor(h001$Cancelled)

library(plyr)

count(h001,'UniqueCarrier')#15

count(h001,'FlightNum')#3240

count(h001,'TailNum') #2820

count(h001,'Origin') #2

count(h001,'Dest')#116

h001<-h001[-c(7,8)]

str(h001)

summary(h001)


# Rename: Column Labels


h001<-setNames(h001,c("x001","x002","x003","x004","x005","x006","x007","x008","x009","x010","x011","x012","x013","x014","x015","y001"))

str(h001)

summary(h001)


# Recoding

library(plyr)

levels(h001$x006)<-0:14

levels(h001$x011)<-0:1

levels(h001$x012)<-0:115

str(h001)

summary(h001)


# Imputation: Median (Removing Missing Values Causes Removing y001 Values)


h001$y001<-as.numeric(h001$y001)

h001$x006<-as.numeric(h001$x006)

h001$x011<-as.numeric(h001$x011)

h001$x012<-as.numeric(h001$x012)

h001$y001<-h001$y001-1

h001$x006<-h001$x006-1

h001$x011<-h001$x011-1

h001$x012<-h001$x012-1

str(h001)

summary(h001)

library(stringr)

f=function(x){
 
  x<-as.numeric(str_replace_all(x,',',''))
 
  x[is.na(x)]=median(x, na.rm=TRUE)
 
  x
 
}

h001=data.frame(apply(h001,2,f))

h001<-round(h001,0)

str(h001)

h001$x006<-as.factor(h001$x006)

h001$x011<-as.factor(h001$x011)

h001$x012<-as.factor(h001$x012)

h001$y001<-as.factor(h001$y001)

str(h001)

summary(h001)


# Writing Data: Training & Test


getwd() 

# [1] "C:/Users/acorn/Documents"


write.table(h001,"Data 2b Training & Test.txt")


# Writing Data: Prediction


h002=h001[-c(16)]

h002=h002[sample(nrow(h002),size=22387),]

h002

str(h002)

getwd()


# [1] "C:/Users/acorn/Documents"


write.table(h002,"Data 2b Prediction.txt")

 

 

+ Recent posts