library(hflights)
h001<-hflights
str(h001)
summary(h001)
h001<-h001[-c(1,20,21)]
h001$UniqueCarrier<-as.factor(h001$UniqueCarrier)
h001$TailNum<-as.factor(h001$TailNum)
h001$Origin<-as.factor(h001$Origin)
h001$Dest<-as.factor(h001$Dest)
h001$Cancelled<-as.factor(h001$Cancelled)
library(plyr)
count(h001,'UniqueCarrier')#15
count(h001,'FlightNum')#3240
count(h001,'TailNum') #2820
count(h001,'Origin') #2
count(h001,'Dest')#116
h001<-h001[-c(7,8)]
str(h001)
summary(h001)
# Rename: Column Labels
h001<-setNames(h001,c("x001","x002","x003","x004","x005","x006","x007","x008","x009","x010","x011","x012","x013","x014","x015","y001"))
str(h001)
summary(h001)
# Recoding
library(plyr)
levels(h001$x006)<-0:14
levels(h001$x011)<-0:1
levels(h001$x012)<-0:115
str(h001)
summary(h001)
# Imputation: Median (Removing Missing Values Causes Removing y001 Values)
h001$y001<-as.numeric(h001$y001)
h001$x006<-as.numeric(h001$x006)
h001$x011<-as.numeric(h001$x011)
h001$x012<-as.numeric(h001$x012)
h001$y001<-h001$y001-1
h001$x006<-h001$x006-1
h001$x011<-h001$x011-1
h001$x012<-h001$x012-1
str(h001)
summary(h001)
library(stringr)
f=function(x){
x<-as.numeric(str_replace_all(x,',',''))
x[is.na(x)]=median(x, na.rm=TRUE)
x
}
h001=data.frame(apply(h001,2,f))
h001<-round(h001,0)
str(h001)
h001$x006<-as.factor(h001$x006)
h001$x011<-as.factor(h001$x011)
h001$x012<-as.factor(h001$x012)
h001$y001<-as.factor(h001$y001)
str(h001)
summary(h001)
# Writing Data: Training & Test
getwd()
# [1] "C:/Users/acorn/Documents"
write.table(h001,"Data 2b Training & Test.txt")
# Writing Data: Prediction
h002=h001[-c(16)]
h002=h002[sample(nrow(h002),size=22387),]
h002
str(h002)
getwd()
# [1] "C:/Users/acorn/Documents"
write.table(h002,"Data 2b Prediction.txt")
'R programming' 카테고리의 다른 글
03.구글시트 연동해서 R studio로 파일 불러오기 (0) | 2017.05.29 |
---|---|
02.R 언어 기본문법(집합, matrix, data frame) (0) | 2017.04.02 |
01.R 언어란? (0) | 2017.03.18 |