8.14 Data Template

library(janitor)      # Cleanup: clean_names().
library(rattle)       # Dataset: weatherAUS.
library(magrittr)     # Pipelines: %<>%.
library(randomForest) # Imputation: na.roughfix().
library(dplyr)        # Wrangling: rename_all().


dsname <- "weatherAUS"
ds     <- get(dsname)
nobs   <- nrow(ds)

dim(ds)

vnames        <- names(ds)
ds           %<>% clean_names(numerals="right")
names(vnames) <- names(ds)

names(ds)

vars   <- names(ds)
target <- "rain_tomorrow"
vars   <- c(target, vars) %>% unique() %>% rev()

for (v in which(sapply(ds, is.factor))) levels(ds[[v]]) %<>% normVarNames()

risk   <- "risk_mm"
id     <- c("date", "location")
ignore <- c(risk, id)
vars   <- setdiff(vars, ignore)
inputs <- setdiff(vars, target)

form   <- formula(paste(target, "~ ."))

# ds[vars] %<>% na.roughfix()

SPLIT <- c(0.7, 0.15, 0.15)

tr     <- sample(nobs, SPLIT[1]*nobs)
tu     <- nobs %>% seq_len() %>% setdiff(tr) %>% sample(SPLIT[2]*nobs)
te     <- nobs %>% seq_len() %>% setdiff(tr) %>% setdiff(tu)

target.tr <- ds %>% slice(tr) %>% pull(target)
target.tu <- ds %>% slice(tu) %>% pull(target)
target.te <- ds %>% slice(te) %>% pull(target)

if (!is.null(risk))
{
  risk.tr   <- ds %>% slice(tr) %>% pull(risk)
  risk.tu   <- ds %>% slice(tu) %>% pull(risk)
  risk.te   <- ds %>% slice(te) %>% pull(risk)
}


Your donation will support ongoing availability and give you access to the PDF version of this book. Desktop Survival Guides include Data Science, GNU/Linux, and MLHub. Books available on Amazon include Data Mining with Rattle and Essentials of Data Science. Popular open source software includes rattle, wajig, and mlhub. Hosted by Togaware, a pioneer of free and open source software since 1984. Copyright © 1995-2022 Graham.Williams@togaware.com Creative Commons Attribution-ShareAlike 4.0