使用 R sampling 包的 strata() 函数进行分层抽样,抽样后的训练和测试数据索引保存为 train.csv 和 test.csv。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
# clear memory rm(list=ls(all=T))
# load sampling package library(sampling)
# read in data index task.idx <- read.csv("data.csv")
# select train and test data n <- round(4/5*nrow(task.idx)/3) sub_train <- strata(task.idx, stratanames=("task"), size=rep(n, 3), method="srswor") data_train <- task.idx[sub_train$ID_unit,] data_test <- task.idx[-sub_train$ID_unit,]
# save the index of train and test data write.csv(data_train, file="train.csv", quote=F, row.names=F) write.csv(data_test, file="test.csv", quote=F, row.names=F)