I'm trying to read in a lot of data as efficiently as possible; the data are in about 1,400 CSV files within 6 individual ZIP files. The CSV files are all similar timeseries data with the same columns, except some CSV files have an extra column ("yield_c" in this example). I need to import all data and would prefer to not unzip the ZIP files. I've had success with a few approaches but both have their problems; we'll call them A and B. Approach A seems to run much faster than B, but it is adding each CSV header as a new data row and (likely related) is causing all columns to be read as character. Approach B processes the full dataset correctly but is abominably slow.
Any tips on coding this better for efficiency and a proper data pull? I think I am missing something simple about how freadr processes the cmd input but I can't figure out how to make approach A work correctly without adding the header rows every time.
Here is a sample dataset of "only" 240 CSV files among 6 ZIP files with my two data processor functions datprocessorA() and datprocessorB(). Currently (for me) approach A takes a few seconds and approach B takes 2.5 minutes.
Apologies for any formatting or etiquette issues, I don't post here as I can usually find a solution. Similarly, apologies if this is a dupe but I've scoured the forums and haven't found anything to help this specific case; plenty of related issues but not this one in particular.
library(data.table)
# Create sample CSV files and zip them in a folder
tempath <- "temp"
dir.create(file.path(tempath), showWarnings = FALSE)
setwd(tempath)
dfs <- list()
num_dfs <- 240
for (i in 1:num_dfs) {
data <- data.frame(date = seq(as.Date("1980-10-01"), as.Date("2020-09-01"), by = "month"),
Subset = paste0("Subset", i),
yield_a = sample(0:20,480, replace = TRUE),
yield_b = sample(0:5,480, replace = TRUE))
df_name <- paste0("df", i)
dfs[[df_name]] <- data
}
# randomly (ish) add "yield_c" column to a subset of dataframes
seqz <- sort(unique(c(1,sample(seq(1,num_dfs, by = 5), size = num_dfs%/%3, replace = TRUE))))
for (j in seqz) {
dfs[[j]]$yield_c = sample(1:10,480, replace = TRUE)
}
# write all to csv, zip up, delete csvs
for (k in seq_along(dfs)) {
file_name <- paste0("file", k, ".csv") # Create a unique filename
write.csv(dfs[[k]], file = file_name, row.names = FALSE) # Save to CSV
}
zipcount <- 6
for (l in 1:zipcount) {
zip(paste0("zip",l,".zip"), paste0("file", seq(num_dfs/zipcount*(l-1)+1,num_dfs/zipcount*l,1), ".csv"))
}
file.remove(list.files(pattern = "\\.csv$", full.names = TRUE))
# import
# return to parent directory to more closely imitate my scenario
setwd("..")
#get path for all zip files
zips <- paste0(tempath, "/", list.files(path = tempath, full.names = F, recursive = F))
# first function to import all csvs into R as a table
datprocessorA <- function(zipdir) {
connz <- paste("unzip -p",zipdir)
datz <- rbindlist(lapply(connz, function(x) fread(x, sep=',', header = TRUE,
stringsAsFactors=FALSE, fill = Inf)),
use.names = TRUE, fill=TRUE)
datz
}
# alternative function to import all csvs into R as a table
datprocessorB <- function(zipdir) {
datz <- lapply(setNames(nm = unzip(zipdir, list = TRUE)$Name),
function(fn) fread(cmd = paste("unzip -p -a",zipdir, fn), header = TRUE,
stringsAsFactors=FALSE, fill = Inf)) %>%
rbindlist(use.names = TRUE, fill=TRUE)
datz
}
# for loop to process import function; choose datprocessorA or datprocessorB
funx <- datprocessorA
begin <- Sys.time()
for (i in 1:length(zips)) {
zipdir <- zips[i]
if (i == 1) {
import <- funx(zipdir)
} else {
dat2 <- funx(zipdir)
import <- rbind(import, dat2, fill = TRUE)
rm(dat2)
}
}
end <- Sys.time()
print(paste0("Approach A length: ",round(end-begin), " seconds"))
funx <- datprocessorB
begin <- Sys.time()
for (i in 1:length(zips)) {
zipdir <- zips[i]
if (i == 1) {
import <- funx(zipdir)
} else {
dat2 <- funx(zipdir)
import <- rbind(import, dat2, fill = TRUE)
rm(dat2)
}
}
end <- Sys.time()
print(paste0("Approach B length: ",round(end-begin), " minutes"))