What is the fastest alternative to replicate() in R when running codes with multiple functions?

Question

I have this code where I use six functions for Monte Carlo simulation and then calculate the 95th percentile of the resulting probability distribution for each row of the data frame I specified using replicate(). My actual data frame has 11,628 rows. I used system.time() to estimate how long will it take assuming I only have 5 rows. The results indicate the run time is 50 seconds per 5 rows, so that follows that I will endure 32+ hours for 11,628 rows. Is there a faster and more efficient alternative to execute my code?

Here is my data frame final.distrib.

install.packages("partitions")
library(partitions)

# Identify building stock distribution in percentage
# Get 6 elements that sum up to 1
z <- compositions(n = 20, m = 6, include.zero = FALSE)

# Convert partition to matrix
z <- as.matrix.partition(z)

# Transpose matrix
z <- t(z)

# Multiply by 0.05 so that elements take any value from 0 to 1 by incremental step of 0.05
z <- z * 0.05
distrib.percent <- data.frame(z)
colnames(distrib.percent) <- c("LWA", "LWB", "SCA", "SCB", "RCA", "RCB")

# Identify household count without rounding off
# Multiply the percentages of distribution with the total number of households (971 households)
distrib.count <- data.frame(z*971)
colnames(distrib.count) <- c("LWA", "LWB", "SCA", "SCB", "RCA", "RCB")

#Round off values, making sure each row sums up to 971.
smart.round <- function(x) {
  y <- floor(x)
  indices <- tail(order(x-y), round(sum(x)) - sum(y))
  y[indices] <- y[indices] + 1
  y
}

final.distrib <- data.frame(t(apply(distrib.count, 1, smart.round)))
colnames(final.distrib) <- c("LWA", "LWB", "SCA", "SCB", "RCA", "RCB")

And here is the code with six functions:

install.packages("triangle")
library(triangle)

function.LWA <- function() {
  n_sim <- 10000
  LWA.cost <- rtriangle(n_sim, a = 0.9*8407.29, b = 1.1*8407.29, c = 8407.29) 
  random.area <- rlnorm(100000, meanlog = 3.61, sdlog = 0.82) 
  LWA.area <- head(random.area[random.area >= 5 & random.area <= 200], n_sim) 
  LWA.loss <- rtriangle (n_sim, a = 0.0890, b = 0.6655, c = 0.3212) 
  LWA.cost*LWA.area*LWA.loss
}

function.LWB <- function() {
  n_sim <- 10000
  LWB.cost <- rtriangle(n_sim, a = 0.9*10328.43, b = 1.1*10328.43, c = 10328.43)
  random.area <- rlnorm(100000, meanlog = 3.61, sdlog = 0.82) 
  LWB.area <- head(random.area[random.area >= 5 & random.area <= 200], n_sim)
  LWB.loss <- rtriangle(n_sim, a = 0.0423, b = 0.5572, c = 0.2038)
  LWB.cost*LWB.area*LWB.loss
}

function.SCA <- function() {
  n_sim <- 10000
  SCA.cost <- rtriangle(n_sim, a = 0.9*12055.57, b = 1.1*12055.57, c = 12055.57)
  random.area <- rlnorm(100000, meanlog = 3.61, sdlog = 0.82) 
  SCA.area <- head(random.area[random.area >= 5 & random.area <= 200], n_sim)
  SCA.loss <- rtriangle(n_sim, a = 0.0453, b = 0.5665, c = 0.2136)
  SCA.cost*SCA.area*SCA.loss
}

function.SCB <- function() {
  n_sim <- 10000
  SCB.cost <- rtriangle(n_sim, a = 0.9*17935.86, b = 1.1*17935.86, c = 17935.86)
  random.area <- rlnorm(100000, meanlog = 3.61, sdlog = 0.82) 
  SCB.area <- head(random.area[random.area >= 5 & random.area <= 200], n_sim)
  SCB.loss <- rtriangle(n_sim, a = 0.0645, b = 0.6361, c = 0.2791)
  SCB.cost*SCB.area*SCB.loss
}

function.RCA <- function() {
  n_sim <- 10000
  RCA.cost <- rtriangle(n_sim, a = 0.9*19363.57, b = 1.1*19363.57, c = 19363.57)
  random.area <- rlnorm(100000, meanlog = 3.61, sdlog = 0.82) 
  RCA.area <- head(random.area[random.area >= 5 & random.area <= 200], n_sim)
  RCA.loss <- rtriangle(n_sim, a = 0.0263, b = 0.5009, c = 0.1537)
  RCA.cost*RCA.area*RCA.loss
}

function.RCB <- function() {
  n_sim <- 10000
  RCB.cost <- rtriangle(n_sim, a = 0.9*25182.71, b = 1.1*25182.71, c = 25182.71)
  random.area <- rlnorm(100000, meanlog = 3.61, sdlog = 0.82) 
  RCB.area <- head(random.area[random.area >= 5 & random.area <= 200], n_sim)
  RCB.loss <- rtriangle(n_sim, a = 0.0583, b = 0.6078, c = 0.2519)
  RCB.cost*RCB.area*RCB.loss
}

# Function to calculate the direct loss and return the 95th percentile
calculate_percentile <- function(row) {
  set.seed(1) # for comparability across scenarios
  direct.loss <- (rowSums(replicate(n=row["LWA"], function.LWA()))) +
                 (rowSums(replicate(n=row["LWB"], function.LWB()))) +
                 (rowSums(replicate(n=row["SCA"], function.SCA()))) +
                 (rowSums(replicate(n=row["SCB"], function.SCB()))) +
                 (rowSums(replicate(n=row["RCA"], function.RCA()))) +
                 (rowSums(replicate(n=row["RCB"], function.RCB())))
  quantile(direct.loss, probs = 0.95) # Specify 95th percentile
}

# Apply the function to each row of the data frame (containing house count) and store the results
loss.PEISVII.95p <- setNames(data.frame(apply(final.distrib, 1, calculate_percentile)), "PEIS VII loss")

Thank you.

Have you tried running a timing test for replicate() vs. a simple for loop? — Carl Witthoft
– Carl Witthoft, Commented Sep 14, 2024 at 17:15
If you run a profiler (e.g. Rprof()) on your code, you'll see that most of the time is being spent in rlnorm(). That makes sense: you are generating many millions of lognormals in your simulation. Just running on the first 5 rows calls it more than 4800 times, and each of those generates 100000 values. That's nearly half a billion values that you are generating. — user2554330
– user2554330, Commented Sep 14, 2024 at 18:11

jblood94 · Accepted Answer · 2024-09-16 16:00:02Z

A couple improvements along with parallel processing can get it down to about an hour with 16 cores.

First, instead of using replicate, take the total number of cost/loss/area samples as needed for each row and put into a matrix before doing rowsums.

Second, you are generating way more lognormal samples than needed for truncation. A couple quick calculations show that oversampling by only 3% is sufficient to virtually guarantee enough samples between 5 and 200:

# probability that a sample will fall outside [5, 200]
(p <- diff(plnorm(c(5, 200), 3.61, 0.82)))
#> [1] 0.9728997
# minimum number of log-normal samples needed
min_n <- min(final.distrib)*1e4
# upper bound probability of not generating enough samples within [5, 200]
# over the entire simulation
-expm1(pbinom(min_n, 1.03*min_n, p, FALSE, TRUE)*6*nrow(final.distrib))
#> [1] 1.550331e-13

The full code (over 150 rows):

n_sim <- 1e4
abc <- array(c(0.9*8407.29, 1.1*8407.29, 8407.29,
               0.0890, 0.6655, 0.3212,
               0.9*10328.43, 1.1*10328.43, 10328.43,
               0.0423, 0.5572, 0.2038,
               0.9*12055.57, 1.1*12055.57, 12055.57,
               0.0453, 0.5665, 0.2136,
               0.9*17935.86, 1.1*17935.86, 17935.86,
               0.0645, 0.6361, 0.2791,
               0.9*19363.57, 1.1*19363.57, 19363.57,
               0.0263, 0.5009, 0.1537,
               0.9*25182.71, 1.1*25182.71, 25182.71,
               0.0583, 0.6078, 0.2519), c(3, 2, 6))

rtri <- function(n, a, b, c) {
  fc <- (c - a)/(b - a)
  U <- runif(n)
  blna <- U < fc
  r <- numeric(n)
  r[blna] <- a + sqrt(U[blna]*(b - a)*(c - a))
  r[!blna] <- b - sqrt((1 - U[!blna])*(b - a)*(b - c))
  r
}

f.random.area <- function(n_rep) {
  n <- n_sim*n_rep
  x <- rlnorm(n*1.029, 3.61, 0.82)
  matrix(x[abs(x - 102.5) < 97.5][1:n], n_sim, n_rep)
}

f <- function(i, n_rep) {
  n <- n_sim*n_rep
  cost <- rtri(n, abc[1, 1, i], abc[2, 1, i], abc[3, 1, i])
  loss <- rtri(n, abc[1, 2, i], abc[2, 2, i], abc[3, 2, i])
  rowsums(cost*loss*f.random.area(n_rep))
}

loss.quantile <- function(n_rep, p = 0.95) {
  quantile(rowsums(mapply(\(i) f(i, n_rep[i]), seq_along(n_rep))), probs = p)
}

library(parallel)

cl <- makeCluster(detectCores() - 1) # 15 cores
clusterExport(cl, c("f", "loss.quantile", "f.random.area", "abc", "n_sim", "rtri"))
invisible(clusterEvalQ(cl, library(Rfast))) # for `rowsums`
# loss.PEISVII.95p <- unlist(parLapply(cl, asplit(final.distrib, 1), loss.quantile)) # full calculation
# time 150 rows
system.time(loss.PEISVII.95p <- unlist(
  parLapply(cl, asplit(final.distrib[1:150,], 1), loss.quantile)
))
#>    user  system elapsed 
#>    0.03    0.00   46.66

Collectives™ on Stack Overflow

What is the fastest alternative to replicate() in R when running codes with multiple functions?

1 Answer 1

Comments

Your Answer

Linked

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

Comments

Your Answer

Sign up or log in

Post as a guest

Linked

Related