1

I have this code where I use six functions for Monte Carlo simulation and then calculate the 95th percentile of the resulting probability distribution for each row of the data frame I specified using replicate(). My actual data frame has 11,628 rows. I used system.time() to estimate how long will it take assuming I only have 5 rows. The results indicate the run time is 50 seconds per 5 rows, so that follows that I will endure 32+ hours for 11,628 rows. Is there a faster and more efficient alternative to execute my code?

Here is my data frame final.distrib.

install.packages("partitions")
library(partitions)

# Identify building stock distribution in percentage
# Get 6 elements that sum up to 1
z <- compositions(n = 20, m = 6, include.zero = FALSE)

# Convert partition to matrix
z <- as.matrix.partition(z)

# Transpose matrix
z <- t(z)

# Multiply by 0.05 so that elements take any value from 0 to 1 by incremental step of 0.05
z <- z * 0.05
distrib.percent <- data.frame(z)
colnames(distrib.percent) <- c("LWA", "LWB", "SCA", "SCB", "RCA", "RCB")

# Identify household count without rounding off
# Multiply the percentages of distribution with the total number of households (971 households)
distrib.count <- data.frame(z*971)
colnames(distrib.count) <- c("LWA", "LWB", "SCA", "SCB", "RCA", "RCB")

#Round off values, making sure each row sums up to 971.
smart.round <- function(x) {
  y <- floor(x)
  indices <- tail(order(x-y), round(sum(x)) - sum(y))
  y[indices] <- y[indices] + 1
  y
}

final.distrib <- data.frame(t(apply(distrib.count, 1, smart.round)))
colnames(final.distrib) <- c("LWA", "LWB", "SCA", "SCB", "RCA", "RCB")

And here is the code with six functions:

install.packages("triangle")
library(triangle)

function.LWA <- function() {
  n_sim <- 10000
  LWA.cost <- rtriangle(n_sim, a = 0.9*8407.29, b = 1.1*8407.29, c = 8407.29) 
  random.area <- rlnorm(100000, meanlog = 3.61, sdlog = 0.82) 
  LWA.area <- head(random.area[random.area >= 5 & random.area <= 200], n_sim) 
  LWA.loss <- rtriangle (n_sim, a = 0.0890, b = 0.6655, c = 0.3212) 
  LWA.cost*LWA.area*LWA.loss
}

function.LWB <- function() {
  n_sim <- 10000
  LWB.cost <- rtriangle(n_sim, a = 0.9*10328.43, b = 1.1*10328.43, c = 10328.43)
  random.area <- rlnorm(100000, meanlog = 3.61, sdlog = 0.82) 
  LWB.area <- head(random.area[random.area >= 5 & random.area <= 200], n_sim)
  LWB.loss <- rtriangle(n_sim, a = 0.0423, b = 0.5572, c = 0.2038)
  LWB.cost*LWB.area*LWB.loss
}

function.SCA <- function() {
  n_sim <- 10000
  SCA.cost <- rtriangle(n_sim, a = 0.9*12055.57, b = 1.1*12055.57, c = 12055.57)
  random.area <- rlnorm(100000, meanlog = 3.61, sdlog = 0.82) 
  SCA.area <- head(random.area[random.area >= 5 & random.area <= 200], n_sim)
  SCA.loss <- rtriangle(n_sim, a = 0.0453, b = 0.5665, c = 0.2136)
  SCA.cost*SCA.area*SCA.loss
}

function.SCB <- function() {
  n_sim <- 10000
  SCB.cost <- rtriangle(n_sim, a = 0.9*17935.86, b = 1.1*17935.86, c = 17935.86)
  random.area <- rlnorm(100000, meanlog = 3.61, sdlog = 0.82) 
  SCB.area <- head(random.area[random.area >= 5 & random.area <= 200], n_sim)
  SCB.loss <- rtriangle(n_sim, a = 0.0645, b = 0.6361, c = 0.2791)
  SCB.cost*SCB.area*SCB.loss
}

function.RCA <- function() {
  n_sim <- 10000
  RCA.cost <- rtriangle(n_sim, a = 0.9*19363.57, b = 1.1*19363.57, c = 19363.57)
  random.area <- rlnorm(100000, meanlog = 3.61, sdlog = 0.82) 
  RCA.area <- head(random.area[random.area >= 5 & random.area <= 200], n_sim)
  RCA.loss <- rtriangle(n_sim, a = 0.0263, b = 0.5009, c = 0.1537)
  RCA.cost*RCA.area*RCA.loss
}

function.RCB <- function() {
  n_sim <- 10000
  RCB.cost <- rtriangle(n_sim, a = 0.9*25182.71, b = 1.1*25182.71, c = 25182.71)
  random.area <- rlnorm(100000, meanlog = 3.61, sdlog = 0.82) 
  RCB.area <- head(random.area[random.area >= 5 & random.area <= 200], n_sim)
  RCB.loss <- rtriangle(n_sim, a = 0.0583, b = 0.6078, c = 0.2519)
  RCB.cost*RCB.area*RCB.loss
}

# Function to calculate the direct loss and return the 95th percentile
calculate_percentile <- function(row) {
  set.seed(1) # for comparability across scenarios
  direct.loss <- (rowSums(replicate(n=row["LWA"], function.LWA()))) +
                 (rowSums(replicate(n=row["LWB"], function.LWB()))) +
                 (rowSums(replicate(n=row["SCA"], function.SCA()))) +
                 (rowSums(replicate(n=row["SCB"], function.SCB()))) +
                 (rowSums(replicate(n=row["RCA"], function.RCA()))) +
                 (rowSums(replicate(n=row["RCB"], function.RCB())))
  quantile(direct.loss, probs = 0.95) # Specify 95th percentile
}

# Apply the function to each row of the data frame (containing house count) and store the results
loss.PEISVII.95p <- setNames(data.frame(apply(final.distrib, 1, calculate_percentile)), "PEIS VII loss")

Thank you.

3
  • Have you tried running a timing test for replicate() vs. a simple for loop? Commented Sep 14, 2024 at 17:15
  • 3
    If you run a profiler (e.g. Rprof()) on your code, you'll see that most of the time is being spent in rlnorm(). That makes sense: you are generating many millions of lognormals in your simulation. Just running on the first 5 rows calls it more than 4800 times, and each of those generates 100000 values. That's nearly half a billion values that you are generating. Commented Sep 14, 2024 at 18:11
  • Do you even have enough memory? Commented Sep 15, 2024 at 16:35

1 Answer 1

1

A couple improvements along with parallel processing can get it down to about an hour with 16 cores.

First, instead of using replicate, take the total number of cost/loss/area samples as needed for each row and put into a matrix before doing rowsums.

Second, you are generating way more lognormal samples than needed for truncation. A couple quick calculations show that oversampling by only 3% is sufficient to virtually guarantee enough samples between 5 and 200:

# probability that a sample will fall outside [5, 200]
(p <- diff(plnorm(c(5, 200), 3.61, 0.82)))
#> [1] 0.9728997
# minimum number of log-normal samples needed
min_n <- min(final.distrib)*1e4
# upper bound probability of not generating enough samples within [5, 200]
# over the entire simulation
-expm1(pbinom(min_n, 1.03*min_n, p, FALSE, TRUE)*6*nrow(final.distrib))
#> [1] 1.550331e-13

The full code (over 150 rows):

n_sim <- 1e4
abc <- array(c(0.9*8407.29, 1.1*8407.29, 8407.29,
               0.0890, 0.6655, 0.3212,
               0.9*10328.43, 1.1*10328.43, 10328.43,
               0.0423, 0.5572, 0.2038,
               0.9*12055.57, 1.1*12055.57, 12055.57,
               0.0453, 0.5665, 0.2136,
               0.9*17935.86, 1.1*17935.86, 17935.86,
               0.0645, 0.6361, 0.2791,
               0.9*19363.57, 1.1*19363.57, 19363.57,
               0.0263, 0.5009, 0.1537,
               0.9*25182.71, 1.1*25182.71, 25182.71,
               0.0583, 0.6078, 0.2519), c(3, 2, 6))

rtri <- function(n, a, b, c) {
  fc <- (c - a)/(b - a)
  U <- runif(n)
  blna <- U < fc
  r <- numeric(n)
  r[blna] <- a + sqrt(U[blna]*(b - a)*(c - a))
  r[!blna] <- b - sqrt((1 - U[!blna])*(b - a)*(b - c))
  r
}

f.random.area <- function(n_rep) {
  n <- n_sim*n_rep
  x <- rlnorm(n*1.029, 3.61, 0.82)
  matrix(x[abs(x - 102.5) < 97.5][1:n], n_sim, n_rep)
}

f <- function(i, n_rep) {
  n <- n_sim*n_rep
  cost <- rtri(n, abc[1, 1, i], abc[2, 1, i], abc[3, 1, i])
  loss <- rtri(n, abc[1, 2, i], abc[2, 2, i], abc[3, 2, i])
  rowsums(cost*loss*f.random.area(n_rep))
}

loss.quantile <- function(n_rep, p = 0.95) {
  quantile(rowsums(mapply(\(i) f(i, n_rep[i]), seq_along(n_rep))), probs = p)
}

library(parallel)

cl <- makeCluster(detectCores() - 1) # 15 cores
clusterExport(cl, c("f", "loss.quantile", "f.random.area", "abc", "n_sim", "rtri"))
invisible(clusterEvalQ(cl, library(Rfast))) # for `rowsums`
# loss.PEISVII.95p <- unlist(parLapply(cl, asplit(final.distrib, 1), loss.quantile)) # full calculation
# time 150 rows
system.time(loss.PEISVII.95p <- unlist(
  parLapply(cl, asplit(final.distrib[1:150,], 1), loss.quantile)
))
#>    user  system elapsed 
#>    0.03    0.00   46.66
Sign up to request clarification or add additional context in comments.

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.