I have this code where I use six functions for Monte Carlo simulation and then calculate the 95th percentile of the resulting probability distribution for each row of the data frame I specified using replicate(). My actual data frame has 11,628 rows. I used system.time() to estimate how long will it take assuming I only have 5 rows. The results indicate the run time is 50 seconds per 5 rows, so that follows that I will endure 32+ hours for 11,628 rows. Is there a faster and more efficient alternative to execute my code?
Here is my data frame final.distrib.
install.packages("partitions")
library(partitions)
# Identify building stock distribution in percentage
# Get 6 elements that sum up to 1
z <- compositions(n = 20, m = 6, include.zero = FALSE)
# Convert partition to matrix
z <- as.matrix.partition(z)
# Transpose matrix
z <- t(z)
# Multiply by 0.05 so that elements take any value from 0 to 1 by incremental step of 0.05
z <- z * 0.05
distrib.percent <- data.frame(z)
colnames(distrib.percent) <- c("LWA", "LWB", "SCA", "SCB", "RCA", "RCB")
# Identify household count without rounding off
# Multiply the percentages of distribution with the total number of households (971 households)
distrib.count <- data.frame(z*971)
colnames(distrib.count) <- c("LWA", "LWB", "SCA", "SCB", "RCA", "RCB")
#Round off values, making sure each row sums up to 971.
smart.round <- function(x) {
y <- floor(x)
indices <- tail(order(x-y), round(sum(x)) - sum(y))
y[indices] <- y[indices] + 1
y
}
final.distrib <- data.frame(t(apply(distrib.count, 1, smart.round)))
colnames(final.distrib) <- c("LWA", "LWB", "SCA", "SCB", "RCA", "RCB")
And here is the code with six functions:
install.packages("triangle")
library(triangle)
function.LWA <- function() {
n_sim <- 10000
LWA.cost <- rtriangle(n_sim, a = 0.9*8407.29, b = 1.1*8407.29, c = 8407.29)
random.area <- rlnorm(100000, meanlog = 3.61, sdlog = 0.82)
LWA.area <- head(random.area[random.area >= 5 & random.area <= 200], n_sim)
LWA.loss <- rtriangle (n_sim, a = 0.0890, b = 0.6655, c = 0.3212)
LWA.cost*LWA.area*LWA.loss
}
function.LWB <- function() {
n_sim <- 10000
LWB.cost <- rtriangle(n_sim, a = 0.9*10328.43, b = 1.1*10328.43, c = 10328.43)
random.area <- rlnorm(100000, meanlog = 3.61, sdlog = 0.82)
LWB.area <- head(random.area[random.area >= 5 & random.area <= 200], n_sim)
LWB.loss <- rtriangle(n_sim, a = 0.0423, b = 0.5572, c = 0.2038)
LWB.cost*LWB.area*LWB.loss
}
function.SCA <- function() {
n_sim <- 10000
SCA.cost <- rtriangle(n_sim, a = 0.9*12055.57, b = 1.1*12055.57, c = 12055.57)
random.area <- rlnorm(100000, meanlog = 3.61, sdlog = 0.82)
SCA.area <- head(random.area[random.area >= 5 & random.area <= 200], n_sim)
SCA.loss <- rtriangle(n_sim, a = 0.0453, b = 0.5665, c = 0.2136)
SCA.cost*SCA.area*SCA.loss
}
function.SCB <- function() {
n_sim <- 10000
SCB.cost <- rtriangle(n_sim, a = 0.9*17935.86, b = 1.1*17935.86, c = 17935.86)
random.area <- rlnorm(100000, meanlog = 3.61, sdlog = 0.82)
SCB.area <- head(random.area[random.area >= 5 & random.area <= 200], n_sim)
SCB.loss <- rtriangle(n_sim, a = 0.0645, b = 0.6361, c = 0.2791)
SCB.cost*SCB.area*SCB.loss
}
function.RCA <- function() {
n_sim <- 10000
RCA.cost <- rtriangle(n_sim, a = 0.9*19363.57, b = 1.1*19363.57, c = 19363.57)
random.area <- rlnorm(100000, meanlog = 3.61, sdlog = 0.82)
RCA.area <- head(random.area[random.area >= 5 & random.area <= 200], n_sim)
RCA.loss <- rtriangle(n_sim, a = 0.0263, b = 0.5009, c = 0.1537)
RCA.cost*RCA.area*RCA.loss
}
function.RCB <- function() {
n_sim <- 10000
RCB.cost <- rtriangle(n_sim, a = 0.9*25182.71, b = 1.1*25182.71, c = 25182.71)
random.area <- rlnorm(100000, meanlog = 3.61, sdlog = 0.82)
RCB.area <- head(random.area[random.area >= 5 & random.area <= 200], n_sim)
RCB.loss <- rtriangle(n_sim, a = 0.0583, b = 0.6078, c = 0.2519)
RCB.cost*RCB.area*RCB.loss
}
# Function to calculate the direct loss and return the 95th percentile
calculate_percentile <- function(row) {
set.seed(1) # for comparability across scenarios
direct.loss <- (rowSums(replicate(n=row["LWA"], function.LWA()))) +
(rowSums(replicate(n=row["LWB"], function.LWB()))) +
(rowSums(replicate(n=row["SCA"], function.SCA()))) +
(rowSums(replicate(n=row["SCB"], function.SCB()))) +
(rowSums(replicate(n=row["RCA"], function.RCA()))) +
(rowSums(replicate(n=row["RCB"], function.RCB())))
quantile(direct.loss, probs = 0.95) # Specify 95th percentile
}
# Apply the function to each row of the data frame (containing house count) and store the results
loss.PEISVII.95p <- setNames(data.frame(apply(final.distrib, 1, calculate_percentile)), "PEIS VII loss")
Thank you.
replicate()vs. a simpleforloop?Rprof()) on your code, you'll see that most of the time is being spent inrlnorm(). That makes sense: you are generating many millions of lognormals in your simulation. Just running on the first 5 rows calls it more than 4800 times, and each of those generates 100000 values. That's nearly half a billion values that you are generating.