Using Run-Length Encoding and Generating Sums

Question

I have the following run-length encoding data.

df1 <- structure(list(lengths = c(2L, 3L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 3L, 1L), values = c(10, 9, NA, 5, 4, 3, NA, 2, NA, 1, 0, NA, 0)), row.names = c(NA, -13L), class = "data.frame")
df1
# > df1
#    lengths values
# 1        2     10
# 2        3      9
# 3        2     NA
# 4        1      5
# 5        1      4
# 6        1      3
# 7        1     NA
# 8        1      2
# 9        2     NA
# 10       1      1
# 11       1      0
# 12       3     NA
# 13       1      0

Using a particular threshold (0.01), I create a new variable in this data frame.

df1$Below_Threshold <- ifelse(df1$values <= 0.01, TRUE, FALSE)
df1
# > df1
#    lengths values Below_Threshold
# 1        2     10           FALSE
# 2        3      9           FALSE
# 3        2     NA              NA
# 4        1      5           FALSE
# 5        1      4           FALSE
# 6        1      3           FALSE
# 7        1     NA              NA
# 8        1      2           FALSE
# 9        2     NA              NA
# 10       1      1           FALSE
# 11       1      0            TRUE
# 12       3     NA              NA
# 13       1      0            TRUE

I now want to perform run-length encoding on this new variable, but instead of simply returning the number of occurrences, I want to return the sum of the lengths column from the first data frame. The result should look like the sum column in the df2 data frame in the following chunk of code.

df2 <- structure(list(values = c(FALSE, NA, FALSE, NA, FALSE, NA, FALSE, TRUE, NA, TRUE), sum = c(5, 2, 3, 1, 1, 2, 1, 1, 3, 1)), class = "data.frame", row.names = c(NA, -10L))
df2
# > df2
#    values sum
# 1   FALSE   5
# 2      NA   2
# 3   FALSE   3
# 4      NA   1
# 5   FALSE   1
# 6      NA   2
# 7   FALSE   1
# 8    TRUE   1
# 9      NA   3
# 10   TRUE   1

Is there a nice, efficient way of achieving this result? base R solutions are preferred but all are welcome.

Onyambu · Accepted Answer · 2024-04-11 02:28:25Z

df1 %>%
   group_by(grp = consecutive_id(values <= 0.01))%>%
   summarise(values = first(values) <= 0.01, sum = sum(lengths))

# A tibble: 10 × 3
     grp values   sum
   <int> <lgl>  <int>
 1     1 FALSE      5
 2     2 NA         2
 3     3 FALSE      3
 4     4 NA         1
 5     5 FALSE      1
 6     6 NA         2
 7     7 FALSE      1
 8     8 TRUE       1
 9     9 NA         3
10    10 TRUE       1

If that feels repetative, use:

df1 %>%
  mutate(values = values <= 0.01) %>%
  group_by(grp = consecutive_id(values))%>%
  summarise(values = first(values), sum = sum(lengths))

# A tibble: 10 × 3
     grp values   sum
   <int> <lgl>  <int>
 1     1 FALSE      5
 2     2 NA         2
 3     3 FALSE      3
 4     4 NA         1
 5     5 FALSE      1
 6     6 NA         2
 7     7 FALSE      1
 8     8 TRUE       1
 9     9 NA         3
10    10 TRUE       1

Edward · Accepted Answer · 2024-04-11 02:23:37Z

3

rle1 <- rle(df1$Below_Threshold)

z <- cumsum(rle1$lengths)
y <- c(1, (z+1))[-(length(z)+1)]

df2 <- data.frame(
  values = rle1$values,
  sum    = sapply(1:length(z), function(i) sum(df1$lengths[y[i]:z[i]]))
)

df2
   values sum
1   FALSE   5
2      NA   2
3   FALSE   3
4      NA   1
5   FALSE   1
6      NA   2
7   FALSE   1
8    TRUE   1
9      NA   3
10   TRUE   1

answered Apr 11, 2024 at 2:23

Edward

22.2k3 gold badges18 silver badges37 bronze badges

Comments

Ronak Shah · Accepted Answer · 2024-04-11 05:37:06Z

3

We can achieve this in base R using the following :

Use rle to create run length encoding and expand it to the whole dataframe to create a group column. Use aggregate to collapse the output per group and take sum of lengths value.

rl <- rle(df1$Below_Threshold)
df2 <- transform(df1, group = rep(seq_along(rl$values), rl$lengths))
cbind(values = rl$values, aggregate(lengths~group, df2, sum))

#   values group lengths
#1   FALSE     1       5
#2      NA     2       2
#3   FALSE     3       3
#4      NA     4       1
#5   FALSE     5       1
#6      NA     6       2
#7   FALSE     7       1
#8    TRUE     8       1
#9      NA     9       3
#10   TRUE    10       1

answered Apr 11, 2024 at 5:37

Ronak Shah

391k20 gold badges173 silver badges237 bronze badges

Comments

jblood94 · Accepted Answer · 2024-04-11 20:13:06Z

with(
  rle(df1$Below_Threshold),
  data.frame(values, sum = diff(c(0L, cumsum(df1$lengths)[cumsum(lengths)])))
)
#>    values sum
#> 1   FALSE   5
#> 2      NA   2
#> 3   FALSE   3
#> 4      NA   1
#> 5   FALSE   1
#> 6      NA   2
#> 7   FALSE   1
#> 8    TRUE   1
#> 9      NA   3
#> 10   TRUE   1

Benchmarking.

Functions

library(dplyr)

f1 <- function(df1) {
  df1 %>%
    group_by(grp = consecutive_id(values <= 0.01))%>%
    summarise(values = first(values) <= 0.01, sum = sum(lengths))
}

f2 <- function(df1) {
  rle1 <- rle(df1$Below_Threshold)
  
  z <- cumsum(rle1$lengths)
  y <- c(1, (z+1))[-(length(z)+1)]
  
  data.frame(
    values = rle1$values,
    sum    = sapply(1:length(z), function(i) sum(df1$lengths[y[i]:z[i]]))
  )
}

f3 <- function(df1) {
  rl <- rle(df1$Below_Threshold)
  df2 <- transform(df1, group = rep(seq_along(rl$values), rl$lengths))
  cbind(values = rl$values, aggregate(lengths~group, df2, sum))
}

f4 <- function(df1) {
  with(
    rle(df1$Below_Threshold),
    data.frame(values, sum = diff(c(0L, cumsum(df1$lengths)[cumsum(lengths)])))
  )
}

Timings:

bench::mark(
  Onyambu = f1(df1),
  Edward = f2(df1),
  RonakShah = f3(df1),
  jblood94 = f4(df1),
  check = FALSE
)
#> # A tibble: 4 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 Onyambu      2.44ms   2.73ms      344.    2.68MB     15.6
#> 2 Edward      139.1µs  156.5µs     6029.    4.12MB     18.9
#> 3 RonakShah   650.7µs  730.3µs     1321.  515.07KB     16.9
#> 4 jblood94    125.6µs    143µs     6675.        0B     21.2

Collectives™ on Stack Overflow

Using Run-Length Encoding and Generating Sums

4 Answers 4

Comments

Comments

Comments

Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

4 Answers 4

Comments

Comments

Comments

Comments

Your Answer

Sign up or log in

Post as a guest

Related