2

I have the following run-length encoding data.

df1 <- structure(list(lengths = c(2L, 3L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 3L, 1L), values = c(10, 9, NA, 5, 4, 3, NA, 2, NA, 1, 0, NA, 0)), row.names = c(NA, -13L), class = "data.frame")
df1
# > df1
#    lengths values
# 1        2     10
# 2        3      9
# 3        2     NA
# 4        1      5
# 5        1      4
# 6        1      3
# 7        1     NA
# 8        1      2
# 9        2     NA
# 10       1      1
# 11       1      0
# 12       3     NA
# 13       1      0

Using a particular threshold (0.01), I create a new variable in this data frame.

df1$Below_Threshold <- ifelse(df1$values <= 0.01, TRUE, FALSE)
df1
# > df1
#    lengths values Below_Threshold
# 1        2     10           FALSE
# 2        3      9           FALSE
# 3        2     NA              NA
# 4        1      5           FALSE
# 5        1      4           FALSE
# 6        1      3           FALSE
# 7        1     NA              NA
# 8        1      2           FALSE
# 9        2     NA              NA
# 10       1      1           FALSE
# 11       1      0            TRUE
# 12       3     NA              NA
# 13       1      0            TRUE

I now want to perform run-length encoding on this new variable, but instead of simply returning the number of occurrences, I want to return the sum of the lengths column from the first data frame. The result should look like the sum column in the df2 data frame in the following chunk of code.

df2 <- structure(list(values = c(FALSE, NA, FALSE, NA, FALSE, NA, FALSE, TRUE, NA, TRUE), sum = c(5, 2, 3, 1, 1, 2, 1, 1, 3, 1)), class = "data.frame", row.names = c(NA, -10L))
df2
# > df2
#    values sum
# 1   FALSE   5
# 2      NA   2
# 3   FALSE   3
# 4      NA   1
# 5   FALSE   1
# 6      NA   2
# 7   FALSE   1
# 8    TRUE   1
# 9      NA   3
# 10   TRUE   1

Is there a nice, efficient way of achieving this result? base R solutions are preferred but all are welcome.

4 Answers 4

4
df1 %>%
   group_by(grp = consecutive_id(values <= 0.01))%>%
   summarise(values = first(values) <= 0.01, sum = sum(lengths))

# A tibble: 10 × 3
     grp values   sum
   <int> <lgl>  <int>
 1     1 FALSE      5
 2     2 NA         2
 3     3 FALSE      3
 4     4 NA         1
 5     5 FALSE      1
 6     6 NA         2
 7     7 FALSE      1
 8     8 TRUE       1
 9     9 NA         3
10    10 TRUE       1

If that feels repetative, use:

df1 %>%
  mutate(values = values <= 0.01) %>%
  group_by(grp = consecutive_id(values))%>%
  summarise(values = first(values), sum = sum(lengths))

# A tibble: 10 × 3
     grp values   sum
   <int> <lgl>  <int>
 1     1 FALSE      5
 2     2 NA         2
 3     3 FALSE      3
 4     4 NA         1
 5     5 FALSE      1
 6     6 NA         2
 7     7 FALSE      1
 8     8 TRUE       1
 9     9 NA         3
10    10 TRUE       1
Sign up to request clarification or add additional context in comments.

Comments

3
rle1 <- rle(df1$Below_Threshold)

z <- cumsum(rle1$lengths)
y <- c(1, (z+1))[-(length(z)+1)]

df2 <- data.frame(
  values = rle1$values,
  sum    = sapply(1:length(z), function(i) sum(df1$lengths[y[i]:z[i]]))
)

df2
   values sum
1   FALSE   5
2      NA   2
3   FALSE   3
4      NA   1
5   FALSE   1
6      NA   2
7   FALSE   1
8    TRUE   1
9      NA   3
10   TRUE   1

Comments

3

We can achieve this in base R using the following :

Use rle to create run length encoding and expand it to the whole dataframe to create a group column. Use aggregate to collapse the output per group and take sum of lengths value.

rl <- rle(df1$Below_Threshold)
df2 <- transform(df1, group = rep(seq_along(rl$values), rl$lengths))
cbind(values = rl$values, aggregate(lengths~group, df2, sum))

#   values group lengths
#1   FALSE     1       5
#2      NA     2       2
#3   FALSE     3       3
#4      NA     4       1
#5   FALSE     5       1
#6      NA     6       2
#7   FALSE     7       1
#8    TRUE     8       1
#9      NA     9       3
#10   TRUE    10       1

Comments

1
with(
  rle(df1$Below_Threshold),
  data.frame(values, sum = diff(c(0L, cumsum(df1$lengths)[cumsum(lengths)])))
)
#>    values sum
#> 1   FALSE   5
#> 2      NA   2
#> 3   FALSE   3
#> 4      NA   1
#> 5   FALSE   1
#> 6      NA   2
#> 7   FALSE   1
#> 8    TRUE   1
#> 9      NA   3
#> 10   TRUE   1

Benchmarking.

Functions

library(dplyr)

f1 <- function(df1) {
  df1 %>%
    group_by(grp = consecutive_id(values <= 0.01))%>%
    summarise(values = first(values) <= 0.01, sum = sum(lengths))
}

f2 <- function(df1) {
  rle1 <- rle(df1$Below_Threshold)
  
  z <- cumsum(rle1$lengths)
  y <- c(1, (z+1))[-(length(z)+1)]
  
  data.frame(
    values = rle1$values,
    sum    = sapply(1:length(z), function(i) sum(df1$lengths[y[i]:z[i]]))
  )
}

f3 <- function(df1) {
  rl <- rle(df1$Below_Threshold)
  df2 <- transform(df1, group = rep(seq_along(rl$values), rl$lengths))
  cbind(values = rl$values, aggregate(lengths~group, df2, sum))
}

f4 <- function(df1) {
  with(
    rle(df1$Below_Threshold),
    data.frame(values, sum = diff(c(0L, cumsum(df1$lengths)[cumsum(lengths)])))
  )
}

Timings:

bench::mark(
  Onyambu = f1(df1),
  Edward = f2(df1),
  RonakShah = f3(df1),
  jblood94 = f4(df1),
  check = FALSE
)
#> # A tibble: 4 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 Onyambu      2.44ms   2.73ms      344.    2.68MB     15.6
#> 2 Edward      139.1µs  156.5µs     6029.    4.12MB     18.9
#> 3 RonakShah   650.7µs  730.3µs     1321.  515.07KB     16.9
#> 4 jblood94    125.6µs    143µs     6675.        0B     21.2

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.