4

This question is about how to generate frequency transition tables from longitudinal data in the long format using R base functions or commonly used packages such as dplyr. Consider the longitudinal data

id <- c(1,1,2,2,3,3,4,4)
state <- c("C","A", "A", "A", "B", "A", "C", "A")
period <- rep(c("Start", "End"), 4)
df <- data.frame(id, state, period)
df

  id state period
1  1      C  Start
2  1      A    End
3  2      A  Start
4  2      A    End
5  3      B  Start
6  3      A    End
7  4      C  Start
8  4      A    End

and the expected output

    transition freq
1     A to A    1
2     A to B    0
3     A to C    0
4     B to B    0
5     B to A    1
6     B to C    0
7     C to C    0
8     C to A    2
9     C to B    0

I can generate the above output using the function statetable.msm in the msm package. However, I would like to know if this could be generated by base functions in R or other packages such as dplyr. Help is much appreciated!

7
  • Is it always 2 rows per ID? Commented Nov 19, 2024 at 13:14
  • 1
    @zx8754 it's possible to have many rows per ID, but this particular case its 2 rows per ID. Thanks! Commented Nov 19, 2024 at 13:17
  • "start A end B" same as "start B end A" ? Commented Nov 19, 2024 at 13:24
  • @zx8754 Not same. Commented Nov 19, 2024 at 13:25
  • If there are many rows per ID, how do you know which Start matches with which End? Commented Nov 19, 2024 at 13:29

6 Answers 6

6

A solution entirely within base R could be something like:

do.call("c",
  split(df$state, df$id) |>
  lapply(paste, collapse = " to ")) |>
  factor(levels = sort(c(outer(unique(df$state), unique(df$state), 
                               FUN = paste, sep = " to ")))) |>
  table() |>
  as.data.frame() |>
  setNames(c("transition", "freq"))
#>   transition freq
#> 1     A to A    1
#> 2     A to B    0
#> 3     A to C    0
#> 4     B to A    1
#> 5     B to B    0
#> 6     B to C    0
#> 7     C to A    2
#> 8     C to B    0
#> 9     C to C    0
Sign up to request clarification or add additional context in comments.

Comments

3

You can try

states <- sort(unique(df$state))
transform(
    aggregate(
        id ~ .,
        merge(
            expand.grid(state.Start = states, state.End = states),
            reshape(
                df,
                direction = "wide",
                idvar = "id",
                timevar = "period"
            ),
            all = TRUE
        ),
        list,
        na.action = na.pass
    ),
    freq = lengths(id) * !is.na(id)
)

gives

  state.Start state.End   id freq
1           A         A    2    1
2           B         A    3    1
3           C         A 1, 4    2
4           A         B   NA    0
5           B         B   NA    0
6           C         B   NA    0
7           A         C   NA    0
8           B         C   NA    0
9           C         C   NA    0

Comments

3

You could merge permutations with repetition to the first and last element (I interpreted that from your comment) of the state variable pasted together.

> data.frame(
+   Var1=RcppAlgos::permuteGeneral(sort(unique(df$state)), 2, rep=TRUE, 
+                                  FUN=paste, collapse=' to ') |> do.call(what='rbind')
+ ) |> 
+   merge(
+     split(df$state, df$id) |> lapply(\(x) toString(x[c(1, length(x))])) |>
+       sub(', ', ' to ', x=_) |> table() |> as.data.frame(),
+     all=TRUE
+   ) |> 
+   transform(Freq=replace(Freq, is.na(Freq), 0)) |>  setNames(c('transition', 'freq'))
  transition freq
1     A to A    1
2     A to B    0
3     A to C    0
4     B to A    1
5     B to B    0
6     B to C    0
7     C to A    2
8     C to B    0
9     C to C    0

Comments

2

With tidyverse, using left_join on the crossing states

library(dplyr)
library(tidyr)

left_join(crossing(Start = df$state, End = df$state), 
          pivot_wider(df, names_from = period, values_from = state)) %>% 
  reframe(freq = id * 0 + n(), .by = c(Start, End)) %>% 
  distinct()

output

# A tibble: 9 × 3
  Start End    freq
  <chr> <chr> <dbl>
1 A     A         1
2 A     B        NA
3 A     C        NA
4 B     A         1
5 B     B        NA
6 B     C        NA
7 C     A         2
8 C     B        NA
9 C     C        NA

or, to get the complete desired format

left_join(crossing(Start = df$state, End = df$state), 
          pivot_wider(df, names_from = period, values_from = state)) %>% 
  mutate(freq = id * 0 + n(),
         freq = replace(freq, is.na(freq), 0), .by = c(Start, End)) %>% 
  mutate(transition = purrr::map2_vec(Start, End, ~ paste(.x, "to", .y)), .before=1) %>%
  select(-c(Start, End, id)) %>% 
  distinct()
Joining with `by = join_by(Start, End)`
# A tibble: 9 × 2
  transition  freq
  <chr>      <dbl>
1 A to A         1
2 A to B         0
3 A to C         0
4 B to A         1
5 B to B         0
6 B to C         0
7 C to A         2
8 C to B         0
9 C to C         0

Comments

1

An alternative format of the data is to store in a length(id) x length(period) matrix:

rows = unique(df$id)
cols = unique(df$period)
mat = matrix(nrow = length(rows), ncol = length(cols), 
             dimnames = list(id = rows, period = cols))
mat[cbind(df$id, df$period)] = df$state
#mat
#   period
#id  Start End
#  1 "C"   "A"
#  2 "A"   "A"
#  3 "B"   "A"
#  4 "C"   "A"

And then, tabulate:

lvls = unique(df$state)
as.data.frame(table(factor(mat[, "Start"], lvls), 
                    factor(mat[, "End"], lvls)))
#  Var1 Var2 Freq
#1    C    C    0
#2    A    C    0
#3    B    C    0
#4    C    A    2
#5    A    A    1
#6    B    A    1
#7    C    B    0
#8    A    B    0
#9    B    B    0

and format as needed afterwards.

Comments

1

a (detailled) solution with data.table :)

Code:

library(data.table)
library(stringr)

# define the example
dt_example <- data.table(
  id = c(1,1,2,2,3,3,4,4),
  state = c("C","A", "A", "A", "B", "A", "C", "A"),
  period = rep(c("Start", "End"), 4)
)

# define the target output structure
dt_target <- data.table( 
  Start.Point = c( rep("A",3), rep("B",3),rep("C",3) ), 
  End.Point = rep(c("A", "B", "C"),3) 
)
dt_target[, Path := str_c(Start.Point, " -> ", End.Point)]

# work the data
dt_agregate <- merge( 
  dt_example[period == "Start", .(id, Start.Point = state)], 
  dt_example[period == "End", .(id, End.Point = state)], 
  by = "id", 
  allow.cartesian = T
)
dt_agregate[, Path := str_c(Start.Point, " -> ", End.Point)]

# attach the aggreagation results
dt_target <- merge(
   dt_target[,.(Path)],
   dt_agregate[, .(freq = .N), .(Path)],
   by = "Path", 
   all = T
 )

# replace NA by 0
setnafill(dt_target, fill = 0, cols = 2)
dt_target

Output:

Key: <Path>
     Path  freq
   <char> <int>
1: A -> A     1
2: A -> B     0
3: A -> C     0
4: B -> A     1
5: B -> B     0
6: B -> C     0
7: C -> A     2
8: C -> B     0
9: C -> C     0

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.