72

I'm trying to create separate data.frame objects based on levels of a factor. So if I have:

df <- data.frame(
  x=rnorm(25),
  y=rnorm(25),
  g=rep(factor(LETTERS[1:5]), 5)
)

How can I split df into separate data.frames for each level of g containing the corresponding x and y values? I can get most of the way there using split(df, df$g), but I'd like the each level of the factor to have its own data.frame.

What's the best way to do this?

0

5 Answers 5

111

I think that split does exactly what you want.

Notice that X is a list of data frames, as seen by str:

X <- split(df, df$g)
str(X)

If you want individual object with the group g names you could assign the elements of X from split to objects of those names, though this seems like extra work when you can just index the data frames from the list split creates.

#I used lapply just to drop the third column g which is no longer needed.
Y <- lapply(seq_along(X), function(x) as.data.frame(X[[x]])[, 1:2]) 

#Assign the dataframes in the list Y to individual objects
A <- Y[[1]]
B <- Y[[2]]
C <- Y[[3]]
D <- Y[[4]]
E <- Y[[5]]

#Or use lapply with assign to assign each piece to an object all at once
lapply(seq_along(Y), function(x) {
    assign(c("A", "B", "C", "D", "E")[x], Y[[x]], envir=.GlobalEnv)
    }
)

Edit Or even better than using lapply to assign to the global environment use list2env:

names(Y) <- c("A", "B", "C", "D", "E")
list2env(Y, envir = .GlobalEnv)
A
Sign up to request clarification or add additional context in comments.

Comments

23

Since dplyr 0.8.0 , we can also use group_split which has similar behavior as base::split

library(dplyr)
df %>% group_split(g)

#[[1]]
# A tibble: 5 x 3
#       x      y g    
#   <dbl>  <dbl> <fct>
#1 -1.21  -1.45  A    
#2  0.506  1.10  A    
#3 -0.477 -1.17  A    
#4 -0.110  1.45  A    
#5  0.134 -0.969 A    

#[[2]]
# A tibble: 5 x 3
#       x      y g    
#   <dbl>  <dbl> <fct>
#1  0.277  0.575 B    
#2 -0.575 -0.476 B    
#3 -0.998 -2.18  B    
#4 -0.511 -1.07  B    
#5 -0.491 -1.11  B  
#....

It also comes with argument .keep (which is TRUE by default) to specify whether or not the grouped column should be kept.

df %>% group_split(g, .keep = FALSE)

#[[1]]
# A tibble: 5 x 2
#       x      y
#   <dbl>  <dbl>
#1 -1.21  -1.45 
#2  0.506  1.10 
#3 -0.477 -1.17 
#4 -0.110  1.45 
#5  0.134 -0.969

#[[2]]
# A tibble: 5 x 2
#       x      y
#   <dbl>  <dbl>
#1  0.277  0.575
#2 -0.575 -0.476
#3 -0.998 -2.18 
#4 -0.511 -1.07 
#5 -0.491 -1.11 
#....

The difference between base::split and dplyr::group_split is that group_split does not name the elements of the list based on grouping. So

df1 <- df %>% group_split(g)
names(df1) #gives 
NULL

whereas

df2 <- split(df, df$g)
names(df2) #gives
#[1] "A" "B" "C" "D" "E"

data

set.seed(1234)
df <- data.frame(
      x=rnorm(25),
      y=rnorm(25),
      g=rep(factor(LETTERS[1:5]), 5)
)

Comments

2

The following loop can solve the problem:

data <- data.frame(ID_CODE  = c('001', '001', '001', '002', '002', '003'),
                   Metric1 = c('0.94', '0.68', '0.8', '0.12', '0.56', '0.87'))


for (i in unique(data$ID_CODE)) {
  len <- sum(data$ID_CODE == i)
  df <- data.frame(ID_CODE = rep(i, len), 
                   Metric1 = data[data$ID_CODE == i,"Metric1"])
  assign(paste0("df_", i), df)
}

The created data frames are:

> df_001
  ID_CODE Metric1
1     001    0.94
2     001    0.68
3     001     0.8
> df_002
  ID_CODE Metric1
1     002    0.12
2     002    0.56
> df_003
  ID_CODE Metric1
1     003    0.87

Created on 2023-03-27 with reprex v2.0.2

Comments

1

If you want a data.table option, you could also use the split function:

Split method for data.table. Faster and more flexible. Be aware that processing list of data.tables will be generally much slower than manipulation in single data.table by group using by argument, read more on data.table.

Here is a reproducible example:

library(data.table)
set.seed(123)
dt <- data.table(
  x=rnorm(25),
  y=rnorm(25),
  g=rep(factor(LETTERS[1:5]), 5)
)
dt = dt[sample(.N)]
df = as.data.frame(dt)

fdt = dt[, c(lapply(.SD, as.factor), list(g=g)), .SDcols=x:y]
fdf = as.data.frame(fdt)
sdf = split(fdf, list(fdf$g))
sdf
#> $A
#>                     x                  y g
#> 7    1.78691313680308 -0.694706978920513 A
#> 11  -1.06782370598685  -1.12310858320335 A
#> 17   1.71506498688328  0.426464221476814 A
#> 23   1.22408179743946  0.688640254100091 A
#> 25 -0.560475646552213  -1.68669331074241 A
#> 
#> $B
#>                     x                  y g
#> 3  -0.217974914658295 -0.402884835299076 B
#> 8   0.359813827057364  0.553917653537589 B
#> 10  0.497850478229239 -0.207917278019599 B
#> 14  0.460916205989202 -0.295071482992271 B
#> 18  -0.23017748948328  0.837787044494525 B
#> 
#> $C
#>                    x                   y g
#> 5   1.55870831414912   0.153373117836515 C
#> 6  -1.26506123460653   0.895125661045022 C
#> 9  -1.02600444830724  -0.466655353623219 C
#> 13 0.400771450594052 -0.0619117105767217 C
#> 16 -1.96661715662964   -1.26539635156826 C
#> 
#> $D
#>                     x                  y g
#> 4    0.11068271594512 -0.305962663739917 D
#> 12  0.701355901563686   2.16895596533851 D
#> 21 -0.686852851893526  0.878133487533042 D
#> 22  0.070508391424576  -1.13813693701195 D
#> 24  -0.72889122929114  0.779965118336318 D
#> 
#> $E
#>                     x                   y g
#> 1  -0.625039267849257 -0.0833690664718293 E
#> 2  -0.472791407727934    1.20796199830499 E
#> 15 -0.555841134754075  -0.380471001012383 E
#> 19 -0.445661970099958   0.821581081637487 E
#> 20  0.129287735160946    1.25381492106993 E

Created on 2022-07-09 by the reprex package (v2.0.1)

Comments

0

collapse has rsplit function which does the same.

collapse::rsplit(df, df$g)

When compared against the current solutions it works the best.

library(collapse)
library(dplyr)
library(data.table)

df <- data.frame(
  x=rnorm(1e6),
  y=rnorm(1e6),
  g=sample(factor(LETTERS), 1e6, replace = TRUE)
)

fdf <- copy(df)
fdf <- as.data.table(fdf)


microbenchmark::microbenchmark(
  base_split = split(df, df$g),
  group_split = group_split(df, g), 
  dt_split = split(fdf, list(fdf$g)), 
  rsplit = collapse::rsplit(df, df$g)
)

#Unit: milliseconds
#        expr     min       lq     mean   median       uq      max neval
#  base_split 46.6185 51.33850 68.95247 56.75210 59.26480 217.1945   100
# group_split 20.4633 21.43105 28.37164 22.47115 29.17205 199.5639   100
#    dt_split 27.1163 28.44910 35.41670 33.92700 35.82960 178.5173   100
#      rsplit 13.2479 14.23280 19.10574 14.92970 21.76650 166.4624   100

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.