Split data.frame based on levels of a factor into new data.frames

Question

I'm trying to create separate data.frame objects based on levels of a factor. So if I have:

df <- data.frame(
  x=rnorm(25),
  y=rnorm(25),
  g=rep(factor(LETTERS[1:5]), 5)
)

How can I split df into separate data.frames for each level of g containing the corresponding x and y values? I can get most of the way there using split(df, df$g), but I'd like the each level of the factor to have its own data.frame.

What's the best way to do this?

Tyler Rinker · Accepted Answer · 2014-03-20 03:14:47Z

I think that split does exactly what you want.

Notice that X is a list of data frames, as seen by str:

X <- split(df, df$g)
str(X)

If you want individual object with the group g names you could assign the elements of X from split to objects of those names, though this seems like extra work when you can just index the data frames from the list split creates.

#I used lapply just to drop the third column g which is no longer needed.
Y <- lapply(seq_along(X), function(x) as.data.frame(X[[x]])[, 1:2]) 

#Assign the dataframes in the list Y to individual objects
A <- Y[[1]]
B <- Y[[2]]
C <- Y[[3]]
D <- Y[[4]]
E <- Y[[5]]

#Or use lapply with assign to assign each piece to an object all at once
lapply(seq_along(Y), function(x) {
    assign(c("A", "B", "C", "D", "E")[x], Y[[x]], envir=.GlobalEnv)
    }
)

Edit Or even better than using lapply to assign to the global environment use list2env:

names(Y) <- c("A", "B", "C", "D", "E")
list2env(Y, envir = .GlobalEnv)
A

Ronak Shah · Accepted Answer · 2020-09-07 02:04:18Z

Since dplyr 0.8.0 , we can also use group_split which has similar behavior as base::split

library(dplyr)
df %>% group_split(g)

#[[1]]
# A tibble: 5 x 3
#       x      y g    
#   <dbl>  <dbl> <fct>
#1 -1.21  -1.45  A    
#2  0.506  1.10  A    
#3 -0.477 -1.17  A    
#4 -0.110  1.45  A    
#5  0.134 -0.969 A    

#[[2]]
# A tibble: 5 x 3
#       x      y g    
#   <dbl>  <dbl> <fct>
#1  0.277  0.575 B    
#2 -0.575 -0.476 B    
#3 -0.998 -2.18  B    
#4 -0.511 -1.07  B    
#5 -0.491 -1.11  B  
#....

It also comes with argument .keep (which is TRUE by default) to specify whether or not the grouped column should be kept.

df %>% group_split(g, .keep = FALSE)

#[[1]]
# A tibble: 5 x 2
#       x      y
#   <dbl>  <dbl>
#1 -1.21  -1.45 
#2  0.506  1.10 
#3 -0.477 -1.17 
#4 -0.110  1.45 
#5  0.134 -0.969

#[[2]]
# A tibble: 5 x 2
#       x      y
#   <dbl>  <dbl>
#1  0.277  0.575
#2 -0.575 -0.476
#3 -0.998 -2.18 
#4 -0.511 -1.07 
#5 -0.491 -1.11 
#....

The difference between base::split and dplyr::group_split is that group_split does not name the elements of the list based on grouping. So

df1 <- df %>% group_split(g)
names(df1) #gives 
NULL

whereas

df2 <- split(df, df$g)
names(df2) #gives
#[1] "A" "B" "C" "D" "E"

data

set.seed(1234)
df <- data.frame(
      x=rnorm(25),
      y=rnorm(25),
      g=rep(factor(LETTERS[1:5]), 5)
)

Md Ahsanul Himel · Accepted Answer · 2023-03-27 10:02:11Z

2

The following loop can solve the problem:

data <- data.frame(ID_CODE  = c('001', '001', '001', '002', '002', '003'),
                   Metric1 = c('0.94', '0.68', '0.8', '0.12', '0.56', '0.87'))


for (i in unique(data$ID_CODE)) {
  len <- sum(data$ID_CODE == i)
  df <- data.frame(ID_CODE = rep(i, len), 
                   Metric1 = data[data$ID_CODE == i,"Metric1"])
  assign(paste0("df_", i), df)
}

The created data frames are:

> df_001
  ID_CODE Metric1
1     001    0.94
2     001    0.68
3     001     0.8
> df_002
  ID_CODE Metric1
1     002    0.12
2     002    0.56
> df_003
  ID_CODE Metric1
1     003    0.87

^{Created on 2023-03-27 with reprex v2.0.2}

answered Mar 27, 2023 at 10:02

Md Ahsanul Himel

3451 silver badge11 bronze badges

Comments

Quinten · Accepted Answer · 2022-07-09 18:29:32Z

If you want a data.table option, you could also use the split function:

Split method for data.table. Faster and more flexible. Be aware that processing list of data.tables will be generally much slower than manipulation in single data.table by group using by argument, read more on data.table.

Here is a reproducible example:

library(data.table)
set.seed(123)
dt <- data.table(
  x=rnorm(25),
  y=rnorm(25),
  g=rep(factor(LETTERS[1:5]), 5)
)
dt = dt[sample(.N)]
df = as.data.frame(dt)

fdt = dt[, c(lapply(.SD, as.factor), list(g=g)), .SDcols=x:y]
fdf = as.data.frame(fdt)
sdf = split(fdf, list(fdf$g))
sdf
#> $A
#>                     x                  y g
#> 7    1.78691313680308 -0.694706978920513 A
#> 11  -1.06782370598685  -1.12310858320335 A
#> 17   1.71506498688328  0.426464221476814 A
#> 23   1.22408179743946  0.688640254100091 A
#> 25 -0.560475646552213  -1.68669331074241 A
#> 
#> $B
#>                     x                  y g
#> 3  -0.217974914658295 -0.402884835299076 B
#> 8   0.359813827057364  0.553917653537589 B
#> 10  0.497850478229239 -0.207917278019599 B
#> 14  0.460916205989202 -0.295071482992271 B
#> 18  -0.23017748948328  0.837787044494525 B
#> 
#> $C
#>                    x                   y g
#> 5   1.55870831414912   0.153373117836515 C
#> 6  -1.26506123460653   0.895125661045022 C
#> 9  -1.02600444830724  -0.466655353623219 C
#> 13 0.400771450594052 -0.0619117105767217 C
#> 16 -1.96661715662964   -1.26539635156826 C
#> 
#> $D
#>                     x                  y g
#> 4    0.11068271594512 -0.305962663739917 D
#> 12  0.701355901563686   2.16895596533851 D
#> 21 -0.686852851893526  0.878133487533042 D
#> 22  0.070508391424576  -1.13813693701195 D
#> 24  -0.72889122929114  0.779965118336318 D
#> 
#> $E
#>                     x                   y g
#> 1  -0.625039267849257 -0.0833690664718293 E
#> 2  -0.472791407727934    1.20796199830499 E
#> 15 -0.555841134754075  -0.380471001012383 E
#> 19 -0.445661970099958   0.821581081637487 E
#> 20  0.129287735160946    1.25381492106993 E

^{Created on 2022-07-09 by the reprex package (v2.0.1)}

Ronak Shah · Accepted Answer · 2024-08-22 03:32:53Z

collapse has rsplit function which does the same.

collapse::rsplit(df, df$g)

When compared against the current solutions it works the best.

library(collapse)
library(dplyr)
library(data.table)

df <- data.frame(
  x=rnorm(1e6),
  y=rnorm(1e6),
  g=sample(factor(LETTERS), 1e6, replace = TRUE)
)

fdf <- copy(df)
fdf <- as.data.table(fdf)


microbenchmark::microbenchmark(
  base_split = split(df, df$g),
  group_split = group_split(df, g), 
  dt_split = split(fdf, list(fdf$g)), 
  rsplit = collapse::rsplit(df, df$g)
)

#Unit: milliseconds
#        expr     min       lq     mean   median       uq      max neval
#  base_split 46.6185 51.33850 68.95247 56.75210 59.26480 217.1945   100
# group_split 20.4633 21.43105 28.37164 22.47115 29.17205 199.5639   100
#    dt_split 27.1163 28.44910 35.41670 33.92700 35.82960 178.5173   100
#      rsplit 13.2479 14.23280 19.10574 14.92970 21.76650 166.4624   100

Collectives™ on Stack Overflow

Split data.frame based on levels of a factor into new data.frames

5 Answers 5

Comments

Comments

Comments

Comments

Comments

Your Answer

Linked

Hot Network Questions

Collectives™ on Stack Overflow

5 Answers 5

Comments

Comments

Comments

Comments

Comments

Your Answer

Sign up or log in

Post as a guest

Linked

Related