3

I have following data and code:

mydt = structure(list(vnum1 = c(0.517551446921093, -0.997822163825322, 
3.40784990301597, -0.20990292802279, 0.171252718589118, -0.624084617915488, 
0.0979152932727754, -0.673949942523713, 0.689937370719125, -0.356403906786312, 
-0.565253563082689, -0.725285109477077, -0.343253827285705, -0.515803106223986, 
2.21193745540815, 0.179392018244011, 0.695885203438304, -0.869946981188651, 
0.170084087339536, 0.864392658315656, 0.801471783050381, 0.753880989575548, 
-0.572671791856263, -0.238511443188091, -1.1837711276515, 1.13728246296508, 
0.702244681081861, -0.851470541269798, 0.0471820411719659, 0.547952252697306, 
0.527539936397851, 0.247070882010565, -0.562100684713534, -1.05183021003772, 
0.934263969812236, -0.603673312084538, -2.00612207642211, 0.2312103046843, 
-0.214991379754579, 0.282701708464789, 0.289934023279607, 0.567328033965404, 
-0.359157137438815, 0.648221129776207, 0.857904763904759, 0.289415512264559, 
1.06555885899638, 0.333119386976963, -1.46070627726311, 0.0552050036156248
), vfac1 = structure(c(2L, 1L, 1L, 2L, 1L, 3L, 2L, 3L, 1L, 4L, 
4L, 3L, 1L, 3L, 1L, 4L, 4L, 4L, 1L, 2L, 2L, 4L, 2L, 4L, 1L, 3L, 
4L, 1L, 2L, 2L, 2L, 1L, 3L, 4L, 1L, 2L, 1L, 3L, 1L, 4L, 2L, 3L, 
2L, 1L, 2L, 2L, 2L, 3L, 4L, 2L), .Label = c("1", "2", "3", "4"
), class = "factor"), vch1 = structure(c(3L, 4L, 5L, 4L, 1L, 
5L, 5L, 3L, 3L, 4L, 1L, 4L, 3L, 5L, 1L, 3L, 4L, 5L, 1L, 3L, 5L, 
2L, 5L, 5L, 1L, 2L, 5L, 5L, 1L, 3L, 4L, 1L, 2L, 2L, 5L, 1L, 4L, 
2L, 1L, 5L, 4L, 4L, 3L, 2L, 5L, 4L, 3L, 2L, 3L, 2L), .Label = c("A", 
"B", "C", "D", "E"), class = "factor")), .Names = c("vnum1", 
"vfac1", "vch1"), class = c("data.table", "data.frame"), row.names = c(NA, 
50L))


mydt[,list(mean=mean(vnum1), sd=sd(vnum1)),list(vfac1, vch1)]
    vfac1 vch1        mean         sd
 1:     2    C  0.52725962 0.54536269
 2:     1    D -1.50197212 0.71297571
 3:     1    E  1.16354778 2.13889714
 4:     2    D  0.22424664 0.31039463
 5:     1    A  0.23359711 1.10743823
 6:     3    E -0.56994386 0.07656659
 7:     2    E  0.29615501 0.67455339
 8:     3    C -0.67394994         NA
 9:     1    C  0.17334177 0.73057650
10:     4    D  0.16974065 0.74408077
11:     4    A -0.56525356         NA
12:     3    D -0.07897854 0.91401552
13:     4    C -0.64065713 1.15972463
14:     4    E -0.03087801 0.67895741
15:     4    B -0.14897461 1.27683063
16:     3    B  0.28487787 0.69502367
17:     2    A -0.27824564 0.46022423
18:     1    B  0.64822113         NA
19:     2    B  0.05520500         NA

I want to create following function where I can send the column names and get above result. However, following function is not working:

myfn = function(ddt, 'vnum1', 'vfac1', 'vch1'){
        mydt[,list(mean=mean('vnum1'), sd=sd('vnum1')),list('vfac1', 'vch1')]
}

How can I send column names (or column vector themselves) so that I can get result from a function? Thanks for your help.

2
  • I have a doubt about the se. Is it from any package? Commented Feb 5, 2015 at 6:44
  • se() is in sciplot. But I have changed question above to simple 'sd'. Commented Feb 5, 2015 at 7:20

2 Answers 2

3

You could try

 myfn <- function(dt, v1, v2, v3){
   dt[, list(mean=mean(eval(as.name(v1))), sd=sd(eval(as.name(v1)))), 
        by=c(deparse(substitute(v2)), deparse(substitute(v3)))]
 }
 myfn(mydt, 'vnum1', vfac1, vch1)
 #   vfac1 vch1        mean         sd
 #1:     2    C  0.52725962 0.54536269
 #2:     1    D -1.50197212 0.71297571
 #3:     1    E  1.16354778 2.13889714
 #4:     2    D  0.22424664 0.31039463
 #5:     1    A  0.23359711 1.10743823
 #6:     3    E -0.56994386 0.07656659
 #7:     2    E  0.29615501 0.67455339
 #8:     3    C -0.67394994         NA
 #9:     1    C  0.17334177 0.73057650
#10:     4    D  0.16974065 0.74408077
#11:     4    A -0.56525356         NA
#12:     3    D -0.07897854 0.91401552
#13:     4    C -0.64065713 1.15972463
#14:     4    E -0.03087801 0.67895741
#15:     4    B -0.14897461 1.27683063
#16:     3    B  0.28487787 0.69502367
#17:     2    A -0.27824564 0.46022423
#18:     1    B  0.64822113         NA
#19:     2    B  0.05520500         NA

Also works when the colum names are changed

 setnames(mydt, names(mydt), letters[1:3])
 head(myfn(mydt, 'a', b, c),2)
 #   b c       mean        sd
 #1: 2 C  0.5272596 0.5453627
 #2: 1 D -1.5019721 0.7129757

Or you can remove the deparse(substitute(.. and pass variables are quoted strings

 myfn <- function(dt, v1, v2, v3){
    dt[, list(mean=mean(eval(as.name(v1))), sd=sd(eval(as.name(v1)))), 
        by=c(v2, v3)]
 }
myfn(mydt, 'vnum1', 'vfac1', 'vch1')

Here is another variant which can take the unquoted string

 myfn <- function(dt, v1, v2, v3){
   args  <- as.list(match.call())
   e1 <- c(deparse(args$v2), deparse(args$v3))
   dt[, .(mean=mean(eval(args$v1)), sd=sd(eval(args$v1))), by=e1]
  }

  head(myfn(mydt, vnum1, vfac1, vch1),2)
  #  vfac1 vch1       mean        sd
  #1:     2    C  0.5272596 0.5453627
  #2:     1    D -1.5019721 0.7129757
Sign up to request clarification or add additional context in comments.

3 Comments

Yes, it works. Even with changed names, it is working. For example, if column names in original data.table are a,b,c then myfn(mydt, 'a', 'b', 'c') works. Though the output columns 1 and 2 do not have correct names, that can be changed. Thanks for your answer.
@rnso You can change it with setnames. If it was a single group, it was easier.
@rnso Corrected the column names issue
2

I think you can simplify this and make it more data.table idiomatic by just using .SD. You also don't need to evaluate within the by statement because data.table won't look for variables within the global environment in order to aggregate by (unlike in j statement). So simply

myfn <- function(dt, v1, v2, v3){
  dt[, .(mean = mean(.SD[[v1]]), sd = sd(.SD[[v1]])), c(v2, v3)]
}
myfn(mydt, "vnum1", "vfac1", "vch1")
##    vfac1 vch1       mean         sd
## 1:     2    C  0.5272596 0.54536269
## 2:     1    D -1.5019721 0.71297571
## 3:     1    E  1.1635478 2.13889714
## 4:     2    D  0.2242466 0.31039463
## 5:     1    A  0.2335971 1.10743823
## 6:     3    E -0.5699439 0.07656659
...

Testing for other column names

setnames(mydt, letters[1:3])
head(myfn(mydt, "a", "b", "c"), 2)
#    b c       mean        sd
# 1: 2 C  0.5272596 0.5453627
# 2: 1 D -1.5019721 0.7129757

Alternatively, you could also use get as in

myfn <- function(dt, v1, v2, v3){
  dt[, .(mean = mean(get(v1)), sd = sd(get(v1))), c(v2, v3)]
}

Though, efficiency wise @akruns eval(as.name()) combinations within the j statement should be the fastest until Arun/Matt will optimise .SD.

4 Comments

There seems to be an error. The brackets are not matching. Error: unexpected ']
@rnso, forgot a bracket. Fixed now.
@rnso you are using an old data.table version. If you want to keep using it. use list instead .. I've edited back. What's your data.table version btw?
The output is not correct. It is not calculating mean and sd of groups.

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.