# High Performance Computing with R # Speeding up R programs # NAOJ tutorials February 2017 # Eric Feigelson # Resources: # The Art of R Programming, N. Matloff, (2011, Chpt 11) # At www.r-bloggers.com: FasteR! HigheR! StrongeR!, N. Ross (2013) # At www.r-statistics.com: Speed up using JIT compiler, T. Galil (2012) # Getting Started with doParallel and foreach, S. Weston & R. Calaway (2014) # Simple Parallel Statistical Computing in R, L. Tierney (slides, 2003) # State of the Art in Parallel Computing with R, M. Schmidberger et al. (J Stat Software, 2009) # Tutorial: Parallel Computing with R on Lion and Hammer (RCC/PSU, 2013) # HPC-R Exercises: Parallelism, D. Schmidt (2015) # R is written in C and some R functions (particularly vector operations) # proceed at machine code speed. But R is an interpreted language, and some # functions (e.g. `for', `if/else' and `while' loops) proceed at much # slower speeds. # R functions were changed to a `byte-code compiler' c2012, so on-the-fly # compilation is reduced. Python has the same compiler type. # R code can often be improved for performance through improved structure & # vectorization, by converting computationally-intensive portions to # C or Fortran, by using parallel processing within R, and by using advanced # CRAN packages for use on large CPU/GPU clusters or cloud computing. ######################## ######################## I Benchmarking the problem ######################## ######################## N <- 1000000 # a million operations test1 <- function(n) { foo1 <- rnorm(n) ; foo2 <- rbeta(n,5,5) foo3 <- foo1 + foo2 return(foo3) } system.time(test1(N)) # vector operations, fast, R ~ 10*system system.time(test1(N*10)) # O(N) behavior test2 <- function(n) { foo3 <- vector(length=n) foo1 <- rnorm(n) ; foo2 <- rbeta(n,5,5) for (i in 1:n) foo3[i] <- foo1[i] + foo2[i] return(foo3) } system.time(test2(N)) # for loop, 10x slower, R ~ 100*system test3 <- function(n) { foo3 <- vector(length=n) for (i in 1:n/10) for (j in 1:10) foo1 <- rnorm(n) ; foo2 <- rbeta(n,5,5) for (i in 1:n) {foo3[i] <- foo1[i] + foo2[i]} return(foo3) } # system.time(test3(10000)) # Double loop, very slow, R ~ 40*system system.time(test3(3000)) # O(N^2) behavior # Note: "for", ":" and "<-" all require function calls ######################## ######################## # II Profiling & debugging R programs ######################## ######################## # For more complex programs, profiling with function `Rprof' can # show which portions are slower than others. Rprof queries the call stack # 50 times a second to see what function is running. CRAN packages # `microbenchmark' and 'rbenchmark' give additional profiling capabilities. Rprof("profile.result") invisible(test2(N)) Rprof(NULL) summaryRprof("profile.result") install.packages('microbenchmark') library(microbenchmark) ; library(ggplot2) compare <- microbenchmark(test1(N/10), test2(N/10), times = 50) autoplot(compare) # R has built-in functions including `debug', `browser', `traceback', # `options(error=recover)', and `tryCatch' to help the programmer # understand complex codes. CRAN packages include `debug'. Run R # within `gdb' to debug C code called by R scripts. ######################## ######################## # III Speeding up R ######################## ######################## # Clever use of the following can often speed up your program: sort, # table, inner, outer, crossword, expand.grid, which, where, any, all, # sum, cumsum, sumRows, cumprod, %% (modulo), etc. # Following is a slow code with many calls to a random number generator, # and a fast code with only one call. This illustrates tradeoff # between speed and memory usage. This and other examples of R speedup # efforts are given by N. Matloff. A particular problem with R processing # is that vectors are often unnecessarily copied and recalculated. sum_ran2 <- 0 system.time(for (i in 1:N) { ran.2 = rnorm(2) ; sum_ran2 = sum_ran2 + max(ran.2) } ) system.time (ran.many <- matrix(rnorm(2*N), ncol=2) ) system.time (sum(pmax(ran.many[,1], ran.many[,2]))) # Many operations can be sped up with R's `apply' functions: apply, # sapply, lapply, etc. lapply loops in compiled C code and can be # fastest, although using numerics can be faster than using lists. # lapply procedures can be parallelized using `mclapply' in package # `parallel'. In the example below, the "+" function can be replaced # by a more complex user-defined function. test4 <- function(n) { foo1 <- rnorm(n) ; foo2 <- rbeta(n,5,5) foo4 <- apply(cbind(foo1, foo2), 1, "+") return(foo4) } system.time(test4(N)) # apply is not effective here ######################## ######################## # IV Pre-compiling R code ######################## ######################## # Since c2012, all R/CRAN functions are pre-compiled, but user-defined # functions are not. You can do this yourself and speed up your code. system.time(test2(N)) library(compiler) comp_test2 <- cmpfun(test2) system.time(comp_test2(N)) # factor of 3 improvement # A related option is to use just-in-time (JIT) compiling in R that # automatically compiles all functions their first time. Add the following # at the beginning of the code. library(compiler) enableJIT(1) # Conclusion on speeding up R by N. Matloff: `# `The moral of the story # is that performance issues can be unpredictable.' ######################## ######################## # V Parallel processing ######################## ######################## # Important CRAN packages: multicore, snow,snowfall, doParallel,foreach, plyr # Most parallelizations are related to `apply', so if you can run your # task in `apply', you can parallelize. # Note: R's random number generation should not be parallelized # Note: CRAN's `foreach' is a free product of the company Revolution Analytics # that specialized in advanced high performance computing in R. # Easy start to parallel processing: `doParallel' as a backend for `foreach' library(doParallel) ncores <- getDoParWorkers() ; ncores # Quiz number of cores available clus <- makeCluster(ncores) rnorm.out <- matrix(nrow=N, ncol=ncores) registerDoParallel(clus) # Register using foreach package foreach(i=1:ncores) %dopar% { rnorm.out[,i] = rnorm(N) } stopCluster(clus) # Easy start to parallel processing: snowfall as a backend to snow # snow = Simple Network of Workstations # CRAN `rlecuyer' interfaces C to random number generators with multiple streams # Special non-R code needed to submit the job to the cluster install.packages('snow') ; library(snow) install.packages('rlecuyer') ; library(rlecuyer) clus <- makeCluster(ncores, type='SOCK') # initialize cluster clusterSetupRNG(clus) # RNG = random number generator clusterExport(clus, ls()) # Export session contents to slave processors parSapply(clus, 1:5, function() { rnorm(10) }) stopCluster(clus)