# High Performance Computing with R
# Speeding up R programs
# NAOJ tutorials February 2017
# Eric Feigelson
# Resources:
# The Art of R Programming, N. Matloff, (2011, Chpt 11)
# At www.r-bloggers.com: FasteR! HigheR! StrongeR!, N. Ross (2013)
# At www.r-statistics.com: Speed up using JIT compiler, T. Galil (2012)
# Getting Started with doParallel and foreach, S. Weston & R. Calaway (2014)
# Simple Parallel Statistical Computing in R, L. Tierney (slides, 2003)
# State of the Art in Parallel Computing with R, M. Schmidberger et al. (J Stat Software, 2009)
# Tutorial: Parallel Computing with R on Lion and Hammer (RCC/PSU, 2013)
# HPC-R Exercises: Parallelism, D. Schmidt (2015)
# R is written in C and some R functions (particularly vector operations)
# proceed at machine code speed. But R is an interpreted language, and some
# functions (e.g. `for', `if/else' and `while' loops) proceed at much
# slower speeds.
# R functions were changed to a `byte-code compiler' c2012, so on-the-fly
# compilation is reduced. Python has the same compiler type.
# R code can often be improved for performance through improved structure &
# vectorization, by converting computationally-intensive portions to
# C or Fortran, by using parallel processing within R, and by using advanced
# CRAN packages for use on large CPU/GPU clusters or cloud computing.
########################
########################
I Benchmarking the problem
########################
########################
N <- 1000000 # a million operations
test1 <- function(n) {
foo1 <- rnorm(n) ; foo2 <- rbeta(n,5,5)
foo3 <- foo1 + foo2
return(foo3) }
system.time(test1(N)) # vector operations, fast, R ~ 10*system
system.time(test1(N*10)) # O(N) behavior
test2 <- function(n) {
foo3 <- vector(length=n)
foo1 <- rnorm(n) ; foo2 <- rbeta(n,5,5)
for (i in 1:n) foo3[i] <- foo1[i] + foo2[i]
return(foo3) }
system.time(test2(N)) # for loop, 10x slower, R ~ 100*system
test3 <- function(n) {
foo3 <- vector(length=n)
for (i in 1:n/10)
for (j in 1:10)
foo1 <- rnorm(n) ; foo2 <- rbeta(n,5,5)
for (i in 1:n) {foo3[i] <- foo1[i] + foo2[i]}
return(foo3) }
# system.time(test3(10000)) # Double loop, very slow, R ~ 40*system
system.time(test3(3000)) # O(N^2) behavior
# Note: "for", ":" and "<-" all require function calls
########################
########################
# II Profiling & debugging R programs
########################
########################
# For more complex programs, profiling with function `Rprof' can
# show which portions are slower than others. Rprof queries the call stack
# 50 times a second to see what function is running. CRAN packages
# `microbenchmark' and 'rbenchmark' give additional profiling capabilities.
Rprof("profile.result")
invisible(test2(N))
Rprof(NULL)
summaryRprof("profile.result")
install.packages('microbenchmark')
library(microbenchmark) ; library(ggplot2)
compare <- microbenchmark(test1(N/10), test2(N/10), times = 50)
autoplot(compare)
# R has built-in functions including `debug', `browser', `traceback',
# `options(error=recover)', and `tryCatch' to help the programmer
# understand complex codes. CRAN packages include `debug'. Run R
# within `gdb' to debug C code called by R scripts.
########################
########################
# III Speeding up R
########################
########################
# Clever use of the following can often speed up your program: sort,
# table, inner, outer, crossword, expand.grid, which, where, any, all,
# sum, cumsum, sumRows, cumprod, %% (modulo), etc.
# Following is a slow code with many calls to a random number generator,
# and a fast code with only one call. This illustrates tradeoff
# between speed and memory usage. This and other examples of R speedup
# efforts are given by N. Matloff. A particular problem with R processing
# is that vectors are often unnecessarily copied and recalculated.
sum_ran2 <- 0
system.time(for (i in 1:N) { ran.2 = rnorm(2) ; sum_ran2 = sum_ran2 + max(ran.2) } )
system.time (ran.many <- matrix(rnorm(2*N), ncol=2) )
system.time (sum(pmax(ran.many[,1], ran.many[,2])))
# Many operations can be sped up with R's `apply' functions: apply,
# sapply, lapply, etc. lapply loops in compiled C code and can be
# fastest, although using numerics can be faster than using lists.
# lapply procedures can be parallelized using `mclapply' in package
# `parallel'. In the example below, the "+" function can be replaced
# by a more complex user-defined function.
test4 <- function(n) {
foo1 <- rnorm(n) ; foo2 <- rbeta(n,5,5)
foo4 <- apply(cbind(foo1, foo2), 1, "+")
return(foo4) }
system.time(test4(N)) # apply is not effective here
########################
########################
# IV Pre-compiling R code
########################
########################
# Since c2012, all R/CRAN functions are pre-compiled, but user-defined
# functions are not. You can do this yourself and speed up your code.
system.time(test2(N))
library(compiler)
comp_test2 <- cmpfun(test2)
system.time(comp_test2(N)) # factor of 3 improvement
# A related option is to use just-in-time (JIT) compiling in R that
# automatically compiles all functions their first time. Add the following
# at the beginning of the code.
library(compiler)
enableJIT(1)
# Conclusion on speeding up R by N. Matloff: `# `The moral of the story
# is that performance issues can be unpredictable.'
########################
########################
# V Parallel processing
########################
########################
# Important CRAN packages: multicore, snow,snowfall, doParallel,foreach, plyr
# Most parallelizations are related to `apply', so if you can run your
# task in `apply', you can parallelize.
# Note: R's random number generation should not be parallelized
# Note: CRAN's `foreach' is a free product of the company Revolution Analytics
# that specialized in advanced high performance computing in R.
# Easy start to parallel processing: `doParallel' as a backend for `foreach'
library(doParallel)
ncores <- getDoParWorkers() ; ncores # Quiz number of cores available
clus <- makeCluster(ncores)
rnorm.out <- matrix(nrow=N, ncol=ncores)
registerDoParallel(clus) # Register using foreach package
foreach(i=1:ncores) %dopar% { rnorm.out[,i] = rnorm(N) }
stopCluster(clus)
# Easy start to parallel processing: snowfall as a backend to snow
# snow = Simple Network of Workstations
# CRAN `rlecuyer' interfaces C to random number generators with multiple streams
# Special non-R code needed to submit the job to the cluster
install.packages('snow') ; library(snow)
install.packages('rlecuyer') ; library(rlecuyer)
clus <- makeCluster(ncores, type='SOCK') # initialize cluster
clusterSetupRNG(clus) # RNG = random number generator
clusterExport(clus, ls()) # Export session contents to slave processors
parSapply(clus, 1:5, function() { rnorm(10) })
stopCluster(clus)