R is the most comprehensive public-domain statistical software available today.  An implementation of the S language for statistical analysis, it is increasingly popular in the statistical community. It takes only a few minutes to obtain:  go to http://www.R-project.org , choose a CRAN mirror site, and download the executable appropriate for your operating system.

The following demonstration is a series of excerpts from a weeklong set of tutorials presented at the Summer School in Statistics for Astronomers and Physicists at Penn State University (http://astrostatistics.psu.edu/su06/index.html). When this demo is viewed online, the bolded, indented lines

# like this one

are meant to be copied and pasted directly into R at the command prompt.

Quick R demo

Load the Hipparcos dataset described at http://astrostatistics.psu.edu/datasets/HIP_star.html.

attach(hip)

Create a fancy boxplot.

boxplot(Vmag~cut(B.V,breaks=(-1:6)/2),
notch=T, varwidth=T, las=1, tcl=.5,
xlab=expression("B minus V"),
ylab=expression("V magnitude"),
main="Can you find the red giants?",
cex=1, cex.lab=1.4, cex.axis=.8, cex.main=1)
axis(2, labels=F, at=0:12, tcl=-.25)
axis(4, at=3*(0:4))

Now for a scatterplot.

plot(RA,DE,pch=".")

filter1 <- (RA>50 & RA<100 & DE>0 & DE<25)
plot(pmRA[filter1],pmDE[filter1],pch=20)
rect(0,-150,200,50,border=2)

plot(pmRA[filter1],pmDE[filter1],pch=20,
xlim=c(0,200),ylim=c(-150,50))
rect(90,-60,130,-10,border=2)
filter2 <- (pmRA>90 & pmRA<130 & pmDE>-60 & pmDE< -10) # Space in 'pmDE< -10' is necessary!
filter <- filter1 & filter2

Let's have a final look.

pairs(hip[filter,-c(1,5)],pch=20)

There is one outlying star in the e_Plx variable. Get rid of it:

filter <- filter & (e_Plx<5)
sum(filter)

A final look at the HR plot, with the Hyades highlighted:

plot(Vmag,B.V,pch=c(46,20)[1+filter], col=1+filter,
xlim=range(Vmag[filter]), ylim=range(B.V[filter]))

Next, a regression example involving the dataset http://astrostatistics.psu.edu/datasets/GRB_afterglow.html:

x <- log10(grb[,1])
y <- log10(grb[,2])
plot(x,y,xlab="log time",ylab="log flux")

The relationship looks roughly linear, so let's try a linear model:

model1 <- lm(y~x)
abline(model1, col=2, lwd=2)
model1 # same as print(model1)
summary(model1)

We can look at a confidence ellipse for the slope and intercept using the "car" package, available from CRAN (http://cran.r-project.org/mirrors.html).

library(car)
confidence.ellipse(model1)

Next up, a quadratic fit (which is still linear regression, by the way):

model2 <- lm(y~x+I(x^2))
summary(model2)
plot(x,y,xlab="log time",ylab="log flux")
abline(model1, col=2, lwd=2)
xx <- seq(min(x),max(x),len=200)
yy <- model2\$coef %*% rbind(1,xx,xx^2)
lines(xx,yy,lwd=2,col=3)

Let's check AIC for the two models:

AIC(model1)
AIC(model2)

And here is the BIC:

n <- length(x)
AIC(model1, k=log(n))
AIC(model2, k=log(n))

Let's try a nonparametric lowess fit.

npmodel1 <- lowess(y~x)
lines(npmodel1, col=4, lwd=2)

It is hard to see the pattern of the lowess curve in the plot. Let's replot it with no other distractions.

plot(x,y,xlab="log time",ylab="log flux", type="n")
lines(npmodel1, col=4, lwd=2)

flux <- grb[,2]
nlsmodel1 <- nls(flux ~ I(10^(a+b*x)), start=list(a=0,b=0))
nlsmodel1
npmodel2 <- lowess(flux~x)
plot(x, flux, xlab="log time", ylab="flux")
lines(xx, 10^(4.1085-.9674*xx), col=2, lwd=2)
lines(npmodel2, col=3, lwd=2)

Finally, resistant regression using the lqs function in the MASS package.

library(MASS)
lqsmodel1 <- lqs(y~x, method="lts")
plot(x,y,xlab="log time",ylab="log flux")
abline(model1,col=2)
abline(lqsmodel1,col=3)

Here is a model-based clustering example applied to the COMBO-17 dataset (http://astrostatistics.psu.edu/datasets/COMBO17.html).

"http://astrostatistics.psu.edu/datasets/COMBO17.csv",
na.strings="")
attach(combo)
xy <- cbind(BjMAG, S280MAG-BjMAG)
xy <- xy[Mcz>.7 & Mcz<.9,]
xy <- na.omit(xy)
plot(xy[,1],xy[,2], pch=20,
xlab=expression(M[B]),
ylab= expression("(280-B)"[rest]),
main="z between 0.7 and 0.9",
cex.lab=1.5)

The mclust package will do the clustering.

library(mclust)
mc2 <- Mclust(xy, minG=2, maxG=2)
r <- sqrt(qchisq(.5,2))
for(i in 1:2)
ellipse(mc2\$mu[,i], mc2\$sigma[,,i], r, col=1+i)

Is two components the "best" solution?

mc <- Mclust(xy)
mc\$G
plot(xy[,1],xy[,2], pch=20,
xlab=expression(M[B]),
ylab= expression("(280-B)"[rest]),
main="z between 0.7 and 0.9",
cex.lab=1.5)
for(i in 1:mc\$G)
ellipse(mc\$mu[,i], mc\$sigma[,,i], r, col=1+i)