```
Modern Statistical Methods for Astronomy With R Applications
Eric D. Feigelson & G. Jogesh Babu

R scripts
These scripts may be cut and pasted into any R console.
They are self-contained and should reproduce the figures
and results in the book.

******************************************************************
******************************************************************

Chapter. 2 Probability

# Set up 6 panel figure

par(mfrow=c(3,2))

# Plot upper left panel with three illustrative exponential p.d.f. distributions

xdens <- seq(0,5,0.02)
plot(xdens,dexp(xdens,rate=0.5), type='l', ylim=c(0,1.5), xlab='',
ylab='Exponential p.d.f.',lty=1)

lines(xdens,dexp(xdens,rate=1), type='l', lty=2)
lines(xdens,dexp(xdens,rate=1.5), type='l', lty=3)
legend(2, 1.45, lty=1, substitute(lambda==0.5), box.lty=0)
legend(2, 1.30, lty=2, substitute(lambda==1.0), box.lty=0)
legend(2, 1.15, lty=3, substitute(lambda==1.5), box.lty=0)

# Help files to learn these function

help(seq) ; help(plot); help(par) ; help(lines); help(legend)

# Plot upper right panel with three illustrative exponential c.d.f. distributions

plot(xdens, pexp(xdens,rate=0.5), type='l', ylim=c(0,1.0), xlab='',
ylab='Exponential c.d.f.', lty=1)
lines(xdens, pexp(xdens,rate=1), type='l', lty=2)
lines(xdens, pexp(xdens,rate=1.5),type='l',lty=3)
legend(3, 0.50, lty=1, substitute(lambda==0.5), box.lty=0)
legend(3, 0.38, lty=2, substitute(lambda==1.0), box.lty=0)
legend(3, 0.26, lty=3, substitute(lambda==1.5), box.lty=0)

# Plot middle panels with illustrative normal p.d.f. and c.d.f.

xdens <- seq(-5, 5, 0.02)
ylabdnorm <- expression(phi[mu~sigma^2] (x))
plot(xdens, dnorm(xdens, sd=sqrt(0.2)), type='l', ylim=c(0,1.0), xlab='',
ylab=ylabdnorm,lty=1)
lines(xdens,dnorm(xdens, sd=sqrt(1.0)), type='l', lty=2)
lines(xdens,dnorm(xdens, sd=sqrt(5.0)), type='l', lty=3)
lines(xdens,dnorm(xdens, mean=-2.0, sd=sqrt(0.5)), type='l', lty=4)
leg1 <- expression(mu^' '==0, mu^' '==0, mu^' '==0, mu^' '==-2)
leg2 <- expression(sigma^2==0.2, sigma^2==1.0, sigma^2==5.0, sigma^2==0.5,)
legend(0.5, 1.0, lty=1:4, leg1, lwd=2, box.lty=0)
legend(3.0, 1.01, leg2, box.lty=0)

ylabpnorm <- expression(Phi[mu~sigma^2] (x))
plot(xdens,pnorm(xdens,sd=sqrt(0.2)), type='l', ylim=c(0,1.0), xlab='',
ylab=ylabpnorm,lty=1)
lines(xdens,pnorm(xdens, sd=sqrt(1.0)), type='l', lty=2)
lines(xdens,pnorm(xdens, sd=sqrt(5.0)), type='l', lty=3)
lines(xdens,pnorm(xdens, mean=-2.0, sd=sqrt(0.5)), type='l', lty=4)
leg1 <- expression(mu^' '==0, mu^' '==0, mu^' '==0, mu^' '==-2)
leg2 <- expression(sigma^2==0.2, sigma^2==1.0, sigma^2==5.0, sigma^2==0.5,)
legend(0.5, 0.6, lty=1:4, leg1, lwd=2, box.lty=0)
legend(3.0, 0.61, leg2, box.lty=0)

# Plot bottom panels with illustrative lognormal p.d.f. and c.d.f.

xdens <- seq(0,3, 0.02)
plot(xdens, dlnorm(xdens, meanlog=0, sdlog=5), type='l', ylim=c(0,2), xlab='',
ylab='Lognormal density', lty=1)
lines(xdens, dlnorm(xdens, meanlog=0, sdlog=1), type='l', lty=2)
lines(xdens, dlnorm(xdens, meanlog=0, sdlog=1/2), type='l', lty=3)
lines(xdens, dlnorm(xdens, meanlog=0, sdlog=1/8), type='l', lty=4)
leg1 <- expression(sigma==5, sigma==1, sigma==1/2, sigma==1/8)
legend(1.8,1.8,lty=1:4,leg1,box.lty=0)

plot(xdens,plnorm(xdens,meanlog=0,sdlog=5),type='l',ylim=c(0,1),xlab='x',
ylab='Lognormal distribution',lty=1)
lines(xdens,plnorm(xdens,meanlog=0,sdlog=1),type='l',lty=2)
lines(xdens,plnorm(xdens,meanlog=0,sdlog=1/2),type='l',lty=3)
lines(xdens,plnorm(xdens,meanlog=0,sdlog=1/8),type='l',lty=4)
leg1 <- expression(sigma==5,sigma==1,sigma==1/2,sigma==1/8)
legend(1.5,0.6,lty=1:4,leg1,box.lty=0)

# Return plot to single-panel format

par(mfrow=c(1,1))

******************************************************************
******************************************************************

Chapter 4  Probability Distribution Functions

# Define Pareto distributions for Salpeter IMF

dpareto <- function(x, alpha=1.35, b=1) (alpha>0)*(b>0)^alpha / x^(alpha+1)
ppareto <- function(x, alpha=1.35, b=1) (x > b)*(1-((b>0)/x)^(alpha>0))
qpareto <- function(u, alpha=1.35, b=1) (b>0)/(1-u)^(1/(alpha>0))  # 00])
lmidpts <- log(hx\$mids[counts>0])
tmp1 <- lm(ldens~lmidpts)
#   plot(lmidpts, ldens) ; abline(tmp1, col=2)
alpha.LS.hist <-  -1 - as.numeric(tmp1\$coef[2])
return(alpha.LS.hist) }
alpha.LS.hist.wt <- function(x) {
hx <- hist(x, nclass=20, plot=F)
counts <- hx\$counts
ldens <- log(hx\$density[counts>0])
lmidpts <- log(hx\$mids[counts>0])
tmp2 <- lm(ldens~lmidpts, weights=counts[counts>0])
alpha.LS.hist.wt <-  -1 - as.numeric(tmp2\$coef[2])
return(alpha.LS.hist.wt)  }
six_alphas <-  function(x) {
out <- c(alpha.MLE(x), alpha.MVUE(x), alpha.mom(x),
alpha.LS.EDF(x), alpha.LS.hist(x), alpha.LS.hist.wt(x))
return(out)  }

# Comparison of alpha estimators on simulated datasets
# Construct nsim simulated datasets with npts data points and alpha=1.35

nsim=500 ;   alpha_sim = NULL
for(i in 1:nsim) {
xtmp = rpareto(npts)
alpha_sim = rbind(alpha_sim,six_alphas(xtmp))  }
colnames(alpha_sim)=c('MLE','MVUE','Moments','LS_EDF','LS_hist',
'LS_hist_wt')

# Compute mean integrated square error

bias_sim = apply(alpha_sim,2,mean) - 1.35
var_sim =  apply(alpha_sim,2,var) *(nsim-1)/nsim
mise_sim =  bias_sim^2 + var_sim
rbind(bias_sim, var_sim, mise_sim)

# Read Milky Way Galaxy and M 31 globular cluster K magnitudes

KGC_MWG <- GC_MWG[,2]  ; KGC_M31 <- GC_M31[,2]-24.44
kseq <- seq(-15.0, -5.0, 0.25)

# Fit normal distributions

library(MASS)
par(mfrow=c(1,2))
hist(KGC_MWG, breaks=kseq, ylim=c(0,10), main='', xlab='K mag',
ylab='N',col=gray(0.5))
normfit_MWG <- fitdistr(KGC_MWG,'normal') ; normfit_MWG
lines(kseq, dnorm(kseq, mean=normfit_MWG\$estimate[[1]],
sd=normfit_MWG\$estimate[[2]]) * normfit_MWG\$n*0.25,lwd=2)
hist(KGC_M31, breaks=kseq, ylim=c(0,50), main='',xlab='K mag',ylab='N',)
normfit_M31 <- fitdistr(KGC_M31,'normal') ; normfit_M31
lines(kseq,dnorm(kseq, mean=normfit_M31\$estimate[[1]],
sd=normfit_M31\$estimate[[2]]) * normfit_M31\$n*0.25, lwd=2)

# Test for normality

install.packages('nortest') ; library(nortest)
install.packages('moments') ; library(moments)

pearson.test(KGC_M31)
skewness(KGC_M31) ; agostino.test(KGC_M31) ; jarque.test(KGC_M31)
kurtosis(KGC_M31) ; anscombe.test(KGC_M31) ; bonett.test(KGC_M31)
shapiro.test(KGC_M31) ; sf.test (KGC_M31)

# Test for multimodality

library(diptest)
dip(KGC_M31)

******************************************************************
******************************************************************

Chapter 5.  Nonparametric Statistics

# Data plots and summaries for asteroid densities

astnames <- asteroids[,1] ; dens <- asteroids[,2] ; err <- asteroids[,3]
dotchart(dens, labels=astnames, cex=0.9, xlab=expression(Density~(g/cm^3)))
plot(dens, ylim=c(0,8), xlab="Asteroids", ylab=expression(Density~(g/cm^3)),pch=20)
num <- seq(1,length(dens))
segments(num, dens+err, num, dens-err)

# Boxplot and summary statistics for asteroid data

boxplot(asteroids[,2:3], varwidth=T, notch=T, xlab="Asteroids", ylab="Density",
pars=list(boxwex=0.3,boxlwd=1.5,whisklwd=1.5,staplelwd=1.5,outlwd=1.5,
font=2))
summary(dens)
mean(dens) ; sd(dens)
weighted.mean(dens,1/err^2)

# Tests for normality

shapiro.test(dens)
install.packages('nortest') ; library('nortest')
ad.test(dens) ; cvm.test(dens) ; lillie.test(dens) ; pearson.test(dens)
install.packages('outliers') ; library('outliers')
dixon.test(dens) ; chisq.out.test(dens)
grubbs.test(dens) ; grubbs.test(dens,type=20)

# Read magnitudes for Milky Way and M 31 globular clusters

K1 <- GC1[,2] ;  K2 <- GC2[,2]
summary(K1) ; summary(K2)

# Three estimates of the distance modulus to M 31

DMmn <- mean(K2) - mean(K1) ;  DMmn
sigDMmn <- sqrt(var(K1)/length(K1) + var(K2)/length(K2)) ; sigDMmn
DMmed <- median(K2) - median(K1) ; DMmed
wilcox.test(K2, K1, conf.int=T)

# e.d.f., quantile and Q-Q plots for globular cluster magnitudes

plot(ecdf(K1), cex.points=0, verticals=T, xlab="K (mag)", ylab="e.d.f.", main="")
text(-7.5, 0.8, lab="MWG")  ;  text(-10.5,0.9, lab="M 31")

par(mfrow=c(1,3))
qqplot(K1, K2-24.90, pch=20, cex.axis=1.3, cex.lab=1.3,  xlab="MWG",
ylab="M31 - 24.90", main="")
qqnorm(K1, pch=20, cex.axis=1.3, cex.lab=1.3, main="")
qqline(K1, lty=2, lwd=1.5)
text(-2.5, -6, pos=4, cex=1.3, 'MWG normal QQ plot')
qqnorm(K2-24.90, pch=20, cex.axis=1.3, cex.lab=1.3, main="")
qqline(K2-24.90, lty=2, lwd=1.5)
text(-3, -7.5, pos=4, cex=1.3, 'M31 normal QQ plot')
par(mfrow=c(1,1))

# Plot e.d.f. with confidence bands

install.packages('sfsmisc') ; library('sfsmisc')
ecdf.ksCI(K1,ci.col='black')

# Nonparametric tests for normality

install.packages('nortest') ; library(nortest)
cvm.test(K1) ; cvm.test(K2)

# Nonparametric two-sample tests for Milky Way and M31 globular clusters

ks.test(K1, K2-24.90) # with distance modulus offset removed
wilcox.test(K1, K2-24.90)
mood.test(K1, K2-24.90)
install.packages(cramer) ; library(cramer)
cramer.test(K1, K2-24.90)

# Parametric two-sample tests

pooled_var <- ((length(K1)-1)*var(K1) + (length(K2)-1)*var(K2)) /
(length(K1)+length(K2)-2)
mean_diff <- (mean(K1) - mean(K2-24.90)) /
sqrt(pooled_var * (1/length(K1)+1/length(K2)))
mean_diff ; pnorm(mean_diff)
t.test(K1, (K2-24.90), var.eq=T)
var.test(K1, K2-24.90)

# Input star formation contingency tables

jets <- matrix(c(9,2,5,5), nrow=2)  ;  jets
evol <- matrix(c(21,1,2,0,16,14,42,1,61,90,179,2,17,4,5,5,22,95,126,10),nrow=4)
evol

# Test null hypotheses
chisq.test(jets)  ;  fisher.test(jets)
chisq.test(evol)
fisher.test(evol[2:3,]) # Chamaeleon vs. Taurus sample
fisher.test(evol[3:4,]) # Taurus vs. eta Cha sample

# Association plot
assocplot(evol,col=c("black","gray"),xlab='Regions',ylab='Evolutionary classes')

******************************************************************
******************************************************************

Chapter 6. Density Estimation or Data Smoothing

# Construct large and small samples of SDSS quasar redshifts and r-i colors

dim(qso) ; names(qso) ; summary(qso)  ;  attach(qso)
z.all <- z ;  z.200 <- z[1:200]
r.i.all <-  r_mag - i_mag ; r.i.200 <- r.i.all[1:200]
sig.r.i.all  <- sqrt(sig_r_mag^2 + sig_i_mag^2)  ; sig.r.i.200 <- sig.r.i.all[1:200]

# Plot histogram and quantile function

par(mfrow=c(1,2))
hist(z.all, breaks='scott', main='', xlab='Redshift', col='black')
plot(quantile(z.all, seq(1,100,1)/100, na.rm=T), pch=20, cex=0.5,
xlab='Percentile', ylab='Redshift')
par(mfrow=c(1,1))

# Constant kernel density estimator

plot(density(z.all), bw=bw.nrd(z.all), main='', xlab='Redshift', lwd=2)

install.packages("quantreg") ; library(quantreg)
akern.zqso <- akj(z.all, z=seq(0,5,0.01), alpha=0.5)
str(akern.zqso)
plot.window(xlim=c(0,5), ylim=c(0,0.6))
plot(seq(0,5, 0.01), akern.zqso\$dens, pch=20, cex=0.5,  xlab="Redshift",
ylab="Density")
rug(sample(z.all, 500))

# Distribution with and without measurement errors

summary(aster)  ;  dim(aster)  ;  attach(aster)
install.packages('decon')  ;  library(decon)

par(mfrow=c(1,2))
plot(ecdf(Dens), main='', xlab=expression(Asteroid~density~~g/cm^3), ylab='c.d.f.',
verticals=T, cex=0, lwd=1.5)
x <- seq(0, 6, 0.02)
cdf_sm <- DeconCdf(Dens,Err,x,bw=0.1)
lines(cdf_sm, lwd=1.5)

plot(DeconPdf(Dens,Err,x,bw=0.1), main='', xlab=expression(Asteroid~density~~
par(mfrow=c(1,1))

# Natural spline fit

install.packages("pspline") ; library(pspline)
fit=sm.spline(z.200, r.i.200)
plot(z.200, r.i.200, pch=20, cex=0.7, xlab="Redshift", ylab="r-i (mag)")
lines(fit\$x, fit\$y, lwd=2)

# Natural spline fit weighted by the variances of the measurement errors

fitw <- sm.spline(z.200, r.i.200, w=sig.r.i.200^2)
lines(fitw\$x, fitw\$y, lty=2, lwd=2)

# Bivariate Nadaraya-Watson regression estimator with bootstrap errors

install.packages("np") ; library(np)
bw.NW <- npregbw(z.200, r.i.200, regtype='lc', bwtype='fixed')
npplot(bws=bw.NW, ylim=c(-0.25,0.7), plot.errors.method="bootstrap",
plot.errors.bar='I', plot.errors.type='quantiles')
points(z.200, r.i.200, pch=20, cex=0.7)

# Local regression

q1 <- qso[qso[,10] < 0.3,] ; q1 <- q1[q1[,12]<0.3,]
dim(q1) ; names(q1) ; summary(q1)
r_i <- q1[,9] - q1[,11] ; z <- q1[,4] ; r <- q1[,9]

# 2. Plot two-dimensional smoothed distribution

install.packages('ash') ; library(ash)
nbin <- c(500, 500) ; ab <- matrix(c(0.0,-0.5,5.5,2.), 2,2)
bins <- bin2(cbind(z,r_i), ab, nbin)
f <- ash2(bins, c(5,5)) ; attributes(f)
f\$z <- log10(f\$z)

image(f\$x, f\$y, f\$z, col=gray(seq(0.5,0.2,by=-0.02)), zlim=c(-2.0,0.2),
main='', xlab="Redshift", ylab="r-i")

# 3. Construct loess local regression lines

z1 <- z[order(z)] ; r_i1 <- r_i[order(z)]
loct1 <- loess(r_i1~z1, span=0.1, data.frame(x=z1,y=r_i1))
summary(loct1)
lines(z1, predict(loct1), lwd=2, col=2)
z2 <- z1[z1>3.5]; r_i2 <- r_i1[z1>3.5]
loct2 <- loess(r_i2~z2, span=0.2, data.frame(x=z2,y=r_i2))
lines(z2, predict(loct2), lwd=2, col=3)

# 4. Save evenly-spaced loess fit to a file

x1 <- seq(0.0, 2.5, by=0.02) ; x2 <- seq(2.52, 5.0, by=0.02)
loctdat1 <- predict(loct1, data.frame(x=x1))
loctdat2=predict(loct2, data.frame(x=x2))
write(rbind(x1,loctdat1), sep=' ', ncol=2, file='qso.txt')
write(rbind(x2,loctdat2), sep=' ', ncol=2, file='qso.txt', append=T)

******************************************************************
******************************************************************

Chapter 7.  Regression

# Linear regression with heteroscedastic measurement errors
# Construct and plot dataset of SDSS quasars with 1818)),c(11:14)]
dim(qso1) ; summary(qso1) ; attach(qso1)
sig_z_mag[sig_z_mag<0.01] <- 0.01

dev.new(2)
plot(i_mag, z_mag, pch=20, cex=0.1, col=grey(0.5), xlim=c(18,21.5),
ylim=c(17.5,22), xlab="SDSS i (mag)", ylab="SDSS z (mag)")
for(i in 50:150) {
lines(c(i_mag[i],i_mag[i]),c((z_mag[i]+sig_z_mag[i]),
(z_mag[i]-sig_z_mag[i])))
lines(c((i_mag[i]+sig_i_mag[i]),(i_mag[i]-sig_i_mag[i])),
c(z_mag[i],z_mag[i]))   }

# Ordinary least squares fit

fit_ols <- lm(z_mag~i_mag)
summary(fit_ols)
confint(fit_ols, level=0.997)
dev.set(2) ;  abline(fit_ols\$coef, lty=1 ,lwd=2)

dev.new(3) ; par(mfrow=c(2,2))
plot(fit_ols, which=c(2:5), caption='', sub.caption='' ,pch=20, cex=0.3,
cex.lab=1.3, cex.axis=1.3)
par(mfrow=c(1,1))

# Weighted least squares fit

fit_wt <- lm(z_mag~i_mag, x=T, weights=1/(sig_z_mag*sig_z_mag))
summary(fit_wt)
dev.set(2) ; abline(fit_wt\$coef,lty=2,lwd=2)

# Generalized linear modeling

fit_glm_gau <- glm(z_mag ~ i_mag) ; summary(fit_glm_gau)
fit_glm_gam <- glm(z_mag ~ i_mag,family=Gamma) ; summary(fit_glm_gam)

# Robust M-estimator

library(MASS)
fit_M <- rlm(z_mag~i_mag, method='M', weights=1/(sig_z_mag*sig_z_mag),
wt.method='inv.var')
summary(fit_M)
length(which(fit_M\$w<1.0))
dev.set(3) ; hist(fit_M\$w, breaks=50, ylim=c(0,700), xlab='Robust regression
weights',  main='')
aM <- fit_M\$coef[[1]] ; bM <- fit_M\$coef[[2]]
dev.set(2) ; lines(c(18,22), c(aM+bM*18, aM+bM*22), lty=3, lwd=3)

# Thiel-Sen regression line

install.packages('zyp') ; library(zyp)
fit_ts <- zyp.sen(z_mag~i_mag, qso1[1:8000,])
confint.zyp(fit_ts)
dev.set(2) ; abline(fit_ts\$coef, lty=4, lwd=2)

# Linear quantile regression

install.packages('quantreg') ; library(quantreg)
fit_rq <- rq(z_mag~i_mag, tau=c(0.10,0.50,0.90))
print.summary.rq(fit_rq)
par(mfrow=c(1,2))
plot(i_mag, z_mag, pch=20, cex=0.1, col=grey(0.5), xlim=c(18,22),
ylim=c(17.5,22), xlab="SDSS i (mag)", ylab="SDSS z (mag)")
for(j in 0:2) {
aquant=fit_rq\$coef[[2*j+1]] ; bquant=fit_rq\$coef[[2*j+2]]
lines(c(18,22),c((aquant+bquant*18),(aquant+bquant*22)),lwd=2) }

# Nonlinear quantile regression

fit_rqss.1 <- rqss(z_mag~qss(i_mag), data=qso1, tau=0.10)
fit_rqss.5 <- rqss(z_mag~qss(i_mag), data=qso1, tau=0.50)
fit_rqss.9 <- rqss(z_mag~qss(i_mag), data=qso1, tau=0.90)
plot.rqss(fit_rqss.1, rug=F, ylim=c(17.5,22), titles='')
points(i_mag, z_mag, cex=0.1, pch=20, col=grey(0.5))
par(mfrow=c(1,1))

# Fit Sersic function to NGC 4472 elliptical galaxy surface brightness profile

NGC4472 <-
attach(NGC4472)
NGC4472.fit <-  nls(surf_mag ~ -2.5*log10(I.e * 10^(-(0.868*n-0.142)*
r.e=120.,n=4.), model=T, trace=T)
summary(NGC4472.fit)
deviance(NGC4472.fit)
logLik(NGC4472.fit)

# Plot NGC 4472 data and best-fit model

xlab="r  (arcsec)", ylab=expression(mu ~~ (mag/sq.arcsec)), ylim=c(16,28),
cex.lab=1.5, cex.axis=1.5)

# Fit and plot radial profiles of NGC 4406 and NGC 4451

NGC4406 <-
attach(NGC4406)
NGC4406.fit <- nls(surf_mag ~ -2.5*log10(I.e * 10^(-(0.868*n-0.142)*
r.e=120.,n=4.), model=T, trace=T)
summary(NGC4406.fit)

NGC4551 <-
attach(NGC4551)
NGC4551.fit <-  nls(surf_mag ~ -2.5*log10(I.e * 10^(-(0.868*n-0.142)*
model=T, trace=T)
summary(NGC4551.fit)
legend(500, 20, c("NGC 4472","NGC 4406", "NGC 4551"), pch=c(20,3,5))

# NGC 4472 analysis
# Residual plot

ylab="Residuals", pch=20, cex.lab=1.5, cex.axis=1.5)
lwd=2)

# Test for normality of residuals

qqnorm(residuals(NGC4472.fit) / summary(NGC4472.fit)\$sigma)
abline(a=0,b=1)
shapiro.test(residuals(NGC4472.fit) / summary(NGC4472.fit)\$sigma)

# Bootstrap parameter estimates

install.packages('nlstools') ; library(nlstools)
NGC4472.boot <- nlsBoot(NGC4472.fit)
summary(NGC4472.boot)
curve(dnorm(x,m=5.95, sd=0.10)*58/5.95, xlim=c(5.6,6.4), ylim=c(0,50))
hist(NGC4472.boot\$coefboot[,3], breaks=50, add=T)  # not shown

******************************************************************
******************************************************************

Chapter 8.  Multivariate Analysis

# Overview of globular cluster properties

summary(GC)
manyhist <- function(x) {
par(mfrow=n2mfrow(dim(x)[2]))
for (i in 1:dim(x)[2]) { name=names(x)[i]
hist(x[,i], main='', breaks='FD', ylab='', xlab=name) }}
par(mfrow=c(1,1))
manyhist(GC[,2:13])

# Univariate boxplots and normal quantile-quantile plots for two variables

par(mfrow=c(1,4))
boxplot(GC[,8], main='', ylab=expression(Core~r~~ (pc)), pars=list(xlab='', cex.lab=1.3, cex.axis=1.3, pch=20))
qqnorm(GC[,8], main='', xlab='', ylab='', cex.lab=1.3, cex.axis=1.3, pch=20)
boxplot(GC[,20], main='', ylab=expression(CSB ~~(mag/arcmin^2)),
cex.lab=1.3, cex.axis=1.3, pars=list(xlab='', pch=20))
qqnorm(GC[,20], main='', xlab='', ylab='', cex.lab=1.3,
cex.axis=1.3, pch=20)
qqline(GC[,20])
par(mfrow=c(1,1))

#  Prepare the data
# 1. Remove objects with NA entries, remove labels

dim(GC) ; GC1 <- na.omit(GC[,-1]) ; dim(GC1)

# 2. Standardize variables

GC2 <- scale(GC1)

# 3. Separate locational and dynamical variables

GCloc <- GC2[,+c(1:4)]
GCdyn <- GC2[,-c(1:4)]

# 4. Remove two bad outliers

GCdyn1 <- GCdyn[-c(which.max(GCdyn[,4]), which.max(GCdyn[,11])),]
GCloc1 <- GCloc[-c(which.max(GCdyn[,4]), which.max(GCdyn[,11])),]

# Bivariate relationships

cor(GCdyn1, method='kendall')
var(GCdyn1)
pairs(GCdyn1[,3:8],pch=20,cex=0.4)

# Elaborated color pairs plot

library(MASS)
pairs(GCdyn1[,5:8],main='',labels=names(GCdyn1[5:8]), panel=function(x,y) {
+    	abline(lsfit(x,y)\$coef,lwd=2,col='deeppink2')
+       lines(lowess(x,y),lwd=3,col='blue3',lty=2)
+ 	points(x,y,pch=21,bg = c("red", "green3", "blue"))
+ 	rug(jitter(x,factor=3),side=1,col='lightcoral',ticksize=-.05)
+ 	rug(jitter(y,factor=3),side=2,col='cornflowerblue',ticksize=-.05)
+ 	contour(kde2d(x,y)\$x, kde2d(x,y)\$y, kde2d(x,y)\$z, drawlabels=F,add=T, col='darkblue',nlevel=4)
+ 	})

# PCA for dynamical variables.

PCdyn <- princomp(GCdyn1)
plot(PCdyn, main='')
summary(PCdyn)
biplot(PCdyn, col='black', cex=c(0.6,1))

# Add principal component values into the data frame

PCdat <- data.frame(names=row.names(GCdyn1), GCdyn1, PCdyn\$scores[,1:4])

# Multiple regression to predict globular cluster central surface brightnesses

attach(GCdyn)
CSB_fit1 <- lm(Cent.surf.bright~.-Cent.surf.bright,data=GCdyn) ; CSB_fit1
CSB_fitt <- lm(Cent.surf.bright~.,data=GCdyn[,c(7:11,13)]) ; CSB_fit2
str(CSB_fit2)
summary(CSB_fit2)
sd(CSB_fit2\$residuals)
par(mfrow=c(2,1))
plot(CSB_fit2\$fitted.values, Cent.surf.bright, pch=20)
qqnorm(CSB_fit2\$residuals, pch=20, main='')
qqline(CSB_fit2\$residuals)
par(mfrow=c(1,1))

# MARS nonlinear regression

install.packages('earth') ; library(earth)
CSB_fit3 <- earth(Cent.surf.bright~.-Cent.surf.bright, data=GCdyn) ; CSB_fit3
sd(CSB_fit3\$residuals)
qqnorm(CSB_fit2\$residuals) ; qqline(CSB_fit2\$residuals)

# Some multivariate display techniques:
# interactive 3-dim scatter plot; 4-dim bubble plot; parallel coordinates plot

library(rgl)
open3d() ; plot3d(GCdyn1[,5:7])
snapshot3d(file='GlobClus3D.png')

library(lattice)
cloud(GCdyn1[,5]~GCdyn1[,6]*GCdyn[,7], screen=list(z=60,x=45,y=20),
xlab='log.t.rad', ylab='log.rho.cen', zlab='conc', col=1, pch=1, cex=GCdyn1[,8]+1)

parallel(~GCdyn1, col=c('darkred','darkgreen','orange','blue','black'))

******************************************************************
******************************************************************

Chapter 9.  Clustering, Classification and Data Mining

# Color-magnitude diagram for low-redshift COMBO-17 galaxies

dim(COMBO) ; names(COMBO)
par(mfrow=c(1,2))
plot(COMBO_loz, pch=20, cex=0.5, xlim=c(-22,-7), ylim=c(-2,2.5),
xlab=expression(M[B]~~(mag)), ylab=expression(M[280] - M[B]~~(mag)),
main='')

# Two-dimensional kernel-density estimator

library(MASS)
COMBO_loz_sm <- kde2d(COMBO_loz[,1], COMBO_loz[,2], h=c(1.6,0.4),
lims = c(-22,-7,-2,2.5), n=500)
image(COMBO_loz_sm, col=grey(13:0/15), xlab=expression(M[B]~~(mag)),
ylab=expression(M[280] - M[B]~~(mag)), xlim=c(-22,-7), ylim=c(-2,2.5),
xaxp=c(-20,-10,2))
par(mfrow=c(1,1))

# Standardize variables

Mag_std <- scale(COMBO_loz[,1])
Color_std <- scale(COMBO_loz[,2])
COMBO_std <- cbind(Mag_std,Color_std)

# Hierarchical clustering

COMBO_dist <- dist(COMBO_std)
COMBO_hc <- hclust(COMBO_dist, method='complete')
COMBO_coph <- cophenetic(COMBO_hc)
cor(COMBO_dist, COMBO_coph)

# Cutting the tree at k=5 clusters

plclust(COMBO_hc, label=F)
COMBO_hc5a <- rect.hclust(COMBO_hc, k=5, border='black')
str(COMBO_hc5a)
COMBO_hc5b <- cutree(COMBO_hc, k=5)
str(COMBO_hc5b)
plot(COMBO_loz, pch=(COMBO_hc5b+19), cex=0.7, xlab=expression(M[B]~~(mag)),
ylab=expression(M[280] - M[B]~~(mag)), main='')

# Density-based clustering algorithm

install.packages('fpc') ; library(fpc)
COMBO_dbs <-  dbscan(COMBO_std, eps=0.1, MinPts=5, method='raw')
print.dbscan(COMBO_dbs) ; COMBO_dbs\$cluster
plot(COMBO_loz[COMBO_dbs\$cluster==0,], pch=20, cex=0.7, xlab='M_B (mag)',
ylab='M_280 - M_B (mag)')
points(COMBO_loz[COMBO_dbs\$cluster==2,], pch=2, cex=1.0)
points(COMBO_loz[COMBO_dbs\$cluster==1 | COMBO_dbs\$cluster==3,], pch=1, cex=1.0)

# r-band distribution of Sloan quasars

dim(SDSS_qso) ; summary(SDSS_qso)
qso_r <- SDSS_qso[,5] ; n_qso <- length(qso_r)
par(mfrow=c(1,2))
hist(qso_r, breaks=100, main='', xlim=c(16,26))

#  Normal mixture model

install.packages('mclust')  ; library(mclust)
fit.mm <- Mclust(qso_r,modelNames="V")
plot(fit.mm, ylim=c(-67000,-65000))

str(fit.mm)
fit.mm\$parameters\$mean
sqrt(fit.mm\$parameters\$variance\$sigmasq)
fit.mm\$parameters\$pro

par(mfrow=c(1,1))
plot(ecdf(qso_r), xlim=c(16,26), xlab=' r (mag)', main='')

r.seq <- seq(10,30,0.02)
sum.comp <- rep(0,length(r.seq))
for (i in 1:fit.mm\$G) { sum.comp <- sum.comp +  pnorm(r.seq, mean=
fit.mm\$parameters\$mean[[i]], sd=sqrt(fit.mm\$parameters\$variance\$sigmasq[[i]])) *
fit.mm\$parameters\$pro[[i]] }
lines(r.seq, sum.comp, lwd=3, lty=2)
for (i in 1:fit.mm\$G) {lines(r.seq, dnorm(r.seq, mean=fit2\$parameters\$mean[[i]],
sd=sqrt(fit2\$parameters\$variance\$sigmasq[[i]])) * fit2\$parameters\$pro[[i]])  }

# Model-based clustering

library(mclust)
COMBO_mclus <- mclustBIC(COMBO_loz,modelNames='VVV')
plot(COMBO_mclus, col='black')
COMBO_sum_mclus <- summary.mclustBIC(COMBO_mclus,COMBO_loz,3)
COMBO_sum_mclus\$parameters ; COMBO_sum_mclus\$classification
COMBO_sum_mclus\$z ; COMBO_sum_mclus\$uncertainty
plot(COMBO_loz, pch=(19+COMBO_sum_mclus\$classification), cex=1.0, xlab='M_B (mag)',
ylab='M_280 - M_B (mag)', main='COMBO-17 MVN model clustering (k=3)',
cex.lab=1.3, cex.axis=1.3)

# R script for constructing SDSS test and training datasets is given
# in Appendix C.

# Unsupervised k-means partitioning

SDSS.kmean <- kmeans(SDSS_test,6)
print(SDSS.kmean\$centers)
plot(SDSS_test[,1], SDSS_test[,2], pch=20, cex=0.3, col=gray(SDSS.kmean\$cluster/7),
xlab='u-g (mag)', ylab='g-r (mag)', xlim=c(-0.5,3), ylim=c(-0.6,1.5))

# Linear discriminant analysis

library(MASS)
SDSS_lda <- lda(SDSS_train[,1:4], as.factor(SDSS_train[,5]))
SDSS_train_lda <- predict(SDSS_lda, SDSS_train[,1:4])
SDSS_test_lda <- predict(SDSS_lda, SDSS_test[,1:4])

par(mfrow=c(2,1))
plot(SDSS_test[,1],SDSS_test[,2], xlim=c(-0.7,3), ylim=c(-0.7,1.8), pch=20,
col=SDSS_test_lda\$class, cex=0.5, main='', xlab='u-g (mag)', ylab='g-r (mag)')

# k-nn classification

install.packages('class') ; library(class)
SDSS_knn4 <- knn(SDSS_train[,1:4], SDSS_test,
as.factor(SDSS_train[,5]), k=4, prob=T)
plot(SDSS_test[,1], SDSS_test[,2], xlim=c(-0.7,3), ylim=c(-0.7,1.8), pch=20,
col=SDSS_knn4, cex=0.5, main='', xlab='u-g (mag)', ylab='g-r (mag)')

# Validation of k-nn classification

SDSS_train_lda <- lda(SDSS_train[,1:4], as.factor(SDSS_train[,5]))
SDSS_train_knn4 <- knn(SDSS_train[,1:4], SDSS_train[,1:4], SDSS_train[,5],k=4)

plot(jitter(as.numeric(SDSS_train_lda\$class), factor=0.5), jitter(as.numeric
(SDSS_train[,5]), factor=0.5), pch=20, cex=0.5, xlab='LDA class',
ylab='True class', xaxp=c(1,3,2),yaxp=c(1,3,2))
plot(jitter(as.numeric(SDSS_train_knn4), factor=0.5), jitter(as.numeric
(SDSS_train[,5]), factor=0.5), pch=20, cex=0.5, xlab='k-nn class',
ylab='True class', xaxp=c(1,3,2),yaxp=c(1,3,2))
par(mfrow=c(1,1))

# Single layer neutral network

options(size=100, maxit=1000)
SDSS_nnet <- multinom(as.factor(SDSS_train[,5]) ~ SDSS_train[,1] + SDSS_train[,2] +
SDSS_train[,3] + SDSS_train[,4], data=SDSS_train)
SDSS_train_nnet <- predict(SDSS_nnet,SDSS_train[,1:4])
plot(jitter(as.numeric(SDSS_train_nnet), factor=0.5), jitter(as.numeric
(SDSS_train[,5]), factor=0.5), pch=20, cex=0.5, xlab='nnet class',
ylab='True class', xaxp=c(1,3,2),yaxp=c(1,3,2))

# Classification And Regression Tree model, prediction and validation

library('rpart')
SDSS_rpart_mod <- rpart(SDSS_train[,5] ~., data=SDSS_train[,1:4])
SDSS_rpart_test_pred <- predict(SDSS_rpart_mod, SDSS_test)
SDSS_rpart_train_pred <- predict(SDSS_rpart_mod, SDSS_train)
summary(SDSS_rpart_mod) ; str(SDSS_rpart_mod)

plot(SDSS_test[,1], SDSS_test[,2], xlim=c(-0.7,3), ylim=c(-0.7,1.8), pch=20,
col=round(SDSS_rpart_test_pred), cex=0.5,
main='', xlab='u-g (mag)', ylab='g-r (mag)')
plot(jitter(SDSS_rpart_train_pred, factor=5), jitter(SDSS_train[,5]), pch=20,
cex=0.5, xlab='CART class', ylab='True class',yaxp=c(1,3,2))

plot(SDSS_rpart_mod, branch=0.5, margin=0.05)
text(SDSS_rpart_mod, digits=3, use.n=T, cex=0.8)
plotcp(SDSS_rpart_mod, lwd=2, cex.axis=1.3, cex.lab=1.3)

# Support Vector Machine model, prediction and validation

install.packages('e1071') ; library(e1071)
SDSS_svm_mod <- svm(SDSS_train[,5] ~.,data=SDSS_train[,1:4],cost=100, gamma=1)
summary(SDSS_svm_mod) ; str(SDSS_svm_mod)
SDSS_svm_test_pred <- predict(SDSS_svm_mod, SDSS_test)
SDSS_svm_train_pred <- predict(SDSS_svm_mod, SDSS_train)

plot(SDSS_test[,1], SDSS_test[,2], xlim=c(-0.7,3), ylim=c(-0.7,1.8), pch=20,
col=round(SDSS_svm_test_pred), cex=0.5, main='',
xlab='u-g (mag)', ylab='g-r (mag)')
plot(SDSS_svm_train_pred, jitter(SDSS_train[,5]), pch=20, cex=0.5,
xlab='SVM class', ylab='True class', yaxp=c(1,3,2))

# Final SVM classification of the test set

par(mfrow=c(1,3))
plot(SDSS_test[,1], SDSS_test[,2], xlim=c(-0.7,3), col=round(SDSS_svm_test_pred),
ylim=c(-0.7,1.8), pch=20, cex=0.5, main='', xlab='u-g (mag)',ylab='g-r (mag)')
plot(SDSS_test[,2], SDSS_test[,3], xlim=c(-0.7,1.8), col=round(SDSS_svm_test_pred),
ylim=c(-0.7,1.8), pch=20, cex=0.5, main='', xlab='g-r (mag)',ylab='r-i (mag)')
plot(SDSS_test[,3], SDSS_test[,4], xlim=c(-0.7,1.8), col=round(SDSS_svm_test_pred),
ylim=c(-1.1,1.3), pch=20, cex=0.5, main='', xlab='r-i (mag)',ylab='i-z (mag)')
par(mfrow=c(1,1))

SDSS_test_svm_out <- cbind(SDSS[,6], SDSS[,7], SDSS_test, SDSS_svm_test_pred)
names(SDSS_test_svm_out)[c(1,2,7)] <- c('R.A.', 'Dec', 'SVM Class')
write.table(format(SDSS_test_svm_out), file='SDSS_test_svm.out',sep='\\t',quote=F)

******************************************************************
******************************************************************

Chapter 10. Nondetections: Censored and Truncated Data

# Construct sample of X-ray luminosities of low-redshift (z<0.3) Sloan quasars

dim(qso) ; names(qso) ; summary(qso)
qso.lowz <- qso[qso[,2]<0.3,]
dim(qso.lowz) ; names(qso.lowz) ; summary(qso.lowz)
attach(qso.lowz)

# X-ray luminosities for detections and (approximately) for upper limits

ROSAT[ROSAT < (-8.900)] <- NaN  # X-ray nondetections
Dist <- (3*10^{10})*z*(3.09*10^{24}) / (73*10^5)
XLum <- 8.3*10^(-12)*10^(ROSAT) * (4*pi*Dist^2)
XLum[is.na(ROSAT)] <- 1.*10^(-12)*runif(1)* (4*pi*Dist[is.na(ROSAT)]^2)

par(mfrow=c(1,2))
plot(log10(XLum[!is.na(ROSAT)]), ylim=c(42.5,45), xlab='',
ylab='log(Lx) erg/s', pch=20)
plot(log10(XLum[is.na(ROSAT)]), ylim=c(42.5,45), xlab='',
ylab='log(Lx) erg/s', pch=25)
par(mfrow=c(1,1))

# Construct quasar Kaplan-Meier luminosity function and 95% confidence band

library(survival)
Xstatus <- seq(1,1,length.out=400)  # 1 = detected
Xstatus[is.na(ROSAT)] <- 0  # 0 = right-censored
survobj <- Surv(-XLum, Xstatus)
KM.XLF<- survfit(survobj~1, conf.int=0.95, conf.type='plain',
conf.lower='modified')
KM.output <- summary(survfit(survobj~1))

# Plot Kaplan-Meier estimator with confidence bands

plot(log10(-KM.XLF\$time), KM.XLF\$surv,  ylim=c(0,1), pch=20, cex=0.5, main='',
xlab=expression(log*L[x]), ylab='Kaplan-Meier estimator')
lines(log10(-KM.XLF\$time), KM.XLF\$upper, lty=2)
lines(log10(-KM.XLF\$time), KM.XLF\$lower, lty=2)

# Prepare data for survival two-sample tests
# Construct subsamples for XLFs of radio detected vs. undetected quasars
# Class = 1 are radio detections, Class = 2 are radio nondetections
# X-ray luminosities are obtained from ROSAT counts

ROSAT[ROSAT < (-8.900)] <- NaN
Xstatus <- seq(1,1,length.out=370)
Xstatus[is.na(ROSAT[1:370])] <- 0
Dist <- (3*10^{10})*z*(3.09*10^{24}) / (73*10^5)
XLum <- seq(1,1,length.out=370)
XLum[1:370] <- 8.3*10^(-12)*10^(ROSAT) * (4*pi*Dist^2)
XLum[is.na(ROSAT[1:370])] <- 1.*10^(-12)*runif(1)* (4*pi*Dist[is.na(ROSAT)]^2)

# Apply five two-sample tests for radio detected vs. nondetected subsamples

install.packages('surv2sample') ; library(surv2sample)
surv2.ks.out

# Read dataset on beryllium and lithium abundances in stars
dim(abun) ; names(abun) ; attach(abun)

# Boxplot of abundances for stars with and without planets

cen_Be <- seq(FALSE, FALSE, length=68) ; cen_Be[Ind_Be==0] <- TRUE
cen_Li <- seq(FALSE, FALSE, length=68) ; cen_Li[Ind_Li==0] <- TRUE
Type_factor <- as.factor(Type)
par(mfrow=c(1,2))
cenboxplot(logN_Be, cen_Be, Type_factor, log=FALSE, ylab='log N(Be)',
names=c('Planets','No planets'), boxwex=0.5, notch=TRUE, varwidth=TRUE,
cex.axis=1.5, cex.lab=1.5, lwd=2)
cenboxplot(logN_Li, cen_Li, Type_factor, log=FALSE, ylab='log N(Li)',
names=c('Planets','No planets'), boxwex=0.5, notch=TRUE, varwidth=TRUE,
cex.axis=1.5, cex.lab=1.5, lwd=2)
par(mfrow=c(1,1))

# Test significance of possible lithium abundance effect

logN_Li1 <- logN_Li[-c(1,23)] ; cen_Li1 <- cen_Li[-c(1,23)]
Type_factor1 <- Type_factor[-c(1,23)] # remove NaN values
cendiff(logN_Li1, cen_Li1, Type_factor1, rho=0)
cendiff(logN_Li1, cen_Li1, Type_factor1,rho=1)

# Reproduce Santos et al. (2002) plot of stellar beryllium vs. lithium abundance

ind_det1 <- which(Ind_Li==1 & Ind_Be==1 & Type==1) # filled circles
ind_det2 <- which(Ind_Li==1 & Ind_Be==1 & Type==2) # open circles
ind_left1 <- which(Ind_Li==0 & Ind_Be==1 & Type==1)
ind_left2 <- which(Ind_Li==0 & Ind_Be==1 & Type==2)
ind_down1 <- which(Ind_Li==1 & Ind_Be==0 & Type==1)
ind_down2 <- which(Ind_Li==1 & Ind_Be==0 & Type==2)
ind_both1 <- which(Ind_Li==0 & Ind_Be==0 & Type==1)
ind_both2 <- which(Ind_Li==0 & Ind_Be==0 & Type==2)

plot(logN_Li[ind_det1], logN_Be[ind_det1], xlim=c(-0.6,3.5), ylim=c(-0.2,1.5),
main="", xlab="log N(Li)",
ylab="log N(Be)", pch=16, lwd=2) # plot detections
points(logN_Li[ind_det2], logN_Be[ind_det2], pch=1)
arrows(logN_Li[ind_left1], logN_Be[ind_left1], logN_Li[ind_left1]-0.2,
logN_Be[ind_left1],length=0.1) # plot left arrows
arrows(logN_Li[ind_left2], logN_Be[ind_left2], logN_Li[ind_left2]-0.2,
logN_Be[ind_left2],length=0.1)
points(logN_Li[ind_left1], logN_Be[ind_left1], pch=16)
points(logN_Li[ind_left2], logN_Be[ind_left2], pch=1)
arrows(logN_Li[ind_down1], logN_Be[ind_down1], logN_Li[ind_down1],
logN_Be[ind_down1]-0.1, length=0.1)
arrows(logN_Li[ind_down2], logN_Be[ind_down2], logN_Li[ind_down2],
logN_Be[ind_down2]-0.1, length=0.1)
points(logN_Li[ind_down1], logN_Be[ind_down1], pch=16)
points(logN_Li[ind_down2], logN_Be[ind_down2], pch=1)

arrows(logN_Li[ind_both1], logN_Be[ind_both1],
logN_Li[ind_both1]-0.2, logN_Be[ind_both1], length=0.1) # plot double arrows
arrows(logN_Li[ind_both1], logN_Be[ind_both1], logN_Li[ind_both1],
logN_Be[ind_both1]-0.1,length=0.1)
arrows(logN_Li[ind_both2], logN_Be[ind_both2], logN_Li[ind_both2]-0.2,
logN_Be[ind_both2],length=0.1)
arrows(logN_Li[ind_both2], logN_Be[ind_both2], logN_Li[ind_both2],
logN_Be[ind_both2]-0.1,length=0.1)
points(logN_Li[ind_both1], logN_Be[ind_both1], pch=16)
points(logN_Li[ind_both2], logN_Be[ind_both2], pch=1)

# Bivariate correlation and regression using Akritas-Thiel-Sen procedure

logN_Li1 <- logN_Li[-c(1,23)] # remove two points with NaN entries
Ind_Li1 <- Ind_Li[-c(1,23)] ;
logN_Be1 <- logN_Be[-c(1,23)]
Ind_Be1 <- Ind_Be[-c(1,23)]
Li_cen <- seq(FALSE, FALSE, length=66) # construct censoring indicator variables
Li_cen[which(Ind_Be1==0)]  <- TRUE
Be_cen=seq(FALSE, FALSE, length=66)
Be_cen[which(Ind_Li1==0)] <- TRUE
cenken_out <- cenken(logN_Be1, Be_cen, logN_Li1, Li_cen)
abline(a=cenken_out\$intercept, b=cenken_out\$slope, lwd=2)

# Construct a sample of bright nearby Hipparcos stars

attach(hip) ; dim(hip); summary(hip)

# Plot luminosity distribution of stars and their truncation limits

AbsMag <- Vmag + 5*log10(Plx/1000) + 5
Lum <- 2.512^(4.84 - AbsMag)
plot(density(log10(Lum)),ylim=c(0,1.7), main='',
xlab='log L (solar, V band)', lwd=2, cex.lab=1.2)
AbsMaglim <- 10.5 + 5*log10(Plx/1000) + 5
Lumlim <- 2.512^(4.84 - AbsMaglim)
lines(density(log10(Lumlim)), lty=2, lwd=2)
text(0.7, 0.5, 'Hipparcos sample', pos=4, font=2)
text(-0.5, 1.2, 'Truncation limits', pos=4, font=2)

# Compute Lynden-Bell-Woodroofe estimator  with 90% confidence bands

install.packages('DTDA') ; library(DTDA)
LBW.hip <- efron.petrosian(log10(Lum), log10(Lumlim), boot=T, B=100, alpha=0.1)
summary(LBW.hip)
plot(LBW.hip\$time, LBW.hip\$survival, pch=20, cex=0.6, main='',
xlab='log L (solar, V band)', ylab='Density', cex.lab=1.2)
upper <- LBW.hip\$upper.Sob[-(1000:1013)]
lower <- LBW.hip\$lower.Sob[-(1000:1013)]
lines(LBW.hip\$time,upper, lty=2) ; lines(LBW.hip\$time, lower, lty=2)

# Compare with observed Wielen 1983 local star LF

Wielen.MV <- seq(0, 12, 1)
Wielen.LF <- c(35,126,209,380,676,955,1050,891,1120,1410,2140,2510,4470)
points(log10(2.512^(4.84-Wielen.MV)), Wielen.LF/2500)

# Compare with binned 1/V.max luminosity function (Schmidt 1968)

Vol.max <- (4*pi/3)*(1000/Plx)^3
bin.sum <- function(vol) sum(1/vol)
Lum.bins <- cut(log10(Lum), breaks=seq(-2.5,3.0,0.5), ord=T)
Schmidt.LF <- by(Lum, Lum.bins, bin.sum)
lines(seq(-2.25, 2.75, 0.5), Schmidt.LF/4500, pch=3, lwd=2)

******************************************************************
******************************************************************

Chapter 11. Time Series Analysis

# Read in GX 5-1 data and create time series

GX.dat <- scan("http://astrostatistics.psu.edu/MSMA/datasets/GX.dat")
GX.time <- seq(from=0, to=512, length.out=65536)
GX.ts <-  ts(GX.dat, GX.time) ; GX.ts.offset <- ts(GX.dat-30, GX.time)

# Compare histogram of counts to normal distribution

hist(GX.dat, breaks=100, xlim=c(40,100), ylim=c(0,3500), xlab='GX 5-1 counts',
font=2, font.lab=2, main='')
sd(GX.dat) / sqrt(mean(GX.dat))  # result is 1.24

# Examine raw and smoothed time series

plot.ts(GX.ts[1:6000], ylab='GX 5-1 counts', xlab='Time (1/128 sec)',
cex.lab=1.3, cex.axis=1.3)
plot(GX.time,GX.dat, ylim=c(-10,115), xlab='Time (sec)', ylab="GX 5-1 counts",
cex.lab=1.3, cex.axis=1.3, type='n')
lines(ksmooth(GX.time, GX.dat+30, 'normal', bandwidth=7), lwd=2)
text(450, 110, 'Normal kernel')
lines(filter(GX.ts, sides=2, rep(1,7)/7), lwd=2)
text(450, 85, 'Moving average')
lines(kernapply(GX.ts.offset, kernel('modified.daniell', 7)), lwd=2)
text(450, 50, 'Modified daniell')
lines(supsmu(GX.time, GX.dat-60), lwd=2)
text(400, 20, "Friedman's supersmoother")
lines(lowess(GX.time, GX.dat-80, 0.05), lwd=2)
text(450, 0, "Local regression")

# Raw periodogram

f <- 0:32768/65536
I <- (4/65536) * abs(fft(GX.ts) / sqrt(65536))^2
plot(f[2:60000], I[2:60000], type="l", ylab="Power", xlab="Frequency")

Pergram <- spec.pgram(GX.ts,log='no',main='')
summary(Pergram)
Pergram\$spec[500] # value of 500th point
2*Pergram\$spec[500] / qchisq(c(0.025,0.975),2)

# Raw and smoothed periodogram

par(mfrow=c(3,1))
spec.pgram(GX.ts, log='no', main='', sub='')
spec.pgram(GX.ts, spans=50, log='no', main='', sub='')
spec.pgram(GX.ts, spans=500, log='no', main='', sub='')

# Autocorrelation functions of the GX 5-1 time series

par(mfrow=c(1,2))
acf(GX.ts, 40, main='', ci.col='black', ylim=c(-0.05,0.3), lwd=2)
pacf(GX.ts, 40, main='', ci.col='black', ylim=c(-0.05,0.3), lwd=2)

# Autoregressive modeling

ARmod <- ar(GX.ts, method='ols')
ARmod\$order # model selection based on AIC
ARmod\$ar # best-fit parameter values
ARmod\$asy.se.coef\$ar # parameter standard errors
plot(0:29, log10(ARmod\$aic[1:30]), xlab='AR model parameters',
ylab='log(AIC)', pch=20)
arrows(27, 0.4, 27, 0.0, length=0.1)

# Spectrum of AR model

ARspec <- spec.ar(GX.ts, plot=F)
GXspec <- spec.pgram(GX.ts, span=101, main='', sub='', lwd=2)
lines(ARspec\$freq, ARspec\$spec, col='red', lwd=4)
legend(0.23, 550, c('Periodogram, Daniell smooth', 'AR(27) model'),
lty=c(1,1), lwd=c(2,4), col=c('black','red'))

# Spectral estimates of the long-range memory parameter d

install.packages('fracdiff') ; library(fracdiff)
d.FARIMA <- fracdiff(GX.ts, nar=27, nma=1, ar=ARmod\$ar) ; d.FARIMA\$d
d.GPH <- fdGPH(GX.ts)
d.Reisen <- fdSperio(GX.ts)

# Time domain estimates of the long-range memory parameter H=d+1/2

install.packages('fractal') ; library(fractal)
d.DFA <- DFA(GX.ts) ;  d.DFA
plot(d.DFA)
d.ACVF <- hurstACVF(GX.ts) ; d.ACVF
d.block <- hurstBlock(GX.ts)  ; d.block

# Discrete wavelet transform

install.packages('waveslim') ; library(waveslim)
GX.wav <-  dwt(GX.ts,n.levels=10)
par(mfrow=c(3,1))
plot.ts(up.sample(GX.wav[[4]],2^4),type='h',axes=F,ylab='') ; abline(h=0)
plot.ts(up.sample(GX.wav[[7]],2^7),type='h',axes=F,ylab='',lwd=2) ; abline(h=0)
plot.ts(up.sample(GX.wav[[10]],2^{10}),type='h',axes=F,ylab='',lwd=2) ; abline(h=0)
par(mfrow=c(1,1))

******************************************************************
******************************************************************

Chapter 12. Spatial Point Processes

# Construct three galaxy redshift datasets, plot using spatstat

attach(shap) ; dim(shap) ; summary(shap)
shap.hi <- shap[(R.A. < 205) & (R.A. > 200) & (Dec. > -34) & (Dec. < -29) ,]
shap.lo <- shap[(R.A. < 214) & (R.A. > 209) & (Dec. > -34) & (Dec. < -27) ,]
shap.clus <- shap[(R.A. <204.5) & (R.A. > 200.4) & (Dec. > -32.5) & (Dec. < -31.0)
& (Vel > 11000) & (Vel < 18000),]

# Plot in 3-dimensions using rgl, plot and scatterplot3d

install.packages('rgl') ; library(rgl)
rgl.open()
rgl.points(scale(shap[,1]), scale(shap[,2]), scale(shap[,4]))
rgl.bbox()
rgl.snapshot('Shapley.png')
rgl.close()

plot(shap.lo[,1], shap.lo[,2], cex=(scale(shap.lo[,4])+1.5)/2,
xlab='Right Ascension (degrees)', ylab='Declination (degrees)')

install.packages('scatterplot3d') ; library(scatterplot3d)
scatterplot3d(shap.hi[,c(1,2,4)], pch=20, cex.symbols=0.7, type='p', angl=40,
zlim=c(0,50000))

# Preparation of nearest neighbor lists for spdep analysis

install.packages('spdep') ; library(spdep)
shap.lo.mat <- as.matrix(shap.lo[,1:2])
nn.shap.lo <- knearneigh(shap.lo.mat, k=1) ; str(nn.shap.lo)
nb.shap.lo <- knn2nb(nn.shap.lo) ; plot.nb(nb.shap.lo, shap.lo[,1:2])
nb.wt.shap.lo <- nb2listw(nb.shap.lo,style='B') ; summary(nb.wt.shap.lo)

# Application of Moran's I and Geary's C statistics

moran(shap.lo.mat[,1], nb.wt.shap.lo, n=length(nb.shap.lo),
S0=Szero(nb.wt.shap.lo))
moran.test(shap.lo.mat[,1], nb.wt.shap.lo)
moran.mc(shap.lo.mat[,1], nb.wt.shap.lo, nsim=10000)
geary(shap.lo.mat[,1], nb.wt.shap.lo, n=length(nb.shap.lo),
n1=length(nb.shap.lo)-1, S0=Szero(nb.wt.shap.lo))
geary.test(shap.lo.mat[,1], nb.wt.shap.lo)

# Variogram analysis: gstat

install.packages('gstat') ; library(gstat)
shap.variog <- variogram(Vel~1, locations=~R.A.+Dec., data=shap)
variog.mod1 <- vgm(7e+07, "Gau", 3.0,2e+07)
variog.fit <- fit.variogram(shap.variog,variog.mod1) ; variog.fit
plot(shap.variog,model <- variog.fit, col='black', pch=20,
xlab='Distance (degree)', ylab="Semivariance (km/s*km/s)", lwd=2)

# Variogram analysis: geoR

install.packages('geoR') ; library(geoR)
shap1 <- shap[-c(which(duplicated(shap[,1:2]))),]
shap.geo <- as.geodata(shap1,coords.col=1:2, data.col=4)
points.geodata(shap.geo,cex.min=0.2, cex.max=1.0, pt.div='quart', col='gray')
plot.geodata(shap.geo, breaks=30)

shap.vario <- variog(shap.geo, uvec=seq(0, 10, by=0.5))
plot(shap.vario,lwd=2, cex.lab=1.3, cex.axis=1.3, lty=1)
shap.GRF1 <- likfit(shap.geo, ini.cov.pars=c(4e7,0.2))
summary.likGRF(shap.GRF1)
lines.variomodel(shap.GRF1, lwd=2, lty=2)

# Preparation for spatstat analysis

install.packages('spatstat') ; library(spatstat)
shap.lo.win <- owin(range(shap.lo[,1]), range(shap.lo[,2]))
centroid.owin(shap.lo.win) ; area.owin(shap.lo.win)
shap.lo.ppp <- as.ppp(shap.lo[,c(1,2,4)], shap.lo.win) # planar point pattern
summary(shap.lo.ppp)
plot(density(shap.lo.ppp,0.3), col=topo.colors(20), main='', xlab='R.A.',
ylab='Dec.')

# K function for the Shapley low density region

shap.lo.K <- Kest(shap.lo.ppp, correction='isotropic')
shap.lo.K.bias <- Kest(shap.lo.ppp, correction='none')
plot.fv(shap.lo.K, lwd=2, col='black',  main='', xlab='r (degrees)', legend=F)
plot.fv(shap.lo.K.bias, add=T, lty=3, lwd=2, col='black', legend=F)

# Draw envelope of 100 simulations of CSR process

shap.lo.K.env <- envelope(shap.lo.ppp, fun=Kest, nsim=100, global=T)
xx <- c(0, shap.lo.K.env\$r, rev(shap.lo.K.env\$r), 0)
yy <- c(c(0, shap.lo.K.env\$lo), rev(c(0,shap.lo.K.env\$hi)))
polygon(xx, yy, col='gray')
plot.fv(shap.lo.K, lwd=2, col='black', main='', add=T, legend=F)
plot.fv(shap.lo.K.bias, add=T, lty=3, lwd=2, col='black', legend=F)

# Similar plot for the L* function

shap.lo.L <- Lest(shap.lo.ppp, correction='isotropic')
shap.lo.L.bias <- Lest(shap.lo.ppp, correction='none')
plot(shap.lo.L\$r, (shap.lo.L\$iso - shap.lo.L\$r), lwd=2, col='black',
main='',xlab='r (degrees)', ylab='L*(r)', ty='l', ylim=c(-0.2,0.2))
lines(shap.lo.L\$r, (shap.lo.L\$theo - shap.lo.L\$r), lwd=2, lty=2)
lines(shap.lo.L\$r, (shap.lo.L.bias\$un - shap.lo.L\$r), lwd=2, lty=3)

# Baddeley J function for the Shapley low density region

plot(Jest(shap.lo.ppp), lwd=2, col='black', cex.lab=1.3, cex.axis=1.3, main='',
xlab='r (degrees)', legend=F)

# Two-point correlation function

shap.lo.pcf <- pcf(shap.lo.ppp)
plot(shap.lo.pcf, xlim=c(0.0,0.2))
plot(log10(shap.lo.pcf\$r[2:512]), log10(shap.lo.pcf\$trans[2:512]), type='l',
lwd=2, xlab='log r (degrees)', ylab='log pair correlation fn')
lines(c(-1,0), c(0.78+0.48,0.48), lwd=2, lty=2)
lines(c(-2,0), c(0,0), lwd=2, lty=3)

# Compute Dirichlet (Voronoi) tessellation

shap.lo.dir <- dirichlet(shap.lo.ppp)
summary(shap.lo.dir) ; plot(shap.lo.dir, main='')
shap.lo.tile <- tiles(shap.lo.dir) ; str(shap.lo.tile)
shap.lo.area <- list(lapply(shap.lo.tile, area.owin)) ; str(shap.lo.area)

# Select small area tiles as clusters

hist(as.numeric(shap.lo.area[[1]]), breaks=30)
shap.lo.clus <- cut(as.numeric(shap.lo.area[[1]]), breaks=c(0,0.06,1))
plot(shap.lo.dir, main='')
points(shap.lo.ppp, pch=20, cex=0.5)
points(shap.lo.ppp[shap.lo.clus=='(0,0.06]'], pch=1,lwd=2)

# IDW spatial interpolation using the gstat package

xrange <- c(193,216) ; yrange <- c(-38,-26.5)
shap.grid <- expand.grid(x=seq(from=xrange[1], to=xrange[2], by=0.05),
y <- seq(from=yrange[1], to=yrange[2], by=0.05))
names(shap.grid) <- c("R.A.","Dec.")
gridded(shap.grid) <- ~R.A.+Dec.
plot(shap.grid) ; points(shap[,1:2], pch=20)

shap.idw <- idw(shap[,4]~1, locations=~R.A.+Dec., shap[,1:2], shap.grid,
idp=1.5)
sp.theme(regions = list(col = topo.colors(100)), set=T)
spplot(shap.idw)

# Ordinary kriging using the geoR package

shap.vario <- variog(shap.geo, uvec=seq(0, 10, by=0.2))
plot(shap.vario, lwd=2, lty=1)
shap.variofit <- variofit(shap.vario, cov.model='gaussian')
lines(shap.variofit, lty=2)

shap.grid <- pred_grid(c(193,217), c(-38,-29), by=0.3)
KC <- krige.control(obj.model=shap.variofit)
shap.okrig <- krige.conv(shap.geo,loc=shap.grid, krig=KC)

image(shap.okrig, xlim=c(195,203), ylim=c(-38,-27))
image(shap.okrig, loc=shap.grid, val=sqrt(shap.okrig\$krige.var),
xlim=c(195,203), ylim=c(-38,-27), zlim=c(3800,5000))

# Preparation of circular datasets from Hipparcos proper motion catalog

dim(hip) ;  summary(hip)
hist(hip[,8], breaks=50)
hip1<- hip[hip[,8]<5,] ; hip2 = na.omit(hip1) ; hip3 = hip2
hip3[,6] <- hip2[,6] - median(hip2[,6])
hip3[,7] <- hip2[,7] - median(hip2[,7])
attach(hip3) ; dim(hip3)
nstar <- length(hip3[,6])

const=360. / (2*pi)
theta <- numeric(length=nstar)
for (i in 1:nstar) {
if(pmRA[i]>=0 & pmDE[i]>=0) theta[i] = atan(pmRA[i] / pmDE[i]) * const
if(pmDE[i]<0) theta[i] = 180. + atan(pmRA[i] / pmDE[i]) * const
if(pmRA[i]<0 & pmDE[i]>=0)   theta[i] = 360. + atan(pmRA[i] / pmDE[i]) * const
}
hist(theta, breaks=20, lwd=2,xlab='Position angle theta (degrees)', main='')

# Proper motion and circular plots

install.packages('CircStats') ; library(CircStats)
circ.summary(theta)
circ.plot(theta, cex=0.3, pch=20, stack=T, bins=50, shrink=2)
theta=theta / const
plot(pmRA, pmDE, pch=20, cex=0.6, xlab='Proper motion R.A. (mas/yr)',
ylab='Proper motion Dec. (mas/yr)',main='')
abline(0, 0, lty=2, lwd=2) ; abline(0, 100000, lty=2, lwd=2)
est.kappa(theta, bias=T)

# Basic statistics, tests for uniformity, and von Mises tests

install.packages('circular') ; library(circular)
circ.summary(theta)
sqrt(circ.disp(theta)[4])

kuiper(theta) ; r.test(theta); rao.spacing(theta)
watson(theta) ; watson(theta,dist='vm')

vm.ml(theta,bias=T)
vm_boot <- vm.bootstrap.ci(theta,bias=T)
vm_boot\$mu.ci ; vm_boot\$kappa.ci

******************************************************************
******************************************************************

Appendix B. Getting Started with R

# I. Query the session environment and make a vector of three numbers

getwd()
setwd('/Users/myself/newdir')
sessionInfo() ;  citation()
a.test <- c(33, 44, 55)
ls()  ;  class(a.test) ;  str(a.test)
a.test ;  write(file='output', a.test)

# II. Create a 500x2 bivariate dataset where the X values are evenly
# distributed between 0 and 5 and the Y values have a nonlinear
# dependence on X with heteroscedastic Gaussian scatter

help(seq) ; help(sample)  ; help(rnorm)
set.seed(1)
x <- sample(seq(0.01, 3, length.out=500))
y <- 0.5*x + 0.3^(x^2) + rnorm(500, mean=0, sd=(0.05*(1+x^2)))

# III. Examine, summarize and boxplot univariate distributions
summary(x) ; summary(y)
str(summary(x))
write.table(rbind(summary(x),summary(y)),file='summary.txt')

help(par) ; help(boxplot)
par(mfrow=c(1,2))  # set up two-panel figure
boxplot(x,  notch=T, main='Boxplot for X')
boxplot(y,  notch=T, pch=20, cex=0.5, main='Boxplot for Y')
par(mfrow=c(1,1))
dev.copy2eps(file='box.eps')

plot(ecdf(x),pch=20,cex=0.3,main='',ylab='EDF',xlab='')
text(2.0,0.5,"X") ; text(1.4,0.8,"Y")
dev.copy2eps(file='edf.eps')

# IV. Put X and Y into a data frame

help(cbind) ; help(as.data.frame) ; help(names)
xy <- cbind(x, y)  ;  str(xy)
xy <- as.data.frame(xy)
names(xy) <- c('Xvar', 'Yvar')  ; str(xy)
attach(xy)  ;  ls()

# V. Bivariate plots and correlation tests

help(plot)
par(mfrow=c(2,2))
plot(xy, pch=20, cex=0.5)
plot(log10(xy), pch=20, cex=0.5, xlab='log(Xvar)', ylab='log(Yvar)')

length(x[x>2.5])
cor.test(x[x>2.5],y[x>2.5], method='pearson')
cor.test(x[x>2.5],y[x>2.5], method='kendall')

# VI. Kernel density estimator with spline fit

smoothScatter(log10(xy), lwd=4, pch=20, cex=0.2,
colramp = colorRampPalette(c("white",gray(20:5/20))))
lines(smooth.spline(log10(xy)), lwd=3)

# VII. Least-squares polynomial regression
logx <- log10(x) ; logx2 <- (log10(x))^2 ; logx3 <- (log10(x))^3
logx4 <- (log10(x))^4 ; logx5 <- (log10(x))^5
yfit <- lm(log10(y) ~ logx + logx2 + logx3 + logx4 + logx5)
str(yfit)
plot(log10(x), log10(y), pch=20, col=grey(0.5),cex=0.3)
lines(sort(log10(x)), yfit\$fit[order(log10(x))], lwd=3)
dev.copy2eps(file="smooth.eps")

******************************************************************
******************************************************************

Appendix C. Astronomical Datasets

# SDSS point sources test dataset, N=17,000 (mag<21, point sources, hi-qual)

dim(SDSS) ; summary(SDSS)
SDSS_test <- data.frame(cbind((SDSS[,1]-SDSS[,2]), (SDSS[,2]-SDSS[,3]),
(SDSS[,3]-SDSS[,4]), (SDSS[,4]-SDSS[,5])))
names(SDSS_test) <- c('u_g', 'g_r', 'r_i', 'i_z')
str(SDSS_test)

par(mfrow=c(1,3))
plot(SDSS_test[,1], SDSS_test[,2], xlim=c(-0.7,3), ylim=c(-0.7,1.8),pch=20,
cex=0.6, cex.lab=1.5, cex.axis=1.5, main='', xlab='u-g (mag)', ylab='g-r (mag)')
plot(SDSS_test[,2], SDSS_test[,3], xlim=c(-0.7,1.8), ylim=c(-0.7,1.8), pch=20,
cex=0.6, cex.lab=1.5, cex.axis=1.5, main='', xlab='g-r (mag)', ylab='r-i (mag)')
plot(SDSS_test[,3], SDSS_test[,4], xlim=c(-0.7,1.8), ylim=c(-1.1,1.3), pch=20,
cex=0.6, cex.lab=1.5, cex.axis=1.5, main='', xlab='r-i (mag)', ylab='i-z (mag)')
par(mfrow=c(1,1))

# Quasar training set, N=2000 (Class 1)

dim(qso1) ; summary(qso1)
bad_phot_qso <- which(qso1[,c(3,5,7,9,11)] > 21.0 | qso1[,3]==0)
qso3 <- cbind((qso2[,3]-qso2[,5]), (qso2[,5]-qso2[,7]), (qso2[,7]-qso2[,9]), (qso2[,9]-qso2[,11]))
qso_train <- data.frame(cbind(qso3, rep(1, length(qso3[,1]))))
names(qso_train) <- c('u_g', 'g_r', 'r_i', 'i_z', 'Class')
dim(qso_train) ; summary(qso_train)

# Star training set, N=5000 (Class 2)

dim(temp2) ; summary(temp2)
star <- cbind((temp2[,1]-temp2[,2]), (temp2[,2]-temp2[,3]), (temp2[,3]-temp2[,4]),
(temp2[,4]-temp2[,5]))
star_train <- data.frame(cbind(star, rep(2, length(star[,1]))))
names(star_train) <- c('u_g','g_r','r_i','i_z','Class')
dim(star_train) ; summary(star_train)

# White dwarf training set, N=2000 (Class 3)

dim(temp3) ; summary(temp3)
temp3 <- na.omit(temp3)
wd <- cbind((temp3[1:2000,2]-temp3[1:2000,3]), (temp3[1:2000,3]-temp3[1:2000,4]),
(temp3[1:2000,4]-temp3[1:2000,5]), (temp3[1:2000,5]-temp3[1:2000,6]))
wd_train <- data.frame(cbind(wd, rep(3, length(wd[,1]))))
names(wd_train) <- c('u_g', 'g_r', 'r_i', 'i_z', 'Class')
dim(wd_train) ; summary(wd_train)

# Combine and plot the training set (9000 objects)

SDSS_train <- data.frame(rbind(qso_train, star_train, wd_train))
names(SDSS_train) <- c('u_g', 'g_r', 'r_i', 'i_z', 'Class')
str(SDSS_train)

par(mfrow=c(1,3))
plot(SDSS_train[,1], SDSS_train[,2], xlim=c(-0.7,3), ylim=c(-0.7,1.8), pch=20,
col=SDSS_train[,5], cex=0.6, cex.lab=1.6, cex.axis=1.6, main='', xlab='u-g (mag)',
ylab='g-r (mag)')
legend(-0.5, 1.7, c('QSO','MS + RG','WD'), pch=20, col=c('black','red','green'),
cex=1.6)
plot(SDSS_train[,2], SDSS_train[,3], xlim=c(-0.7,1.8), ylim=c(-0.7,1.8), pch=20,
col=SDSS_train[,5], cex=0.6, cex.lab=1.6, cex.axis=1.6, main='', xlab='g-r (mag)',
ylab='r-i (mag)')
plot(SDSS_train[,3], SDSS_train[,4], xlim=c(-0.7,1.8), ylim=c(-1.1,1.3), pch=20,
col=SDSS_train[,5], cex=0.6, cex.lab=1.6, cex.axis=1.6, main='', xlab='r-i (mag)',
ylab='i-z (mag)')
par(mfrow=c(1,1))

```