#!/usr/bin/Rscript
#
# regcomp - compare regression lines from two sets of data

helpstr<-"
regcomp - compare regression lines from two sets of data

Usage:  regcomp [-h] [-t tchar] [-p plotfile] file ...

where:  -h		specifies that data files have a header line
	-t tchar	specifies the tab character which separates numbers
			in data files, e.g. -t ,
			(default is any white space)
	-p plotfile	specifies that a plot of the two data sets and their
			regression lines be plotted to plotfile, where suffix
			(i.e. extension) of file name specifies plot file type
			which must be .eps, .pdf, .svg, .jpg or .png
	file		specifies one or more data files, containing 2 or 3
			columns of data each.  2-column files have x and y pairs
			and 3-column files have a group name followed by x and
			y pairs on each line.  2-column files will assume the
			file name as group name.

Regcomp performs a linear regression on the two different groups of data
in the specified files, or if more than two groups are given, the first group
is taken alone and all others combined.  The two calculated regressions are
compared, first to see if their slopes are significantly different, then
by assuming a common slope, to see if their adjusted means are significantly
different.

The statistical tests for comparing regression lines are described in:
Armitage P., Berry G., Matthews J.N.S. (2002) Statistical Methods in Medical
Research, 4th ed. Blackwell Scientific Publications. Oxford UK.
ISBN 0-632-05257-0. ss. 11.4-11.5, pp. 322-335.

Copyright (c) 2012, Gilles Detillieux, Spinal Cord Research Centre,
University of Manitoba.  All Rights Reserved.
"

args<-commandArgs(T)
if (length(grep("-\\?|--*help", args)) > 0) {
	cat(helpstr)
	q()
}
hdr<-F
if (length(i<-grep("^-h$", args)) > 0) {
	hdr<-T
	args<-args[-i]		# remove the argument used
}
sep<-""
if (length(i<-grep("^-t$", args)) > 0) {
	sep<-args[i+1]
	args<-args[-c(i,i+1)]	# remove the two arguments used
}
pfile<-""
if (length(i<-grep("^-p$", args)) > 0) {
	pfile<-args[i+1]
	args<-args[-c(i,i+1)]	# remove the two arguments used
}
badarg<-F
if (length(i<-grep("^-[^-]$", args)) > 0) {
	cat("regcomp: Invalid option argument(s): ", args[i], "\n")
	badarg<-T
}
if (badarg || length(args) == 0) {
	cat("Usage:  regcomp [-h] [-t tchar] [-p plotfile] file ...\n")
	cat("\tor regcomp --help\tfor detailed usage information\n")
	q(status=1)
}

dset<-""
for (a in args) {
	if (a == "--") next
	if (a == "-") a<-"stdin"
	dset2<-read.table(a, sep=sep, header=hdr)
	grps<-rep(a, dim(dset2)[1])
	if (dim(dset2)[2] < 3) dset2<-cbind(grps, dset2)
	if (length(dim(dset)) != 2) {
		dset<-dset2
	} else {
		if (!identical(colnames(dset), colnames(dset2))) {
			cat("regcomp: Warning - column names don't match in all files\n")
			if (hdr) cat("regcomp: maybe you don't need -h option\n")
			colnames(dset2) = colnames(dset)
		}
		dset<-rbind(dset, dset2)
	}
}

if (length(dim(dset)) != 2) {
	cat("regcomp: No valid data set specified in file arguments\n")
	q(status=1)
}
if (dim(dset)[2] != 3) {
	cat("regcomp: Data not in groups of X, Y pairs - maybe wrong separator used with -t\n")
	q(status=1)
}
if (!hdr && suppressWarnings(is.na(dset[1,2]+0) || is.na(dset[1,3]+0))) {
	cat("regcomp: First row of data is non-numeric - maybe you need to specify -h\n")
	q(status=1)
}

grps<-unique(dset[,1])
if (length(grps) < 2) {
	cat("regcomp: Need at least two data groups specified in file arguments\n")
	q(status=1)
}
if (length(grps) > 2) grps[2]<-"others"
dset1<-dset[dset[,1] == grps[1], 2:3]
dset2<-dset[dset[,1] != grps[1], 2:3]

# line type (0-6: none, solid, dashed, dotted, dot-dash, longdash, dash-ldash)
#lt1<-1;  lt2<-3	# solid, dotted
lt1<-1;  lt2<-1	# solid, solid
ltcs<-3		# line type for common slope (dotted)
# symbol (0-13: sq,cir,tri,+,x,dia,tdn,xsq,x+,+dia,+cir,tupdn,+sq,xcir,
#	 14-25: tdnsq,bl,meddot,stri,sdia,dot,smdot,cir,sq,dia,tri,tdn)
#sy1<-19; sy2<-21	# dot, circle
sy1<-19; sy2<-15	# dot, block
# color (0-8: none, blk, red, grn, blu, cya, mag, yel, gry)
cl1<-4;  cl2<-2	# blue, red

fit1<-lm(dset1[,2]~dset1[,1]) 
fit2<-lm(dset2[,2]~dset2[,1]) 
n1<-dim(dset1)[1]		# n value (no. of samples)
n2<-dim(dset2)[1]
mx1<-mean(dset1[,1])		# mean X value
mx2<-mean(dset2[,1])
sdx1<-sd(dset1[,1])		# standard deviation of X values
sdx2<-sd(dset2[,1])
my1<-mean(dset1[,2])		# mean Y value
my2<-mean(dset2[,2])
sdy1<-sd(dset1[,2])		# standard deviation of Y values
sdy2<-sd(dset2[,2])
r1<-cor(dset1[,2], dset1[,1])	# r value (correlation coefficient)
r2<-cor(dset2[,2], dset2[,1])
t1<-coef(summary(fit1))[2,3]	# t value (t-test value)
t2<-coef(summary(fit2))[2,3]
p1<-coef(summary(fit1))[2,4]	# p value (probability of t value)
p2<-coef(summary(fit2))[2,4]
b1<-as.numeric(coef(fit1)[2])	# b value (slope)
b2<-as.numeric(coef(fit2)[2])
a1<-as.numeric(coef(fit1)[1])	# a value (intercept)
a2<-as.numeric(coef(fit2)[1])

regtab<-matrix(c(n1,n2, mx1,mx2, sdx1,sdx2, my1,my2, sdy1,sdy2,
		 r1,r2, t1,t2, p1,p2, b1,b2, a1,a2), ncol=2, byrow=T)
colnames(regtab)<-grps[1:2]
row.names(regtab)<-c("n","mean x","SDx","mean y","SDy","r","t","p","slope b","intercept a")

oo<-options(scipen=10)
regtab
options(oo)

df<-n1+n2-4
if (df > 0) {
	sxx1<-sum(dset1[,1]^2) - sum(dset1[,1])^2/n1
	syy1<-sum(dset1[,2]^2) - sum(dset1[,2])^2/n1
	sxy1<-sum(dset1[,1]*dset1[,2]) - sum(dset1[,1])*sum(dset1[,2])/n1
	sxx2<-sum(dset2[,1]^2) - sum(dset2[,1])^2/n2
	syy2<-sum(dset2[,2]^2) - sum(dset2[,2])^2/n2
	sxy2<-sum(dset2[,1]*dset2[,2]) - sum(dset2[,1])*sum(dset2[,2])/n2
	s2comm <- ( (syy1-((sxy1^2)/sxx1)) + (syy2-((sxy2^2)/sxx2)) ) / df
	#cat(sprintf("s2comm = %g\n", s2comm))
	varcomm<-s2comm * (1/sxx1 + 1/sxx2)
	secomm<-sqrt(varcomm)
	#cat(sprintf("A: n=%g Sxx=%g Sxy=%g Syy=%g\n",n1,sxx1,sxy1,syy1))
	#cat(sprintf("B: n=%g Sxx=%g Sxy=%g Syy=%g\n",n2,sxx2,sxy2,syy2))
	seb<-coef(summary(fit1))[2,2]
	seg<-coef(summary(fit2))[2,2]
	tcomm<-(b1-b2)/secomm
	cat(sprintf("b1-b2 = %g\n", b1-b2))
	cat(sprintf("SEdiff: %f\n", secomm))
	#cat(sprintf("Vdiff: %f\n", varcomm))
	cat(sprintf("t = %g on df = %g, p = %g\n", tcomm, df, 2*pt(-abs(tcomm),df=df)))
	bcomm<-(sxy1+sxy2)/(sxx1+sxx2)
	varcomm<-s2comm/(sxx1+sxx2)
	cat(sprintf("Common slope b: %f\n", bcomm))
	#cat(sprintf("Common var.: %f, ", varcomm))
	#cat(sprintf("Common SE: %f\n", sqrt(varcomm)))
	dadjmean <- my1 - my2 - bcomm*(mx1-mx2)
	cat(sprintf("Difference of adjusted means: %g\n", dadjmean))
	dfd<-n1+n2-3
	s2diff <- (syy1+syy2 - (sxy1+sxy2)^2/(sxx1+sxx2)) / dfd
	vardiff <- s2diff * (1/n1 + 1/n2 + (mx1-mx2)^2/(sxx1+sxx2))
	sediff<-sqrt(vardiff)
	td <- dadjmean / sediff
	cat(sprintf("SEdiff = %g\n", sediff))
	cat(sprintf("t = %g on df = %g, p = %g\n", td, dfd, 2*pt(-abs(td),df=dfd)))
}

xlb=""
ylb=""
if (hdr) {
	xlb=as.character(colnames(dset)[2])
	ylb=as.character(colnames(dset)[3])
}

for (f in pfile) {
	if (f == "") next
	suff<-sub("^.*\\.", "", f)
	if (suff == "eps" || suff == "ps") {
		postscript(f, horizontal=FALSE, onefile=FALSE, paper="special", width=8, height=6)
	} else if (suff == "svg") {
		#svg(f)
		library("RSVGTipsDevice")
		devSVGTips(f, xmlHeader=T, toolTipMode=0)
	} else if (suff == "pdf") {
		pdf(f)
	} else if (suff == "png") {
		png(f, width=800, height=600)
	} else if (suff == "jpg") {
		jpeg(f, width=800, height=600)
	} else {
		cat("regcomp: Unknown plot file type: ", f, "\n")
		next
	}
	plot(dset[,2], dset[,3], type='n', bty='l', xlab=xlb, ylab=ylb)
	points(dset1[,1], dset1[,2], pch=sy1, col=cl1)
	points(dset2[,1], dset2[,2], pch=sy2, col=cl2)
	abline(fit1, lty=lt1, col=cl1)
	abline(fit2, lty=lt2, col=cl2)
	if (df > 0) {
		aa1 <- my1 - bcomm*mx1
		aa2 <- my2 - bcomm*mx2
		abline(aa1, bcomm, lty=3, col=cl1)
		abline(aa2, bcomm, lty=3, col=cl2)
	}
	legend("topleft", as.character(grps[1:2]), lty=c(lt1,lt2), pch=c(sy1,sy2), col=c(cl1,cl2))
	invisible(dev.off())
}

