diff --git a/data/epactsMulti.R b/data/epactsMulti.R index 8673d1d..fcf206c 100644 --- a/data/epactsMulti.R +++ b/data/epactsMulti.R @@ -2,7 +2,7 @@ args <- commandArgs(trailingOnly=TRUE) bindir <- args[2] ## Directory containing the R scripts phenof <- args[3] ## phenotype file -##covf <- args[4] ## covariate file +covf <- args[4] ## covariate file or null model index file indf <- args[5] ## individual IDs (in order from VCF) vcf <- args[6] ## VCF file - must be bgzipped and tabixed region <- args[7] ## Genomic region to load @@ -29,9 +29,20 @@ dyn.load(paste(bindir,"/lib/epactsR/libs/epactsR",.Platform$dynlib.ext, sep="")) pnames <- scan(phenof,what=character(),nlines=1)[-1] phenos <- as.matrix(read.table(phenof)[,-1]) +if(test=="multi.b.sna2") +{ + nullf<-substr(covf,1,nchar(covf)-4) #remove ".cov" to get outf +} else if ( covf == "NULL" ) { + cov <- NULL +} else { + cov <- as.matrix(read.table(covf)[,-1]) +} + + ind <- as.integer(read.table(indf)[,2])-1 G <- .Call("readVcf",vcf,region,field,passOnly,ind,NULL) + m <- nrow(G) if ( m > 0 ) { n <- ncol(G) @@ -61,19 +72,23 @@ if ( m > 0 ) { print("bar") - out <- matrix(NA,m,4+2*ncol(phenos)) - out[vids,3+(1:ncol(phenos))*2] <- r$p - out[vids,4+(1:ncol(phenos))*2] <- r$b + out <- matrix(NA,m,4+ncol(r$p)+ncol(r$add)) + out[vids,4+(1:ncol(r$p))] <- r$p + out[vids,4+ncol(r$p)+(1:ncol(r$add))] <- r$add out[,1] <- NS out[,2] <- AC out[,3] <- CR out[,4] <- MAF - colnames(out) <- c("NS","AC","CR","MAF",rep(NA,2*ncol(phenos))) - colnames(out)[3+(1:ncol(phenos))*2] <- paste(pnames,"P",sep=".") - colnames(out)[4+(1:ncol(phenos))*2] <- paste(pnames,"B",sep=".") + colnames(out) <- c("NS","AC","CR","MAF",rep(NA,ncol(r$p)),rep(NA,ncol(r$add))) + colnames(out)[4+(1:ncol(r$p))] <- paste(pnames,"P",sep=".") + colnames(out)[4+ncol(r$p)+(1:ncol(r$add))] <- r$cname rownames(out) <- rownames(G) + print(warnings()) .Call("writeMatrix",out,outf) } else { write.table(NULL,outf,row.names=F,col.names=F) } +rm(list=ls()) +gc() + diff --git a/data/epactsMultiNull.R b/data/epactsMultiNull.R new file mode 100644 index 0000000..7f545f3 --- /dev/null +++ b/data/epactsMultiNull.R @@ -0,0 +1,194 @@ +args <- commandArgs(trailingOnly=TRUE) +nullf <- args[2] ## output name +phenof <- args[3] ## phenotype file +covf <- args[4] ## covariate file or null model index file + +pnames <- scan(phenof,what=character(),nlines=1)[-1] +phenos <- as.matrix(read.table(phenof)[,-1]) + +if ( covf == "NULL" ) { + cov <- NULL; +} else { + cov <- as.matrix(read.table(covf)[,-1]) +} + + +fast.logistf.fit <- function (x, y, weight = NULL, offset = NULL, firth = TRUE, col.fit = NULL, + init = NULL, control) { + n <- nrow(x) + k <- ncol(x) + if (is.null(init)) + init = rep(0, k) + if (is.null(col.fit)) + col.fit = 1:k + if (is.null(offset)) + offset = rep(0, n) + if (is.null(weight)) + weight = rep(1, n) + if (col.fit[1] == 0) + maxit <- 0 + if (missing(control)) + control <- fast.logistf.control() + maxit <- control$maxit + maxstep <- control$maxstep + maxhs <- control$maxhs + lconv <- control$lconv + gconv <- control$gconv + xconv <- control$xconv + beta <- init + iter <- 0 + pi <- as.vector(1/(1 + exp(-x %*% beta - offset))) + evals <- 1 + repeat { + beta.old <- beta + XW2 <- t(x * (weight * pi * (1-pi))^0.5) + myQR <- qr(t(XW2)) + Q <- qr.Q(myQR) + h <- (Q*Q) %*% rep(1, ncol(Q)) + if (firth) + U.star <- crossprod(x, weight * (y - pi) + h * (0.5 - pi)) + else U.star <- crossprod(x, weight * (y - pi)) + XX.covs <- matrix(0, k, k) + if (col.fit[1] != 0) { + XX.XW2 <- t(x[, col.fit, drop=FALSE] * (weight * pi * (1-pi))^0.5) + XX.Fisher <- crossprod(t(XX.XW2)) + XX.covs[col.fit, col.fit] <- fast.invFisher(XX.Fisher) ###### HERE IS THE PROBLEM!!! + } + if(all(is.na(XX.covs)) == T) { + break + } + delta <- as.vector(XX.covs %*% U.star) + delta[is.na(delta)] <- 0 + mx <- max(abs(delta))/maxstep + if (mx > 1) + delta <- delta/mx + evals <- evals + 1 + if (maxit > 0) { + iter <- iter + 1 + beta <- beta + delta + pi <- as.vector(1/(1 + exp(-x %*% beta - offset))) + + + } + if (iter == maxit | ((max(abs(delta)) <= xconv) & (all(abs(U.star[col.fit]) < + gconv)))) + break + } + # Error catching (if chol(x) not positive definite) + if(all(is.na(XX.covs))==T) { + var <- XX.covs + list(beta = NA, var = var, pi = NA, hat.diag = NA, + iter = NA, evals = NA, conv = c(NA, + NA, NA)) + } else { + var <- XX.covs + list(beta = beta, var = var, pi = pi, hat.diag = h, + iter = iter, evals = evals, conv = c(max(abs(U.star)), + max(abs(delta)))) + } +} + + +fast.logistf.control <- function (maxit = 50, maxhs = 15, maxstep = 15, lconv = 1e-05, + gconv = 1e-05, xconv = 1e-05) +{ + list(maxit = maxit, maxhs = maxhs, maxstep = maxstep, lconv = lconv, + gconv = gconv, xconv = xconv) +} + +fast.logDet <- function (x) { + my.chol <- tryCatch(chol(x),error=function(e) {NA}) + if (all(is.na(my.chol))==T) { + return(NA) + } else { + return (2 * sum(log(diag(my.chol)))) + } +} + +fast.invFisher <- function(x) { + my.chol <- tryCatch(chol(x),error=function(e) {NA}) + if (all(is.na(my.chol))==T) { + return(NA) + } else { + return (chol2inv(my.chol)) + } + #ifelse(is.na(my.chol), NA, chol2inv(my.chol)) +} + + +ScoreTest_wSaddleApprox_Get_X1 = function(X1) +{ + q1<-ncol(X1) + if(q1>=2) + { + if(sum(abs(X1[,1]-X1[,2]))==0) + { + X1=X1[,-2] + q1<-q1-1 + } + } + qr1<-qr(X1) + if(qr1$rank < q1){ + + X1.svd<-svd(X1) + X1 = X1.svd$u[,1:qr1$rank] + } + + return(X1) +} + + +ScoreTest_wSaddleApprox_NULL_Model <- function(formula, data=NULL) +{ + X1<-model.matrix(formula,data=data) + X1<-ScoreTest_wSaddleApprox_Get_X1(X1) + + glmfit= glm(formula, data=data, family = "binomial") + convflag=0 + if(glmfit$converged) + { + mu = glmfit$fitted.values + if(mean(mu)/mean(glmfit$y)>0.001 & (1-mean(mu))/(1-mean(glmfit$y))>0.001) convflag<-1 #Check that the null model converged properly with glm + } + if(convflag==0) + { + firthfit=fast.logistf.fit(x=X1,y=glmfit$y) + eta<-X1%*%firthfit$beta + mu = as.vector(exp(eta)/(1+exp(eta))) + } + + V = mu*(1-mu) + res = glmfit$y- mu + n1<-length(res) + + XV = t(X1 * V) + XVX_inv= solve(t(X1)%*%(X1 * V)) + XXVX_inv= X1 %*% XVX_inv + + re<-list(y=glmfit$y, mu=mu, res=res, V=V, X1=X1, XV=XV, XXVX_inv =XXVX_inv) + class(re)<-"SA_NULL" + return(re) +} + +################################################################## +## MAIN FUNCTION: null.b.sna2 +################################################################## + +phenos <- phenos - min(phenos,na.rm=T) + + k <- ncol(cov) + +g<-ncol(phenos) + + for(gind in 1:g) + { + nm<-which(is.na(phenos[,gind])==FALSE) + pheno<-phenos[nm,gind] + cov1<-cov[nm,] + + obj.null<-ScoreTest_wSaddleApprox_NULL_Model(pheno ~as.matrix(cov1)) + save.image(paste(nullf,".",pnames[gind],".null.RData",sep="")) + + } + + diff --git a/data/epactsSingle.R b/data/epactsSingle.R index 1f10f6d..0e83c08 100644 --- a/data/epactsSingle.R +++ b/data/epactsSingle.R @@ -40,11 +40,14 @@ if ( !require(epactsR,lib.loc=paste(bindir,"/lib/",sep="") ) ) { } ## the DLL file has to be loaded explictly for some machine - don't know why dyn.load(paste(bindir,"/lib/epactsR/libs/epactsR",.Platform$dynlib.ext, sep="")) - + pheno <- as.double(read.table(phenof)[,2]) pheno <- pheno - min(pheno,na.rm=T) -if ( covf == "NULL" ) { - cov <- NULL; +if(test=="single.b.sna2") +{ + nullf<-substr(covf,1,nchar(covf)-4) #remove ".cov" to get outf +} else if ( covf == "NULL" ) { + cov <- NULL } else { cov <- as.matrix(read.table(covf)[,-1]) } @@ -54,8 +57,7 @@ binaryFlag <- is.binary(pheno) ind <- as.integer(read.table(indf)[,2])-1 G <- .Call("readVcf",vcf,region,field,passOnly,ind,NULL) -rnames <- rownames(G) -rownames(G) <- seq_along(rnames) + m <- nrow(G) if ( m > 0 ) { n <- ncol(G) @@ -72,7 +74,8 @@ if ( m > 0 ) { rsq[rsq>1] <- 1 if ( minRSQ > 0 ) { vids <- which((varAC > 0) & (MAF >= minMAF) & (MAF <= maxMAF ) & ( MAC >= minMAC ) & ( MAC <= maxMAC) & (CR >= minCallRate) & (rsq >= minRSQ)) - } else { + } + else { vids <- which((varAC > 0) & (MAF >= minMAF) & (MAF <= maxMAF) & ( MAC >= minMAC ) & ( MAC <= maxMAC) & (CR >= minCallRate)) } genos <- G[vids,,drop=FALSE] @@ -88,7 +91,6 @@ if ( m > 0 ) { func <- match.fun(test) r <- func() - if ( binaryFlag ) { binary.cname <- c("NS.CASE","NS.CTRL","AF.CASE","AF.CTRL") #,"CNT.CASE","CNT.CTRL"); @@ -109,7 +111,8 @@ if ( m > 0 ) { #binary.add[,4] <- rowSums((matrix(pheno,nrow(G),ncol(G),byrow=T) == 0) * G * (1-ina), na.rm=T)/binary.add[,2] #binary.add[binary.add[,1] == 0,3] <- NA #binary.add[binary.add[,2] == 0,4] <- NA - } else { + } + else { binary.cname <- NULL binary.ncols <- 0 } @@ -123,7 +126,8 @@ if ( m > 0 ) { if ( minRSQ > 0 ) { colnames(out) <- c("NS","AC","CALLRATE","MAF","PVALUE",r$cname,"RSQ",binary.cname); out[,6+ncol(r$add)] <- rsq - } else { + } + else { colnames(out) <- c("NS","AC","CALLRATE","MAF","PVALUE",r$cname,binary.cname); } out[vids,5] <- r$p @@ -131,17 +135,8 @@ if ( m > 0 ) { if ( binaryFlag ) { out[,5+ncol(r$add)+1:4] <- binary.add } - rownames(out) <- rnames - - if (class(try(rnames[seq_along(rnames)], silent = TRUE)) == "try-error") { - tmp <- rownames(out) - goodnames <- sapply(seq_along(tmp), function (iname) { - class(try(tmp[iname], silent = TRUE)) != "try-error" - }) - out <- out[which(goodnames), ] - cat(paste(sum(!goodnames), 'variants have been excluded due to error: "Value of SET_STRING_ELT() must be a \'CHARSXP\' not a \'integer\'"'), file = paste0(outf, ".log")) - } else {} - + rownames(out) <- rownames(G) + print(warnings()) .Call("writeMatrix",out,outf) } else { diff --git a/data/epactsSingleNull.R b/data/epactsSingleNull.R new file mode 100644 index 0000000..0dd18cc --- /dev/null +++ b/data/epactsSingleNull.R @@ -0,0 +1,183 @@ +args <- commandArgs(trailingOnly=TRUE) +nullf <- args[2] ## output name +phenof <- args[3] ## phenotype file +covf <- args[4] ## covariate file or null model index file + +pheno <- as.matrix(read.table(phenof)[,-1]) + +if ( covf == "NULL" ) { + cov <- NULL; +} else { + cov <- as.matrix(read.table(covf)[,-1]) +} + + +fast.logistf.fit <- function (x, y, weight = NULL, offset = NULL, firth = TRUE, col.fit = NULL, + init = NULL, control) { + n <- nrow(x) + k <- ncol(x) + if (is.null(init)) + init = rep(0, k) + if (is.null(col.fit)) + col.fit = 1:k + if (is.null(offset)) + offset = rep(0, n) + if (is.null(weight)) + weight = rep(1, n) + if (col.fit[1] == 0) + maxit <- 0 + if (missing(control)) + control <- fast.logistf.control() + maxit <- control$maxit + maxstep <- control$maxstep + maxhs <- control$maxhs + lconv <- control$lconv + gconv <- control$gconv + xconv <- control$xconv + beta <- init + iter <- 0 + pi <- as.vector(1/(1 + exp(-x %*% beta - offset))) + evals <- 1 + repeat { + beta.old <- beta + XW2 <- t(x * (weight * pi * (1-pi))^0.5) + myQR <- qr(t(XW2)) + Q <- qr.Q(myQR) + h <- (Q*Q) %*% rep(1, ncol(Q)) + if (firth) + U.star <- crossprod(x, weight * (y - pi) + h * (0.5 - pi)) + else U.star <- crossprod(x, weight * (y - pi)) + XX.covs <- matrix(0, k, k) + if (col.fit[1] != 0) { + XX.XW2 <- t(x[, col.fit, drop=FALSE] * (weight * pi * (1-pi))^0.5) + XX.Fisher <- crossprod(t(XX.XW2)) + XX.covs[col.fit, col.fit] <- fast.invFisher(XX.Fisher) ###### HERE IS THE PROBLEM!!! + } + if(all(is.na(XX.covs)) == T) { + break + } + delta <- as.vector(XX.covs %*% U.star) + delta[is.na(delta)] <- 0 + mx <- max(abs(delta))/maxstep + if (mx > 1) + delta <- delta/mx + evals <- evals + 1 + if (maxit > 0) { + iter <- iter + 1 + beta <- beta + delta + pi <- as.vector(1/(1 + exp(-x %*% beta - offset))) + + + } + if (iter == maxit | ((max(abs(delta)) <= xconv) & (all(abs(U.star[col.fit]) < + gconv)))) + break + } + # Error catching (if chol(x) not positive definite) + if(all(is.na(XX.covs))==T) { + var <- XX.covs + list(beta = NA, var = var, pi = NA, hat.diag = NA, + iter = NA, evals = NA, conv = c(NA, + NA, NA)) + } else { + var <- XX.covs + list(beta = beta, var = var, pi = pi, hat.diag = h, + iter = iter, evals = evals, conv = c(max(abs(U.star)), + max(abs(delta)))) + } +} + + +fast.logistf.control <- function (maxit = 50, maxhs = 15, maxstep = 15, lconv = 1e-05, + gconv = 1e-05, xconv = 1e-05) +{ + list(maxit = maxit, maxhs = maxhs, maxstep = maxstep, lconv = lconv, + gconv = gconv, xconv = xconv) +} + +fast.logDet <- function (x) { + my.chol <- tryCatch(chol(x),error=function(e) {NA}) + if (all(is.na(my.chol))==T) { + return(NA) + } else { + return (2 * sum(log(diag(my.chol)))) + } +} + +fast.invFisher <- function(x) { + my.chol <- tryCatch(chol(x),error=function(e) {NA}) + if (all(is.na(my.chol))==T) { + return(NA) + } else { + return (chol2inv(my.chol)) + } + #ifelse(is.na(my.chol), NA, chol2inv(my.chol)) +} + +ScoreTest_wSaddleApprox_Get_X1 = function(X1) +{ + q1<-ncol(X1) + if(q1>=2) + { + if(sum(abs(X1[,1]-X1[,2]))==0) + { + X1=X1[,-2] + q1<-q1-1 + } + } + qr1<-qr(X1) + if(qr1$rank < q1){ + + X1.svd<-svd(X1) + X1 = X1.svd$u[,1:qr1$rank] + } + + return(X1) +} + + +ScoreTest_wSaddleApprox_NULL_Model <- function(formula, data=NULL) +{ + X1<-model.matrix(formula,data=data) + X1<-ScoreTest_wSaddleApprox_Get_X1(X1) + + glmfit= glm(formula, data=data, family = "binomial") + convflag=0 + if(glmfit$converged) + { + mu = glmfit$fitted.values + if(mean(mu)/mean(glmfit$y)>0.001 & (1-mean(mu))/(1-mean(glmfit$y))>0.001) convflag<-1 #Check that the null model converged properly with glm + } + if(convflag==0) + { + firthfit=fast.logistf.fit(x=X1,y=glmfit$y) + eta<-X1%*%firthfit$beta + mu = as.vector(exp(eta)/(1+exp(eta))) + } + + V = mu*(1-mu) + res = glmfit$y- mu + n1<-length(res) + + XV = t(X1 * V) + XVX_inv= solve(t(X1)%*%(X1 * V)) + XXVX_inv= X1 %*% XVX_inv + + re<-list(y=glmfit$y, mu=mu, res=res, V=V, X1=X1, XV=XV, XXVX_inv =XXVX_inv) + class(re)<-"SA_NULL" + return(re) +} + +################################################################## +## MAIN FUNCTION: null.b.sna2 +################################################################## + +pheno <- pheno - min(pheno,na.rm=T) + k <- ncol(cov) + + obj.null<-ScoreTest_wSaddleApprox_NULL_Model(pheno ~as.matrix(cov)) + save.image(paste(nullf,".null.RData",sep="")) + + + + diff --git a/data/multi.b.sna2.R b/data/multi.b.sna2.R new file mode 100644 index 0000000..024aa33 --- /dev/null +++ b/data/multi.b.sna2.R @@ -0,0 +1,713 @@ +## Core functions of pugwash to perform association + +################################################################## +## SINGLE VARIANT TEST +## INPUT VARIABLES: +## n : total # of individuals +## NS : number of called samples +## AC : allele count +## MAF : minor allele frequency +## vids : indices from 1:n after AF/AC threshold +## genos : genotype matrix (after AF/AC threshold) +## EXPECTED OUTPUT : list(p, addcols, addnames) for each genos row +## p : p-value +## addcols : additional columns (with proper column names) +################################################################## + +ScoreTest_wSaddleApprox_Get_X1 = function(X1) +{ + q1<-ncol(X1) + if(q1>=2) + { + if(sum(abs(X1[,1]-X1[,2]))==0) + { + X1=X1[,-2] + q1<-q1-1 + } + } + qr1<-qr(X1) + if(qr1$rank < q1){ + + X1.svd<-svd(X1) + X1 = X1.svd$u[,1:qr1$rank] + } + + return(X1) +} + + +ScoreTest_wSaddleApprox_NULL_Model <- function(formula, data=NULL) +{ + X1<-model.matrix(formula,data=data) + X1<-ScoreTest_wSaddleApprox_Get_X1(X1) + + glmfit= glm(formula, data=data, family = "binomial") + convflag=0 + if(glmfit$converged) + { + mu = glmfit$fitted.values + if(mean(mu)/mean(glmfit$y)>0.001 & (1-mean(mu))/(1-mean(glmfit$y))>0.001) convflag<-1 #Check that the null model converged properly with glm + } + if(convflag==0) + { + firthfit=fast.logistf.fit(x=X1,y=glmfit$y) + eta<-X1%*%firthfit$beta + mu = as.vector(exp(eta)/(1+exp(eta))) + } + + V = mu*(1-mu) + res = glmfit$y- mu + n1<-length(res) + + XV = t(X1 * V) + XVX_inv= solve(t(X1)%*%(X1 * V)) + XXVX_inv= X1 %*% XVX_inv + + re<-list(y=glmfit$y, mu=mu, res=res, V=V, X1=X1, XV=XV, XXVX_inv =XXVX_inv) + class(re)<-"SA_NULL" + return(re) +} + +Korg<-function(t, mu, g) +{ + n.t<-length(t) + out<-rep(0,n.t) + + for(i in 1:n.t){ + t1<-t[i] + temp<-log(1 - mu + mu * exp(g* t1)) + out[i]<-sum(temp) + } + return(out) +} + + +K1_adj<-function(t, mu, g, q) +{ + n.t<-length(t) + out<-rep(0,n.t) + + for(i in 1:n.t){ + t1<-t[i] + temp1<-(1 - mu)* exp(-g * t1) + mu + temp2<-mu *g + out[i]<-sum(temp2/temp1)-q + } + return(out) +} + +K2<-function(t, mu, g) +{ + n.t<-length(t) + out<-rep(0,n.t) + + for(i in 1:n.t){ + t1<-t[i] + temp1<-((1 - mu)* exp(-g * t1) + mu)^2 + temp2<-(1-mu) * mu * g^2 * exp(-g*t1) + out[i]<-sum(temp2/temp1,na.rm=TRUE) + } + return(out) +} + +getroot_K1<-function(init,mu,g,q,m1,tol=.Machine$double.eps^0.25,maxiter=1000) +{ + g.pos<-sum(g[which(g>0)]) + g.neg<- sum(g[which(g<0)]) + if(q>=g.pos || q<=g.neg) + { + return(list(root=Inf,n.iter=0,Is.converge=TRUE)) + } else { + t<-init + K1_eval<-K1_adj(t,mu,g,q) + prevJump<- Inf + rep<-1 + repeat + { + K2_eval<-K2(t,mu,g) + tnew<-t-K1_eval/K2_eval + if(is.na(tnew)) + { + conv=FALSE + break + } + if(abs(tnew-t)prevJump-tol) + { + tnew<-t+sign(newK1-K1_eval)*prevJump/2 + newK1<-K1_adj(tnew,mu,g,q) + prevJump<-prevJump/2 + } else { + prevJump<-abs(tnew-t) + } + } + + rep<-rep+1 + t<-tnew + K1_eval<-newK1 + } + return(list(root=t,n.iter=rep,Is.converge=conv)) + } +} + +Get_Saddle_Prob<-function(zeta, mu, g, q) +{ + k1<-Korg(zeta, mu, g) + k2<-K2(zeta, mu, g) + + if(is.finite(k1) && is.finite(k2)) + { + temp1<-zeta * q - k1 + + + w<-sign(zeta) * (2 *temp1)^{1/2} + v<- zeta * (k2)^{1/2} + + Z.test<-w + 1/w * log(v/w) + + if(Z.test > 0){ + pval<-pnorm(Z.test, lower.tail = FALSE) + } else { + pval<-pnorm(Z.test, lower.tail = TRUE) + } + } else { + pval<-0 + } + + return(pval) +} + +Saddle_Prob<-function(q, mu, g, Cutoff=2,alpha) +{ + m1<-sum(mu * g) + var1<-sum(mu * (1-mu) * g^2) + p1=NULL + p2=NULL + + # + qinv = -sign(q-m1) * abs(q-m1) + m1 + + # Noadj + pval.noadj<-pchisq((q - m1)^2/var1, lower.tail = FALSE, df=1) + Is.converge=TRUE + + if(Cutoff=="BE"){ + rho<-sum(((abs(g))^3)*mu*(1-mu)*(mu^2+(1-mu)^2)) + B<-0.56*rho*var1^(-3/2) + p<-B+alpha/2 + Cutoff=ifelse(p>=0.496,0.01,qnorm(p,lower.tail=F)) + } else if(Cutoff < 10^-1){ + Cutoff=10^-1 + } + # + + if(abs(q - m1)/sqrt(var1) < Cutoff){ + + pval=pval.noadj + + } else { + out.uni1<-getroot_K1(0, mu=mu, g=g, q=q) + out.uni2<-getroot_K1(0, mu=mu, g=g, q=qinv) + if(out.uni1$Is.converge==TRUE && out.uni2$Is.converge==TRUE) + { + p1<-Get_Saddle_Prob(out.uni1$root, mu, g, q) + p2<-Get_Saddle_Prob(out.uni2$root, mu, g, qinv) + pval = p1+p2 + Is.converge=TRUE + } else { + print("Error_Converge") + pval<-pval.noadj + Is.converge=FALSE + } + } + + if(pval!=0 && pval.noadj/pval>10^3) + { + return(Saddle_Prob(q, mu, g, Cutoff=Cutoff*2,alpha)) + } else { + return(list(p.value=pval, p.value.NA=pval.noadj, Is.converge=Is.converge, p1=p1, p2=p2)) + } + +} + +TestSPA<-function(G, obj.null, Cutoff=2,alpha) +{ + if(class(obj.null) != "SA_NULL"){ + stop("obj.null should be a returned object from ScoreTest_wSaddleApprox_NULL_Model") + } + + y = obj.null$y + mu = obj.null$mu + res = obj.null$res + + n.g<-sum(G) + if(n.g/(2*length(G))>0.5) + { + G<-2-G + n.g<-sum(G) + } + G1<-G - obj.null$XXVX_inv %*% (obj.null$XV %*% G) + q<-sum(G1 * y) /sqrt(n.g) + out<-Saddle_Prob(q, mu=mu, g=G1/sqrt(n.g), Cutoff=Cutoff,alpha=alpha) + + return(out) +} + +Korg_fast<-function(t, mu, g,gNA,gNB,muNA,muNB,NAmu,NAsigma) +{ + n.t<-length(t) + out<-rep(0,n.t) + + for(i in 1:n.t){ + t1<-t[i] + temp<-log(1 - muNB + muNB * exp(gNB* t1)) + out[i]<-sum(temp)+NAmu*t1+0.5*NAsigma*t1^2 + } + return(out) +} + + +K1_adj_fast<-function(t, mu, g, q,gNA,gNB,muNA,muNB,NAmu,NAsigma) +{ + n.t<-length(t) + out<-rep(0,n.t) + + for(i in 1:n.t){ + t1<-t[i] + temp1<-(1 - muNB)* exp(-gNB * t1) + muNB + temp2<-muNB *gNB + temp3<-NAmu+NAsigma*t1 + out[i]<-sum(temp2/temp1)+temp3-q + } + return(out) +} + +K2_fast<-function(t, mu, g,gNA,gNB,muNA,muNB,NAmu,NAsigma) +{ + n.t<-length(t) + out<-rep(0,n.t) + + for(i in 1:n.t){ + t1<-t[i] + temp1<-((1 - muNB)* exp(-gNB * t1) + muNB)^2 + temp2<-(1-muNB) * muNB * gNB^2 * exp(-gNB*t1) + out[i]<-sum(temp2/temp1,na.rm=TRUE)+NAsigma + } + return(out) +} + +getroot_K1_fast<-function(init,mu,g,q,m1,gNA,gNB,muNA,muNB,NAmu,NAsigma,tol=.Machine$double.eps^0.25,maxiter=1000) +{ + g.pos<-sum(g[which(g>0)]) + g.neg<- sum(g[which(g<0)]) + if(q>=g.pos || q<=g.neg) + { + return(list(root=Inf,n.iter=0,Is.converge=TRUE)) + } else { + t<-init + K1_eval<-K1_adj_fast(t,mu,g,q,gNA,gNB,muNA,muNB,NAmu,NAsigma) + prevJump<- Inf + rep<-1 + repeat + { + K2_eval<-K2_fast(t,mu,g,gNA,gNB,muNA,muNB,NAmu,NAsigma) + tnew<-t-K1_eval/K2_eval + if(is.na(tnew)) + { + conv=FALSE + break + } + if(abs(tnew-t)prevJump-tol) + { + tnew<-t+sign(newK1-K1_eval)*prevJump/2 + newK1<-K1_adj_fast(tnew,mu,g,q,gNA,gNB,muNA,muNB,NAmu,NAsigma) + prevJump<-prevJump/2 + } else { + prevJump<-abs(tnew-t) + } + } + + rep<-rep+1 + t<-tnew + K1_eval<-newK1 + } + return(list(root=t,n.iter=rep,Is.converge=conv)) + } +} + +Get_Saddle_Prob_fast<-function(zeta, mu, g, q,gNA,gNB,muNA,muNB,NAmu,NAsigma) +{ + k1<-Korg_fast(zeta, mu, g,gNA,gNB,muNA,muNB,NAmu,NAsigma) + k2<-K2_fast(zeta, mu, g,gNA,gNB,muNA,muNB,NAmu,NAsigma) + + if(is.finite(k1) && is.finite(k2)) + { + temp1<-zeta * q - k1 + + + w<-sign(zeta) * (2 *temp1)^{1/2} + v<- zeta * (k2)^{1/2} + + Z.test<-w + 1/w * log(v/w) + + if(Z.test > 0){ + pval<-pnorm(Z.test, lower.tail = FALSE) + } else { + pval<-pnorm(Z.test, lower.tail = TRUE) + } + } else { + pval<-0 + } + + return(pval) +} + +Saddle_Prob_fast<-function(q, g,mu,gNA,gNB,muNA,muNB,Cutoff=2,alpha) +{ + m1<-sum(mu * g) + var1<-sum(mu * (1-mu) * g^2) + p1=NULL + p2=NULL + + # + qinv = -sign(q-m1) * abs(q-m1) + m1 + + # Noadj + pval.noadj<-pchisq((q - m1)^2/var1, lower.tail = FALSE, df=1) + Is.converge=TRUE + + if(Cutoff=="BE"){ + rho<-sum(((abs(g))^3)*mu*(1-mu)*(mu^2+(1-mu)^2)) + B<-0.56*rho*var1^(-3/2) + p<-B+alpha/2 + Cutoff=ifelse(p>=0.496,0.01,qnorm(p,lower.tail=F)) + } else if(Cutoff < 10^-1){ + Cutoff=10^-1 + } + + # + + if(abs(q - m1)/sqrt(var1) < Cutoff){ + + pval=pval.noadj + + } else { + NAmu= m1-sum(gNB*muNB) + NAsigma=var1-sum(muNB*(1-muNB)*gNB^2) + out.uni1<-getroot_K1_fast(0, mu=mu, g=g, q=q,gNA=gNA,gNB=gNB,muNA=muNA,muNB=muNB,NAmu=NAmu,NAsigma=NAsigma) + out.uni2<-getroot_K1_fast(0, mu=mu, g=g, q=qinv,gNA=gNA,gNB=gNB,muNA=muNA,muNB=muNB,NAmu=NAmu,NAsigma=NAsigma) + if(out.uni1$Is.converge==TRUE && out.uni2$Is.converge==TRUE) + { + p1<-Get_Saddle_Prob_fast(out.uni1$root, mu, g, q,gNA,gNB,muNA,muNB,NAmu,NAsigma) + p2<-Get_Saddle_Prob_fast(out.uni2$root, mu, g, qinv,gNA,gNB,muNA,muNB,NAmu,NAsigma) + pval = p1+p2 + Is.converge=TRUE + } else { + print("Error_Converge") + pval<-pval.noadj + Is.converge=FALSE + } + + } + + if(pval!=0 && pval.noadj/pval>10^3) + { + return(Saddle_Prob_fast(q, g,mu,gNA,gNB,muNA,muNB,Cutoff=Cutoff*2,alpha)) + } else { + return(list(p.value=pval, p.value.NA=pval.noadj, Is.converge=Is.converge, p1=p1, p2=p2)) + } + +} + +TestSPAfast<-function(G, obj.null, Cutoff=2,alpha) +{ + if(class(obj.null) != "SA_NULL"){ + stop("obj.null should be a returned object from ScoreTest_wSaddleApprox_NULL_Model") + } + + y = obj.null$y + mu = obj.null$mu + res = obj.null$res + + n.g<-sum(G) + if(n.g/(2*length(G))>0.5) + { + G<-2-G + n.g<-sum(G) + } + NAset<-which(G==0) + G1<-G - obj.null$XXVX_inv %*% (obj.null$XV %*% G) + q<-sum(G1 * y) /sqrt(n.g) + g=G1/sqrt(n.g) + if(length(NAset)/length(G)<0.5) + { + out<-Saddle_Prob(q, mu=mu, g=G1/sqrt(n.g), Cutoff=Cutoff,alpha=alpha) + + } else { + + out<-Saddle_Prob_fast(q,g=g,mu=mu,gNA=g[NAset],gNB=g[-NAset], +muNA=mu[NAset],muNB=mu[-NAset],Cutoff=Cutoff,alpha=alpha) + } + return(out) +} + + + +fast.logistf.fit <- function (x, y, weight = NULL, offset = NULL, firth = TRUE, col.fit = NULL, + init = NULL, control) { + n <- nrow(x) + k <- ncol(x) + if (is.null(init)) + init = rep(0, k) + if (is.null(col.fit)) + col.fit = 1:k + if (is.null(offset)) + offset = rep(0, n) + if (is.null(weight)) + weight = rep(1, n) + if (col.fit[1] == 0) + maxit <- 0 + if (missing(control)) + control <- fast.logistf.control() + maxit <- control$maxit + maxstep <- control$maxstep + maxhs <- control$maxhs + lconv <- control$lconv + gconv <- control$gconv + xconv <- control$xconv + beta <- init + iter <- 0 + pi <- as.vector(1/(1 + exp(-x %*% beta - offset))) + evals <- 1 + repeat { + beta.old <- beta + XW2 <- t(x * (weight * pi * (1-pi))^0.5) + myQR <- qr(t(XW2)) + Q <- qr.Q(myQR) + h <- (Q*Q) %*% rep(1, ncol(Q)) + if (firth) + U.star <- crossprod(x, weight * (y - pi) + h * (0.5 - pi)) + else U.star <- crossprod(x, weight * (y - pi)) + XX.covs <- matrix(0, k, k) + if (col.fit[1] != 0) { + XX.XW2 <- t(x[, col.fit, drop=FALSE] * (weight * pi * (1-pi))^0.5) + XX.Fisher <- crossprod(t(XX.XW2)) + XX.covs[col.fit, col.fit] <- fast.invFisher(XX.Fisher) ###### HERE IS THE PROBLEM!!! + } + if(all(is.na(XX.covs)) == T) { + break + } + delta <- as.vector(XX.covs %*% U.star) + delta[is.na(delta)] <- 0 + mx <- max(abs(delta))/maxstep + if (mx > 1) + delta <- delta/mx + evals <- evals + 1 + if (maxit > 0) { + iter <- iter + 1 + beta <- beta + delta + pi <- as.vector(1/(1 + exp(-x %*% beta - offset))) + + + } + if (iter == maxit | ((max(abs(delta)) <= xconv) & (all(abs(U.star[col.fit]) < + gconv)))) + break + } + # Error catching (if chol(x) not positive definite) + if(all(is.na(XX.covs))==T) { + var <- XX.covs + list(beta = NA, var = var, pi = NA, hat.diag = NA, + iter = NA, evals = NA, conv = c(NA, + NA, NA)) + } else { + var <- XX.covs + list(beta = beta, var = var, pi = pi, hat.diag = h, + iter = iter, evals = evals, conv = c(max(abs(U.star)), + max(abs(delta)))) + } +} + + +fast.logistf.control <- function (maxit = 50, maxhs = 15, maxstep = 15, lconv = 1e-05, + gconv = 1e-05, xconv = 1e-05) +{ + list(maxit = maxit, maxhs = maxhs, maxstep = maxstep, lconv = lconv, + gconv = gconv, xconv = xconv) +} + +fast.logDet <- function (x) { + my.chol <- tryCatch(chol(x),error=function(e) {NA}) + if (all(is.na(my.chol))==T) { + return(NA) + } else { + return (2 * sum(log(diag(my.chol)))) + } +} + +fast.invFisher <- function(x) { + my.chol <- tryCatch(chol(x),error=function(e) {NA}) + if (all(is.na(my.chol))==T) { + return(NA) + } else { + return (chol2inv(my.chol)) + } + #ifelse(is.na(my.chol), NA, chol2inv(my.chol)) +} + + +ScoreTest_SPA <-function(genos,pheno,cov,obj.null,method=c("fastSPA","SPA"),minmac=5,Cutoff=2,alpha=5*10^-8,missing.id=NA,beta.out=FALSE,beta.Cutoff=5*10^-7) +{ + method<-match.arg(method) + + if(missing(obj.null)) + { + if(missing(cov) || is.null(cov)) + { + cov<-rep(1,length(pheno)) + } + obj.null<-ScoreTest_wSaddleApprox_NULL_Model(as.matrix(pheno) ~as.matrix(cov)) + } + cov<-obj.null$cov + pheno<-obj.null$y + + genos<-as.matrix(genos) + if(ncol(genos)==1) + { + m<-1 + genos<-t(genos) + } else { + m <- nrow(genos) + } + if(is.na(missing.id)==FALSE) + { + genos[which(genos==missing.id)]<-NA + } + p.value<-rep(NA,m) + p.value.NA<-rep(NA,m) + Is.converge<-rep(NA,m) + beta<-rep(NA,m) + SEbeta<-rep(NA,m) + for (i in 1:m) + { + try({ + ina<-which(is.na(genos[i,])) + if(length(ina)>0) + { + genos[i,ina]<-mean(genos[i,],na.rm=TRUE) + } + MAC<-min(sum(genos[i,]),sum(2-genos[i,])) + if(MAC>=minmac) + { + if(method=="fastSPA") + { + re <- TestSPAfast(as.vector(genos[i,,drop=FALSE]), obj.null, Cutoff=Cutoff, alpha=alpha) + } else { + re <- TestSPA(as.vector(genos[i,,drop=FALSE]), obj.null, Cutoff=Cutoff, alpha=alpha) + } + p.value[i] <- re$p.value + p.value.NA[i] <- re$p.value.NA + Is.converge[i]<- re$Is.converge + if(beta.out==TRUE && p.value[i] 0 ) { + genos[ina] <- matrix(AC[vids]/NS[vids],nrow(genos),ncol(genos))[ina] + } + + p <- matrix(NA,m,g) # store p-values for m markers + add <- matrix(NA,m,3*g) # extra columns: p.value.NA, Beta, SE + cname <- outer(pnames,c("P.NA","BETA","SEBETA"),FUN="paste",sep=".") + cname<-as.vector(t(cname)) + + for(gind in 1:g) + { + if ( m > 0 ) { ## If there is at least one marker to test + nm<-which(is.na(phenos[,gind])==FALSE) + pheno<-phenos[nm,gind] + geno<-as.matrix(genos[,nm]) + if(ncol(geno)==1) geno<-t(geno) + load(paste(nullf,".",pnames[gind],".null.RData",sep="")) + cov<-obj.null$X1 + + for (i in 1:m) { + re <- TestSPAfast(as.vector(geno[i,,drop=FALSE]), obj.null, Cutoff=2) + p[i,gind] <- re$p.value + beta<-NA + SEbeta<-NA + if(p[i,gind]=2) + { + if(sum(abs(X1[,1]-X1[,2]))==0) + { + X1=X1[,-2] + q1<-q1-1 + } + } + qr1<-qr(X1) + if(qr1$rank < q1){ + + X1.svd<-svd(X1) + X1 = X1.svd$u[,1:qr1$rank] + } + + return(X1) +} + + +ScoreTest_wSaddleApprox_NULL_Model <- function(formula, data=NULL) +{ + X1<-model.matrix(formula,data=data) + X1<-ScoreTest_wSaddleApprox_Get_X1(X1) + + glmfit= glm(formula, data=data, family = "binomial") + convflag=0 + if(glmfit$converged) + { + mu = glmfit$fitted.values + if(mean(mu)/mean(glmfit$y)>0.001 & (1-mean(mu))/(1-mean(glmfit$y))>0.001) convflag<-1 #Check that the null model converged properly with glm + } + if(convflag==0) + { + firthfit=fast.logistf.fit(x=X1,y=glmfit$y) + eta<-X1%*%firthfit$beta + mu = as.vector(exp(eta)/(1+exp(eta))) + } + + V = mu*(1-mu) + res = glmfit$y- mu + n1<-length(res) + + XV = t(X1 * V) + XVX_inv= solve(t(X1)%*%(X1 * V)) + XXVX_inv= X1 %*% XVX_inv + + re<-list(y=glmfit$y, mu=mu, res=res, V=V, X1=X1, XV=XV, XXVX_inv =XXVX_inv) + class(re)<-"SA_NULL" + return(re) +} + +Korg<-function(t, mu, g) +{ n.t<-length(t) out<-rep(0,n.t) @@ -38,8 +83,8 @@ Korg<-function(t, mu, g){ } -K1_adj<-function(t, mu, g, q){ - +K1_adj<-function(t, mu, g, q) +{ n.t<-length(t) out<-rep(0,n.t) @@ -47,11 +92,22 @@ K1_adj<-function(t, mu, g, q){ t1<-t[i] temp1<-(1 - mu)* exp(-g * t1) + mu temp2<-mu *g - out[i]<-sum(temp2/temp1)-q } + return(out) +} + +K2<-function(t, mu, g) +{ + n.t<-length(t) + out<-rep(0,n.t) - + for(i in 1:n.t){ + t1<-t[i] + temp1<-((1 - mu)* exp(-g * t1) + mu)^2 + temp2<-(1-mu) * mu * g^2 * exp(-g*t1) + out[i]<-sum(temp2/temp1,na.rm=TRUE) + } return(out) } @@ -108,145 +164,209 @@ getroot_K1<-function(init,mu,g,q,m1,tol=.Machine$double.eps^0.25,maxiter=1000) } } +Get_Saddle_Prob<-function(zeta, mu, g, q) +{ + k1<-Korg(zeta, mu, g) + k2<-K2(zeta, mu, g) + + if(is.finite(k1) && is.finite(k2)) + { + temp1<-zeta * q - k1 -K2<-function(t, mu, g){ - - n.t<-length(t) - out<-rep(0,n.t) - for(i in 1:n.t){ - t1<-t[i] - temp1<-((1 - mu)* exp(-g * t1) + mu)^2 - temp2<-(1-mu) * mu * g^2 * exp(-g*t1) - out[i]<-sum(temp2/temp1,na.rm=TRUE) + w<-sign(zeta) * (2 *temp1)^{1/2} + v<- zeta * (k2)^{1/2} + + Z.test<-w + 1/w * log(v/w) + + if(Z.test > 0){ + pval<-pnorm(Z.test, lower.tail = FALSE) + } else { + pval<-pnorm(Z.test, lower.tail = TRUE) + } + } else { + pval<-0 } - return(out) + + return(pval) } + +Saddle_Prob<-function(q, mu, g, Cutoff=2,alpha) +{ + m1<-sum(mu * g) + var1<-sum(mu * (1-mu) * g^2) + p1=NULL + p2=NULL + + # + qinv = -sign(q-m1) * abs(q-m1) + m1 -Get_Spline_Fun<-function(mu, g){ - node<-c(-5000,-2000,-1000, -100, -50.5, -38.125, -25.75, -13.375, -1, 0, - 1, 50.5, 62.875, 75.25, 100, 1000,2000,5000) - y1<-Korg(node ,mu ,g) - y1deriv<-K1_adj(node ,mu ,g,0) - sfun<-splinefun(node , y1 ,method='natural') + # Noadj + pval.noadj<-pchisq((q - m1)^2/var1, lower.tail = FALSE, df=1) + Is.converge=TRUE -} + if(Cutoff=="BE"){ + rho<-sum(((abs(g))^3)*mu*(1-mu)*(mu^2+(1-mu)^2)) + B<-0.56*rho*var1^(-3/2) + p<-B+alpha/2 + Cutoff=ifelse(p>=0.496,0.01,qnorm(p,lower.tail=F)) + } else if(Cutoff < 10^-1){ + Cutoff=10^-1 + } + # -Get_Spline_FunH<-function(mu, g){ - - node<-c(-5000,-2000,-1000, -100, -1, 0, 1, 100, 1000,2000,5000) - flag=0 - rep<-0 - repeat{ - y1<-Korg(node ,mu ,g) - y1deriv<-K1_adj(node ,mu ,g,0) - sfun<-splinefunH(node , y1 ,m=y1deriv) - xval<-NULL - for(i in 1:(length(node)-1)) - xval<-c(xval,seq(node[i],node[i+1],length.out=100)) - curve<-sfun(xval,deriv=2) - if(min(curve)<0) + if(abs(q - m1)/sqrt(var1) < Cutoff){ + + pval=pval.noadj + + } else { + out.uni1<-getroot_K1(0, mu=mu, g=g, q=q) + out.uni2<-getroot_K1(0, mu=mu, g=g, q=qinv) + if(out.uni1$Is.converge==TRUE && out.uni2$Is.converge==TRUE) { - ls<-unique(findInterval(xval[which(curve<0)],node,all.inside=T)) - nodeadd<-NULL - for(i in 1:length(ls)) - { - b<-(node[ls[i]]+node[ls[i]+1])/2 - nodeadd<-c(nodeadd,b) - } - nodeadd<-unique(nodeadd) - node<-c(node,nodeadd) - node<-node[order(node)] + p1<-Get_Saddle_Prob(out.uni1$root, mu, g, q) + p2<-Get_Saddle_Prob(out.uni2$root, mu, g, qinv) + pval = p1+p2 + Is.converge=TRUE } else { - flag=1 - } - if(flag==1 || rep>10) break - rep<-rep+1 + print("Error_Converge") + pval<-pval.noadj + Is.converge=FALSE + } } + if(pval!=0 && pval.noadj/pval>10^3) + { + return(Saddle_Prob(q, mu, g, Cutoff=Cutoff*2,alpha)) + } else { + return(list(p.value=pval, p.value.NA=pval.noadj, Is.converge=Is.converge, p1=p1, p2=p2)) + } - return(sfun) -} - -Get_Spline_Fun_H2<-function(mu, g){ - - node<-c(-5000,-2000,-1000, -100, -1, 0, 1, 100, 1000,2000,5000) - y1<-Korg(node ,mu ,g) - y1deriv<-K1_adj(node ,mu ,g,0) - y1deriv2<-K2(node ,mu ,g) - sfun<-splineH2(nodeval=cbind(node,y1,y1deriv,y1deriv2)) - return(sfun) } -splineH2<-function(nodeval) +TestSPA<-function(G, obj.null, Cutoff=2,alpha) { - sfun<-matrix(0,nrow(nodeval),7) - sfun[,1]<-nodeval[,1] - A<-matrix(c(1,1,0,0,0,0,0,1,1,1,0,0,0,1,0,2,2,2,0,1,0,3,0,6,0,1,0, - 4,0,12,0,1,0,5,0,20),6,6) - for(i in 1:(nrow(nodeval)-1)) + if(class(obj.null) != "SA_NULL"){ + stop("obj.null should be a returned object from ScoreTest_wSaddleApprox_NULL_Model") + } + + y = obj.null$y + mu = obj.null$mu + res = obj.null$res + + n.g<-sum(G) + if(n.g/(2*length(G))>0.5) { - f<-as.vector(nodeval[i:(i+1),2:4]) - sfun[i,2:7]<-solve(A)%*%f + G<-2-G + n.g<-sum(G) } - return(sfun) -} - -Korg_S_H2<-function(t, sfun){ - - i<-findInterval(t,sfun[,1],all.inside=T) - t1<-(t-sfun[i,1])/(sfun[i+1,1]-sfun[i,1]) - x<-t1^(0:5) - out<-sum(x*sfun[i,2:7]) - return(out) -} - - -K1_adj_S_H2<-function(t, q, sfun){ + G1<-G - obj.null$XXVX_inv %*% (obj.null$XV %*% G) + q<-sum(G1 * y) /sqrt(n.g) + out<-Saddle_Prob(q, mu=mu, g=G1/sqrt(n.g), Cutoff=Cutoff,alpha=alpha) - i<-findInterval(t,sfun[,1],all.inside=T) - t1<-(t-sfun[i,1])/(sfun[i+1,1]-sfun[i,1]) - x<-(t1^(0:4))*(1:5) - out<-sum(x*sfun[i,3:7])-q return(out) } - -K2_S_H2<-function(t, sfun){ - - i<-findInterval(t,sfun[,1],all.inside=T) - t1<-(t-sfun[i,1])/(sfun[i+1,1]-sfun[i,1]) - x<-(t1^(0:3))*c(2,6,12,20) - out<-sum(x*sfun[i,4:7]) +Korg_fast<-function(t, mu, g,gNA,gNB,muNA,muNB,NAmu,NAsigma) +{ + n.t<-length(t) + out<-rep(0,n.t) + + for(i in 1:n.t){ + t1<-t[i] + temp<-log(1 - muNB + muNB * exp(gNB* t1)) + out[i]<-sum(temp)+NAmu*t1+0.5*NAsigma*t1^2 + } return(out) } -Korg_S<-function(t, sfun){ - out<-sfun(t, deriv=0) +K1_adj_fast<-function(t, mu, g, q,gNA,gNB,muNA,muNB,NAmu,NAsigma) +{ + n.t<-length(t) + out<-rep(0,n.t) + + for(i in 1:n.t){ + t1<-t[i] + temp1<-(1 - muNB)* exp(-gNB * t1) + muNB + temp2<-muNB *gNB + temp3<-NAmu+NAsigma*t1 + out[i]<-sum(temp2/temp1)+temp3-q + } return(out) } - -K1_adj_S<-function(t, q, sfun){ - - out<-sfun(t, deriv=1) - q +K2_fast<-function(t, mu, g,gNA,gNB,muNA,muNB,NAmu,NAsigma) +{ + n.t<-length(t) + out<-rep(0,n.t) + + for(i in 1:n.t){ + t1<-t[i] + temp1<-((1 - muNB)* exp(-gNB * t1) + muNB)^2 + temp2<-(1-muNB) * muNB * gNB^2 * exp(-gNB*t1) + out[i]<-sum(temp2/temp1,na.rm=TRUE)+NAsigma + } return(out) } +getroot_K1_fast<-function(init,mu,g,q,m1,gNA,gNB,muNA,muNB,NAmu,NAsigma,tol=.Machine$double.eps^0.25,maxiter=1000) +{ + g.pos<-sum(g[which(g>0)]) + g.neg<- sum(g[which(g<0)]) + if(q>=g.pos || q<=g.neg) + { + return(list(root=Inf,n.iter=0,Is.converge=TRUE)) + } else { + t<-init + K1_eval<-K1_adj_fast(t,mu,g,q,gNA,gNB,muNA,muNB,NAmu,NAsigma) + prevJump<- Inf + rep<-1 + repeat + { + K2_eval<-K2_fast(t,mu,g,gNA,gNB,muNA,muNB,NAmu,NAsigma) + tnew<-t-K1_eval/K2_eval + if(is.na(tnew)) + { + conv=FALSE + break + } + if(abs(tnew-t)prevJump-tol) + { + tnew<-t+sign(newK1-K1_eval)*prevJump/2 + newK1<-K1_adj_fast(tnew,mu,g,q,gNA,gNB,muNA,muNB,NAmu,NAsigma) + prevJump<-prevJump/2 + } else { + prevJump<-abs(tnew-t) + } + } - out<-sfun(t, deriv=2) - return(out) + rep<-rep+1 + t<-tnew + K1_eval<-newK1 + } + return(list(root=t,n.iter=rep,Is.converge=conv)) + } } - -Get_Saddle_Prob<-function(zeta, mu, g, q) { - - # zeta<-out.uni2$root - k1<-Korg(zeta, mu, g) - k2<-K2(zeta, mu, g) +Get_Saddle_Prob_fast<-function(zeta, mu, g, q,gNA,gNB,muNA,muNB,NAmu,NAsigma) +{ + k1<-Korg_fast(zeta, mu, g,gNA,gNB,muNA,muNB,NAmu,NAsigma) + k2<-K2_fast(zeta, mu, g,gNA,gNB,muNA,muNB,NAmu,NAsigma) if(is.finite(k1) && is.finite(k2)) { @@ -269,12 +389,9 @@ Get_Saddle_Prob<-function(zeta, mu, g, q) { return(pval) } - - - -Saddle_Prob<-function(q, mu, g, Cutoff=2){ - #mu =pi; g=g=G1/sqrt(n.g); Spline=FALSE +Saddle_Prob_fast<-function(q, g,mu,gNA,gNB,muNA,muNB,Cutoff=2,alpha) +{ m1<-sum(mu * g) var1<-sum(mu * (1-mu) * g^2) p1=NULL @@ -286,353 +403,59 @@ Saddle_Prob<-function(q, mu, g, Cutoff=2){ # Noadj pval.noadj<-pchisq((q - m1)^2/var1, lower.tail = FALSE, df=1) Is.converge=TRUE - + if(Cutoff=="BE"){ rho<-sum(((abs(g))^3)*mu*(1-mu)*(mu^2+(1-mu)^2)) B<-0.56*rho*var1^(-3/2) - alpha<-5*10^-5 p<-B+alpha/2 Cutoff=ifelse(p>=0.496,0.01,qnorm(p,lower.tail=F)) } else if(Cutoff < 10^-1){ Cutoff=10^-1 } - + # - if(abs(q - m1)/sqrt(var1) < Cutoff || abs(-q - m1)/sqrt(var1) < Cutoff){ - #Ques: Why the 2nd condition? Isn't that redundant as q and m1 are both positives, - #and in general can cause problem if q and m1 has different signs? + if(abs(q - m1)/sqrt(var1) < Cutoff){ pval=pval.noadj } else { - out.uni1<-try(uniroot(K1_adj, c(-100,100), mu=mu, g=g, q=q), silent=TRUE) - out.uni2<-try(uniroot(K1_adj, c(-100,100), mu=mu, g=g, q=qinv), silent=TRUE) - -#Ques: Don't need the whole interval - # try one-more with large interval - if(class(out.uni1) == "try-error"){ - out.uni1<-try(uniroot(K1_adj, c(-1000,1000), mu=mu, g=g, q=q), silent=TRUE) + NAmu= m1-sum(gNB*muNB) + NAsigma=var1-sum(muNB*(1-muNB)*gNB^2) + out.uni1<-getroot_K1_fast(0, mu=mu, g=g, q=q,gNA=gNA,gNB=gNB,muNA=muNA,muNB=muNB,NAmu=NAmu,NAsigma=NAsigma) + out.uni2<-getroot_K1_fast(0, mu=mu, g=g, q=qinv,gNA=gNA,gNB=gNB,muNA=muNA,muNB=muNB,NAmu=NAmu,NAsigma=NAsigma) + if(out.uni1$Is.converge==TRUE && out.uni2$Is.converge==TRUE) + { + p1<-Get_Saddle_Prob_fast(out.uni1$root, mu, g, q,gNA,gNB,muNA,muNB,NAmu,NAsigma) + p2<-Get_Saddle_Prob_fast(out.uni2$root, mu, g, qinv,gNA,gNB,muNA,muNB,NAmu,NAsigma) + pval = p1+p2 + Is.converge=TRUE + } else { + print("Error_Converge") + pval<-pval.noadj + Is.converge=FALSE } - if(class(out.uni1) != "try-error"){ + } - p1<-Get_Saddle_Prob(out.uni1$root, mu, g, q) - - if(class(out.uni2) != "try-error"){ - p2<-Get_Saddle_Prob(out.uni2$root, mu, g, qinv) - - } else { - t<-seq(-1000,1000, length.out=20) - out1<-K1_adj(t, mu, g, qinv) - idx<-which(abs(out1) == min(abs(out1), na.rm=TRUE)) - try(p2<-Get_Saddle_Prob(t[idx], mu, g, qinv), silent=TRUE) - if(class(p2) == "try-error"){ - p2 = p1 - } - - } - - pval = p1+p2 - - #if(pval < 10^-3 && pval.noadj > 0.1){ - # pval<-pval.noadj - # Is.converge=FALSE - #} - - } else { - pval<-pval.noadj - Is.converge=FALSE - } - - - } - - return(list(p.value=pval, p.value.NA=pval.noadj, Is.converge=Is.converge, p1=p1, p2=p2)) -} - -Saddle_Prob_NR<-function(q, mu, g, Cutoff=2){ - - m1<-sum(mu * g) - var1<-sum(mu * (1-mu) * g^2) - p1=NULL - p2=NULL - - # - qinv = -sign(q-m1) * abs(q-m1) + m1 - - # Noadj - pval.noadj<-pchisq((q - m1)^2/var1, lower.tail = FALSE, df=1) - Is.converge=TRUE - - if(Cutoff=="BE"){ - rho<-sum(((abs(g))^3)*mu*(1-mu)*(mu^2+(1-mu)^2)) - B<-0.56*rho*var1^(-3/2) - alpha<-5*10^-5 - p<-B+alpha/2 - Cutoff=ifelse(p>=0.496,0.01,qnorm(p,lower.tail=F)) - } else if(Cutoff < 10^-1){ - Cutoff=10^-1 - } - # - - if(abs(q - m1)/sqrt(var1) < Cutoff || abs(-q - m1)/sqrt(var1) < Cutoff){ - - pval=pval.noadj - - } else { - out.uni1<-getroot_K1(0, mu=mu, g=g, q=q) - out.uni2<-getroot_K1(0, mu=mu, g=g, q=qinv) - if(out.uni1$Is.converge==TRUE && out.uni2$Is.converge==TRUE) - { - p1<-Get_Saddle_Prob(out.uni1$root, mu, g, q) - p2<-Get_Saddle_Prob(out.uni2$root, mu, g, qinv) - pval = p1+p2 - Is.converge=TRUE - } else { - print("Error_Converge") - pval<-pval.noadj - Is.converge=FALSE - } - #if(pval < 10^-3 && pval.noadj > 0.1){ - # print("Error_Mismatch") - # pval<-pval.noadj - # Is.converge=FALSE - #} - - } - - return(list(p.value=pval, p.value.NA=pval.noadj, Is.converge=Is.converge, p1=p1, p2=p2)) -} - - - -Saddle_Prob_wSpline<-function(q, mu, g, Spline=FALSE){ - - #mu =pi; g=g=G1/sqrt(n.g); Spline=FALSE - m1<-sum(mu * g) - var1<-sum(mu * (1-mu) * g^2) - - # Noadj - pval.noadj<-pchisq((q - m1)^2/var1, lower.tail = FALSE, df=1) - pval.noadj<-pnorm((q - m1)/sqrt(var1), lower.tail = FALSE) - Is.converge=TRUE - - # incase of singleton - idx<-which(g> 0) #Ques: Why these steps? - if(length(idx)==1){ - if(q==0){ - pval=1 - (1-mu[idx])/2 - } else { - pval=mu[idx]/2 - } - } else if(q ==0){ - - temp<-sum(log(1-mu[idx])) - pval<-1 - exp(temp)/2 - - } else if (q==sum(g)){ - - temp<-sum(log(mu[idx])) - pval<-exp(temp)/2 - - } else if(abs(q - m1)/sqrt(var1) < 10^{-5}){ - pval=0.5 - } else if(Spline==FALSE){ - - out<-try(uniroot(K1_adj, c(-100,100), mu=mu, g=g, q=q), silent=TRUE) - if(class(out) != "try-error"){ - zeta<-out$root - - k1<-Korg(zeta, mu, g) - k2<-K2(zeta, mu, g) - - temp1<-zeta * q - k1 - - - w<-sign(zeta) * (2 *temp1)^{1/2} - v<- zeta * (k2)^{1/2} - - Z.test<-w + 1/w * log(v/w) - #pval<-pchisq(Z.test^2, lower.tail = FALSE, df=1) - pval<-pnorm(Z.test, lower.tail = FALSE) - - if(pval < 10^-2 && pval.noadj > 0.1){ - pval<-pval.noadj - Is.converge=FALSE - } - - } else { - pval<-pval.noadj - Is.converge=FALSE - } - - - } else { - n<-length(g) - sfun<-Get_Spline_Fun(mu, g, n) - - out<-uniroot(K1_adj_S, c(-3000,3000), q=q, sfun=sfun) - zeta<-out$root - - k1<-Korg_S(zeta, sfun) - k2<-K2_S(zeta, sfun) - - temp1<-zeta * q - k1 - - - w<-sign(zeta) * (2 *temp1)^{1/2} - - if(k2 <= 0){ - v<-w - } else { - v<- zeta * (k2)^{1/2} - } - pval<-pnorm(w + 1/w * log(v/w), lower.tail = FALSE) - - - } - q.transe<-q - m1 - if(!is.na(pval)){ - q.transe<-qnorm(pval, lower.tail=FALSE) * sqrt(var1) - } #Ques: Why this overwriting and why is this information needed - - - return(list(pval=pval, pval.noadj=pval.noadj, q.transe=q.transe, Is.converge=Is.converge, w=w, v=v)) -} - - -# -# Input -# G: genotype -# obj.null: returned object from GLM function with the null model -# Cutoff: a cutoff value to use either saddle point approx or normal approx -# -# Output -# p.value: p-value from the saddle point approx. If the mean centered test statistic is within Cutoff *SD, p.value will be calculated using the normal approx. -# p.value.NA: p-value from the normal approx -# Is.converge: covergence status of saddle point approx. If Is.converge=FALSE, p.value will be calculated using the normal approx. -# p1 and p2: internal use only (please ignore them) -# - - - -##################################################### -# Following 3 functions are added - -ScoreTest_wSaddleApprox_Get_X1 = function(X1){ - - q1<-ncol(X1) - if(q1>=3) - { - if(sum(abs(X1[,1]-X1[,2]))==0) - { - X1=X1[,-2] - q1<-q1-1 - } - } - qr1<-qr(X1) - if(qr1$rank < q1){ - - X1.svd<-svd(X1) - X1 = X1.svd$u #Ques: Why? - } - - return(X1) - -} - - -ScoreTest_wSaddleApprox_NULL_Model = function(formula, data=NULL ){ - - - X1<-model.matrix(formula,data=data) - X1<-ScoreTest_wSaddleApprox_Get_X1(X1) - - glmfit= glm(formula, data=data, family = "binomial") - mu = glmfit$fitted.values - #Ques: Why not use Firth bias correction here? Not a big deal though, and not time consuming - V = mu*(1-mu) - res = glmfit$y- mu - n1<-length(res) - - # - XV = t(X1 * V) - XVX_inv= solve(t(X1)%*%(X1 * V)) - XXVX_inv= X1 %*% XVX_inv - - re<-list(y=glmfit$y, mu=mu, res=res, V=V, X1=X1, XV=XV, XXVX_inv =XXVX_inv) - class(re)<-"SA_NULL" - return(re) - - -} - - -# -# G: genotype vector -# obj.SA.null: object from ScoreTest_wSaddleApprox_NULL_Model -# - -ScoreTest_wSaddleApprox_New<-function(G, obj.SA.null, Cutoff=2){ - - if(class(obj.SA.null) != "SA_NULL"){ - stop("obj.SA.null should be a returned object from ScoreTest_wSaddleApprox_NULL_Model") - } - - y = obj.SA.null$y - mu = obj.SA.null$mu - res = obj.SA.null$res - - n.g<-sum(G) - if(n.g/(2*length(G))>0.5) - { - G<-2-G - n.g<-sum(G) - } - G1<-G - obj.SA.null$XXVX_inv %*% (obj.SA.null$XV %*% G) - #Ques: Reference? Doesn't removing X2 make G and y effectively indep? - q<-sum(G1 * y) /sqrt(n.g) - #q1<-sum(G * y) /sqrt(n.g) - out<-Saddle_Prob(q, mu=mu, g=G1/sqrt(n.g), Cutoff=Cutoff) - - return(out) -} - -ScoreTest_wSaddleApprox_NewNR<-function(G, obj.SA.null, Cutoff=2){ - - if(class(obj.SA.null) != "SA_NULL"){ - stop("obj.SA.null should be a returned object from ScoreTest_wSaddleApprox_NULL_Model") - } - - y = obj.SA.null$y - mu = obj.SA.null$mu - res = obj.SA.null$res - - n.g<-sum(G) - if(n.g/(2*length(G))>0.5) + if(pval!=0 && pval.noadj/pval>10^3) { - G<-2-G - n.g<-sum(G) + return(Saddle_Prob_fast(q, g,mu,gNA,gNB,muNA,muNB,Cutoff=Cutoff*2,alpha)) + } else { + return(list(p.value=pval, p.value.NA=pval.noadj, Is.converge=Is.converge, p1=p1, p2=p2)) } - G1<-G - obj.SA.null$XXVX_inv %*% (obj.SA.null$XV %*% G) - #Ques: Reference? Doesn't removing X2 make G and y effectively indep? - q<-sum(G1 * y) /sqrt(n.g) - #q1<-sum(G * y) /sqrt(n.g) - out<-Saddle_Prob_NR(q, mu=mu, g=G1/sqrt(n.g), Cutoff=Cutoff) - return(out) } -ScoreTest_wSaddleApprox_OldNR<-function(G, obj.SA.null, Cutoff=2){ - - if(class(obj.SA.null) != "SA_NULL"){ - stop("obj.SA.null should be a returned object from ScoreTest_wSaddleApprox_NULL_Model") +TestSPAfast<-function(G, obj.null, Cutoff=2,alpha) +{ + if(class(obj.null) != "SA_NULL"){ + stop("obj.null should be a returned object from ScoreTest_wSaddleApprox_NULL_Model") } - y = obj.SA.null$y - mu = obj.SA.null$mu - res = obj.SA.null$res + y = obj.null$y + mu = obj.null$mu + res = obj.null$res n.g<-sum(G) if(n.g/(2*length(G))>0.5) @@ -641,234 +464,239 @@ ScoreTest_wSaddleApprox_OldNR<-function(G, obj.SA.null, Cutoff=2){ n.g<-sum(G) } NAset<-which(G==0) - G1<-G - obj.SA.null$XXVX_inv %*% (obj.SA.null$XV %*% G) + G1<-G - obj.null$XXVX_inv %*% (obj.null$XV %*% G) q<-sum(G1 * y) /sqrt(n.g) g=G1/sqrt(n.g) if(length(NAset)/length(G)<0.5) { - out<-Saddle_Prob_NR(q, mu=mu, g=G1/sqrt(n.g), Cutoff=Cutoff) + out<-Saddle_Prob(q, mu=mu, g=G1/sqrt(n.g), Cutoff=Cutoff,alpha=alpha) } else { - out<-Saddle_Prob_OldNR(q,g=g,mu=mu,gNA=g[NAset],gNB=g[-NAset], -muNA=mu[NAset],muNB=mu[-NAset],Cutoff=Cutoff) + out<-Saddle_Prob_fast(q,g=g,mu=mu,gNA=g[NAset],gNB=g[-NAset], +muNA=mu[NAset],muNB=mu[-NAset],Cutoff=Cutoff,alpha=alpha) } return(out) } -Saddle_Prob_OldNR<-function(q, g,mu,gNA,gNB,muNA,muNB,Cutoff=2){ - m1<-sum(mu * g) - var1<-sum(mu * (1-mu) * g^2) - p1=NULL - p2=NULL - # - qinv = -sign(q-m1) * abs(q-m1) + m1 +fast.logistf.fit <- function (x, y, weight = NULL, offset = NULL, firth = TRUE, col.fit = NULL, + init = NULL, control) { + n <- nrow(x) + k <- ncol(x) + if (is.null(init)) + init = rep(0, k) + if (is.null(col.fit)) + col.fit = 1:k + if (is.null(offset)) + offset = rep(0, n) + if (is.null(weight)) + weight = rep(1, n) + if (col.fit[1] == 0) + maxit <- 0 + if (missing(control)) + control <- fast.logistf.control() + maxit <- control$maxit + maxstep <- control$maxstep + maxhs <- control$maxhs + lconv <- control$lconv + gconv <- control$gconv + xconv <- control$xconv + beta <- init + iter <- 0 + pi <- as.vector(1/(1 + exp(-x %*% beta - offset))) + evals <- 1 + repeat { + beta.old <- beta + XW2 <- t(x * (weight * pi * (1-pi))^0.5) + myQR <- qr(t(XW2)) + Q <- qr.Q(myQR) + h <- (Q*Q) %*% rep(1, ncol(Q)) + if (firth) + U.star <- crossprod(x, weight * (y - pi) + h * (0.5 - pi)) + else U.star <- crossprod(x, weight * (y - pi)) + XX.covs <- matrix(0, k, k) + if (col.fit[1] != 0) { + XX.XW2 <- t(x[, col.fit, drop=FALSE] * (weight * pi * (1-pi))^0.5) + XX.Fisher <- crossprod(t(XX.XW2)) + XX.covs[col.fit, col.fit] <- fast.invFisher(XX.Fisher) ###### HERE IS THE PROBLEM!!! + } + if(all(is.na(XX.covs)) == T) { + break + } + delta <- as.vector(XX.covs %*% U.star) + delta[is.na(delta)] <- 0 + mx <- max(abs(delta))/maxstep + if (mx > 1) + delta <- delta/mx + evals <- evals + 1 + if (maxit > 0) { + iter <- iter + 1 + beta <- beta + delta + pi <- as.vector(1/(1 + exp(-x %*% beta - offset))) - # Noadj - pval.noadj<-pchisq((q - m1)^2/var1, lower.tail = FALSE, df=1) - Is.converge=TRUE - if(Cutoff=="BE"){ - rho<-sum(((abs(g))^3)*mu*(1-mu)*(mu^2+(1-mu)^2)) - B<-0.56*rho*var1^(-3/2) - alpha<-5*10^-5 - p<-B+alpha/2 - Cutoff=ifelse(p>=0.496,0.01,qnorm(p,lower.tail=F)) - } else if(Cutoff < 10^-1){ - Cutoff=10^-1 - } - - # - - if(abs(q - m1)/sqrt(var1) < Cutoff || abs(-q - m1)/sqrt(var1) < Cutoff){ + } + if (iter == maxit | ((max(abs(delta)) <= xconv) & (all(abs(U.star[col.fit]) < + gconv)))) + break + } + # Error catching (if chol(x) not positive definite) + if(all(is.na(XX.covs))==T) { + var <- XX.covs + list(beta = NA, var = var, pi = NA, hat.diag = NA, + iter = NA, evals = NA, conv = c(NA, + NA, NA)) + } else { + var <- XX.covs + list(beta = beta, var = var, pi = pi, hat.diag = h, + iter = iter, evals = evals, conv = c(max(abs(U.star)), + max(abs(delta)))) + } +} - pval=pval.noadj - - } else { - #NAmu= sum(gNA*muNA) - #NAsigma=sum(muNA*(1-muNA)*gNA^2) - NAmu= m1-sum(gNB*muNB) - NAsigma=var1-sum(muNB*(1-muNB)*gNB^2) - out.uni1<-getroot_K1_Old(0, mu=mu, g=g, q=q,gNA=gNA,gNB=gNB,muNA=muNA,muNB=muNB,NAmu=NAmu,NAsigma=NAsigma) - out.uni2<-getroot_K1_Old(0, mu=mu, g=g, q=qinv,gNA=gNA,gNB=gNB,muNA=muNA,muNB=muNB,NAmu=NAmu,NAsigma=NAsigma) - if(out.uni1$Is.converge==TRUE && out.uni2$Is.converge==TRUE) - { - p1<-Get_Saddle_Prob_Old(out.uni1$root, mu, g, q,gNA,gNB,muNA,muNB,NAmu,NAsigma) - p2<-Get_Saddle_Prob_Old(out.uni2$root, mu, g, qinv,gNA,gNB,muNA,muNB,NAmu,NAsigma) - pval = p1+p2 - Is.converge=TRUE - } else { - print("Error_Converge") - pval<-pval.noadj - Is.converge=FALSE - } - - } - - return(list(p.value=pval, p.value.NA=pval.noadj, Is.converge=Is.converge, p1=p1, p2=p2)) + +fast.logistf.control <- function (maxit = 50, maxhs = 15, maxstep = 15, lconv = 1e-05, + gconv = 1e-05, xconv = 1e-05) +{ + list(maxit = maxit, maxhs = maxhs, maxstep = maxstep, lconv = lconv, + gconv = gconv, xconv = xconv) } -getroot_K1_Old<-function(init,mu,g,q,m1,gNA,gNB,muNA,muNB,NAmu,NAsigma,tol=.Machine$double.eps^0.25,maxiter=1000) +fast.logDet <- function (x) { + my.chol <- tryCatch(chol(x),error=function(e) {NA}) + if (all(is.na(my.chol))==T) { + return(NA) + } else { + return (2 * sum(log(diag(my.chol)))) + } +} + +fast.invFisher <- function(x) { + my.chol <- tryCatch(chol(x),error=function(e) {NA}) + if (all(is.na(my.chol))==T) { + return(NA) + } else { + return (chol2inv(my.chol)) + } + #ifelse(is.na(my.chol), NA, chol2inv(my.chol)) +} + + +ScoreTest_SPA <-function(genos,pheno,cov,obj.null,method=c("fastSPA","SPA"),minmac=5,Cutoff=2,alpha=5*10^-8,missing.id=NA,beta.out=FALSE,beta.Cutoff=5*10^-7) { - g.pos<-sum(g[which(g>0)]) - g.neg<- sum(g[which(g<0)]) - if(q>=g.pos || q<=g.neg) + method<-match.arg(method) + + if(missing(obj.null)) { - return(list(root=Inf,n.iter=0,Is.converge=TRUE)) + if(missing(cov) || is.null(cov)) + { + cov<-rep(1,length(pheno)) + } + obj.null<-ScoreTest_wSaddleApprox_NULL_Model(as.matrix(pheno) ~as.matrix(cov)) + } + cov<-obj.null$cov + pheno<-obj.null$y + + genos<-as.matrix(genos) + if(ncol(genos)==1) + { + m<-1 + genos<-t(genos) } else { - t<-init - K1_eval<-K1_adj_Old(t,mu,g,q,gNA,gNB,muNA,muNB,NAmu,NAsigma) - prevJump<- Inf - rep<-1 - repeat + m <- nrow(genos) + } + if(is.na(missing.id)==FALSE) + { + genos[which(genos==missing.id)]<-NA + } + p.value<-rep(NA,m) + p.value.NA<-rep(NA,m) + Is.converge<-rep(NA,m) + beta<-rep(NA,m) + SEbeta<-rep(NA,m) + for (i in 1:m) + { + try({ + ina<-which(is.na(genos[i,])) + if(length(ina)>0) { - K2_eval<-K2_Old(t,mu,g,gNA,gNB,muNA,muNB,NAmu,NAsigma) - tnew<-t-K1_eval/K2_eval - if(is.na(tnew)) - { - conv=FALSE - break - } - if(abs(tnew-t)=minmac) + { + if(method=="fastSPA") { - conv<-FALSE - break + re <- TestSPAfast(as.vector(genos[i,,drop=FALSE]), obj.null, Cutoff=Cutoff, alpha=alpha) + } else { + re <- TestSPA(as.vector(genos[i,,drop=FALSE]), obj.null, Cutoff=Cutoff, alpha=alpha) } - - newK1<-K1_adj_Old(tnew,mu,g,q,gNA,gNB,muNA,muNB,NAmu,NAsigma) - if(sign(K1_eval)!=sign(newK1)) + p.value[i] <- re$p.value + p.value.NA[i] <- re$p.value.NA + Is.converge[i]<- re$Is.converge + if(beta.out==TRUE && p.value[i]prevJump-tol) - { - tnew<-t+sign(newK1-K1_eval)*prevJump/2 - newK1<-K1_adj_Old(tnew,mu,g,q,gNA,gNB,muNA,muNB,NAmu,NAsigma) - prevJump<-prevJump/2 - } else { - prevJump<-abs(tnew-t) - } + re.firth<-fast.logistf.fit(x=cbind(t(genos[i,,drop=FALSE]),cov),y=pheno,firth=TRUE) + beta[i]<-re.firth$beta[1] + SEbeta[i]<-sqrt(re.firth$var[1,1]) } - - rep<-rep+1 - t<-tnew - K1_eval<-newK1 } - return(list(root=t,n.iter=rep,Is.converge=conv)) - } -} - -Korg_Old<-function(t, mu, g,gNA,gNB,muNA,muNB,NAmu,NAsigma){ - - n.t<-length(t) - out<-rep(0,n.t) - - for(i in 1:n.t){ - t1<-t[i] - temp<-log(1 - muNB + muNB * exp(gNB* t1)) - out[i]<-sum(temp)+NAmu*t1+0.5*NAsigma*t1^2 + }) + if(i%%1000==0) print(paste("Processed",i,"SNPs",sep=" ")) } - return(out) + return(list(p.value=p.value,p.value.NA=p.value.NA,Is.converge=Is.converge,beta=beta,SEbeta=SEbeta)) } -K1_adj_Old<-function(t, mu, g, q,gNA,gNB,muNA,muNB,NAmu,NAsigma){ - - n.t<-length(t) - out<-rep(0,n.t) - - for(i in 1:n.t){ - t1<-t[i] - temp1<-(1 - muNB)* exp(-gNB * t1) + muNB - temp2<-muNB *gNB - temp3<-NAmu+NAsigma*t1 - out[i]<-sum(temp2/temp1)+temp3-q - } - - - return(out) -} - -K2_Old<-function(t, mu, g,gNA,gNB,muNA,muNB,NAmu,NAsigma){ - - n.t<-length(t) - out<-rep(0,n.t) - - for(i in 1:n.t){ - t1<-t[i] - temp1<-((1 - muNB)* exp(-gNB * t1) + muNB)^2 - temp2<-(1-muNB) * muNB * gNB^2 * exp(-gNB*t1) - out[i]<-sum(temp2/temp1,na.rm=TRUE)+NAsigma - } - return(out) -} - -Get_Saddle_Prob_Old<-function(zeta, mu, g, q,gNA,gNB,muNA,muNB,NAmu,NAsigma) { - - # zeta<-out.uni2$root - k1<-Korg_Old(zeta, mu, g,gNA,gNB,muNA,muNB,NAmu,NAsigma) - k2<-K2_Old(zeta, mu, g,gNA,gNB,muNA,muNB,NAmu,NAsigma) - - if(is.finite(k1) && is.finite(k2)) - { - temp1<-zeta * q - k1 - - - w<-sign(zeta) * (2 *temp1)^{1/2} - v<- zeta * (k2)^{1/2} - - Z.test<-w + 1/w * log(v/w) - - if(Z.test > 0){ - pval<-pnorm(Z.test, lower.tail = FALSE) - } else { - pval<-pnorm(Z.test, lower.tail = TRUE) - } - } else { - pval<-0 - } - - return(pval) -} ################################################################## ## MAIN FUNCTION: single.b.sna2 ################################################################## single.b.sna2 <- function() { - #control <- fast.logistf.control(maxit=25) # Max iterations to converge = 25 + + beta.Cutoff=5*10^-6 n <- ncol(genos) - k <- ncol(cov) - m <- nrow(genos) + genos<-as.matrix(genos) + if(ncol(genos)==1) + { + m<-1 + genos<-t(genos) + } else { + m <- nrow(genos) + } ## resolve missing genotype by mean imputation ina <- is.na(genos) if ( length(vids) > 0 ) { genos[ina] <- matrix(AC[vids]/NS[vids],nrow(genos),ncol(genos))[ina] } - + p <- rep(NA,m) # store p-values for m markers - add <- matrix(NA,m,2) # extra columns: Beta, SE, Chisq - cname <- c("PVAL.NA","CONVERGE") #,"N.CASE","N.CTRL","AF.CASE","AF.CTRL") + add <- matrix(NA,m,4) # extra columns: p.value.NA, Is.converge, Beta, SE + cname <- c("PVAL.NA","CONVERGE","BETA","SEBETA") #,"N.CASE","N.CTRL","AF.CASE","AF.CTRL") if ( m > 0 ) { ## If there is at least one marker to test - obj.SA.null<-ScoreTest_wSaddleApprox_NULL_Model(pheno ~as.matrix(cov)) - + load(paste(nullf,".null.RData",sep="")) + cov=obj.null$X1 for (i in 1:m) { - re <- ScoreTest_wSaddleApprox_OldNR(as.vector(genos[i,,drop=FALSE]), obj.SA.null, Cutoff=2) + re <- TestSPAfast(as.numeric(genos[i,,drop=FALSE]), obj.null, Cutoff=2) p[i] <- re$p.value - - add[i,1:2] <- c(re$p.value.NA, re$Is.converge) + beta<-NA + SEbeta<-NA + if(p[i]= 0 ) ) { - print STDERR "Performing inverse normal transformation of phenotypes\n"; - open(R,">$out.regress.R") || die "Cannot open file $out.regress.R for writing\n"; - print R "t5 <- read.table('$out.phe',nrows=2)\n"; - print R "classes <- sapply(t5,class)\n"; - print R "T <- read.table('$out.phe',colClasses=classes)\n"; - print R "n <- as.matrix(T[,1])\n"; - print R "Y <- as.matrix(T[,-1])\n"; - print R "c <- as.matrix(read.table('$out.cov')[,-1])\n"; - print R "R <- cbind(n, apply(Y,2,function(x) { lm(x~c)\$residual }))\n"; - print R "colnames(R) <- scan('$out.phe',what=character(),nlines=1)\n"; - print R "write.table(R,'$out.phe',row.names=FALSE,col.names=TRUE,quote=FALSE,sep=\"\t\")\n"; - close R; - my $cmd = "$binRscript $out.regress.R --vanilla"; - if ( $mosixNodes ) { $cmd = &getMosixCmd($cmd,$mosixNodes); } - &forkExecWait($cmd); - @covs = (); ## don't regress out covariates any more - unlink("$out.regress.R"); - } - +else { +# if ( !( $invNorm ) && ( $#covs >= 0 ) ) { +# print STDERR "Performing inverse normal transformation of phenotypes\n"; +# open(R,">$out.regress.R") || die "Cannot open file $out.regress.R for writing\n"; +# print R "t5 <- read.table('$out.phe',nrows=2)\n"; +# print R "classes <- sapply(t5,class)\n"; +# print R "T <- read.table('$out.phe',colClasses=classes)\n"; +# print R "n <- as.matrix(T[,1])\n"; +# print R "Y <- as.matrix(T[,-1])\n"; +# print R "c <- as.matrix(read.table('$out.cov')[,-1])\n"; +# print R "R <- cbind(n, apply(Y,2,function(x) { lm(x~c)\$residual }))\n"; +# print R "colnames(R) <- scan('$out.phe',what=character(),nlines=1)\n"; +# print R "write.table(R,'$out.phe',row.names=FALSE,col.names=TRUE,quote=FALSE,sep=\"\t\")\n"; +# close R; +# my $cmd = "$binRscript $out.regress.R --vanilla"; +# if ( $mosixNodes ) { $cmd = &getMosixCmd($cmd,$mosixNodes); } +# &forkExecWait($cmd); +# @covs = (); ## don't regress out covariates any more +# unlink("$out.regress.R"); +# } + +if ( $test eq "b.sna2" ) { #Analyze Null model +my $cmdn = "$binRscript $datadir/epactsMultiNull.R --vanilla $out $pheno $cov;"; +if ( $mosixNodes ) { $cmdn = &getMosixCmd($cmdn,$mosixNodes); } +&forkExecWait($cmdn); +} + my @tgts = (); my @cmds = (); for(my $i=0; $i < @chrs; ++$i) { @@ -421,24 +427,25 @@ else { my $op = "$out.$chrs[$i].$start.$end"; my $region = "$chrs[$i]:$start-$end"; - my $tgt = "$op.epacts.gz"; - my $cmd = "$epactsdir/bin/pEmmax multi-assoc-plain --vcf $cvcf --region $region --field $field --indf $ind --out-assocf $tgt --minMAF $minMAF --maxMAF $maxMAF --maxMAC $maxMAC --minRSQ $minRSQ --minCallRate $minCallRate --minMAC $minMAC --maxP $maxP --pheno $out.phe".($compact ? " --compact" : ""); + my $tgt = "$op.epacts"; - #die "ERROR: Cannot find $datadir/multi.$test.R\n" unless ( -s "$datadir/multi.$test.R" ); + die "ERROR: Cannot find $datadir/multi.$test.R\n" unless ( -s "$datadir/multi.$test.R" ); + + my $cmd = "$binRscript $datadir/epactsMulti.R --vanilla $epactsdir $pheno $cov $ind $cvcf $region $op.epacts $field $minMAF $maxMAF $minMAC $maxMAC $minCallRate $minRSQ ".(($pass) ? "TRUE" : "FALSE")." multi.$test;"; + - #$cmd = "$binRscript $datadir/epactsMulti.R --vanilla $epactsdir $pheno $cov $ind $cvcf $region $op.epacts $field $minMAF $maxMAF $minMAC $maxMAC $minCallRate $minRSQ ".(($pass) ? "TRUE" : "FALSE")." multi.$test; gzip $op.epacts"; if ( $mosixNodes ) { $cmd = &getMosixCmd($cmd,$mosixNodes); } $cmd = "\t$cmd\n"; if ( $anno ) { if ( $mosixNodes ) { - $cmd .= "\t".&getMosixCmd("$epactsdir/bin/epacts-anno --ref $ref --in $op.epacts.gz",$mosixNodes)."\n"; + $cmd .= "\t".&getMosixCmd("$epactsdir/bin/epacts-anno --ref $ref --in $op.epacts",$mosixNodes)."\n"; } else { - $cmd .= "\t$epactsdir/bin/epacts-anno --in $op.epacts.gz --ref $ref\n"; + $cmd .= "\t$epactsdir/bin/epacts-anno --in $op.epacts --ref $ref\n"; } } - $cmd .= "\t$epactsdir/bin/tabix -f -pbed $tgt\n"; + #$cmd .= "\t$epactsdir/bin/tabix -f -pbed $tgt\n"; push(@tgts,$tgt); push(@cmds,$cmd); @@ -448,20 +455,21 @@ else { } print MAK "all: $out.epacts.OK\n\n"; - print MAK "$out.epacts.OK: ".join(".tbi ",@tgts).".tbi\n"; + print MAK "$out.epacts.OK: @tgts\n"; print MAK "\t$epactsdir/bin/epacts-cat @tgts | $epactsdir/bin/bgzip -c > $out.epacts.gz\n"; print MAK "\t$epactsdir/bin/tabix -f -pbed $out.epacts.gz\n"; - print MAK "\t$binrm -f @tgts ".join(".tbi ",@tgts).".tbi\n"; + print MAK "\t$binrm -f @tgts\n"; print MAK "\ttouch $out.epacts.OK\n"; print MAK "\n"; for(my $i=0; $i < @tgts; ++$i) { - print MAK "$tgts[$i].tbi: $vcf $ped\n"; - print MAK "\tsleep ".sprintf("%.3lfs",rand(10))."\n"; + print MAK "$tgts[$i]: $vcf $ped\n"; + #print MAK "\tsleep ".sprintf("%.3lfs",rand(10))."\n"; print MAK "$cmds[$i]\n"; } close MAK; } + print "Finished generating EPACTS Makefile\n"; if ( $run < 0 ) { print "EPACTS will run the with the following commond:\n"; diff --git a/scripts/epacts-single b/scripts/epacts-single index 0f837a7..14e63b6 100755 --- a/scripts/epacts-single +++ b/scripts/epacts-single @@ -303,6 +303,12 @@ if ( ( ! $sepchr ) && ( $vcf =~ /chr/ ) && ( ! $chr ) ) { print STDERR "*************************************************************************************\n"; } +if ( $test eq "b.sna2" ) { #Analyze Null model +my $cmdn = "$binRscript $datadir/epactsSingleNull.R --vanilla $out $pheno $cov;"; +if ( $mosixNodes ) { $cmdn = &getMosixCmd($cmdn,$mosixNodes); } +&forkExecWait($cmdn); +} + open(MAK,">$out.Makefile") || die "Cannot open file $out.Makefile for writing\n"; print MAK ".DELETE_ON_ERROR:\n\n"; diff --git a/scripts/epacts.pm b/scripts/epacts.pm index 6eca5fe..0766f53 100644 --- a/scripts/epacts.pm +++ b/scripts/epacts.pm @@ -57,7 +57,12 @@ sub parsePheno { ## convert command to mosix command sub getMosixCmd { my ($cmd,$nodes) = @_; - return "mosbatch -E/tmp -i -j$nodes sh -c '$cmd'"; + if ( $nodes =~ /srun/ ) { ## --mosixNodes "srun --partition=topmed-working --mem=13GB" + return "$nodes sh -c '$cmd'"; + } + else { + return "mosbatch -E/tmp -i -j$nodes sh -c '$cmd'"; + } } ## convert string chromosome to integer (compatible to PLINK) @@ -503,7 +508,7 @@ sub readPedVcfMulti { my @p = (); for(my $j=0; $j < @iphes; ++$j) { push(@p,&parsePheno($F[$iphes[$j]],$missing)); - die "ERROR: Missing phenotype value is detected in individual $id, phenotype $rphes->[$j]. Currently EPACTS won't run with missing phenotypes\n" if ( $p[$#p] eq $missing ); + #die "ERROR: Missing phenotype value is detected in individual $id, phenotype $rphes->[$j]. Currently EPACTS won't run with missing phenotypes\n" if ( $p[$#p] eq $missing ); } $hPhes{$id} = \@p;