From 5c314702350f95da2045bad0de5f88d8319ab40d Mon Sep 17 00:00:00 2001 From: rounakdey Date: Mon, 18 Sep 2017 01:49:19 -0400 Subject: [PATCH 01/18] Update needed for epacts-multi with missing phenotypes Bypasses the error message for missing phenotypes and let epacts-multi handle the missing phenotype issue. --- scripts/epacts.pm | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/scripts/epacts.pm b/scripts/epacts.pm index 6eca5fe..0766f53 100644 --- a/scripts/epacts.pm +++ b/scripts/epacts.pm @@ -57,7 +57,12 @@ sub parsePheno { ## convert command to mosix command sub getMosixCmd { my ($cmd,$nodes) = @_; - return "mosbatch -E/tmp -i -j$nodes sh -c '$cmd'"; + if ( $nodes =~ /srun/ ) { ## --mosixNodes "srun --partition=topmed-working --mem=13GB" + return "$nodes sh -c '$cmd'"; + } + else { + return "mosbatch -E/tmp -i -j$nodes sh -c '$cmd'"; + } } ## convert string chromosome to integer (compatible to PLINK) @@ -503,7 +508,7 @@ sub readPedVcfMulti { my @p = (); for(my $j=0; $j < @iphes; ++$j) { push(@p,&parsePheno($F[$iphes[$j]],$missing)); - die "ERROR: Missing phenotype value is detected in individual $id, phenotype $rphes->[$j]. Currently EPACTS won't run with missing phenotypes\n" if ( $p[$#p] eq $missing ); + #die "ERROR: Missing phenotype value is detected in individual $id, phenotype $rphes->[$j]. Currently EPACTS won't run with missing phenotypes\n" if ( $p[$#p] eq $missing ); } $hPhes{$id} = \@p; From c319c7a5f6947490db134ce08d874601d0b3ea80 Mon Sep 17 00:00:00 2001 From: rounakdey Date: Mon, 18 Sep 2017 01:58:36 -0400 Subject: [PATCH 02/18] Update to run SPA null model only once per GWAS Runs epactsSingleNull.R to create the null object for SPA test before starting the GWAS, so that the null model is not needed to be calculated for every chunk (of size "unit", default 500K bp) of SNPs. After running epactsSingleNull.R, each instance of epactsSingle.R just loads the null object and performs the test. This helps to reduce the chunk size (memory efficient) and increase the number of chunks without requiring to calculate the null object for each chunk. --- scripts/epacts-single | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/epacts-single b/scripts/epacts-single index 0f837a7..14e63b6 100755 --- a/scripts/epacts-single +++ b/scripts/epacts-single @@ -303,6 +303,12 @@ if ( ( ! $sepchr ) && ( $vcf =~ /chr/ ) && ( ! $chr ) ) { print STDERR "*************************************************************************************\n"; } +if ( $test eq "b.sna2" ) { #Analyze Null model +my $cmdn = "$binRscript $datadir/epactsSingleNull.R --vanilla $out $pheno $cov;"; +if ( $mosixNodes ) { $cmdn = &getMosixCmd($cmdn,$mosixNodes); } +&forkExecWait($cmdn); +} + open(MAK,">$out.Makefile") || die "Cannot open file $out.Makefile for writing\n"; print MAK ".DELETE_ON_ERROR:\n\n"; From beae63153ea67e93bb614228bf72ecc110d5fe40 Mon Sep 17 00:00:00 2001 From: rounakdey Date: Mon, 18 Sep 2017 02:02:56 -0400 Subject: [PATCH 03/18] Modify epacts-multi for SPA test Includes SPA test (b.sna2). Also includes codes to run epactsMultiNull.R once to calculate the null object and then use it directly in each run of epactsMulti.R. --- scripts/epacts-multi | 72 ++++++++++++++++++++++++-------------------- 1 file changed, 40 insertions(+), 32 deletions(-) diff --git a/scripts/epacts-multi b/scripts/epacts-multi index 9cade11..f19bd95 100644 --- a/scripts/epacts-multi +++ b/scripts/epacts-multi @@ -373,27 +373,33 @@ if ( $test eq "q.emmax" ) { ## Need to generate kinship matrix and REML first close MAK; } -else { - if ( !( $invNorm ) && ( $#covs >= 0 ) ) { - print STDERR "Performing inverse normal transformation of phenotypes\n"; - open(R,">$out.regress.R") || die "Cannot open file $out.regress.R for writing\n"; - print R "t5 <- read.table('$out.phe',nrows=2)\n"; - print R "classes <- sapply(t5,class)\n"; - print R "T <- read.table('$out.phe',colClasses=classes)\n"; - print R "n <- as.matrix(T[,1])\n"; - print R "Y <- as.matrix(T[,-1])\n"; - print R "c <- as.matrix(read.table('$out.cov')[,-1])\n"; - print R "R <- cbind(n, apply(Y,2,function(x) { lm(x~c)\$residual }))\n"; - print R "colnames(R) <- scan('$out.phe',what=character(),nlines=1)\n"; - print R "write.table(R,'$out.phe',row.names=FALSE,col.names=TRUE,quote=FALSE,sep=\"\t\")\n"; - close R; - my $cmd = "$binRscript $out.regress.R --vanilla"; - if ( $mosixNodes ) { $cmd = &getMosixCmd($cmd,$mosixNodes); } - &forkExecWait($cmd); - @covs = (); ## don't regress out covariates any more - unlink("$out.regress.R"); - } - +else { +# if ( !( $invNorm ) && ( $#covs >= 0 ) ) { +# print STDERR "Performing inverse normal transformation of phenotypes\n"; +# open(R,">$out.regress.R") || die "Cannot open file $out.regress.R for writing\n"; +# print R "t5 <- read.table('$out.phe',nrows=2)\n"; +# print R "classes <- sapply(t5,class)\n"; +# print R "T <- read.table('$out.phe',colClasses=classes)\n"; +# print R "n <- as.matrix(T[,1])\n"; +# print R "Y <- as.matrix(T[,-1])\n"; +# print R "c <- as.matrix(read.table('$out.cov')[,-1])\n"; +# print R "R <- cbind(n, apply(Y,2,function(x) { lm(x~c)\$residual }))\n"; +# print R "colnames(R) <- scan('$out.phe',what=character(),nlines=1)\n"; +# print R "write.table(R,'$out.phe',row.names=FALSE,col.names=TRUE,quote=FALSE,sep=\"\t\")\n"; +# close R; +# my $cmd = "$binRscript $out.regress.R --vanilla"; +# if ( $mosixNodes ) { $cmd = &getMosixCmd($cmd,$mosixNodes); } +# &forkExecWait($cmd); +# @covs = (); ## don't regress out covariates any more +# unlink("$out.regress.R"); +# } + +if ( $test eq "b.sna2" ) { #Analyze Null model +my $cmdn = "$binRscript $datadir/epactsMultiNull.R --vanilla $out $pheno $cov;"; +if ( $mosixNodes ) { $cmdn = &getMosixCmd($cmdn,$mosixNodes); } +&forkExecWait($cmdn); +} + my @tgts = (); my @cmds = (); for(my $i=0; $i < @chrs; ++$i) { @@ -421,24 +427,25 @@ else { my $op = "$out.$chrs[$i].$start.$end"; my $region = "$chrs[$i]:$start-$end"; - my $tgt = "$op.epacts.gz"; - my $cmd = "$epactsdir/bin/pEmmax multi-assoc-plain --vcf $cvcf --region $region --field $field --indf $ind --out-assocf $tgt --minMAF $minMAF --maxMAF $maxMAF --maxMAC $maxMAC --minRSQ $minRSQ --minCallRate $minCallRate --minMAC $minMAC --maxP $maxP --pheno $out.phe".($compact ? " --compact" : ""); + my $tgt = "$op.epacts"; - #die "ERROR: Cannot find $datadir/multi.$test.R\n" unless ( -s "$datadir/multi.$test.R" ); + die "ERROR: Cannot find $datadir/multi.$test.R\n" unless ( -s "$datadir/multi.$test.R" ); + + my $cmd = "$binRscript $datadir/epactsMulti.R --vanilla $epactsdir $pheno $cov $ind $cvcf $region $op.epacts $field $minMAF $maxMAF $minMAC $maxMAC $minCallRate $minRSQ ".(($pass) ? "TRUE" : "FALSE")." multi.$test;"; + - #$cmd = "$binRscript $datadir/epactsMulti.R --vanilla $epactsdir $pheno $cov $ind $cvcf $region $op.epacts $field $minMAF $maxMAF $minMAC $maxMAC $minCallRate $minRSQ ".(($pass) ? "TRUE" : "FALSE")." multi.$test; gzip $op.epacts"; if ( $mosixNodes ) { $cmd = &getMosixCmd($cmd,$mosixNodes); } $cmd = "\t$cmd\n"; if ( $anno ) { if ( $mosixNodes ) { - $cmd .= "\t".&getMosixCmd("$epactsdir/bin/epacts-anno --ref $ref --in $op.epacts.gz",$mosixNodes)."\n"; + $cmd .= "\t".&getMosixCmd("$epactsdir/bin/epacts-anno --ref $ref --in $op.epacts",$mosixNodes)."\n"; } else { - $cmd .= "\t$epactsdir/bin/epacts-anno --in $op.epacts.gz --ref $ref\n"; + $cmd .= "\t$epactsdir/bin/epacts-anno --in $op.epacts --ref $ref\n"; } } - $cmd .= "\t$epactsdir/bin/tabix -f -pbed $tgt\n"; + #$cmd .= "\t$epactsdir/bin/tabix -f -pbed $tgt\n"; push(@tgts,$tgt); push(@cmds,$cmd); @@ -448,20 +455,21 @@ else { } print MAK "all: $out.epacts.OK\n\n"; - print MAK "$out.epacts.OK: ".join(".tbi ",@tgts).".tbi\n"; + print MAK "$out.epacts.OK: @tgts\n"; print MAK "\t$epactsdir/bin/epacts-cat @tgts | $epactsdir/bin/bgzip -c > $out.epacts.gz\n"; print MAK "\t$epactsdir/bin/tabix -f -pbed $out.epacts.gz\n"; - print MAK "\t$binrm -f @tgts ".join(".tbi ",@tgts).".tbi\n"; + print MAK "\t$binrm -f @tgts\n"; print MAK "\ttouch $out.epacts.OK\n"; print MAK "\n"; for(my $i=0; $i < @tgts; ++$i) { - print MAK "$tgts[$i].tbi: $vcf $ped\n"; - print MAK "\tsleep ".sprintf("%.3lfs",rand(10))."\n"; + print MAK "$tgts[$i]: $vcf $ped\n"; + #print MAK "\tsleep ".sprintf("%.3lfs",rand(10))."\n"; print MAK "$cmds[$i]\n"; } close MAK; } + print "Finished generating EPACTS Makefile\n"; if ( $run < 0 ) { print "EPACTS will run the with the following commond:\n"; From 69df77d303fd41ebb2e8f1a5ea57c497bbc9cae8 Mon Sep 17 00:00:00 2001 From: rounakdey Date: Mon, 18 Sep 2017 02:28:33 -0400 Subject: [PATCH 04/18] Update to load null object for SPA test For b.sna2 test, loads the path of the null object through covf argument --- data/epactsSingle.R | 35 +++++++++++++++-------------------- 1 file changed, 15 insertions(+), 20 deletions(-) diff --git a/data/epactsSingle.R b/data/epactsSingle.R index 1f10f6d..0e83c08 100644 --- a/data/epactsSingle.R +++ b/data/epactsSingle.R @@ -40,11 +40,14 @@ if ( !require(epactsR,lib.loc=paste(bindir,"/lib/",sep="") ) ) { } ## the DLL file has to be loaded explictly for some machine - don't know why dyn.load(paste(bindir,"/lib/epactsR/libs/epactsR",.Platform$dynlib.ext, sep="")) - + pheno <- as.double(read.table(phenof)[,2]) pheno <- pheno - min(pheno,na.rm=T) -if ( covf == "NULL" ) { - cov <- NULL; +if(test=="single.b.sna2") +{ + nullf<-substr(covf,1,nchar(covf)-4) #remove ".cov" to get outf +} else if ( covf == "NULL" ) { + cov <- NULL } else { cov <- as.matrix(read.table(covf)[,-1]) } @@ -54,8 +57,7 @@ binaryFlag <- is.binary(pheno) ind <- as.integer(read.table(indf)[,2])-1 G <- .Call("readVcf",vcf,region,field,passOnly,ind,NULL) -rnames <- rownames(G) -rownames(G) <- seq_along(rnames) + m <- nrow(G) if ( m > 0 ) { n <- ncol(G) @@ -72,7 +74,8 @@ if ( m > 0 ) { rsq[rsq>1] <- 1 if ( minRSQ > 0 ) { vids <- which((varAC > 0) & (MAF >= minMAF) & (MAF <= maxMAF ) & ( MAC >= minMAC ) & ( MAC <= maxMAC) & (CR >= minCallRate) & (rsq >= minRSQ)) - } else { + } + else { vids <- which((varAC > 0) & (MAF >= minMAF) & (MAF <= maxMAF) & ( MAC >= minMAC ) & ( MAC <= maxMAC) & (CR >= minCallRate)) } genos <- G[vids,,drop=FALSE] @@ -88,7 +91,6 @@ if ( m > 0 ) { func <- match.fun(test) r <- func() - if ( binaryFlag ) { binary.cname <- c("NS.CASE","NS.CTRL","AF.CASE","AF.CTRL") #,"CNT.CASE","CNT.CTRL"); @@ -109,7 +111,8 @@ if ( m > 0 ) { #binary.add[,4] <- rowSums((matrix(pheno,nrow(G),ncol(G),byrow=T) == 0) * G * (1-ina), na.rm=T)/binary.add[,2] #binary.add[binary.add[,1] == 0,3] <- NA #binary.add[binary.add[,2] == 0,4] <- NA - } else { + } + else { binary.cname <- NULL binary.ncols <- 0 } @@ -123,7 +126,8 @@ if ( m > 0 ) { if ( minRSQ > 0 ) { colnames(out) <- c("NS","AC","CALLRATE","MAF","PVALUE",r$cname,"RSQ",binary.cname); out[,6+ncol(r$add)] <- rsq - } else { + } + else { colnames(out) <- c("NS","AC","CALLRATE","MAF","PVALUE",r$cname,binary.cname); } out[vids,5] <- r$p @@ -131,17 +135,8 @@ if ( m > 0 ) { if ( binaryFlag ) { out[,5+ncol(r$add)+1:4] <- binary.add } - rownames(out) <- rnames - - if (class(try(rnames[seq_along(rnames)], silent = TRUE)) == "try-error") { - tmp <- rownames(out) - goodnames <- sapply(seq_along(tmp), function (iname) { - class(try(tmp[iname], silent = TRUE)) != "try-error" - }) - out <- out[which(goodnames), ] - cat(paste(sum(!goodnames), 'variants have been excluded due to error: "Value of SET_STRING_ELT() must be a \'CHARSXP\' not a \'integer\'"'), file = paste0(outf, ".log")) - } else {} - + rownames(out) <- rownames(G) + print(warnings()) .Call("writeMatrix",out,outf) } else { From ea9c427ed086c7a5c4c866de729544238480bf81 Mon Sep 17 00:00:00 2001 From: rounakdey Date: Mon, 18 Sep 2017 02:38:03 -0400 Subject: [PATCH 05/18] Updates epactsMulti.R for SPA test Includes SPA test, also includes codes to load the path of the null object through the covf argument --- data/epactsMulti.R | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/data/epactsMulti.R b/data/epactsMulti.R index 8673d1d..fcf206c 100644 --- a/data/epactsMulti.R +++ b/data/epactsMulti.R @@ -2,7 +2,7 @@ args <- commandArgs(trailingOnly=TRUE) bindir <- args[2] ## Directory containing the R scripts phenof <- args[3] ## phenotype file -##covf <- args[4] ## covariate file +covf <- args[4] ## covariate file or null model index file indf <- args[5] ## individual IDs (in order from VCF) vcf <- args[6] ## VCF file - must be bgzipped and tabixed region <- args[7] ## Genomic region to load @@ -29,9 +29,20 @@ dyn.load(paste(bindir,"/lib/epactsR/libs/epactsR",.Platform$dynlib.ext, sep="")) pnames <- scan(phenof,what=character(),nlines=1)[-1] phenos <- as.matrix(read.table(phenof)[,-1]) +if(test=="multi.b.sna2") +{ + nullf<-substr(covf,1,nchar(covf)-4) #remove ".cov" to get outf +} else if ( covf == "NULL" ) { + cov <- NULL +} else { + cov <- as.matrix(read.table(covf)[,-1]) +} + + ind <- as.integer(read.table(indf)[,2])-1 G <- .Call("readVcf",vcf,region,field,passOnly,ind,NULL) + m <- nrow(G) if ( m > 0 ) { n <- ncol(G) @@ -61,19 +72,23 @@ if ( m > 0 ) { print("bar") - out <- matrix(NA,m,4+2*ncol(phenos)) - out[vids,3+(1:ncol(phenos))*2] <- r$p - out[vids,4+(1:ncol(phenos))*2] <- r$b + out <- matrix(NA,m,4+ncol(r$p)+ncol(r$add)) + out[vids,4+(1:ncol(r$p))] <- r$p + out[vids,4+ncol(r$p)+(1:ncol(r$add))] <- r$add out[,1] <- NS out[,2] <- AC out[,3] <- CR out[,4] <- MAF - colnames(out) <- c("NS","AC","CR","MAF",rep(NA,2*ncol(phenos))) - colnames(out)[3+(1:ncol(phenos))*2] <- paste(pnames,"P",sep=".") - colnames(out)[4+(1:ncol(phenos))*2] <- paste(pnames,"B",sep=".") + colnames(out) <- c("NS","AC","CR","MAF",rep(NA,ncol(r$p)),rep(NA,ncol(r$add))) + colnames(out)[4+(1:ncol(r$p))] <- paste(pnames,"P",sep=".") + colnames(out)[4+ncol(r$p)+(1:ncol(r$add))] <- r$cname rownames(out) <- rownames(G) + print(warnings()) .Call("writeMatrix",out,outf) } else { write.table(NULL,outf,row.names=F,col.names=F) } +rm(list=ls()) +gc() + From b3dcdcfed0421e0f6499e70136d5f46eaac661ed Mon Sep 17 00:00:00 2001 From: rounakdey Date: Mon, 18 Sep 2017 02:40:04 -0400 Subject: [PATCH 06/18] Calculate null models for SPA test --- data/epactsMultiNull.R | 80 +++++++++++++++++++++++++++++++++++++++++ data/epactsSingleNull.R | 69 +++++++++++++++++++++++++++++++++++ 2 files changed, 149 insertions(+) create mode 100644 data/epactsMultiNull.R create mode 100644 data/epactsSingleNull.R diff --git a/data/epactsMultiNull.R b/data/epactsMultiNull.R new file mode 100644 index 0000000..29ee74b --- /dev/null +++ b/data/epactsMultiNull.R @@ -0,0 +1,80 @@ +args <- commandArgs(trailingOnly=TRUE) +nullf <- args[2] ## output name +phenof <- args[3] ## phenotype file +covf <- args[4] ## covariate file or null model index file + +pnames <- scan(phenof,what=character(),nlines=1)[-1] +phenos <- as.matrix(read.table(phenof)[,-1]) + +if ( covf == "NULL" ) { + cov <- NULL; +} else { + cov <- as.matrix(read.table(covf)[,-1]) +} + + + +ScoreTest_wSaddleApprox_Get_X1 = function(X1) +{ + q1<-ncol(X1) + if(q1>=2) + { + if(sum(abs(X1[,1]-X1[,2]))==0) + { + X1=X1[,-2] + q1<-q1-1 + } + } + qr1<-qr(X1) + if(qr1$rank < q1){ + + X1.svd<-svd(X1) + X1 = X1.svd$u[,1:qr1$rank] + } + + return(X1) +} + + +ScoreTest_wSaddleApprox_NULL_Model <- function(formula, data=NULL) +{ + X1<-model.matrix(formula,data=data) + X1<-ScoreTest_wSaddleApprox_Get_X1(X1) + + glmfit= glm(formula, data=data, family = "binomial") + mu = glmfit$fitted.values + V = mu*(1-mu) + res = glmfit$y- mu + n1<-length(res) + + XV = t(X1 * V) + XVX_inv= solve(t(X1)%*%(X1 * V)) + XXVX_inv= X1 %*% XVX_inv + + re<-list(y=glmfit$y, mu=mu, res=res, V=V, X1=X1, XV=XV, XXVX_inv =XXVX_inv) + class(re)<-"SA_NULL" + return(re) +} + +################################################################## +## MAIN FUNCTION: null.b.sna2 +################################################################## + +phenos <- phenos - min(phenos,na.rm=T) + + k <- ncol(cov) + +g<-ncol(phenos) + + for(gind in 1:g) + { + nm<-which(is.na(phenos[,gind])==FALSE) + pheno<-phenos[nm,gind] + cov1<-cov[nm,] + + obj.null<-ScoreTest_wSaddleApprox_NULL_Model(pheno ~as.matrix(cov1)) + save.image(paste(nullf,".",pnames[gind],".null.RData",sep="")) + + } + + diff --git a/data/epactsSingleNull.R b/data/epactsSingleNull.R new file mode 100644 index 0000000..0a39b83 --- /dev/null +++ b/data/epactsSingleNull.R @@ -0,0 +1,69 @@ +args <- commandArgs(trailingOnly=TRUE) +nullf <- args[2] ## output name +phenof <- args[3] ## phenotype file +covf <- args[4] ## covariate file or null model index file + +pheno <- as.matrix(read.table(phenof)[,-1]) + +if ( covf == "NULL" ) { + cov <- NULL; +} else { + cov <- as.matrix(read.table(covf)[,-1]) +} + + +ScoreTest_wSaddleApprox_Get_X1 = function(X1) +{ + q1<-ncol(X1) + if(q1>=2) + { + if(sum(abs(X1[,1]-X1[,2]))==0) + { + X1=X1[,-2] + q1<-q1-1 + } + } + qr1<-qr(X1) + if(qr1$rank < q1){ + + X1.svd<-svd(X1) + X1 = X1.svd$u[,1:qr1$rank] + } + + return(X1) +} + + +ScoreTest_wSaddleApprox_NULL_Model <- function(formula, data=NULL) +{ + X1<-model.matrix(formula,data=data) + X1<-ScoreTest_wSaddleApprox_Get_X1(X1) + + glmfit= glm(formula, data=data, family = "binomial") + mu = glmfit$fitted.values + V = mu*(1-mu) + res = glmfit$y- mu + n1<-length(res) + + XV = t(X1 * V) + XVX_inv= solve(t(X1)%*%(X1 * V)) + XXVX_inv= X1 %*% XVX_inv + + re<-list(y=glmfit$y, mu=mu, res=res, V=V, X1=X1, XV=XV, XXVX_inv =XXVX_inv) + class(re)<-"SA_NULL" + return(re) +} + +################################################################## +## MAIN FUNCTION: null.b.sna2 +################################################################## + +pheno <- pheno - min(pheno,na.rm=T) + k <- ncol(cov) + + obj.null<-ScoreTest_wSaddleApprox_NULL_Model(pheno ~as.matrix(cov)) + save.image(paste(nullf,".null.RData",sep="")) + + + + From 5d2d7cb73d62a941635d3095d4eee264fd8392c1 Mon Sep 17 00:00:00 2001 From: rounakdey Date: Mon, 18 Sep 2017 03:13:52 -0400 Subject: [PATCH 07/18] Update epactsMultiNull.R --- data/epactsMultiNull.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/epactsMultiNull.R b/data/epactsMultiNull.R index 29ee74b..ef8e63d 100644 --- a/data/epactsMultiNull.R +++ b/data/epactsMultiNull.R @@ -51,7 +51,7 @@ ScoreTest_wSaddleApprox_NULL_Model <- function(formula, data=NULL) XVX_inv= solve(t(X1)%*%(X1 * V)) XXVX_inv= X1 %*% XVX_inv - re<-list(y=glmfit$y, mu=mu, res=res, V=V, X1=X1, XV=XV, XXVX_inv =XXVX_inv) + re<-list(y=glmfit$y, cov=X1, mu=mu, res=res, V=V, X1=X1, XV=XV, XXVX_inv =XXVX_inv) class(re)<-"SA_NULL" return(re) } From 5537adfa3c9a32fb5a9deb5427df017f6e615222 Mon Sep 17 00:00:00 2001 From: rounakdey Date: Mon, 18 Sep 2017 03:14:37 -0400 Subject: [PATCH 08/18] Update epactsSingleNull.R --- data/epactsSingleNull.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/epactsSingleNull.R b/data/epactsSingleNull.R index 0a39b83..46d3ea6 100644 --- a/data/epactsSingleNull.R +++ b/data/epactsSingleNull.R @@ -49,7 +49,7 @@ ScoreTest_wSaddleApprox_NULL_Model <- function(formula, data=NULL) XVX_inv= solve(t(X1)%*%(X1 * V)) XXVX_inv= X1 %*% XVX_inv - re<-list(y=glmfit$y, mu=mu, res=res, V=V, X1=X1, XV=XV, XXVX_inv =XXVX_inv) + re<-list(y=glmfit$y, cov=X1, mu=mu, res=res, V=V, X1=X1, XV=XV, XXVX_inv =XXVX_inv) class(re)<-"SA_NULL" return(re) } From 976f8bc4182098e98e89c9a336be883636b7a2e7 Mon Sep 17 00:00:00 2001 From: rounakdey Date: Mon, 18 Sep 2017 10:48:47 -0400 Subject: [PATCH 09/18] Update single.b.sna2.R Loads null object created by epactsSingleNull.R Performs beta estimation for p<5*10^-6 --- data/single.b.sna2.R | 1058 +++++++++++++++++------------------------- 1 file changed, 437 insertions(+), 621 deletions(-) diff --git a/data/single.b.sna2.R b/data/single.b.sna2.R index 3dbd63f..102dd3e 100644 --- a/data/single.b.sna2.R +++ b/data/single.b.sna2.R @@ -14,18 +14,51 @@ ## addcols : additional columns (with proper column names) ################################################################## -## single.b.firth : -## Use Firth bias-reduced logistic regression to perform association -## By: Clement Ma -## -## Adapted from 'logistf' R package (v1.10) -## By: Ploner M, Dunkler D, Southworth H, Heinze G -## TRAITS : BINARY -## RETURNS : PVALUE, BETA, SEBETA, CHISQ -## MISSING VALUES : IGNORED - -Korg<-function(t, mu, g){ +ScoreTest_wSaddleApprox_Get_X1 = function(X1) +{ + q1<-ncol(X1) + if(q1>=2) + { + if(sum(abs(X1[,1]-X1[,2]))==0) + { + X1=X1[,-2] + q1<-q1-1 + } + } + qr1<-qr(X1) + if(qr1$rank < q1){ + + X1.svd<-svd(X1) + X1 = X1.svd$u[,1:qr1$rank] + } + + return(X1) +} + + +ScoreTest_wSaddleApprox_NULL_Model <- function(formula, data=NULL) +{ + X1<-model.matrix(formula,data=data) + X1<-ScoreTest_wSaddleApprox_Get_X1(X1) + + glmfit= glm(formula, data=data, family = "binomial") + mu = glmfit$fitted.values + V = mu*(1-mu) + res = glmfit$y- mu + n1<-length(res) + + XV = t(X1 * V) + XVX_inv= solve(t(X1)%*%(X1 * V)) + XXVX_inv= X1 %*% XVX_inv + + re<-list(y=glmfit$y, cov=X1, mu=mu, res=res, V=V, X1=X1, XV=XV, XXVX_inv =XXVX_inv) + class(re)<-"SA_NULL" + return(re) +} + +Korg<-function(t, mu, g) +{ n.t<-length(t) out<-rep(0,n.t) @@ -38,8 +71,8 @@ Korg<-function(t, mu, g){ } -K1_adj<-function(t, mu, g, q){ - +K1_adj<-function(t, mu, g, q) +{ n.t<-length(t) out<-rep(0,n.t) @@ -47,11 +80,22 @@ K1_adj<-function(t, mu, g, q){ t1<-t[i] temp1<-(1 - mu)* exp(-g * t1) + mu temp2<-mu *g - out[i]<-sum(temp2/temp1)-q } + return(out) +} + +K2<-function(t, mu, g) +{ + n.t<-length(t) + out<-rep(0,n.t) - + for(i in 1:n.t){ + t1<-t[i] + temp1<-((1 - mu)* exp(-g * t1) + mu)^2 + temp2<-(1-mu) * mu * g^2 * exp(-g*t1) + out[i]<-sum(temp2/temp1,na.rm=TRUE) + } return(out) } @@ -108,145 +152,209 @@ getroot_K1<-function(init,mu,g,q,m1,tol=.Machine$double.eps^0.25,maxiter=1000) } } +Get_Saddle_Prob<-function(zeta, mu, g, q) +{ + k1<-Korg(zeta, mu, g) + k2<-K2(zeta, mu, g) + + if(is.finite(k1) && is.finite(k2)) + { + temp1<-zeta * q - k1 -K2<-function(t, mu, g){ - - n.t<-length(t) - out<-rep(0,n.t) - for(i in 1:n.t){ - t1<-t[i] - temp1<-((1 - mu)* exp(-g * t1) + mu)^2 - temp2<-(1-mu) * mu * g^2 * exp(-g*t1) - out[i]<-sum(temp2/temp1,na.rm=TRUE) + w<-sign(zeta) * (2 *temp1)^{1/2} + v<- zeta * (k2)^{1/2} + + Z.test<-w + 1/w * log(v/w) + + if(Z.test > 0){ + pval<-pnorm(Z.test, lower.tail = FALSE) + } else { + pval<-pnorm(Z.test, lower.tail = TRUE) + } + } else { + pval<-0 } - return(out) + + return(pval) } + +Saddle_Prob<-function(q, mu, g, Cutoff=2,alpha) +{ + m1<-sum(mu * g) + var1<-sum(mu * (1-mu) * g^2) + p1=NULL + p2=NULL -Get_Spline_Fun<-function(mu, g){ - node<-c(-5000,-2000,-1000, -100, -50.5, -38.125, -25.75, -13.375, -1, 0, - 1, 50.5, 62.875, 75.25, 100, 1000,2000,5000) - y1<-Korg(node ,mu ,g) - y1deriv<-K1_adj(node ,mu ,g,0) - sfun<-splinefun(node , y1 ,method='natural') + # + qinv = -sign(q-m1) * abs(q-m1) + m1 -} + # Noadj + pval.noadj<-pchisq((q - m1)^2/var1, lower.tail = FALSE, df=1) + Is.converge=TRUE -Get_Spline_FunH<-function(mu, g){ - - node<-c(-5000,-2000,-1000, -100, -1, 0, 1, 100, 1000,2000,5000) - flag=0 - rep<-0 - repeat{ - y1<-Korg(node ,mu ,g) - y1deriv<-K1_adj(node ,mu ,g,0) - sfun<-splinefunH(node , y1 ,m=y1deriv) - xval<-NULL - for(i in 1:(length(node)-1)) - xval<-c(xval,seq(node[i],node[i+1],length.out=100)) - curve<-sfun(xval,deriv=2) - if(min(curve)<0) + if(Cutoff=="BE"){ + rho<-sum(((abs(g))^3)*mu*(1-mu)*(mu^2+(1-mu)^2)) + B<-0.56*rho*var1^(-3/2) + p<-B+alpha/2 + Cutoff=ifelse(p>=0.496,0.01,qnorm(p,lower.tail=F)) + } else if(Cutoff < 10^-1){ + Cutoff=10^-1 + } + # + + if(abs(q - m1)/sqrt(var1) < Cutoff){ + + pval=pval.noadj + + } else { + out.uni1<-getroot_K1(0, mu=mu, g=g, q=q) + out.uni2<-getroot_K1(0, mu=mu, g=g, q=qinv) + if(out.uni1$Is.converge==TRUE && out.uni2$Is.converge==TRUE) { - ls<-unique(findInterval(xval[which(curve<0)],node,all.inside=T)) - nodeadd<-NULL - for(i in 1:length(ls)) - { - b<-(node[ls[i]]+node[ls[i]+1])/2 - nodeadd<-c(nodeadd,b) - } - nodeadd<-unique(nodeadd) - node<-c(node,nodeadd) - node<-node[order(node)] + p1<-Get_Saddle_Prob(out.uni1$root, mu, g, q) + p2<-Get_Saddle_Prob(out.uni2$root, mu, g, qinv) + pval = p1+p2 + Is.converge=TRUE } else { - flag=1 - } - if(flag==1 || rep>10) break - rep<-rep+1 + print("Error_Converge") + pval<-pval.noadj + Is.converge=FALSE + } } + if(pval!=0 && pval.noadj/pval>10^3) + { + return(Saddle_Prob(q, mu, g, Cutoff=Cutoff*2,alpha)) + } else { + return(list(p.value=pval, p.value.NA=pval.noadj, Is.converge=Is.converge, p1=p1, p2=p2)) + } - return(sfun) -} - -Get_Spline_Fun_H2<-function(mu, g){ - - node<-c(-5000,-2000,-1000, -100, -1, 0, 1, 100, 1000,2000,5000) - y1<-Korg(node ,mu ,g) - y1deriv<-K1_adj(node ,mu ,g,0) - y1deriv2<-K2(node ,mu ,g) - sfun<-splineH2(nodeval=cbind(node,y1,y1deriv,y1deriv2)) - return(sfun) } -splineH2<-function(nodeval) +TestSPA<-function(G, obj.null, Cutoff=2,alpha) { - sfun<-matrix(0,nrow(nodeval),7) - sfun[,1]<-nodeval[,1] - A<-matrix(c(1,1,0,0,0,0,0,1,1,1,0,0,0,1,0,2,2,2,0,1,0,3,0,6,0,1,0, - 4,0,12,0,1,0,5,0,20),6,6) - for(i in 1:(nrow(nodeval)-1)) + if(class(obj.null) != "SA_NULL"){ + stop("obj.null should be a returned object from ScoreTest_wSaddleApprox_NULL_Model") + } + + y = obj.null$y + mu = obj.null$mu + res = obj.null$res + + n.g<-sum(G) + if(n.g/(2*length(G))>0.5) { - f<-as.vector(nodeval[i:(i+1),2:4]) - sfun[i,2:7]<-solve(A)%*%f + G<-2-G + n.g<-sum(G) } - return(sfun) -} - -Korg_S_H2<-function(t, sfun){ - - i<-findInterval(t,sfun[,1],all.inside=T) - t1<-(t-sfun[i,1])/(sfun[i+1,1]-sfun[i,1]) - x<-t1^(0:5) - out<-sum(x*sfun[i,2:7]) - return(out) -} - - -K1_adj_S_H2<-function(t, q, sfun){ + G1<-G - obj.null$XXVX_inv %*% (obj.null$XV %*% G) + q<-sum(G1 * y) /sqrt(n.g) + out<-Saddle_Prob(q, mu=mu, g=G1/sqrt(n.g), Cutoff=Cutoff,alpha=alpha) - i<-findInterval(t,sfun[,1],all.inside=T) - t1<-(t-sfun[i,1])/(sfun[i+1,1]-sfun[i,1]) - x<-(t1^(0:4))*(1:5) - out<-sum(x*sfun[i,3:7])-q return(out) } - -K2_S_H2<-function(t, sfun){ - - i<-findInterval(t,sfun[,1],all.inside=T) - t1<-(t-sfun[i,1])/(sfun[i+1,1]-sfun[i,1]) - x<-(t1^(0:3))*c(2,6,12,20) - out<-sum(x*sfun[i,4:7]) +Korg_fast<-function(t, mu, g,gNA,gNB,muNA,muNB,NAmu,NAsigma) +{ + n.t<-length(t) + out<-rep(0,n.t) + + for(i in 1:n.t){ + t1<-t[i] + temp<-log(1 - muNB + muNB * exp(gNB* t1)) + out[i]<-sum(temp)+NAmu*t1+0.5*NAsigma*t1^2 + } return(out) } -Korg_S<-function(t, sfun){ - out<-sfun(t, deriv=0) +K1_adj_fast<-function(t, mu, g, q,gNA,gNB,muNA,muNB,NAmu,NAsigma) +{ + n.t<-length(t) + out<-rep(0,n.t) + + for(i in 1:n.t){ + t1<-t[i] + temp1<-(1 - muNB)* exp(-gNB * t1) + muNB + temp2<-muNB *gNB + temp3<-NAmu+NAsigma*t1 + out[i]<-sum(temp2/temp1)+temp3-q + } return(out) } - -K1_adj_S<-function(t, q, sfun){ - - out<-sfun(t, deriv=1) - q +K2_fast<-function(t, mu, g,gNA,gNB,muNA,muNB,NAmu,NAsigma) +{ + n.t<-length(t) + out<-rep(0,n.t) + + for(i in 1:n.t){ + t1<-t[i] + temp1<-((1 - muNB)* exp(-gNB * t1) + muNB)^2 + temp2<-(1-muNB) * muNB * gNB^2 * exp(-gNB*t1) + out[i]<-sum(temp2/temp1,na.rm=TRUE)+NAsigma + } return(out) } +getroot_K1_fast<-function(init,mu,g,q,m1,gNA,gNB,muNA,muNB,NAmu,NAsigma,tol=.Machine$double.eps^0.25,maxiter=1000) +{ + g.pos<-sum(g[which(g>0)]) + g.neg<- sum(g[which(g<0)]) + if(q>=g.pos || q<=g.neg) + { + return(list(root=Inf,n.iter=0,Is.converge=TRUE)) + } else { + t<-init + K1_eval<-K1_adj_fast(t,mu,g,q,gNA,gNB,muNA,muNB,NAmu,NAsigma) + prevJump<- Inf + rep<-1 + repeat + { + K2_eval<-K2_fast(t,mu,g,gNA,gNB,muNA,muNB,NAmu,NAsigma) + tnew<-t-K1_eval/K2_eval + if(is.na(tnew)) + { + conv=FALSE + break + } + if(abs(tnew-t)prevJump-tol) + { + tnew<-t+sign(newK1-K1_eval)*prevJump/2 + newK1<-K1_adj_fast(tnew,mu,g,q,gNA,gNB,muNA,muNB,NAmu,NAsigma) + prevJump<-prevJump/2 + } else { + prevJump<-abs(tnew-t) + } + } - out<-sfun(t, deriv=2) - return(out) + rep<-rep+1 + t<-tnew + K1_eval<-newK1 + } + return(list(root=t,n.iter=rep,Is.converge=conv)) + } } - -Get_Saddle_Prob<-function(zeta, mu, g, q) { - - # zeta<-out.uni2$root - k1<-Korg(zeta, mu, g) - k2<-K2(zeta, mu, g) +Get_Saddle_Prob_fast<-function(zeta, mu, g, q,gNA,gNB,muNA,muNB,NAmu,NAsigma) +{ + k1<-Korg_fast(zeta, mu, g,gNA,gNB,muNA,muNB,NAmu,NAsigma) + k2<-K2_fast(zeta, mu, g,gNA,gNB,muNA,muNB,NAmu,NAsigma) if(is.finite(k1) && is.finite(k2)) { @@ -269,12 +377,9 @@ Get_Saddle_Prob<-function(zeta, mu, g, q) { return(pval) } - - -Saddle_Prob<-function(q, mu, g, Cutoff=2){ - - #mu =pi; g=g=G1/sqrt(n.g); Spline=FALSE +Saddle_Prob_fast<-function(q, g,mu,gNA,gNB,muNA,muNB,Cutoff=2,alpha) +{ m1<-sum(mu * g) var1<-sum(mu * (1-mu) * g^2) p1=NULL @@ -286,353 +391,59 @@ Saddle_Prob<-function(q, mu, g, Cutoff=2){ # Noadj pval.noadj<-pchisq((q - m1)^2/var1, lower.tail = FALSE, df=1) Is.converge=TRUE - + if(Cutoff=="BE"){ rho<-sum(((abs(g))^3)*mu*(1-mu)*(mu^2+(1-mu)^2)) B<-0.56*rho*var1^(-3/2) - alpha<-5*10^-5 p<-B+alpha/2 Cutoff=ifelse(p>=0.496,0.01,qnorm(p,lower.tail=F)) } else if(Cutoff < 10^-1){ Cutoff=10^-1 } - + # - if(abs(q - m1)/sqrt(var1) < Cutoff || abs(-q - m1)/sqrt(var1) < Cutoff){ - #Ques: Why the 2nd condition? Isn't that redundant as q and m1 are both positives, - #and in general can cause problem if q and m1 has different signs? + if(abs(q - m1)/sqrt(var1) < Cutoff){ pval=pval.noadj } else { - out.uni1<-try(uniroot(K1_adj, c(-100,100), mu=mu, g=g, q=q), silent=TRUE) - out.uni2<-try(uniroot(K1_adj, c(-100,100), mu=mu, g=g, q=qinv), silent=TRUE) - -#Ques: Don't need the whole interval - # try one-more with large interval - if(class(out.uni1) == "try-error"){ - out.uni1<-try(uniroot(K1_adj, c(-1000,1000), mu=mu, g=g, q=q), silent=TRUE) - } - - if(class(out.uni1) != "try-error"){ - - p1<-Get_Saddle_Prob(out.uni1$root, mu, g, q) - - if(class(out.uni2) != "try-error"){ - p2<-Get_Saddle_Prob(out.uni2$root, mu, g, qinv) - - } else { - t<-seq(-1000,1000, length.out=20) - out1<-K1_adj(t, mu, g, qinv) - idx<-which(abs(out1) == min(abs(out1), na.rm=TRUE)) - try(p2<-Get_Saddle_Prob(t[idx], mu, g, qinv), silent=TRUE) - if(class(p2) == "try-error"){ - p2 = p1 - } - - } - + NAmu= m1-sum(gNB*muNB) + NAsigma=var1-sum(muNB*(1-muNB)*gNB^2) + out.uni1<-getroot_K1_fast(0, mu=mu, g=g, q=q,gNA=gNA,gNB=gNB,muNA=muNA,muNB=muNB,NAmu=NAmu,NAsigma=NAsigma) + out.uni2<-getroot_K1_fast(0, mu=mu, g=g, q=qinv,gNA=gNA,gNB=gNB,muNA=muNA,muNB=muNB,NAmu=NAmu,NAsigma=NAsigma) + if(out.uni1$Is.converge==TRUE && out.uni2$Is.converge==TRUE) + { + p1<-Get_Saddle_Prob_fast(out.uni1$root, mu, g, q,gNA,gNB,muNA,muNB,NAmu,NAsigma) + p2<-Get_Saddle_Prob_fast(out.uni2$root, mu, g, qinv,gNA,gNB,muNA,muNB,NAmu,NAsigma) pval = p1+p2 - - #if(pval < 10^-3 && pval.noadj > 0.1){ - # pval<-pval.noadj - # Is.converge=FALSE - #} - + Is.converge=TRUE } else { + print("Error_Converge") pval<-pval.noadj - Is.converge=FALSE - } + Is.converge=FALSE + } - - } - - return(list(p.value=pval, p.value.NA=pval.noadj, Is.converge=Is.converge, p1=p1, p2=p2)) -} - -Saddle_Prob_NR<-function(q, mu, g, Cutoff=2){ - - m1<-sum(mu * g) - var1<-sum(mu * (1-mu) * g^2) - p1=NULL - p2=NULL - - # - qinv = -sign(q-m1) * abs(q-m1) + m1 - - # Noadj - pval.noadj<-pchisq((q - m1)^2/var1, lower.tail = FALSE, df=1) - Is.converge=TRUE - - if(Cutoff=="BE"){ - rho<-sum(((abs(g))^3)*mu*(1-mu)*(mu^2+(1-mu)^2)) - B<-0.56*rho*var1^(-3/2) - alpha<-5*10^-5 - p<-B+alpha/2 - Cutoff=ifelse(p>=0.496,0.01,qnorm(p,lower.tail=F)) - } else if(Cutoff < 10^-1){ - Cutoff=10^-1 - } - # - - if(abs(q - m1)/sqrt(var1) < Cutoff || abs(-q - m1)/sqrt(var1) < Cutoff){ - - pval=pval.noadj - - } else { - out.uni1<-getroot_K1(0, mu=mu, g=g, q=q) - out.uni2<-getroot_K1(0, mu=mu, g=g, q=qinv) - if(out.uni1$Is.converge==TRUE && out.uni2$Is.converge==TRUE) - { - p1<-Get_Saddle_Prob(out.uni1$root, mu, g, q) - p2<-Get_Saddle_Prob(out.uni2$root, mu, g, qinv) - pval = p1+p2 - Is.converge=TRUE - } else { - print("Error_Converge") - pval<-pval.noadj - Is.converge=FALSE - } - #if(pval < 10^-3 && pval.noadj > 0.1){ - # print("Error_Mismatch") - # pval<-pval.noadj - # Is.converge=FALSE - #} - - } - - return(list(p.value=pval, p.value.NA=pval.noadj, Is.converge=Is.converge, p1=p1, p2=p2)) -} - - - -Saddle_Prob_wSpline<-function(q, mu, g, Spline=FALSE){ - - #mu =pi; g=g=G1/sqrt(n.g); Spline=FALSE - m1<-sum(mu * g) - var1<-sum(mu * (1-mu) * g^2) - - # Noadj - pval.noadj<-pchisq((q - m1)^2/var1, lower.tail = FALSE, df=1) - pval.noadj<-pnorm((q - m1)/sqrt(var1), lower.tail = FALSE) - Is.converge=TRUE - - # incase of singleton - idx<-which(g> 0) #Ques: Why these steps? - if(length(idx)==1){ - if(q==0){ - pval=1 - (1-mu[idx])/2 - } else { - pval=mu[idx]/2 - } - } else if(q ==0){ - - temp<-sum(log(1-mu[idx])) - pval<-1 - exp(temp)/2 - - } else if (q==sum(g)){ - - temp<-sum(log(mu[idx])) - pval<-exp(temp)/2 - - } else if(abs(q - m1)/sqrt(var1) < 10^{-5}){ - pval=0.5 - } else if(Spline==FALSE){ - - out<-try(uniroot(K1_adj, c(-100,100), mu=mu, g=g, q=q), silent=TRUE) - if(class(out) != "try-error"){ - zeta<-out$root - - k1<-Korg(zeta, mu, g) - k2<-K2(zeta, mu, g) - - temp1<-zeta * q - k1 - - - w<-sign(zeta) * (2 *temp1)^{1/2} - v<- zeta * (k2)^{1/2} - - Z.test<-w + 1/w * log(v/w) - #pval<-pchisq(Z.test^2, lower.tail = FALSE, df=1) - pval<-pnorm(Z.test, lower.tail = FALSE) - - if(pval < 10^-2 && pval.noadj > 0.1){ - pval<-pval.noadj - Is.converge=FALSE - } - - } else { - pval<-pval.noadj - Is.converge=FALSE - } - - - } else { - n<-length(g) - sfun<-Get_Spline_Fun(mu, g, n) - - out<-uniroot(K1_adj_S, c(-3000,3000), q=q, sfun=sfun) - zeta<-out$root - - k1<-Korg_S(zeta, sfun) - k2<-K2_S(zeta, sfun) - - temp1<-zeta * q - k1 - - - w<-sign(zeta) * (2 *temp1)^{1/2} - - if(k2 <= 0){ - v<-w - } else { - v<- zeta * (k2)^{1/2} - } - pval<-pnorm(w + 1/w * log(v/w), lower.tail = FALSE) - - } - q.transe<-q - m1 - if(!is.na(pval)){ - q.transe<-qnorm(pval, lower.tail=FALSE) * sqrt(var1) - } #Ques: Why this overwriting and why is this information needed - - - return(list(pval=pval, pval.noadj=pval.noadj, q.transe=q.transe, Is.converge=Is.converge, w=w, v=v)) -} - - -# -# Input -# G: genotype -# obj.null: returned object from GLM function with the null model -# Cutoff: a cutoff value to use either saddle point approx or normal approx -# -# Output -# p.value: p-value from the saddle point approx. If the mean centered test statistic is within Cutoff *SD, p.value will be calculated using the normal approx. -# p.value.NA: p-value from the normal approx -# Is.converge: covergence status of saddle point approx. If Is.converge=FALSE, p.value will be calculated using the normal approx. -# p1 and p2: internal use only (please ignore them) -# - - -##################################################### -# Following 3 functions are added - -ScoreTest_wSaddleApprox_Get_X1 = function(X1){ - - q1<-ncol(X1) - if(q1>=3) + if(pval!=0 && pval.noadj/pval>10^3) { - if(sum(abs(X1[,1]-X1[,2]))==0) - { - X1=X1[,-2] - q1<-q1-1 - } - } - qr1<-qr(X1) - if(qr1$rank < q1){ - - X1.svd<-svd(X1) - X1 = X1.svd$u #Ques: Why? - } - - return(X1) - -} - - -ScoreTest_wSaddleApprox_NULL_Model = function(formula, data=NULL ){ - - - X1<-model.matrix(formula,data=data) - X1<-ScoreTest_wSaddleApprox_Get_X1(X1) - - glmfit= glm(formula, data=data, family = "binomial") - mu = glmfit$fitted.values - #Ques: Why not use Firth bias correction here? Not a big deal though, and not time consuming - V = mu*(1-mu) - res = glmfit$y- mu - n1<-length(res) - - # - XV = t(X1 * V) - XVX_inv= solve(t(X1)%*%(X1 * V)) - XXVX_inv= X1 %*% XVX_inv - - re<-list(y=glmfit$y, mu=mu, res=res, V=V, X1=X1, XV=XV, XXVX_inv =XXVX_inv) - class(re)<-"SA_NULL" - return(re) - - -} - - -# -# G: genotype vector -# obj.SA.null: object from ScoreTest_wSaddleApprox_NULL_Model -# - -ScoreTest_wSaddleApprox_New<-function(G, obj.SA.null, Cutoff=2){ - - if(class(obj.SA.null) != "SA_NULL"){ - stop("obj.SA.null should be a returned object from ScoreTest_wSaddleApprox_NULL_Model") - } - - y = obj.SA.null$y - mu = obj.SA.null$mu - res = obj.SA.null$res - - n.g<-sum(G) - if(n.g/(2*length(G))>0.5) - { - G<-2-G - n.g<-sum(G) - } - G1<-G - obj.SA.null$XXVX_inv %*% (obj.SA.null$XV %*% G) - #Ques: Reference? Doesn't removing X2 make G and y effectively indep? - q<-sum(G1 * y) /sqrt(n.g) - #q1<-sum(G * y) /sqrt(n.g) - out<-Saddle_Prob(q, mu=mu, g=G1/sqrt(n.g), Cutoff=Cutoff) - - return(out) -} - -ScoreTest_wSaddleApprox_NewNR<-function(G, obj.SA.null, Cutoff=2){ - - if(class(obj.SA.null) != "SA_NULL"){ - stop("obj.SA.null should be a returned object from ScoreTest_wSaddleApprox_NULL_Model") - } - - y = obj.SA.null$y - mu = obj.SA.null$mu - res = obj.SA.null$res - - n.g<-sum(G) - if(n.g/(2*length(G))>0.5) - { - G<-2-G - n.g<-sum(G) + return(Saddle_Prob_fast(q, g,mu,gNA,gNB,muNA,muNB,Cutoff=Cutoff*2,alpha)) + } else { + return(list(p.value=pval, p.value.NA=pval.noadj, Is.converge=Is.converge, p1=p1, p2=p2)) } - G1<-G - obj.SA.null$XXVX_inv %*% (obj.SA.null$XV %*% G) - #Ques: Reference? Doesn't removing X2 make G and y effectively indep? - q<-sum(G1 * y) /sqrt(n.g) - #q1<-sum(G * y) /sqrt(n.g) - out<-Saddle_Prob_NR(q, mu=mu, g=G1/sqrt(n.g), Cutoff=Cutoff) - return(out) } -ScoreTest_wSaddleApprox_OldNR<-function(G, obj.SA.null, Cutoff=2){ - - if(class(obj.SA.null) != "SA_NULL"){ - stop("obj.SA.null should be a returned object from ScoreTest_wSaddleApprox_NULL_Model") +TestSPAfast<-function(G, obj.null, Cutoff=2,alpha) +{ + if(class(obj.null) != "SA_NULL"){ + stop("obj.null should be a returned object from ScoreTest_wSaddleApprox_NULL_Model") } - y = obj.SA.null$y - mu = obj.SA.null$mu - res = obj.SA.null$res + y = obj.null$y + mu = obj.null$mu + res = obj.null$res n.g<-sum(G) if(n.g/(2*length(G))>0.5) @@ -641,234 +452,239 @@ ScoreTest_wSaddleApprox_OldNR<-function(G, obj.SA.null, Cutoff=2){ n.g<-sum(G) } NAset<-which(G==0) - G1<-G - obj.SA.null$XXVX_inv %*% (obj.SA.null$XV %*% G) + G1<-G - obj.null$XXVX_inv %*% (obj.null$XV %*% G) q<-sum(G1 * y) /sqrt(n.g) g=G1/sqrt(n.g) if(length(NAset)/length(G)<0.5) { - out<-Saddle_Prob_NR(q, mu=mu, g=G1/sqrt(n.g), Cutoff=Cutoff) + out<-Saddle_Prob(q, mu=mu, g=G1/sqrt(n.g), Cutoff=Cutoff,alpha=alpha) } else { - out<-Saddle_Prob_OldNR(q,g=g,mu=mu,gNA=g[NAset],gNB=g[-NAset], -muNA=mu[NAset],muNB=mu[-NAset],Cutoff=Cutoff) + out<-Saddle_Prob_fast(q,g=g,mu=mu,gNA=g[NAset],gNB=g[-NAset], +muNA=mu[NAset],muNB=mu[-NAset],Cutoff=Cutoff,alpha=alpha) } return(out) } -Saddle_Prob_OldNR<-function(q, g,mu,gNA,gNB,muNA,muNB,Cutoff=2){ - m1<-sum(mu * g) - var1<-sum(mu * (1-mu) * g^2) - p1=NULL - p2=NULL - # - qinv = -sign(q-m1) * abs(q-m1) + m1 +fast.logistf.fit <- function (x, y, weight = NULL, offset = NULL, firth = TRUE, col.fit = NULL, + init = NULL, control) { + n <- nrow(x) + k <- ncol(x) + if (is.null(init)) + init = rep(0, k) + if (is.null(col.fit)) + col.fit = 1:k + if (is.null(offset)) + offset = rep(0, n) + if (is.null(weight)) + weight = rep(1, n) + if (col.fit[1] == 0) + maxit <- 0 + if (missing(control)) + control <- fast.logistf.control() + maxit <- control$maxit + maxstep <- control$maxstep + maxhs <- control$maxhs + lconv <- control$lconv + gconv <- control$gconv + xconv <- control$xconv + beta <- init + iter <- 0 + pi <- as.vector(1/(1 + exp(-x %*% beta - offset))) + evals <- 1 + repeat { + beta.old <- beta + XW2 <- t(x * (weight * pi * (1-pi))^0.5) + myQR <- qr(t(XW2)) + Q <- qr.Q(myQR) + h <- (Q*Q) %*% rep(1, ncol(Q)) + if (firth) + U.star <- crossprod(x, weight * (y - pi) + h * (0.5 - pi)) + else U.star <- crossprod(x, weight * (y - pi)) + XX.covs <- matrix(0, k, k) + if (col.fit[1] != 0) { + XX.XW2 <- t(x[, col.fit, drop=FALSE] * (weight * pi * (1-pi))^0.5) + XX.Fisher <- crossprod(t(XX.XW2)) + XX.covs[col.fit, col.fit] <- fast.invFisher(XX.Fisher) ###### HERE IS THE PROBLEM!!! + } + if(all(is.na(XX.covs)) == T) { + break + } + delta <- as.vector(XX.covs %*% U.star) + delta[is.na(delta)] <- 0 + mx <- max(abs(delta))/maxstep + if (mx > 1) + delta <- delta/mx + evals <- evals + 1 + if (maxit > 0) { + iter <- iter + 1 + beta <- beta + delta + pi <- as.vector(1/(1 + exp(-x %*% beta - offset))) - # Noadj - pval.noadj<-pchisq((q - m1)^2/var1, lower.tail = FALSE, df=1) - Is.converge=TRUE - if(Cutoff=="BE"){ - rho<-sum(((abs(g))^3)*mu*(1-mu)*(mu^2+(1-mu)^2)) - B<-0.56*rho*var1^(-3/2) - alpha<-5*10^-5 - p<-B+alpha/2 - Cutoff=ifelse(p>=0.496,0.01,qnorm(p,lower.tail=F)) - } else if(Cutoff < 10^-1){ - Cutoff=10^-1 - } - - # - - if(abs(q - m1)/sqrt(var1) < Cutoff || abs(-q - m1)/sqrt(var1) < Cutoff){ + } + if (iter == maxit | ((max(abs(delta)) <= xconv) & (all(abs(U.star[col.fit]) < + gconv)))) + break + } + # Error catching (if chol(x) not positive definite) + if(all(is.na(XX.covs))==T) { + var <- XX.covs + list(beta = NA, var = var, pi = NA, hat.diag = NA, + iter = NA, evals = NA, conv = c(NA, + NA, NA)) + } else { + var <- XX.covs + list(beta = beta, var = var, pi = pi, hat.diag = h, + iter = iter, evals = evals, conv = c(max(abs(U.star)), + max(abs(delta)))) + } +} - pval=pval.noadj - - } else { - #NAmu= sum(gNA*muNA) - #NAsigma=sum(muNA*(1-muNA)*gNA^2) - NAmu= m1-sum(gNB*muNB) - NAsigma=var1-sum(muNB*(1-muNB)*gNB^2) - out.uni1<-getroot_K1_Old(0, mu=mu, g=g, q=q,gNA=gNA,gNB=gNB,muNA=muNA,muNB=muNB,NAmu=NAmu,NAsigma=NAsigma) - out.uni2<-getroot_K1_Old(0, mu=mu, g=g, q=qinv,gNA=gNA,gNB=gNB,muNA=muNA,muNB=muNB,NAmu=NAmu,NAsigma=NAsigma) - if(out.uni1$Is.converge==TRUE && out.uni2$Is.converge==TRUE) - { - p1<-Get_Saddle_Prob_Old(out.uni1$root, mu, g, q,gNA,gNB,muNA,muNB,NAmu,NAsigma) - p2<-Get_Saddle_Prob_Old(out.uni2$root, mu, g, qinv,gNA,gNB,muNA,muNB,NAmu,NAsigma) - pval = p1+p2 - Is.converge=TRUE - } else { - print("Error_Converge") - pval<-pval.noadj - Is.converge=FALSE - } - - } - - return(list(p.value=pval, p.value.NA=pval.noadj, Is.converge=Is.converge, p1=p1, p2=p2)) + +fast.logistf.control <- function (maxit = 50, maxhs = 15, maxstep = 15, lconv = 1e-05, + gconv = 1e-05, xconv = 1e-05) +{ + list(maxit = maxit, maxhs = maxhs, maxstep = maxstep, lconv = lconv, + gconv = gconv, xconv = xconv) +} + +fast.logDet <- function (x) { + my.chol <- tryCatch(chol(x),error=function(e) {NA}) + if (all(is.na(my.chol))==T) { + return(NA) + } else { + return (2 * sum(log(diag(my.chol)))) + } +} + +fast.invFisher <- function(x) { + my.chol <- tryCatch(chol(x),error=function(e) {NA}) + if (all(is.na(my.chol))==T) { + return(NA) + } else { + return (chol2inv(my.chol)) + } + #ifelse(is.na(my.chol), NA, chol2inv(my.chol)) } -getroot_K1_Old<-function(init,mu,g,q,m1,gNA,gNB,muNA,muNB,NAmu,NAsigma,tol=.Machine$double.eps^0.25,maxiter=1000) + +ScoreTest_SPA <-function(genos,pheno,cov,obj.null,method=c("fastSPA","SPA"),minmac=5,Cutoff=2,alpha=5*10^-8,missing.id=NA,beta.out=FALSE,beta.Cutoff=5*10^-7) { - g.pos<-sum(g[which(g>0)]) - g.neg<- sum(g[which(g<0)]) - if(q>=g.pos || q<=g.neg) + method<-match.arg(method) + + if(missing(obj.null)) { - return(list(root=Inf,n.iter=0,Is.converge=TRUE)) + if(missing(cov) || is.null(cov)) + { + cov<-rep(1,length(pheno)) + } + obj.null<-ScoreTest_wSaddleApprox_NULL_Model(as.matrix(pheno) ~as.matrix(cov)) + } + cov<-obj.null$cov + pheno<-obj.null$y + + genos<-as.matrix(genos) + if(ncol(genos)==1) + { + m<-1 + genos<-t(genos) } else { - t<-init - K1_eval<-K1_adj_Old(t,mu,g,q,gNA,gNB,muNA,muNB,NAmu,NAsigma) - prevJump<- Inf - rep<-1 - repeat + m <- nrow(genos) + } + if(is.na(missing.id)==FALSE) + { + genos[which(genos==missing.id)]<-NA + } + p.value<-rep(NA,m) + p.value.NA<-rep(NA,m) + Is.converge<-rep(NA,m) + beta<-rep(NA,m) + SEbeta<-rep(NA,m) + for (i in 1:m) + { + try({ + ina<-which(is.na(genos[i,])) + if(length(ina)>0) { - K2_eval<-K2_Old(t,mu,g,gNA,gNB,muNA,muNB,NAmu,NAsigma) - tnew<-t-K1_eval/K2_eval - if(is.na(tnew)) - { - conv=FALSE - break - } - if(abs(tnew-t)=minmac) + { + if(method=="fastSPA") { - conv<-FALSE - break + re <- TestSPAfast(as.vector(genos[i,,drop=FALSE]), obj.null, Cutoff=Cutoff, alpha=alpha) + } else { + re <- TestSPA(as.vector(genos[i,,drop=FALSE]), obj.null, Cutoff=Cutoff, alpha=alpha) } - - newK1<-K1_adj_Old(tnew,mu,g,q,gNA,gNB,muNA,muNB,NAmu,NAsigma) - if(sign(K1_eval)!=sign(newK1)) + p.value[i] <- re$p.value + p.value.NA[i] <- re$p.value.NA + Is.converge[i]<- re$Is.converge + if(beta.out==TRUE && p.value[i]prevJump-tol) - { - tnew<-t+sign(newK1-K1_eval)*prevJump/2 - newK1<-K1_adj_Old(tnew,mu,g,q,gNA,gNB,muNA,muNB,NAmu,NAsigma) - prevJump<-prevJump/2 - } else { - prevJump<-abs(tnew-t) - } + re.firth<-fast.logistf.fit(x=cbind(t(genos[i,,drop=FALSE]),cov),y=pheno,firth=TRUE) + beta[i]<-re.firth$beta[1] + SEbeta[i]<-sqrt(re.firth$var[1,1]) } - - rep<-rep+1 - t<-tnew - K1_eval<-newK1 } - return(list(root=t,n.iter=rep,Is.converge=conv)) + }) + if(i%%1000==0) print(paste("Processed",i,"SNPs",sep=" ")) } + return(list(p.value=p.value,p.value.NA=p.value.NA,Is.converge=Is.converge,beta=beta,SEbeta=SEbeta)) } -Korg_Old<-function(t, mu, g,gNA,gNB,muNA,muNB,NAmu,NAsigma){ - n.t<-length(t) - out<-rep(0,n.t) - - for(i in 1:n.t){ - t1<-t[i] - temp<-log(1 - muNB + muNB * exp(gNB* t1)) - out[i]<-sum(temp)+NAmu*t1+0.5*NAsigma*t1^2 - } - return(out) -} - - -K1_adj_Old<-function(t, mu, g, q,gNA,gNB,muNA,muNB,NAmu,NAsigma){ - - n.t<-length(t) - out<-rep(0,n.t) - - for(i in 1:n.t){ - t1<-t[i] - temp1<-(1 - muNB)* exp(-gNB * t1) + muNB - temp2<-muNB *gNB - temp3<-NAmu+NAsigma*t1 - out[i]<-sum(temp2/temp1)+temp3-q - } - - - return(out) -} - -K2_Old<-function(t, mu, g,gNA,gNB,muNA,muNB,NAmu,NAsigma){ - - n.t<-length(t) - out<-rep(0,n.t) - - for(i in 1:n.t){ - t1<-t[i] - temp1<-((1 - muNB)* exp(-gNB * t1) + muNB)^2 - temp2<-(1-muNB) * muNB * gNB^2 * exp(-gNB*t1) - out[i]<-sum(temp2/temp1,na.rm=TRUE)+NAsigma - } - return(out) -} - -Get_Saddle_Prob_Old<-function(zeta, mu, g, q,gNA,gNB,muNA,muNB,NAmu,NAsigma) { - - # zeta<-out.uni2$root - k1<-Korg_Old(zeta, mu, g,gNA,gNB,muNA,muNB,NAmu,NAsigma) - k2<-K2_Old(zeta, mu, g,gNA,gNB,muNA,muNB,NAmu,NAsigma) - - if(is.finite(k1) && is.finite(k2)) - { - temp1<-zeta * q - k1 - - - w<-sign(zeta) * (2 *temp1)^{1/2} - v<- zeta * (k2)^{1/2} - - Z.test<-w + 1/w * log(v/w) - - if(Z.test > 0){ - pval<-pnorm(Z.test, lower.tail = FALSE) - } else { - pval<-pnorm(Z.test, lower.tail = TRUE) - } - } else { - pval<-0 - } - - return(pval) -} ################################################################## ## MAIN FUNCTION: single.b.sna2 ################################################################## single.b.sna2 <- function() { - #control <- fast.logistf.control(maxit=25) # Max iterations to converge = 25 + + beta.Cutoff=5*10^-6 n <- ncol(genos) - k <- ncol(cov) - m <- nrow(genos) + genos<-as.matrix(genos) + if(ncol(genos)==1) + { + m<-1 + genos<-t(genos) + } else { + m <- nrow(genos) + } ## resolve missing genotype by mean imputation ina <- is.na(genos) if ( length(vids) > 0 ) { genos[ina] <- matrix(AC[vids]/NS[vids],nrow(genos),ncol(genos))[ina] } - + p <- rep(NA,m) # store p-values for m markers - add <- matrix(NA,m,2) # extra columns: Beta, SE, Chisq - cname <- c("PVAL.NA","CONVERGE") #,"N.CASE","N.CTRL","AF.CASE","AF.CTRL") + add <- matrix(NA,m,4) # extra columns: p.value.NA, Is.converge, Beta, SE + cname <- c("PVAL.NA","CONVERGE","BETA","SEBETA") #,"N.CASE","N.CTRL","AF.CASE","AF.CTRL") if ( m > 0 ) { ## If there is at least one marker to test - obj.SA.null<-ScoreTest_wSaddleApprox_NULL_Model(pheno ~as.matrix(cov)) - + load(paste(nullf,".null.RData",sep="")) + cov=obj.null$cov for (i in 1:m) { - re <- ScoreTest_wSaddleApprox_OldNR(as.vector(genos[i,,drop=FALSE]), obj.SA.null, Cutoff=2) + re <- TestSPAfast(as.numeric(genos[i,,drop=FALSE]), obj.null, Cutoff=2) p[i] <- re$p.value - - add[i,1:2] <- c(re$p.value.NA, re$Is.converge) + beta<-NA + SEbeta<-NA + if(p[i] Date: Mon, 18 Sep 2017 10:50:26 -0400 Subject: [PATCH 10/18] Performs sna2 test with multiple phenotypes --- data/multi.b.sna2.R | 701 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 701 insertions(+) create mode 100644 data/multi.b.sna2.R diff --git a/data/multi.b.sna2.R b/data/multi.b.sna2.R new file mode 100644 index 0000000..34dfd5d --- /dev/null +++ b/data/multi.b.sna2.R @@ -0,0 +1,701 @@ +## Core functions of pugwash to perform association + +################################################################## +## SINGLE VARIANT TEST +## INPUT VARIABLES: +## n : total # of individuals +## NS : number of called samples +## AC : allele count +## MAF : minor allele frequency +## vids : indices from 1:n after AF/AC threshold +## genos : genotype matrix (after AF/AC threshold) +## EXPECTED OUTPUT : list(p, addcols, addnames) for each genos row +## p : p-value +## addcols : additional columns (with proper column names) +################################################################## + +ScoreTest_wSaddleApprox_Get_X1 = function(X1) +{ + q1<-ncol(X1) + if(q1>=2) + { + if(sum(abs(X1[,1]-X1[,2]))==0) + { + X1=X1[,-2] + q1<-q1-1 + } + } + qr1<-qr(X1) + if(qr1$rank < q1){ + + X1.svd<-svd(X1) + X1 = X1.svd$u[,1:qr1$rank] + } + + return(X1) +} + + +ScoreTest_wSaddleApprox_NULL_Model <- function(formula, data=NULL) +{ + X1<-model.matrix(formula,data=data) + X1<-ScoreTest_wSaddleApprox_Get_X1(X1) + + glmfit= glm(formula, data=data, family = "binomial") + mu = glmfit$fitted.values + V = mu*(1-mu) + res = glmfit$y- mu + n1<-length(res) + + XV = t(X1 * V) + XVX_inv= solve(t(X1)%*%(X1 * V)) + XXVX_inv= X1 %*% XVX_inv + + re<-list(y=glmfit$y, cov=X1, mu=mu, res=res, V=V, X1=X1, XV=XV, XXVX_inv =XXVX_inv) + class(re)<-"SA_NULL" + return(re) +} + +Korg<-function(t, mu, g) +{ + n.t<-length(t) + out<-rep(0,n.t) + + for(i in 1:n.t){ + t1<-t[i] + temp<-log(1 - mu + mu * exp(g* t1)) + out[i]<-sum(temp) + } + return(out) +} + + +K1_adj<-function(t, mu, g, q) +{ + n.t<-length(t) + out<-rep(0,n.t) + + for(i in 1:n.t){ + t1<-t[i] + temp1<-(1 - mu)* exp(-g * t1) + mu + temp2<-mu *g + out[i]<-sum(temp2/temp1)-q + } + return(out) +} + +K2<-function(t, mu, g) +{ + n.t<-length(t) + out<-rep(0,n.t) + + for(i in 1:n.t){ + t1<-t[i] + temp1<-((1 - mu)* exp(-g * t1) + mu)^2 + temp2<-(1-mu) * mu * g^2 * exp(-g*t1) + out[i]<-sum(temp2/temp1,na.rm=TRUE) + } + return(out) +} + +getroot_K1<-function(init,mu,g,q,m1,tol=.Machine$double.eps^0.25,maxiter=1000) +{ + g.pos<-sum(g[which(g>0)]) + g.neg<- sum(g[which(g<0)]) + if(q>=g.pos || q<=g.neg) + { + return(list(root=Inf,n.iter=0,Is.converge=TRUE)) + } else { + t<-init + K1_eval<-K1_adj(t,mu,g,q) + prevJump<- Inf + rep<-1 + repeat + { + K2_eval<-K2(t,mu,g) + tnew<-t-K1_eval/K2_eval + if(is.na(tnew)) + { + conv=FALSE + break + } + if(abs(tnew-t)prevJump-tol) + { + tnew<-t+sign(newK1-K1_eval)*prevJump/2 + newK1<-K1_adj(tnew,mu,g,q) + prevJump<-prevJump/2 + } else { + prevJump<-abs(tnew-t) + } + } + + rep<-rep+1 + t<-tnew + K1_eval<-newK1 + } + return(list(root=t,n.iter=rep,Is.converge=conv)) + } +} + +Get_Saddle_Prob<-function(zeta, mu, g, q) +{ + k1<-Korg(zeta, mu, g) + k2<-K2(zeta, mu, g) + + if(is.finite(k1) && is.finite(k2)) + { + temp1<-zeta * q - k1 + + + w<-sign(zeta) * (2 *temp1)^{1/2} + v<- zeta * (k2)^{1/2} + + Z.test<-w + 1/w * log(v/w) + + if(Z.test > 0){ + pval<-pnorm(Z.test, lower.tail = FALSE) + } else { + pval<-pnorm(Z.test, lower.tail = TRUE) + } + } else { + pval<-0 + } + + return(pval) +} + +Saddle_Prob<-function(q, mu, g, Cutoff=2,alpha) +{ + m1<-sum(mu * g) + var1<-sum(mu * (1-mu) * g^2) + p1=NULL + p2=NULL + + # + qinv = -sign(q-m1) * abs(q-m1) + m1 + + # Noadj + pval.noadj<-pchisq((q - m1)^2/var1, lower.tail = FALSE, df=1) + Is.converge=TRUE + + if(Cutoff=="BE"){ + rho<-sum(((abs(g))^3)*mu*(1-mu)*(mu^2+(1-mu)^2)) + B<-0.56*rho*var1^(-3/2) + p<-B+alpha/2 + Cutoff=ifelse(p>=0.496,0.01,qnorm(p,lower.tail=F)) + } else if(Cutoff < 10^-1){ + Cutoff=10^-1 + } + # + + if(abs(q - m1)/sqrt(var1) < Cutoff){ + + pval=pval.noadj + + } else { + out.uni1<-getroot_K1(0, mu=mu, g=g, q=q) + out.uni2<-getroot_K1(0, mu=mu, g=g, q=qinv) + if(out.uni1$Is.converge==TRUE && out.uni2$Is.converge==TRUE) + { + p1<-Get_Saddle_Prob(out.uni1$root, mu, g, q) + p2<-Get_Saddle_Prob(out.uni2$root, mu, g, qinv) + pval = p1+p2 + Is.converge=TRUE + } else { + print("Error_Converge") + pval<-pval.noadj + Is.converge=FALSE + } + } + + if(pval!=0 && pval.noadj/pval>10^3) + { + return(Saddle_Prob(q, mu, g, Cutoff=Cutoff*2,alpha)) + } else { + return(list(p.value=pval, p.value.NA=pval.noadj, Is.converge=Is.converge, p1=p1, p2=p2)) + } + +} + +TestSPA<-function(G, obj.null, Cutoff=2,alpha) +{ + if(class(obj.null) != "SA_NULL"){ + stop("obj.null should be a returned object from ScoreTest_wSaddleApprox_NULL_Model") + } + + y = obj.null$y + mu = obj.null$mu + res = obj.null$res + + n.g<-sum(G) + if(n.g/(2*length(G))>0.5) + { + G<-2-G + n.g<-sum(G) + } + G1<-G - obj.null$XXVX_inv %*% (obj.null$XV %*% G) + q<-sum(G1 * y) /sqrt(n.g) + out<-Saddle_Prob(q, mu=mu, g=G1/sqrt(n.g), Cutoff=Cutoff,alpha=alpha) + + return(out) +} + +Korg_fast<-function(t, mu, g,gNA,gNB,muNA,muNB,NAmu,NAsigma) +{ + n.t<-length(t) + out<-rep(0,n.t) + + for(i in 1:n.t){ + t1<-t[i] + temp<-log(1 - muNB + muNB * exp(gNB* t1)) + out[i]<-sum(temp)+NAmu*t1+0.5*NAsigma*t1^2 + } + return(out) +} + + +K1_adj_fast<-function(t, mu, g, q,gNA,gNB,muNA,muNB,NAmu,NAsigma) +{ + n.t<-length(t) + out<-rep(0,n.t) + + for(i in 1:n.t){ + t1<-t[i] + temp1<-(1 - muNB)* exp(-gNB * t1) + muNB + temp2<-muNB *gNB + temp3<-NAmu+NAsigma*t1 + out[i]<-sum(temp2/temp1)+temp3-q + } + return(out) +} + +K2_fast<-function(t, mu, g,gNA,gNB,muNA,muNB,NAmu,NAsigma) +{ + n.t<-length(t) + out<-rep(0,n.t) + + for(i in 1:n.t){ + t1<-t[i] + temp1<-((1 - muNB)* exp(-gNB * t1) + muNB)^2 + temp2<-(1-muNB) * muNB * gNB^2 * exp(-gNB*t1) + out[i]<-sum(temp2/temp1,na.rm=TRUE)+NAsigma + } + return(out) +} + +getroot_K1_fast<-function(init,mu,g,q,m1,gNA,gNB,muNA,muNB,NAmu,NAsigma,tol=.Machine$double.eps^0.25,maxiter=1000) +{ + g.pos<-sum(g[which(g>0)]) + g.neg<- sum(g[which(g<0)]) + if(q>=g.pos || q<=g.neg) + { + return(list(root=Inf,n.iter=0,Is.converge=TRUE)) + } else { + t<-init + K1_eval<-K1_adj_fast(t,mu,g,q,gNA,gNB,muNA,muNB,NAmu,NAsigma) + prevJump<- Inf + rep<-1 + repeat + { + K2_eval<-K2_fast(t,mu,g,gNA,gNB,muNA,muNB,NAmu,NAsigma) + tnew<-t-K1_eval/K2_eval + if(is.na(tnew)) + { + conv=FALSE + break + } + if(abs(tnew-t)prevJump-tol) + { + tnew<-t+sign(newK1-K1_eval)*prevJump/2 + newK1<-K1_adj_fast(tnew,mu,g,q,gNA,gNB,muNA,muNB,NAmu,NAsigma) + prevJump<-prevJump/2 + } else { + prevJump<-abs(tnew-t) + } + } + + rep<-rep+1 + t<-tnew + K1_eval<-newK1 + } + return(list(root=t,n.iter=rep,Is.converge=conv)) + } +} + +Get_Saddle_Prob_fast<-function(zeta, mu, g, q,gNA,gNB,muNA,muNB,NAmu,NAsigma) +{ + k1<-Korg_fast(zeta, mu, g,gNA,gNB,muNA,muNB,NAmu,NAsigma) + k2<-K2_fast(zeta, mu, g,gNA,gNB,muNA,muNB,NAmu,NAsigma) + + if(is.finite(k1) && is.finite(k2)) + { + temp1<-zeta * q - k1 + + + w<-sign(zeta) * (2 *temp1)^{1/2} + v<- zeta * (k2)^{1/2} + + Z.test<-w + 1/w * log(v/w) + + if(Z.test > 0){ + pval<-pnorm(Z.test, lower.tail = FALSE) + } else { + pval<-pnorm(Z.test, lower.tail = TRUE) + } + } else { + pval<-0 + } + + return(pval) +} + +Saddle_Prob_fast<-function(q, g,mu,gNA,gNB,muNA,muNB,Cutoff=2,alpha) +{ + m1<-sum(mu * g) + var1<-sum(mu * (1-mu) * g^2) + p1=NULL + p2=NULL + + # + qinv = -sign(q-m1) * abs(q-m1) + m1 + + # Noadj + pval.noadj<-pchisq((q - m1)^2/var1, lower.tail = FALSE, df=1) + Is.converge=TRUE + + if(Cutoff=="BE"){ + rho<-sum(((abs(g))^3)*mu*(1-mu)*(mu^2+(1-mu)^2)) + B<-0.56*rho*var1^(-3/2) + p<-B+alpha/2 + Cutoff=ifelse(p>=0.496,0.01,qnorm(p,lower.tail=F)) + } else if(Cutoff < 10^-1){ + Cutoff=10^-1 + } + + # + + if(abs(q - m1)/sqrt(var1) < Cutoff){ + + pval=pval.noadj + + } else { + NAmu= m1-sum(gNB*muNB) + NAsigma=var1-sum(muNB*(1-muNB)*gNB^2) + out.uni1<-getroot_K1_fast(0, mu=mu, g=g, q=q,gNA=gNA,gNB=gNB,muNA=muNA,muNB=muNB,NAmu=NAmu,NAsigma=NAsigma) + out.uni2<-getroot_K1_fast(0, mu=mu, g=g, q=qinv,gNA=gNA,gNB=gNB,muNA=muNA,muNB=muNB,NAmu=NAmu,NAsigma=NAsigma) + if(out.uni1$Is.converge==TRUE && out.uni2$Is.converge==TRUE) + { + p1<-Get_Saddle_Prob_fast(out.uni1$root, mu, g, q,gNA,gNB,muNA,muNB,NAmu,NAsigma) + p2<-Get_Saddle_Prob_fast(out.uni2$root, mu, g, qinv,gNA,gNB,muNA,muNB,NAmu,NAsigma) + pval = p1+p2 + Is.converge=TRUE + } else { + print("Error_Converge") + pval<-pval.noadj + Is.converge=FALSE + } + + } + + if(pval!=0 && pval.noadj/pval>10^3) + { + return(Saddle_Prob_fast(q, g,mu,gNA,gNB,muNA,muNB,Cutoff=Cutoff*2,alpha)) + } else { + return(list(p.value=pval, p.value.NA=pval.noadj, Is.converge=Is.converge, p1=p1, p2=p2)) + } + +} + +TestSPAfast<-function(G, obj.null, Cutoff=2,alpha) +{ + if(class(obj.null) != "SA_NULL"){ + stop("obj.null should be a returned object from ScoreTest_wSaddleApprox_NULL_Model") + } + + y = obj.null$y + mu = obj.null$mu + res = obj.null$res + + n.g<-sum(G) + if(n.g/(2*length(G))>0.5) + { + G<-2-G + n.g<-sum(G) + } + NAset<-which(G==0) + G1<-G - obj.null$XXVX_inv %*% (obj.null$XV %*% G) + q<-sum(G1 * y) /sqrt(n.g) + g=G1/sqrt(n.g) + if(length(NAset)/length(G)<0.5) + { + out<-Saddle_Prob(q, mu=mu, g=G1/sqrt(n.g), Cutoff=Cutoff,alpha=alpha) + + } else { + + out<-Saddle_Prob_fast(q,g=g,mu=mu,gNA=g[NAset],gNB=g[-NAset], +muNA=mu[NAset],muNB=mu[-NAset],Cutoff=Cutoff,alpha=alpha) + } + return(out) +} + + + +fast.logistf.fit <- function (x, y, weight = NULL, offset = NULL, firth = TRUE, col.fit = NULL, + init = NULL, control) { + n <- nrow(x) + k <- ncol(x) + if (is.null(init)) + init = rep(0, k) + if (is.null(col.fit)) + col.fit = 1:k + if (is.null(offset)) + offset = rep(0, n) + if (is.null(weight)) + weight = rep(1, n) + if (col.fit[1] == 0) + maxit <- 0 + if (missing(control)) + control <- fast.logistf.control() + maxit <- control$maxit + maxstep <- control$maxstep + maxhs <- control$maxhs + lconv <- control$lconv + gconv <- control$gconv + xconv <- control$xconv + beta <- init + iter <- 0 + pi <- as.vector(1/(1 + exp(-x %*% beta - offset))) + evals <- 1 + repeat { + beta.old <- beta + XW2 <- t(x * (weight * pi * (1-pi))^0.5) + myQR <- qr(t(XW2)) + Q <- qr.Q(myQR) + h <- (Q*Q) %*% rep(1, ncol(Q)) + if (firth) + U.star <- crossprod(x, weight * (y - pi) + h * (0.5 - pi)) + else U.star <- crossprod(x, weight * (y - pi)) + XX.covs <- matrix(0, k, k) + if (col.fit[1] != 0) { + XX.XW2 <- t(x[, col.fit, drop=FALSE] * (weight * pi * (1-pi))^0.5) + XX.Fisher <- crossprod(t(XX.XW2)) + XX.covs[col.fit, col.fit] <- fast.invFisher(XX.Fisher) ###### HERE IS THE PROBLEM!!! + } + if(all(is.na(XX.covs)) == T) { + break + } + delta <- as.vector(XX.covs %*% U.star) + delta[is.na(delta)] <- 0 + mx <- max(abs(delta))/maxstep + if (mx > 1) + delta <- delta/mx + evals <- evals + 1 + if (maxit > 0) { + iter <- iter + 1 + beta <- beta + delta + pi <- as.vector(1/(1 + exp(-x %*% beta - offset))) + + + } + if (iter == maxit | ((max(abs(delta)) <= xconv) & (all(abs(U.star[col.fit]) < + gconv)))) + break + } + # Error catching (if chol(x) not positive definite) + if(all(is.na(XX.covs))==T) { + var <- XX.covs + list(beta = NA, var = var, pi = NA, hat.diag = NA, + iter = NA, evals = NA, conv = c(NA, + NA, NA)) + } else { + var <- XX.covs + list(beta = beta, var = var, pi = pi, hat.diag = h, + iter = iter, evals = evals, conv = c(max(abs(U.star)), + max(abs(delta)))) + } +} + + +fast.logistf.control <- function (maxit = 50, maxhs = 15, maxstep = 15, lconv = 1e-05, + gconv = 1e-05, xconv = 1e-05) +{ + list(maxit = maxit, maxhs = maxhs, maxstep = maxstep, lconv = lconv, + gconv = gconv, xconv = xconv) +} + +fast.logDet <- function (x) { + my.chol <- tryCatch(chol(x),error=function(e) {NA}) + if (all(is.na(my.chol))==T) { + return(NA) + } else { + return (2 * sum(log(diag(my.chol)))) + } +} + +fast.invFisher <- function(x) { + my.chol <- tryCatch(chol(x),error=function(e) {NA}) + if (all(is.na(my.chol))==T) { + return(NA) + } else { + return (chol2inv(my.chol)) + } + #ifelse(is.na(my.chol), NA, chol2inv(my.chol)) +} + + +ScoreTest_SPA <-function(genos,pheno,cov,obj.null,method=c("fastSPA","SPA"),minmac=5,Cutoff=2,alpha=5*10^-8,missing.id=NA,beta.out=FALSE,beta.Cutoff=5*10^-7) +{ + method<-match.arg(method) + + if(missing(obj.null)) + { + if(missing(cov) || is.null(cov)) + { + cov<-rep(1,length(pheno)) + } + obj.null<-ScoreTest_wSaddleApprox_NULL_Model(as.matrix(pheno) ~as.matrix(cov)) + } + cov<-obj.null$cov + pheno<-obj.null$y + + genos<-as.matrix(genos) + if(ncol(genos)==1) + { + m<-1 + genos<-t(genos) + } else { + m <- nrow(genos) + } + if(is.na(missing.id)==FALSE) + { + genos[which(genos==missing.id)]<-NA + } + p.value<-rep(NA,m) + p.value.NA<-rep(NA,m) + Is.converge<-rep(NA,m) + beta<-rep(NA,m) + SEbeta<-rep(NA,m) + for (i in 1:m) + { + try({ + ina<-which(is.na(genos[i,])) + if(length(ina)>0) + { + genos[i,ina]<-mean(genos[i,],na.rm=TRUE) + } + MAC<-min(sum(genos[i,]),sum(2-genos[i,])) + if(MAC>=minmac) + { + if(method=="fastSPA") + { + re <- TestSPAfast(as.vector(genos[i,,drop=FALSE]), obj.null, Cutoff=Cutoff, alpha=alpha) + } else { + re <- TestSPA(as.vector(genos[i,,drop=FALSE]), obj.null, Cutoff=Cutoff, alpha=alpha) + } + p.value[i] <- re$p.value + p.value.NA[i] <- re$p.value.NA + Is.converge[i]<- re$Is.converge + if(beta.out==TRUE && p.value[i] 0 ) { + genos[ina] <- matrix(AC[vids]/NS[vids],nrow(genos),ncol(genos))[ina] + } + + p <- matrix(NA,m,g) # store p-values for m markers + add <- matrix(NA,m,3*g) # extra columns: p.value.NA, Beta, SE + cname <- outer(pnames,c("P.NA","BETA","SEBETA"),FUN="paste",sep=".") + cname<-as.vector(t(cname)) + + for(gind in 1:g) + { + if ( m > 0 ) { ## If there is at least one marker to test + nm<-which(is.na(phenos[,gind])==FALSE) + pheno<-phenos[nm,gind] + geno<-as.matrix(genos[,nm]) + if(ncol(geno)==1) geno<-t(geno) + load(paste(nullf,".",pnames[gind],".null.RData",sep="")) + cov<-obj.null$cov + + for (i in 1:m) { + re <- TestSPAfast(as.vector(geno[i,,drop=FALSE]), obj.null, Cutoff=2) + p[i,gind] <- re$p.value + beta<-NA + SEbeta<-NA + if(p[i,gind] Date: Thu, 2 Nov 2017 19:36:09 -0400 Subject: [PATCH 11/18] Fixed issue with null model not converging properly Detect where null model is not converging using glm, and apply Firth instead for those cases --- data/epactsMultiNull.R | 118 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 116 insertions(+), 2 deletions(-) diff --git a/data/epactsMultiNull.R b/data/epactsMultiNull.R index ef8e63d..a585619 100644 --- a/data/epactsMultiNull.R +++ b/data/epactsMultiNull.R @@ -13,6 +13,108 @@ if ( covf == "NULL" ) { } +fast.logistf.fit <- function (x, y, weight = NULL, offset = NULL, firth = TRUE, col.fit = NULL, + init = NULL, control) { + n <- nrow(x) + k <- ncol(x) + if (is.null(init)) + init = rep(0, k) + if (is.null(col.fit)) + col.fit = 1:k + if (is.null(offset)) + offset = rep(0, n) + if (is.null(weight)) + weight = rep(1, n) + if (col.fit[1] == 0) + maxit <- 0 + if (missing(control)) + control <- fast.logistf.control() + maxit <- control$maxit + maxstep <- control$maxstep + maxhs <- control$maxhs + lconv <- control$lconv + gconv <- control$gconv + xconv <- control$xconv + beta <- init + iter <- 0 + pi <- as.vector(1/(1 + exp(-x %*% beta - offset))) + evals <- 1 + repeat { + beta.old <- beta + XW2 <- t(x * (weight * pi * (1-pi))^0.5) + myQR <- qr(t(XW2)) + Q <- qr.Q(myQR) + h <- (Q*Q) %*% rep(1, ncol(Q)) + if (firth) + U.star <- crossprod(x, weight * (y - pi) + h * (0.5 - pi)) + else U.star <- crossprod(x, weight * (y - pi)) + XX.covs <- matrix(0, k, k) + if (col.fit[1] != 0) { + XX.XW2 <- t(x[, col.fit, drop=FALSE] * (weight * pi * (1-pi))^0.5) + XX.Fisher <- crossprod(t(XX.XW2)) + XX.covs[col.fit, col.fit] <- fast.invFisher(XX.Fisher) ###### HERE IS THE PROBLEM!!! + } + if(all(is.na(XX.covs)) == T) { + break + } + delta <- as.vector(XX.covs %*% U.star) + delta[is.na(delta)] <- 0 + mx <- max(abs(delta))/maxstep + if (mx > 1) + delta <- delta/mx + evals <- evals + 1 + if (maxit > 0) { + iter <- iter + 1 + beta <- beta + delta + pi <- as.vector(1/(1 + exp(-x %*% beta - offset))) + + + } + if (iter == maxit | ((max(abs(delta)) <= xconv) & (all(abs(U.star[col.fit]) < + gconv)))) + break + } + # Error catching (if chol(x) not positive definite) + if(all(is.na(XX.covs))==T) { + var <- XX.covs + list(beta = NA, var = var, pi = NA, hat.diag = NA, + iter = NA, evals = NA, conv = c(NA, + NA, NA)) + } else { + var <- XX.covs + list(beta = beta, var = var, pi = pi, hat.diag = h, + iter = iter, evals = evals, conv = c(max(abs(U.star)), + max(abs(delta)))) + } +} + + +fast.logistf.control <- function (maxit = 50, maxhs = 15, maxstep = 15, lconv = 1e-05, + gconv = 1e-05, xconv = 1e-05) +{ + list(maxit = maxit, maxhs = maxhs, maxstep = maxstep, lconv = lconv, + gconv = gconv, xconv = xconv) +} + +fast.logDet <- function (x) { + my.chol <- tryCatch(chol(x),error=function(e) {NA}) + if (all(is.na(my.chol))==T) { + return(NA) + } else { + return (2 * sum(log(diag(my.chol)))) + } +} + +fast.invFisher <- function(x) { + my.chol <- tryCatch(chol(x),error=function(e) {NA}) + if (all(is.na(my.chol))==T) { + return(NA) + } else { + return (chol2inv(my.chol)) + } + #ifelse(is.na(my.chol), NA, chol2inv(my.chol)) +} + ScoreTest_wSaddleApprox_Get_X1 = function(X1) { @@ -42,7 +144,19 @@ ScoreTest_wSaddleApprox_NULL_Model <- function(formula, data=NULL) X1<-ScoreTest_wSaddleApprox_Get_X1(X1) glmfit= glm(formula, data=data, family = "binomial") - mu = glmfit$fitted.values + convflag=0 + if(glmfit$converged) + { + mu = glmfit$fitted.values + if(mean(mu)/mean(glmfit$y)>0.001 & (1-mean(mu))/(1-mean(glmfit$y))>0.001) convflag==1 #Check that the null model converged properly with glm + } + if(convflag==0) + { + firthfit=fast.logistf.fit(x=X1,y=glmfit$y) + eta<-X1%*%firthfit$beta + mu = as.vector(exp(eta)/(1+exp(eta))) + } + V = mu*(1-mu) res = glmfit$y- mu n1<-length(res) @@ -51,7 +165,7 @@ ScoreTest_wSaddleApprox_NULL_Model <- function(formula, data=NULL) XVX_inv= solve(t(X1)%*%(X1 * V)) XXVX_inv= X1 %*% XVX_inv - re<-list(y=glmfit$y, cov=X1, mu=mu, res=res, V=V, X1=X1, XV=XV, XXVX_inv =XXVX_inv) + re<-list(y=glmfit$y, mu=mu, res=res, V=V, X1=X1, XV=XV, XXVX_inv =XXVX_inv) class(re)<-"SA_NULL" return(re) } From b05442370c0d0c793f879f55036f57feb0f51bee Mon Sep 17 00:00:00 2001 From: rounakdey Date: Thu, 2 Nov 2017 19:38:34 -0400 Subject: [PATCH 12/18] Fixed issue with null model not converging properly Detect where null model is not converging using glm, and apply Firth instead for those cases --- data/epactsSingleNull.R | 118 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 116 insertions(+), 2 deletions(-) diff --git a/data/epactsSingleNull.R b/data/epactsSingleNull.R index 46d3ea6..f50adae 100644 --- a/data/epactsSingleNull.R +++ b/data/epactsSingleNull.R @@ -12,6 +12,108 @@ if ( covf == "NULL" ) { } +fast.logistf.fit <- function (x, y, weight = NULL, offset = NULL, firth = TRUE, col.fit = NULL, + init = NULL, control) { + n <- nrow(x) + k <- ncol(x) + if (is.null(init)) + init = rep(0, k) + if (is.null(col.fit)) + col.fit = 1:k + if (is.null(offset)) + offset = rep(0, n) + if (is.null(weight)) + weight = rep(1, n) + if (col.fit[1] == 0) + maxit <- 0 + if (missing(control)) + control <- fast.logistf.control() + maxit <- control$maxit + maxstep <- control$maxstep + maxhs <- control$maxhs + lconv <- control$lconv + gconv <- control$gconv + xconv <- control$xconv + beta <- init + iter <- 0 + pi <- as.vector(1/(1 + exp(-x %*% beta - offset))) + evals <- 1 + repeat { + beta.old <- beta + XW2 <- t(x * (weight * pi * (1-pi))^0.5) + myQR <- qr(t(XW2)) + Q <- qr.Q(myQR) + h <- (Q*Q) %*% rep(1, ncol(Q)) + if (firth) + U.star <- crossprod(x, weight * (y - pi) + h * (0.5 - pi)) + else U.star <- crossprod(x, weight * (y - pi)) + XX.covs <- matrix(0, k, k) + if (col.fit[1] != 0) { + XX.XW2 <- t(x[, col.fit, drop=FALSE] * (weight * pi * (1-pi))^0.5) + XX.Fisher <- crossprod(t(XX.XW2)) + XX.covs[col.fit, col.fit] <- fast.invFisher(XX.Fisher) ###### HERE IS THE PROBLEM!!! + } + if(all(is.na(XX.covs)) == T) { + break + } + delta <- as.vector(XX.covs %*% U.star) + delta[is.na(delta)] <- 0 + mx <- max(abs(delta))/maxstep + if (mx > 1) + delta <- delta/mx + evals <- evals + 1 + if (maxit > 0) { + iter <- iter + 1 + beta <- beta + delta + pi <- as.vector(1/(1 + exp(-x %*% beta - offset))) + + + } + if (iter == maxit | ((max(abs(delta)) <= xconv) & (all(abs(U.star[col.fit]) < + gconv)))) + break + } + # Error catching (if chol(x) not positive definite) + if(all(is.na(XX.covs))==T) { + var <- XX.covs + list(beta = NA, var = var, pi = NA, hat.diag = NA, + iter = NA, evals = NA, conv = c(NA, + NA, NA)) + } else { + var <- XX.covs + list(beta = beta, var = var, pi = pi, hat.diag = h, + iter = iter, evals = evals, conv = c(max(abs(U.star)), + max(abs(delta)))) + } +} + + +fast.logistf.control <- function (maxit = 50, maxhs = 15, maxstep = 15, lconv = 1e-05, + gconv = 1e-05, xconv = 1e-05) +{ + list(maxit = maxit, maxhs = maxhs, maxstep = maxstep, lconv = lconv, + gconv = gconv, xconv = xconv) +} + +fast.logDet <- function (x) { + my.chol <- tryCatch(chol(x),error=function(e) {NA}) + if (all(is.na(my.chol))==T) { + return(NA) + } else { + return (2 * sum(log(diag(my.chol)))) + } +} + +fast.invFisher <- function(x) { + my.chol <- tryCatch(chol(x),error=function(e) {NA}) + if (all(is.na(my.chol))==T) { + return(NA) + } else { + return (chol2inv(my.chol)) + } + #ifelse(is.na(my.chol), NA, chol2inv(my.chol)) +} + ScoreTest_wSaddleApprox_Get_X1 = function(X1) { q1<-ncol(X1) @@ -40,7 +142,19 @@ ScoreTest_wSaddleApprox_NULL_Model <- function(formula, data=NULL) X1<-ScoreTest_wSaddleApprox_Get_X1(X1) glmfit= glm(formula, data=data, family = "binomial") - mu = glmfit$fitted.values + convflag=0 + if(glmfit$converged) + { + mu = glmfit$fitted.values + if(mean(mu)/mean(glmfit$y)>0.001 & (1-mean(mu))/(1-mean(glmfit$y))>0.001) convflag==1 #Check that the null model converged properly with glm + } + if(convflag==0) + { + firthfit=fast.logistf.fit(x=X1,y=glmfit$y) + eta<-X1%*%firthfit$beta + mu = as.vector(exp(eta)/(1+exp(eta))) + } + V = mu*(1-mu) res = glmfit$y- mu n1<-length(res) @@ -49,7 +163,7 @@ ScoreTest_wSaddleApprox_NULL_Model <- function(formula, data=NULL) XVX_inv= solve(t(X1)%*%(X1 * V)) XXVX_inv= X1 %*% XVX_inv - re<-list(y=glmfit$y, cov=X1, mu=mu, res=res, V=V, X1=X1, XV=XV, XXVX_inv =XXVX_inv) + re<-list(y=glmfit$y, mu=mu, res=res, V=V, X1=X1, XV=XV, XXVX_inv =XXVX_inv) class(re)<-"SA_NULL" return(re) } From 3c9482b3443c954e8f528b3de8ee8e76b333256d Mon Sep 17 00:00:00 2001 From: rounakdey Date: Thu, 2 Nov 2017 19:40:11 -0400 Subject: [PATCH 13/18] Fixed issue with null model not converging properly Detect where null model is not converging using glm, and apply Firth instead for those cases --- data/multi.b.sna2.R | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/data/multi.b.sna2.R b/data/multi.b.sna2.R index 34dfd5d..171619a 100644 --- a/data/multi.b.sna2.R +++ b/data/multi.b.sna2.R @@ -42,7 +42,19 @@ ScoreTest_wSaddleApprox_NULL_Model <- function(formula, data=NULL) X1<-ScoreTest_wSaddleApprox_Get_X1(X1) glmfit= glm(formula, data=data, family = "binomial") - mu = glmfit$fitted.values + convflag=0 + if(glmfit$converged) + { + mu = glmfit$fitted.values + if(mean(mu)/mean(glmfit$y)>0.001 & (1-mean(mu))/(1-mean(glmfit$y))>0.001) convflag==1 #Check that the null model converged properly with glm + } + if(convflag==0) + { + firthfit=fast.logistf.fit(x=X1,y=glmfit$y) + eta<-X1%*%firthfit$beta + mu = as.vector(exp(eta)/(1+exp(eta))) + } + V = mu*(1-mu) res = glmfit$y- mu n1<-length(res) @@ -51,7 +63,7 @@ ScoreTest_wSaddleApprox_NULL_Model <- function(formula, data=NULL) XVX_inv= solve(t(X1)%*%(X1 * V)) XXVX_inv= X1 %*% XVX_inv - re<-list(y=glmfit$y, cov=X1, mu=mu, res=res, V=V, X1=X1, XV=XV, XXVX_inv =XXVX_inv) + re<-list(y=glmfit$y, mu=mu, res=res, V=V, X1=X1, XV=XV, XXVX_inv =XXVX_inv) class(re)<-"SA_NULL" return(re) } @@ -675,7 +687,7 @@ g<-ncol(phenos) geno<-as.matrix(genos[,nm]) if(ncol(geno)==1) geno<-t(geno) load(paste(nullf,".",pnames[gind],".null.RData",sep="")) - cov<-obj.null$cov + cov<-obj.null$X1 for (i in 1:m) { re <- TestSPAfast(as.vector(geno[i,,drop=FALSE]), obj.null, Cutoff=2) From 930fdaca59ebc745d107c10bb35496b52b7c9557 Mon Sep 17 00:00:00 2001 From: rounakdey Date: Thu, 2 Nov 2017 19:41:05 -0400 Subject: [PATCH 14/18] Fixed issue with null model not converging properly Detect where null model is not converging using glm, and apply Firth instead for those cases --- data/single.b.sna2.R | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/data/single.b.sna2.R b/data/single.b.sna2.R index 102dd3e..967b1f9 100644 --- a/data/single.b.sna2.R +++ b/data/single.b.sna2.R @@ -43,7 +43,19 @@ ScoreTest_wSaddleApprox_NULL_Model <- function(formula, data=NULL) X1<-ScoreTest_wSaddleApprox_Get_X1(X1) glmfit= glm(formula, data=data, family = "binomial") - mu = glmfit$fitted.values + convflag=0 + if(glmfit$converged) + { + mu = glmfit$fitted.values + if(mean(mu)/mean(glmfit$y)>0.001 & (1-mean(mu))/(1-mean(glmfit$y))>0.001) convflag==1 #Check that the null model converged properly with glm + } + if(convflag==0) + { + firthfit=fast.logistf.fit(x=X1,y=glmfit$y) + eta<-X1%*%firthfit$beta + mu = as.vector(exp(eta)/(1+exp(eta))) + } + V = mu*(1-mu) res = glmfit$y- mu n1<-length(res) @@ -52,7 +64,7 @@ ScoreTest_wSaddleApprox_NULL_Model <- function(formula, data=NULL) XVX_inv= solve(t(X1)%*%(X1 * V)) XXVX_inv= X1 %*% XVX_inv - re<-list(y=glmfit$y, cov=X1, mu=mu, res=res, V=V, X1=X1, XV=XV, XXVX_inv =XXVX_inv) + re<-list(y=glmfit$y, mu=mu, res=res, V=V, X1=X1, XV=XV, XXVX_inv =XXVX_inv) class(re)<-"SA_NULL" return(re) } @@ -668,7 +680,7 @@ single.b.sna2 <- function() { if ( m > 0 ) { ## If there is at least one marker to test load(paste(nullf,".null.RData",sep="")) - cov=obj.null$cov + cov=obj.null$X1 for (i in 1:m) { re <- TestSPAfast(as.numeric(genos[i,,drop=FALSE]), obj.null, Cutoff=2) p[i] <- re$p.value From d67fdf4746a4bd9b6e2edbd9bb6e9789ac9682f8 Mon Sep 17 00:00:00 2001 From: rounakdey Date: Fri, 3 Nov 2017 11:04:41 -0400 Subject: [PATCH 15/18] Small coding error convflag <-1 instead of convflag==1 --- data/epactsMultiNull.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/epactsMultiNull.R b/data/epactsMultiNull.R index a585619..7f545f3 100644 --- a/data/epactsMultiNull.R +++ b/data/epactsMultiNull.R @@ -148,7 +148,7 @@ ScoreTest_wSaddleApprox_NULL_Model <- function(formula, data=NULL) if(glmfit$converged) { mu = glmfit$fitted.values - if(mean(mu)/mean(glmfit$y)>0.001 & (1-mean(mu))/(1-mean(glmfit$y))>0.001) convflag==1 #Check that the null model converged properly with glm + if(mean(mu)/mean(glmfit$y)>0.001 & (1-mean(mu))/(1-mean(glmfit$y))>0.001) convflag<-1 #Check that the null model converged properly with glm } if(convflag==0) { From 7d23741a03d795f15857c465925da605929791a8 Mon Sep 17 00:00:00 2001 From: rounakdey Date: Fri, 3 Nov 2017 11:05:19 -0400 Subject: [PATCH 16/18] Small coding error convflag <-1 instead of convflag==1 --- data/epactsSingleNull.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/epactsSingleNull.R b/data/epactsSingleNull.R index f50adae..0dd18cc 100644 --- a/data/epactsSingleNull.R +++ b/data/epactsSingleNull.R @@ -146,7 +146,7 @@ ScoreTest_wSaddleApprox_NULL_Model <- function(formula, data=NULL) if(glmfit$converged) { mu = glmfit$fitted.values - if(mean(mu)/mean(glmfit$y)>0.001 & (1-mean(mu))/(1-mean(glmfit$y))>0.001) convflag==1 #Check that the null model converged properly with glm + if(mean(mu)/mean(glmfit$y)>0.001 & (1-mean(mu))/(1-mean(glmfit$y))>0.001) convflag<-1 #Check that the null model converged properly with glm } if(convflag==0) { From 1806b29911207cdea7e93fe1e49f0f1e18271005 Mon Sep 17 00:00:00 2001 From: rounakdey Date: Fri, 3 Nov 2017 11:06:07 -0400 Subject: [PATCH 17/18] Small coding error convflag <-1 instead of convflag==1 --- data/multi.b.sna2.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/multi.b.sna2.R b/data/multi.b.sna2.R index 171619a..024aa33 100644 --- a/data/multi.b.sna2.R +++ b/data/multi.b.sna2.R @@ -46,7 +46,7 @@ ScoreTest_wSaddleApprox_NULL_Model <- function(formula, data=NULL) if(glmfit$converged) { mu = glmfit$fitted.values - if(mean(mu)/mean(glmfit$y)>0.001 & (1-mean(mu))/(1-mean(glmfit$y))>0.001) convflag==1 #Check that the null model converged properly with glm + if(mean(mu)/mean(glmfit$y)>0.001 & (1-mean(mu))/(1-mean(glmfit$y))>0.001) convflag<-1 #Check that the null model converged properly with glm } if(convflag==0) { From 51b1cff23e49cfc8e4155ef8ad23e12f6f7eb362 Mon Sep 17 00:00:00 2001 From: rounakdey Date: Fri, 3 Nov 2017 11:06:40 -0400 Subject: [PATCH 18/18] Small coding error convflag <-1 instead of convflag==1 --- data/single.b.sna2.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/single.b.sna2.R b/data/single.b.sna2.R index 967b1f9..d3bd2ce 100644 --- a/data/single.b.sna2.R +++ b/data/single.b.sna2.R @@ -47,7 +47,7 @@ ScoreTest_wSaddleApprox_NULL_Model <- function(formula, data=NULL) if(glmfit$converged) { mu = glmfit$fitted.values - if(mean(mu)/mean(glmfit$y)>0.001 & (1-mean(mu))/(1-mean(glmfit$y))>0.001) convflag==1 #Check that the null model converged properly with glm + if(mean(mu)/mean(glmfit$y)>0.001 & (1-mean(mu))/(1-mean(glmfit$y))>0.001) convflag<-1 #Check that the null model converged properly with glm } if(convflag==0) {