From 51b8eed0834a05f879c6f481c3a8b828f6629659 Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Fri, 4 Sep 2015 08:15:52 -0400 Subject: [PATCH 1/7] Update set-path-variable.sh Update Sailfish version --- simulationScripts/set-path-variable.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/simulationScripts/set-path-variable.sh b/simulationScripts/set-path-variable.sh index 4f477df..ef8da59 100644 --- a/simulationScripts/set-path-variable.sh +++ b/simulationScripts/set-path-variable.sh @@ -11,7 +11,7 @@ anaconda/1.9.1 rsem-1.2.15/ express/express-1.5.1-linux_x86_64 tigar2 -export LD_LIBRARY_PATH=YOUR_SAILFISH_PATH_HERE/sailfish/Sailfish-0.6.3-Linux_x86-64/lib:$LD_LIBRARY_PATH -export PATH=YOUR_SAILFISH_PATH_HERE/sailfish/Sailfish-0.6.3-Linux_x86-64/bin:$PATH +export LD_LIBRARY_PATH=YOUR_SAILFISH_PATH_HERE/sailfish/Sailfish-0.7.3-Linux_x86-64/lib:$LD_LIBRARY_PATH +export PATH=YOUR_SAILFISH_PATH_HERE/sailfish/Sailfish-0.7.3-Linux_x86-64/bin:$PATH compilers/gcc/4.8.2 From 1f05c48f4a63fcf98d93bc5c311a268800ae39d7 Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Fri, 4 Sep 2015 08:16:51 -0400 Subject: [PATCH 2/7] Remove deprecated SF bias correction --- simulationScripts/sailfishNames.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/simulationScripts/sailfishNames.R b/simulationScripts/sailfishNames.R index fb9005c..0ab2942 100644 --- a/simulationScripts/sailfishNames.R +++ b/simulationScripts/sailfishNames.R @@ -9,7 +9,7 @@ while (length(oneLine <- readLines(myFile, n = 1, warn = FALSE)) > 0){ } close(myFile) par(mfrow = c(2,3)) -A1 <- read.table("A1/quant_bias_corrected.sf") +A1 <- read.table("A1/quant.sf") ind <- A1[,2] groundTruth <- read.table("../A1/transcriptNames.txt") sailA1 <- cbind(groundTruth,numeric(dim(groundTruth)[1])) @@ -25,7 +25,7 @@ sailA1[,3] <- log(sailA1[,3]/sum(sailA1[,3])) #smoothScatter(sailA1[,-1]);points(c(-30,10),c(-30,10),type = "l",col = 2) write.table(sailA1,file = "sailA1.txt") -A1 <- read.table("A2/quant_bias_corrected.sf") +A1 <- read.table("A2/quant.sf") ind <- A1[,2] groundTruth <- read.table("../A2/transcriptNames.txt") sailA1 <- cbind(groundTruth,numeric(dim(groundTruth)[1])) @@ -41,7 +41,7 @@ sailA1[,3] <- log(sailA1[,3]/sum(sailA1[,3])) #smoothScatter(sailA1[,-1]);points(c(-30,10),c(-30,10),type = "l",col = 2) write.table(sailA1,file = "sailA2.txt") -A1 <- read.table("A3/quant_bias_corrected.sf") +A1 <- read.table("A3/quant.sf") ind <- A1[,2] groundTruth <- read.table("../A3/transcriptNames.txt") sailA1 <- cbind(groundTruth,numeric(dim(groundTruth)[1])) @@ -58,7 +58,7 @@ sailA1[,3] <- log(sailA1[,3]/sum(sailA1[,3])) write.table(sailA1,file = "sailA3.txt") -A1 <- read.table("A4/quant_bias_corrected.sf") +A1 <- read.table("A4/quant.sf") ind <- A1[,2] groundTruth <- read.table("../A4/transcriptNames.txt") sailA1 <- cbind(groundTruth,numeric(dim(groundTruth)[1])) @@ -74,7 +74,7 @@ sailA1[,3] <- log(sailA1[,3]/sum(sailA1[,3])) #smoothScatter(sailA1[,-1]);points(c(-30,10),c(-30,10),type = "l",col = 2) write.table(sailA1,file = "sailA4.txt") -A1 <- read.table("A5/quant_bias_corrected.sf") +A1 <- read.table("A5/quant.sf") ind <- A1[,2] groundTruth <- read.table("../A5/transcriptNames.txt") sailA1 <- cbind(groundTruth,numeric(dim(groundTruth)[1])) From 651f0bb20908b4a4dee4d93a70c281806a17229b Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Fri, 4 Sep 2015 08:18:58 -0400 Subject: [PATCH 3/7] Change to new Sailfish library format string --- simulationScripts/commands.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/simulationScripts/commands.sh b/simulationScripts/commands.sh index 45937c3..3f6b0e6 100644 --- a/simulationScripts/commands.sh +++ b/simulationScripts/commands.sh @@ -226,11 +226,11 @@ cd .. # # Run Sailfish # mkdir sailfish # cd sailfish # -sailfish quant -i ../../transcriptome_data/indDir -p 4 -l "T=PE" -1 ../A1/out1.fq -2 ../A1/out2.fq -o A1 # -sailfish quant -i ../../transcriptome_data/indDir -p 4 -l "T=PE" -1 ../A2/out1.fq -2 ../A2/out2.fq -o A2 # -sailfish quant -i ../../transcriptome_data/indDir -p 4 -l "T=PE" -1 ../A3/out1.fq -2 ../A3/out2.fq -o A3 # -sailfish quant -i ../../transcriptome_data/indDir -p 4 -l "T=PE" -1 ../A4/out1.fq -2 ../A4/out2.fq -o A4 # -sailfish quant -i ../../transcriptome_data/indDir -p 4 -l "T=PE" -1 ../A5/out1.fq -2 ../A5/out2.fq -o A5 # +sailfish quant -i ../../transcriptome_data/indDir -p 4 -l IU -1 ../A1/out1.fq -2 ../A1/out2.fq -o A1 # +sailfish quant -i ../../transcriptome_data/indDir -p 4 -l IU -1 ../A2/out1.fq -2 ../A2/out2.fq -o A2 # +sailfish quant -i ../../transcriptome_data/indDir -p 4 -l IU -1 ../A3/out1.fq -2 ../A3/out2.fq -o A3 # +sailfish quant -i ../../transcriptome_data/indDir -p 4 -l IU -1 ../A4/out1.fq -2 ../A4/out2.fq -o A4 # +sailfish quant -i ../../transcriptome_data/indDir -p 4 -l IU -1 ../A5/out1.fq -2 ../A5/out2.fq -o A5 # # Process output # R CMD BATCH ../../sailfishNames.R # R CMD BATCH ../../withinGeneSailfish.R # From a4690049e1f3a4dd2e44e039e8ae6e8e66a043f3 Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Fri, 4 Sep 2015 08:24:55 -0400 Subject: [PATCH 4/7] Convert to new SF output format Any line starting with '#' is a header. The estimated number of reads is now in column 4 instead of 7. --- simulationScripts/sailfishNames.R | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/simulationScripts/sailfishNames.R b/simulationScripts/sailfishNames.R index 0ab2942..1bbde26 100644 --- a/simulationScripts/sailfishNames.R +++ b/simulationScripts/sailfishNames.R @@ -9,7 +9,7 @@ while (length(oneLine <- readLines(myFile, n = 1, warn = FALSE)) > 0){ } close(myFile) par(mfrow = c(2,3)) -A1 <- read.table("A1/quant.sf") +A1 <- read.table("A1/quant.sf", comment.char="#") ind <- A1[,2] groundTruth <- read.table("../A1/transcriptNames.txt") sailA1 <- cbind(groundTruth,numeric(dim(groundTruth)[1])) @@ -17,7 +17,7 @@ colnames(sailA1) <- c("id","true","est") for (i in 1:dim(groundTruth)[1]){ j <- which(sailNames[,2] == as.character(sailA1[i,1])) k <- which(A1[,1] == sailNames[j,1]) - sailA1[i,3] <- A1[k,7] + sailA1[i,3] <- A1[k,4] if(i%%1000 == 0){print(i)} } sailA1[,2] <- log(sailA1[,2]/sum(sailA1[,2])) @@ -25,7 +25,7 @@ sailA1[,3] <- log(sailA1[,3]/sum(sailA1[,3])) #smoothScatter(sailA1[,-1]);points(c(-30,10),c(-30,10),type = "l",col = 2) write.table(sailA1,file = "sailA1.txt") -A1 <- read.table("A2/quant.sf") +A1 <- read.table("A2/quant.sf", comment.char="#") ind <- A1[,2] groundTruth <- read.table("../A2/transcriptNames.txt") sailA1 <- cbind(groundTruth,numeric(dim(groundTruth)[1])) @@ -33,7 +33,7 @@ colnames(sailA1) <- c("id","true","est") for (i in 1:dim(groundTruth)[1]){ j <- which(sailNames[,2] == as.character(sailA1[i,1])) k <- which(A1[,1] == sailNames[j,1]) - sailA1[i,3] <- A1[k,7] + sailA1[i,3] <- A1[k,4] if(i%%1000 == 0){print(i)} } sailA1[,2] <- log(sailA1[,2]/sum(sailA1[,2])) @@ -41,7 +41,7 @@ sailA1[,3] <- log(sailA1[,3]/sum(sailA1[,3])) #smoothScatter(sailA1[,-1]);points(c(-30,10),c(-30,10),type = "l",col = 2) write.table(sailA1,file = "sailA2.txt") -A1 <- read.table("A3/quant.sf") +A1 <- read.table("A3/quant.sf", comment.char="#") ind <- A1[,2] groundTruth <- read.table("../A3/transcriptNames.txt") sailA1 <- cbind(groundTruth,numeric(dim(groundTruth)[1])) @@ -49,7 +49,7 @@ colnames(sailA1) <- c("id","true","est") for (i in 1:dim(groundTruth)[1]){ j <- which(sailNames[,2] == as.character(sailA1[i,1])) k <- which(A1[,1] == sailNames[j,1]) - sailA1[i,3] <- A1[k,7] + sailA1[i,3] <- A1[k,4] if(i%%1000 == 0){print(i)} } sailA1[,2] <- log(sailA1[,2]/sum(sailA1[,2])) @@ -58,7 +58,7 @@ sailA1[,3] <- log(sailA1[,3]/sum(sailA1[,3])) write.table(sailA1,file = "sailA3.txt") -A1 <- read.table("A4/quant.sf") +A1 <- read.table("A4/quant.sf", comment.char="#") ind <- A1[,2] groundTruth <- read.table("../A4/transcriptNames.txt") sailA1 <- cbind(groundTruth,numeric(dim(groundTruth)[1])) @@ -66,7 +66,7 @@ colnames(sailA1) <- c("id","true","est") for (i in 1:dim(groundTruth)[1]){ j <- which(sailNames[,2] == as.character(sailA1[i,1])) k <- which(A1[,1] == sailNames[j,1]) - sailA1[i,3] <- A1[k,7] + sailA1[i,3] <- A1[k,4] if(i%%1000 == 0){print(i)} } sailA1[,2] <- log(sailA1[,2]/sum(sailA1[,2])) @@ -74,7 +74,7 @@ sailA1[,3] <- log(sailA1[,3]/sum(sailA1[,3])) #smoothScatter(sailA1[,-1]);points(c(-30,10),c(-30,10),type = "l",col = 2) write.table(sailA1,file = "sailA4.txt") -A1 <- read.table("A5/quant.sf") +A1 <- read.table("A5/quant.sf", comment.char="#") ind <- A1[,2] groundTruth <- read.table("../A5/transcriptNames.txt") sailA1 <- cbind(groundTruth,numeric(dim(groundTruth)[1])) @@ -82,7 +82,7 @@ colnames(sailA1) <- c("id","true","est") for (i in 1:dim(groundTruth)[1]){ j <- which(sailNames[,2] == as.character(sailA1[i,1])) k <- which(A1[,1] == sailNames[j,1]) - sailA1[i,3] <- A1[k,7] + sailA1[i,3] <- A1[k,4] if(i%%1000 == 0){print(i)} } sailA1[,2] <- log(sailA1[,2]/sum(sailA1[,2])) From 90f646a60a1b7848e7b9d9e3a4cf0b2336ed69a0 Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Fri, 4 Sep 2015 10:56:19 -0400 Subject: [PATCH 5/7] Create processSailfishNames.py Since sailfish includes the entire FASTA header as the name of the target, we have to process these output files for the rest of the pipeline to work. --- simulationScripts/processSailfishNames.py | 33 +++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 simulationScripts/processSailfishNames.py diff --git a/simulationScripts/processSailfishNames.py b/simulationScripts/processSailfishNames.py new file mode 100644 index 0000000..b4ba421 --- /dev/null +++ b/simulationScripts/processSailfishNames.py @@ -0,0 +1,33 @@ +from __future__ import print_function +import argparse +import os +import sys +import re + +def main(args): + ifile = args.input + if (not os.path.isfile(ifile)): + print('Could not find input file {}'.format(ifile), file=sys.stderr) + sys.exit(1) + + lines = open(ifile).readlines() + headers = [l for l in lines if l.startswith('#')] + data = [l for l in lines if not l.startswith('#')] + + with open(args.output, 'w') as ofile: + for l in headers: + ofile.write(l) + for l in data: + toks = l.rstrip('\n').split('\t') + name = re.split('\s+', toks[0])[0] + newline = '\t'.join([name] + toks[1:]) + ofile.write("{}\n".format(newline)) + + print("successfully modified file", file=sys.stderr) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Process crazy transcript names') + parser.add_argument('--input', type=str, required=True, help='the quant.sf file') + parser.add_argument('--output', type=str, required=True, help='where the processed file should be placed') + args = parser.parse_args() + main(args) From ea3216b973e85c36ce21bcaf423691089c705d73 Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Fri, 4 Sep 2015 10:57:47 -0400 Subject: [PATCH 6/7] Update commands.sh Add code to fix the target names --- simulationScripts/commands.sh | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/simulationScripts/commands.sh b/simulationScripts/commands.sh index 3f6b0e6..3b91738 100644 --- a/simulationScripts/commands.sh +++ b/simulationScripts/commands.sh @@ -232,6 +232,19 @@ sailfish quant -i ../../transcriptome_data/indDir -p 4 -l IU -1 ../A3/out1.fq -2 sailfish quant -i ../../transcriptome_data/indDir -p 4 -l IU -1 ../A4/out1.fq -2 ../A4/out2.fq -o A4 # sailfish quant -i ../../transcriptome_data/indDir -p 4 -l IU -1 ../A5/out1.fq -2 ../A5/out2.fq -o A5 # # Process output # +# Fix the target names +echo "fixing output names" +python ../../processSailfishNames.py --input A1/quant.sf --output A1/quant.fixed.sf +python ../../processSailfishNames.py --input A2/quant.sf --output A2/quant.fixed.sf +python ../../processSailfishNames.py --input A3/quant.sf --output A3/quant.fixed.sf +python ../../processSailfishNames.py --input A4/quant.sf --output A4/quant.fixed.sf +python ../../processSailfishNames.py --input A5/quant.sf --output A5/quant.fixed.sf +mv A1/quant.fixed.sf A1/quant.sf +mv A2/quant.fixed.sf A2/quant.sf +mv A3/quant.fixed.sf A3/quant.sf +mv A4/quant.fixed.sf A4/quant.sf +mv A5/quant.fixed.sf A5/quant.sf +echo "done" R CMD BATCH ../../sailfishNames.R # R CMD BATCH ../../withinGeneSailfish.R # cd .. # From 6223620f0a8d2ec2884401434c1dd188925f1104 Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Fri, 4 Sep 2015 10:58:13 -0400 Subject: [PATCH 7/7] Update generate-known-fa-files.sh -k argument no longer required for sailfish --- simulationScripts/generate-known-fa-files.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simulationScripts/generate-known-fa-files.sh b/simulationScripts/generate-known-fa-files.sh index 869331f..2966d5e 100644 --- a/simulationScripts/generate-known-fa-files.sh +++ b/simulationScripts/generate-known-fa-files.sh @@ -11,7 +11,7 @@ rm -r A1.tophat.out cd transcriptome_data rsem-prepare-reference --bowtie2 known.fa ref #this is for rsem sed -e 's/\,.*//' known.fa > output.fasta #this is for making tigar names work -sailfish index -t known.fa -o indDir -k 20 -p 4 #this is for sailfish +sailfish index -t known.fa -o indDir -p 4 #this is for sailfish