From cc0a10093bb19d79ab07ab2669f87923e7c8532d Mon Sep 17 00:00:00 2001 From: David Heise Date: Thu, 27 Sep 2018 16:31:30 -0400 Subject: [PATCH 1/7] Add dependencies to htslib and libbam. Use my fork of libStatGen for development. --- CMakeLists.txt | 4 +++- requirements.txt | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1ee72b8..f42f472 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,6 +17,8 @@ endif() find_package(ZLIB REQUIRED) find_library(STATGEN_LIBRARY StatGen) +find_library(HTS_LIBRARY hts) +find_library(BAM_LIBRARY bam) add_executable(minimac4 src/Analysis.cpp src/Analysis.h @@ -40,6 +42,6 @@ add_executable(minimac4 src/Estimation.cpp src/Estimation.h) -target_link_libraries(minimac4 ${STATGEN_LIBRARY} ${ZLIB_LIBRARIES}) +target_link_libraries(minimac4 ${STATGEN_LIBRARY} ${HTS_LIBRARY} ${BAM_LIBRARY} ${ZLIB_LIBRARIES}) install(TARGETS minimac4 RUNTIME DESTINATION bin) diff --git a/requirements.txt b/requirements.txt index 6601d8a..44bcf30 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1 @@ -statgen/libStatGen --cmake dep/libstatgen.cmake +daheise/libStatGen@parallel-bgzip --cmake dep/libstatgen.cmake From 974c05e3029e4dc7aec38eb66e24f0ce0d189d35 Mon Sep 17 00:00:00 2001 From: David Heise Date: Mon, 1 Oct 2018 13:28:29 -0400 Subject: [PATCH 2/7] Use CPUs argument to set number of threads for parallel IO. --- src/Analysis.cpp | 44 +++++++++++++++++++++++--------------------- src/DosageData.cpp | 6 ++++-- 2 files changed, 27 insertions(+), 23 deletions(-) diff --git a/src/Analysis.cpp b/src/Analysis.cpp index 2723617..ded8cea 100644 --- a/src/Analysis.cpp +++ b/src/Analysis.cpp @@ -36,7 +36,7 @@ void MyTokenize(vector &result, const char *input, const char *delimiter } - + bool Analysis::CreateRecombinationMap() { @@ -77,9 +77,9 @@ bool Analysis::CreateRecombinationMap() SecondIndex++; } int h=0; - + return true; - + } @@ -102,7 +102,7 @@ String Analysis::RunAnalysis(String &Reffilename, String &Tarfilename, String &R cout << "\n Program Exiting ... \n\n"; return "Genetic.Map.Load.Error"; } - + if (!targetPanel.ScaffoldGWAStoReference(referencePanel,*MyAllVariables)) { cout << "\n Program Exiting ... \n\n"; @@ -408,6 +408,8 @@ void Analysis::AppendtoMainLooVcfFaster(int ChunkNo, int MaxIndex) void Analysis::AppendtoMainVcfFaster(int ChunkNo, int MaxIndex) { + char bgzf_mode[16]; + snprintf(bgzf_mode, 16-2, "r@%d", MyAllVariables->myModelVariables.cpus); VcfPrintStringLength=0; @@ -429,7 +431,7 @@ void Analysis::AppendtoMainVcfFaster(int ChunkNo, int MaxIndex) strs1<<(ChunkNo+1); tempFileIndex+=(".chunk."+(string)(strs1.str())+".dose.part." + (string)(strs.str())+".vcf.gz"); - vcfdosepartialList[i-1] = ifopen(tempFileIndex.c_str(), "r"); + vcfdosepartialList[i-1] = ifopen(tempFileIndex.c_str(), bgzf_mode); } string line; int i=0; @@ -1287,7 +1289,7 @@ void Analysis::GetCurrentPanelReady(int ChunkNo, HaplotypeSet &ThisRefPanel, if(i==(ThisRefPanel.NoBlocks-1)) Mapper[j]=i; } - + } @@ -1398,7 +1400,7 @@ void Analysis::GetCurrentPanelReady(int ChunkNo, HaplotypeSet &ThisRefPanel, ThisRefPanel.CalculateAlleleFreq(); ThisTarPanel.CalculateGWASOnlyAlleleFreq(); - + if(!MyAllVariables->myModelVariables.minimac3) { int time_prev = time(0); @@ -1426,20 +1428,20 @@ void Analysis::GetCurrentPanelReady(int ChunkNo, HaplotypeSet &ThisRefPanel, thisDataFast.MainMarkovModel[i].AssignPanels(ThisRefPanel,ThisRefChipOnlyPanel,ThisTarPanel,ThisTarPanel,MyAllVariables); thisDataFast.MainMarkovModel[i].initializeMatrices(); - } + } } else { int time_prev = time(0); - - + + for(int i=0;imyModelVariables.cpus;i++) { thisDataFast.MainMarkovModel[i].AssignPanels(ThisRefPanel,ThisTarPanel,MyAllVariables); thisDataFast.MainMarkovModel[i].initializeMatricesMinimac3(); - } + } } @@ -1669,8 +1671,8 @@ void Analysis::GetCurrentPanelReady(int ChunkNo, HaplotypeSet &ThisRefPanel, } - - + + bool Analysis::CheckGeneticMapFile() { std::cout << "\n Checking genetic map file : "<myHapDataVariables.mapFile << endl< Pieces(4); vector AppendPiece(2); double PrevSumVal=0.0; - + if(fileStream) - { + { while(fileStream->readLine(line)!=-1) { MyTokenize(Pieces, line.c_str(), tabSep,4); @@ -1707,21 +1709,21 @@ bool Analysis::CheckGeneticMapFile() cout << "\n Program Exiting ... \n\n"; return false; } - - + + if(GeneticMapData.size()<2) { cout << "\n ERROR !!! Chromosome "<myHapDataVariables.mapFile << endl; cout << "\n Program Exiting ... \n\n"; return false; } - + ifclose(fileStream); - + cout<<" Successful !!! "< #include "DosageData.h" #include "HaplotypeSet.h" @@ -13,7 +14,8 @@ void DosageData::FlushPartialVcf(int NovcfParts) PrintEmpStringLength=0; int time_Start = time(0); - + char bgzf_mode[16]; + snprintf(bgzf_mode, 16-3, "wb@%d", MyAllVariables->myModelVariables.cpus); string PartialVcfFileName(MyAllVariables->myOutFormat.OutPrefix); string PartialMetaVcfFileName(MyAllVariables->myOutFormat.OutPrefix); @@ -23,7 +25,7 @@ void DosageData::FlushPartialVcf(int NovcfParts) PartialVcfFileName+=(".chunk."+(string)(strs1.str())+ ".dose.part."+ (string)(strs.str())+".vcf.gz"); PartialMetaVcfFileName+=(".chunk."+(string)(strs1.str())+ ".empiricalDose.part."+ (string)(strs.str())+".vcf.gz"); - IFILE vcfdosepartial = ifopen(PartialVcfFileName.c_str(), "wb", InputFile::BGZF); + IFILE vcfdosepartial = ifopen(PartialVcfFileName.c_str(), bgzf_mode, InputFile::BGZF); IFILE vcfLoodosepartial = NULL; if(MyAllVariables->myOutFormat.meta) From 318c6d997c2f2e21e942356237fc3563aeb221c0 Mon Sep 17 00:00:00 2001 From: David Heise Date: Thu, 4 Oct 2018 10:01:39 -0400 Subject: [PATCH 3/7] Fix a threadbomb and multithread final write. --- src/Analysis.cpp | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/src/Analysis.cpp b/src/Analysis.cpp index ded8cea..c6bc088 100644 --- a/src/Analysis.cpp +++ b/src/Analysis.cpp @@ -408,8 +408,24 @@ void Analysis::AppendtoMainLooVcfFaster(int ChunkNo, int MaxIndex) void Analysis::AppendtoMainVcfFaster(int ChunkNo, int MaxIndex) { - char bgzf_mode[16]; - snprintf(bgzf_mode, 16-2, "r@%d", MyAllVariables->myModelVariables.cpus); + char bgzf_partial_mode[16]; + char bgzf_final_mode[16]; + // To avoid forkbombing on reads, only use more than one CPU on reads + // if there is more than one CPU per files to read. + int readingCpus = (MyAllVariables->myModelVariables.cpus-1)/MaxIndex; + if(MyAllVariables->myModelVariables.cpus > 1 && readingCpus > 1){ + snprintf(bgzf_partial_mode, 16-2, "r@%d", readingCpus); + } else { + snprintf(bgzf_partial_mode, 16-2, "r@%d", 1); + } + + // Use all available CPUs for writing + if(MyAllVariables->myModelVariables.cpus > 1){ + snprintf(bgzf_final_mode, 16-2, "a@%d", MyAllVariables->myModelVariables.cpus-1); + } else { + // Otherwise, do a single threaded read. + snprintf(bgzf_final_mode, 16-2, "a@%d", 1); + } VcfPrintStringLength=0; @@ -420,7 +436,7 @@ void Analysis::AppendtoMainVcfFaster(int ChunkNo, int MaxIndex) cout< vcfdosepartialList(MaxIndex); - vcfdosepartial = ifopen(MyAllVariables->myOutFormat.OutPrefix + ".dose.vcf" + (MyAllVariables->myOutFormat.gzip ? ".gz" : ""), "a", MyAllVariables->myOutFormat.gzip ?InputFile::BGZF:InputFile::UNCOMPRESSED); + vcfdosepartial = ifopen(MyAllVariables->myOutFormat.OutPrefix + ".dose.vcf" + (MyAllVariables->myOutFormat.gzip ? ".gz" : ""), bgzf_final_mode, MyAllVariables->myOutFormat.gzip ?InputFile::BGZF:InputFile::UNCOMPRESSED); for(int i=1;i<=MaxIndex;i++) @@ -431,7 +447,7 @@ void Analysis::AppendtoMainVcfFaster(int ChunkNo, int MaxIndex) strs1<<(ChunkNo+1); tempFileIndex+=(".chunk."+(string)(strs1.str())+".dose.part." + (string)(strs.str())+".vcf.gz"); - vcfdosepartialList[i-1] = ifopen(tempFileIndex.c_str(), bgzf_mode); + vcfdosepartialList[i-1] = ifopen(tempFileIndex.c_str(), bgzf_partial_mode); } string line; int i=0; From 8dd2b4f657f8dc98906213864de402443ee8da7e Mon Sep 17 00:00:00 2001 From: David Heise Date: Wed, 10 Oct 2018 11:43:20 -0400 Subject: [PATCH 4/7] Changes to support libStatGen with cram support --- CMakeLists.txt | 2 +- dep/libstatgen.cmake | 4 ++-- install.sh | 2 +- requirements.txt | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f42f472..cdcbaae 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -18,7 +18,7 @@ endif() find_package(ZLIB REQUIRED) find_library(STATGEN_LIBRARY StatGen) find_library(HTS_LIBRARY hts) -find_library(BAM_LIBRARY bam) +#find_library(BAM_LIBRARY bam) add_executable(minimac4 src/Analysis.cpp src/Analysis.h diff --git a/dep/libstatgen.cmake b/dep/libstatgen.cmake index 9557be4..c67fe44 100644 --- a/dep/libstatgen.cmake +++ b/dep/libstatgen.cmake @@ -5,11 +5,11 @@ project(libStatGen VERSION 1.0.0) add_custom_target(libStatGen ALL COMMAND make WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMENT "Builing libStatGen ...") -file(GLOB_RECURSE LSG_HEADER_LIST "bam/*.h" "fastq/*.h" "general/*.h" "glf/*.h" "samtools/*.h" "vcf/*.h") +file(GLOB_RECURSE LSG_HEADER_LIST "bam/*.h" "fastq/*.h" "general/*.h" "glf/*.h" "vcf/*.h") install(FILES ${LSG_HEADER_LIST} DESTINATION include) if (BUILD_SHARED_LIBS) install(FILES ${CMAKE_SHARED_LIBRARY_PREFIX}StatGen${CMAKE_SHARED_LIBRARY_SUFFIX} DESTINATION lib) else() install(FILES ${CMAKE_STATIC_LIBRARY_PREFIX}StatGen${CMAKE_STATIC_LIBRARY_SUFFIX} DESTINATION lib) -endif() \ No newline at end of file +endif() diff --git a/install.sh b/install.sh index cd7b275..dd9aad4 100755 --- a/install.sh +++ b/install.sh @@ -6,7 +6,7 @@ cget install -f requirements.txt &> install.log mkdir release-build cd release-build/ echo -e "Generating MakeFiles ..." -cmake -DCMAKE_TOOLCHAIN_FILE=../cget/cget/cget.cmake -DCMAKE_BUILD_TYPE=Release .. +cmake -DCMAKE_CXX_FLAGS=-pg -DCMAKE_TOOLCHAIN_FILE=../cget/cget/cget.cmake -DCMAKE_BUILD_TYPE=Release .. make echo "Binary created at /release-build/minimac4" diff --git a/requirements.txt b/requirements.txt index 44bcf30..6f6af07 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1 @@ -daheise/libStatGen@parallel-bgzip --cmake dep/libstatgen.cmake +../libStatGen --cmake dep/libstatgen.cmake From 737fcc04a70428a0d0f8a9a9aeb0250b439c1693 Mon Sep 17 00:00:00 2001 From: David Heise Date: Wed, 10 Oct 2018 11:44:42 -0400 Subject: [PATCH 5/7] Trying more parallelization. --- src/Analysis.cpp | 44 +++++++++++++++++++++++++++----------------- src/DosageData.cpp | 2 ++ src/MyVariables.h | 8 ++++---- 3 files changed, 33 insertions(+), 21 deletions(-) diff --git a/src/Analysis.cpp b/src/Analysis.cpp index c6bc088..b2461e7 100644 --- a/src/Analysis.cpp +++ b/src/Analysis.cpp @@ -357,13 +357,17 @@ void Analysis::AppendtoMainLooVcfFaster(int ChunkNo, int MaxIndex) tempVariant.altAlleleString.c_str()); VcfPrintStringLength+=sprintf(VcfPrintStringPointer + VcfPrintStringLength,"\tTYPED\tGT:LDS"); + vector lines(MaxIndex); + #pragma omp parallel for for(int j=1;j<=MaxIndex;j++) { - line.clear(); - vcfLoodosepartialList[j-1]->readLine(line); - VcfPrintStringLength+=sprintf(VcfPrintStringPointer + VcfPrintStringLength,"%s",line.c_str()); - + lines[j-1].clear(); + vcfLoodosepartialList[j-1]->readLine(lines[j-1]); + } + for(int j=1;j<=MaxIndex;j++){ + VcfPrintStringLength+=sprintf(VcfPrintStringPointer + VcfPrintStringLength,"%s",lines[j-1].c_str()); } + VcfPrintStringLength+=sprintf(VcfPrintStringPointer + VcfPrintStringLength,"\n"); } @@ -410,10 +414,10 @@ void Analysis::AppendtoMainVcfFaster(int ChunkNo, int MaxIndex) { char bgzf_partial_mode[16]; char bgzf_final_mode[16]; - // To avoid forkbombing on reads, only use more than one CPU on reads - // if there is more than one CPU per files to read. + // To avoid threadbombing on reads, only use more than one CPU on reads + // if there is more than one CPU per file to read. int readingCpus = (MyAllVariables->myModelVariables.cpus-1)/MaxIndex; - if(MyAllVariables->myModelVariables.cpus > 1 && readingCpus > 1){ + if(readingCpus > 1){ snprintf(bgzf_partial_mode, 16-2, "r@%d", readingCpus); } else { snprintf(bgzf_partial_mode, 16-2, "r@%d", 1); @@ -421,7 +425,7 @@ void Analysis::AppendtoMainVcfFaster(int ChunkNo, int MaxIndex) // Use all available CPUs for writing if(MyAllVariables->myModelVariables.cpus > 1){ - snprintf(bgzf_final_mode, 16-2, "a@%d", MyAllVariables->myModelVariables.cpus-1); + snprintf(bgzf_final_mode, 16-2, "a@%d", MyAllVariables->myModelVariables.cpus); } else { // Otherwise, do a single threaded read. snprintf(bgzf_final_mode, 16-2, "a@%d", 1); @@ -482,14 +486,17 @@ void Analysis::AppendtoMainVcfFaster(int ChunkNo, int MaxIndex) VcfPrintStringLength+=sprintf(VcfPrintStringPointer + VcfPrintStringLength,"\t%s",MyAllVariables->myOutFormat.formatStringForVCF.c_str()); + vector lines(MaxIndex); + #pragma omp parallel for for(int j=1;j<=MaxIndex;j++) { - line.clear(); - vcfdosepartialList[j-1]->readLine(line); - VcfPrintStringLength+=sprintf(VcfPrintStringPointer + VcfPrintStringLength,"%s",line.c_str()); - + lines[j-1].clear(); + vcfdosepartialList[j-1]->readLine(lines[j-1]); } - VcfPrintStringLength+=sprintf(VcfPrintStringPointer + VcfPrintStringLength,"\n"); + for(int j=1;j<=MaxIndex;j++){ + VcfPrintStringLength+=sprintf(VcfPrintStringPointer + VcfPrintStringLength,"%s",lines[j-1].c_str()); + } + VcfPrintStringLength+=sprintf(VcfPrintStringPointer + VcfPrintStringLength,"\n"); } @@ -516,12 +523,15 @@ void Analysis::AppendtoMainVcfFaster(int ChunkNo, int MaxIndex) freq, freq > 0.5 ? 1.0 - freq : freq); VcfPrintStringLength+=sprintf(VcfPrintStringPointer + VcfPrintStringLength,"\t%s",MyAllVariables->myOutFormat.formatStringForVCF.c_str()); + vector lines(MaxIndex); + #pragma omp parallel for for(int j=1;j<=MaxIndex;j++) { - line.clear(); - vcfdosepartialList[j-1]->readLine(line); - VcfPrintStringLength+=sprintf(VcfPrintStringPointer + VcfPrintStringLength,"%s",line.c_str()); - + lines[j-1].clear(); + vcfdosepartialList[j-1]->readLine(lines[j-1]); + } + for(int j=1;j<=MaxIndex;j++){ + VcfPrintStringLength+=sprintf(VcfPrintStringPointer + VcfPrintStringLength,"%s",lines[j-1].c_str()); } VcfPrintStringLength+=sprintf(VcfPrintStringPointer + VcfPrintStringLength,"\n"); } diff --git a/src/DosageData.cpp b/src/DosageData.cpp index f8b24b5..d1bd192 100644 --- a/src/DosageData.cpp +++ b/src/DosageData.cpp @@ -15,6 +15,8 @@ void DosageData::FlushPartialVcf(int NovcfParts) int time_Start = time(0); char bgzf_mode[16]; + // Use all available CPUs for writing. Parallel writes work best with large + // print buffers. snprintf(bgzf_mode, 16-3, "wb@%d", MyAllVariables->myModelVariables.cpus); string PartialVcfFileName(MyAllVariables->myOutFormat.OutPrefix); diff --git a/src/MyVariables.h b/src/MyVariables.h index e775288..3540d7b 100644 --- a/src/MyVariables.h +++ b/src/MyVariables.h @@ -216,8 +216,8 @@ class ModelVariable int minimac3; bool referenceEstimates; String intermediate; - - + + ModelVariable() { intermediate=""; @@ -560,8 +560,8 @@ class HaplotypeDataVariables } } - - + + CHR=chr; START=start; END=end; From d62447554403b86f750906243e474476e0937fc0 Mon Sep 17 00:00:00 2001 From: David Heise Date: Wed, 10 Oct 2018 11:49:24 -0400 Subject: [PATCH 6/7] Fix a dependency and debug line left behind. --- CMakeLists.txt | 4 ++-- install.sh | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index cdcbaae..12e5a5d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -18,7 +18,7 @@ endif() find_package(ZLIB REQUIRED) find_library(STATGEN_LIBRARY StatGen) find_library(HTS_LIBRARY hts) -#find_library(BAM_LIBRARY bam) + add_executable(minimac4 src/Analysis.cpp src/Analysis.h @@ -42,6 +42,6 @@ add_executable(minimac4 src/Estimation.cpp src/Estimation.h) -target_link_libraries(minimac4 ${STATGEN_LIBRARY} ${HTS_LIBRARY} ${BAM_LIBRARY} ${ZLIB_LIBRARIES}) +target_link_libraries(minimac4 ${STATGEN_LIBRARY} ${HTS_LIBRARY} ${ZLIB_LIBRARIES}) install(TARGETS minimac4 RUNTIME DESTINATION bin) diff --git a/install.sh b/install.sh index dd9aad4..cd7b275 100755 --- a/install.sh +++ b/install.sh @@ -6,7 +6,7 @@ cget install -f requirements.txt &> install.log mkdir release-build cd release-build/ echo -e "Generating MakeFiles ..." -cmake -DCMAKE_CXX_FLAGS=-pg -DCMAKE_TOOLCHAIN_FILE=../cget/cget/cget.cmake -DCMAKE_BUILD_TYPE=Release .. +cmake -DCMAKE_TOOLCHAIN_FILE=../cget/cget/cget.cmake -DCMAKE_BUILD_TYPE=Release .. make echo "Binary created at /release-build/minimac4" From 520076e8bde60996fa5bc6de707543a649837c70 Mon Sep 17 00:00:00 2001 From: David Heise Date: Thu, 11 Oct 2018 09:27:15 -0400 Subject: [PATCH 7/7] Change cget requirement to github code. --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 6f6af07..7bf3d70 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1 @@ -../libStatGen --cmake dep/libstatgen.cmake +daheise/libStatGen@parallel-bgzip-cram --cmake dep/libstatgen.cmake