From ea215f8ee09cfe1a7b75d900edbb582bf2dde9e3 Mon Sep 17 00:00:00 2001 From: I-Hsuan Lin <9032946+ycl6@users.noreply.github.com> Date: Tue, 29 Oct 2024 16:57:47 +0000 Subject: [PATCH 1/2] Add a '--mem' flag to define memory per thread used during 'samtools sort' --- CPU/juicer.sh | 36 ++++++++++++++++++++++++++--------- CPU/mega_from_bams_diploid.sh | 32 ++++++++++++++++++++++++++----- 2 files changed, 54 insertions(+), 14 deletions(-) diff --git a/CPU/juicer.sh b/CPU/juicer.sh index 24b5f7b..d18271c 100755 --- a/CPU/juicer.sh +++ b/CPU/juicer.sh @@ -106,9 +106,11 @@ singleend=0 sampleName="HiC_sample" # library name for RG tag libraryName="HiC_library" +# samtools sort memory per thread +memory="1G" ## Read arguments -usageHelp="Usage: ${0##*/} [-g genomeID] [-d topDir] [-s site]\n [-a about] [-S stage] [-p chrom.sizes path]\n [-y restriction site file] [-z reference genome file]\n [-D Juicer scripts parent dir] [-b ligation] [-t threads]\n [-T threadsHic] [-i sample] [-k library] [-w wobble]\n [-e] [-h] [-f] [-j] [-u] [-m] [--assembly] [--cleanup] [--qc]" +usageHelp="Usage: ${0##*/} [-g genomeID] [-d topDir] [-s site]\n [-a about] [-S stage] [-p chrom.sizes path]\n [-y restriction site file] [-z reference genome file]\n [-D Juicer scripts parent dir] [-b ligation] [-t threads] [--mem memory]\n [-T threadsHic] [-i sample] [-k library] [-w wobble]\n [-e] [-h] [-f] [-j] [-u] [-m] [--assembly] [--cleanup] [--qc]" genomeHelp="* [genomeID] must be defined in the script, e.g. \"hg19\" or \"mm10\" (default \n \"$genomeID\"); alternatively, it can be defined using the -z command" dirHelp="* [topDir] is the top level directory (default\n \"$topDir\")\n [topDir]/fastq must contain the fastq files\n [topDir]/splits will be created to contain the temporary split files\n [topDir]/aligned will be created for the final alignment" siteHelp="* [site] must be defined in the script, e.g. \"HindIII\" or \"MboI\" \n (default \"$site\")" @@ -119,7 +121,8 @@ siteFileHelp="* [restriction site file]: enter path for restriction site file (l scriptDirHelp="* [Juicer scripts directory]: set the Juicer directory,\n which should have scripts/ references/ and restriction_sites/ underneath it\n (default ${juiceDir})" refSeqHelp="* [reference genome file]: enter path for reference sequence file, BWA index\n files must be in same directory" ligationHelp="* [ligation junction]: use this string when counting ligation junctions" -threadsHelp="* [threads]: number of threads when running BWA alignment" +threadsHelp="* [threads]: number of threads when running BWA alignment and samtools view/sort/merge" +memoryHelp="* [memory]: maximum memory per thread when running samtools sort (default 1G).\n Suffix K/M/G recognized, gigabyte (G) is used if suffix is not provided" threadsHicHelp="* [threads for hic file creation]: number of threads when building hic file" sampleHelp="* [sample name]: will be added to the SM portion of the read group (RG) tag" libraryHelp="* [library name]: will be added to the LB portion of the read group (RG) tag" @@ -149,6 +152,7 @@ printHelpAndExit() { echo -e "$scriptDirHelp" echo -e "$ligationHelp" echo -e "$threadsHelp" + echo -e "$memoryHelp" echo -e "$threadsHicHelp" echo -e "$sampleHelp" echo -e "$libraryHelp" @@ -192,7 +196,8 @@ while getopts "d:g:a:hs:p:y:z:S:D:b:t:jfuecT:1:2:i:-:w:k:m" opt; do m) methylation=1 ;; 1) read1files=$OPTARG ;; 2) read2files=$OPTARG ;; - -) case "${OPTARG}" in + -) case "${OPTARG}" in + mem) memory=$2 ;; assembly) earlyexit=1; assembly=1 ;; cleanup) cleanup=1 ;; qc) qc=1 ;; @@ -311,7 +316,7 @@ then echo "Using $site_file as site file" fi -## Set threads for sending appropriate parameters to cluster and string for BWA call +## Set threads for sending appropriate parameters to cluster and string for BWA and samtools calls if [ -z "$threads" ] then threads="$(getconf _NPROCESSORS_ONLN)" @@ -322,6 +327,19 @@ else sthreadstring="-@ $threads" fi +## Set memory for sending appropriate parameters to cluster and string for samtools sort calls +if [[ $memory =~ ^[0-9]+$ ]] +then + smemorystring="-m ${memory}G" +elif [[ $memory =~ ^[0-9]+[KMG]$ ]] +then + smemorystring="-m ${memory}" +else + echo "Memory unit suffix not recognised. Use default setting: 1G" + echo $memory + smemorystring="-m 1G" +fi + if [ -n "$read2files" ] && [ -z "$read1files" ] then echo "***! When fastqs for read2 are specified with \"-2\", corresponding read1 fastqs must be specified with \"-1\" " @@ -542,17 +560,17 @@ then then if [ $singleend -eq 1 ] then - awk -v stem=${name}${ext}_norm -v site_file=$site_file -v singleend=$singleend -f $juiceDir/scripts/common/chimeric_sam.awk $name$ext.sam | samtools sort -t cb -n $sthreadstring > ${name}${ext}.bam + awk -v stem=${name}${ext}_norm -v site_file=$site_file -v singleend=$singleend -f $juiceDir/scripts/common/chimeric_sam.awk $name$ext.sam | samtools sort -t cb -n $sthreadstring $smemorystring > ${name}${ext}.bam else - awk -v stem=${name}${ext}_norm -v site_file=$site_file -f $juiceDir/scripts/common/chimeric_sam.awk $name$ext.sam | samtools sort -t cb -n $sthreadstring > ${name}${ext}.bam + awk -v stem=${name}${ext}_norm -v site_file=$site_file -f $juiceDir/scripts/common/chimeric_sam.awk $name$ext.sam | samtools sort -t cb -n $sthreadstring $smemorystring > ${name}${ext}.bam fi else if [ $singleend -eq 1 ] then - awk -v stem=${name}${ext}_norm -v singleend=$singleend -f $juiceDir/scripts/common/chimeric_sam.awk $name$ext.sam | samtools sort -t cb -n $sthreadstring > ${name}${ext}.bam + awk -v stem=${name}${ext}_norm -v singleend=$singleend -f $juiceDir/scripts/common/chimeric_sam.awk $name$ext.sam | samtools sort -t cb -n $sthreadstring $smemorystring > ${name}${ext}.bam else awk -v stem=${name}${ext}_norm -f $juiceDir/scripts/common/chimeric_sam.awk $name$ext.sam > $name$ext.sam2 - awk -v avgInsertFile=${name}${ext}_norm.txt.res.txt -f $juiceDir/scripts/common/adjust_insert_size.awk $name$ext.sam2 | samtools sort -t cb -n $sthreadstring > ${name}${ext}.bam + awk -v avgInsertFile=${name}${ext}_norm.txt.res.txt -f $juiceDir/scripts/common/adjust_insert_size.awk $name$ext.sam2 | samtools sort -t cb -n $sthreadstring $smemorystring > ${name}${ext}.bam fi fi @@ -679,7 +697,7 @@ if [ -z $postproc ] if [ "$methylation" = 1 ] then $load_methyl - samtools sort $sthreadstring ${outputdir}/merged_dedup.bam > ${outputdir}/merged_dedup_sort.bam + samtools sort $sthreadstring $smemorystring ${outputdir}/merged_dedup.bam > ${outputdir}/merged_dedup_sort.bam $call_methyl extract -F 1024 --keepSingleton --keepDiscordant $refSeq ${outputdir}/merged_dedup_sort.bam $call_methyl extract -F 1024 --keepSingleton --keepDiscordant --cytosine_report --CHH --CHG $refSeq ${outputdir}/merged_dedup_sort.bam ${juiceDir}/scripts/common/conversion.sh ${outputdir}/merged_dedup_sort.cytosine_report.txt > ${outputdir}/conversion_report.txt diff --git a/CPU/mega_from_bams_diploid.sh b/CPU/mega_from_bams_diploid.sh index 22e0b8f..de22f88 100755 --- a/CPU/mega_from_bams_diploid.sh +++ b/CPU/mega_from_bams_diploid.sh @@ -16,7 +16,7 @@ USAGE=" ***************************************************** Simplified mega script for ENCODE DCC hic-pipeline. -USAGE: ./mega.sh -c|--chrom-sizes [-g|--genome-id genome_id] [-r|--resolutions resolutions_string] [-v|--vcf ] [-C|--exclude-chr-from-diploid chr_list] [--separate-homologs] [-p|--psf ] [--reads-to-homologs ] [--juicer-dir ] [--phaser-dir ] [-t|--threads thread_count] [-T|--threads-hic hic_thread_count] [--from-stage stage] [--to-stage stage] ... +USAGE: ./mega.sh -c|--chrom-sizes [-g|--genome-id genome_id] [-r|--resolutions resolutions_string] [-v|--vcf ] [-C|--exclude-chr-from-diploid chr_list] [--separate-homologs] [-p|--psf ] [--reads-to-homologs ] [--juicer-dir ] [--phaser-dir ] [-t|--threads thread_count] [--mem memory] [-T|--threads-hic hic_thread_count] [--from-stage stage] [--to-stage stage] ... DESCRIPTION: This is a simplied mega.sh script to produce aggregate Hi-C maps and chromatic accessibility tracks from multiple experiments. The pipeline includes optional diploid steps which produce diploid versions of the contact maps and chromatin accessibility tracks. @@ -58,6 +58,10 @@ WORKFLOW CONTROL: -t|--threads [num] Indicate how many threads to use. Default: half of available cores as calculated by parallel --number-of-cores. +--mem [num] + Indicate the maximum memory per thread when running samtools sort (default 1G). Suffix K/M/G recognized, gigabyte (G) is used if suffix is not provided. + + -T|--threads-hic [num] Indicate how many threads to use when generating the Hi-C file. Default: 24. @@ -81,9 +85,10 @@ resolutionsToBuildString="-r 2500000,1000000,500000,250000,100000,50000,25000,10 exclude_chr="Y|chrY|MT|chrM" separate_homologs=false mapq=1 ##lowest mapq of interest +memory="1G" # samtools sort memory per thread # multithreading -threads=`parallel --number-of-cores` +threads="$(getconf _NPROCESSORS_ONLN)" threads=$((threads/2)) # adjust for mem usage tmp=`awk '/MemTotal/ {threads=int($2/1024/1024/2/6-1)}END{print threads+0}' /proc/meminfo 2>/dev/null` @@ -190,6 +195,10 @@ while :; do fi shift ;; + --mem) OPTARG=$2 + memory=$OPTARG + shift + ;; -T|--threads-hic) OPTARG=$2 re='^[0-9]+$' if [[ $OPTARG =~ $re ]]; then @@ -266,6 +275,19 @@ fi [ -z $genome_id ] && { echo >&2 ":| Warning: no genome_id is provided. Please provide a genome_id if using one of the common references to be able to run the motif finder. Ignoring motif finder!"; } +## Set memory for sending appropriate parameters to cluster and string for samtools sort calls +if [[ $memory =~ ^[0-9]+$ ]] +then + smemorystring="-m ${memory}G" +elif [[ $memory =~ ^[0-9]+[KMG]$ ]] +then + smemorystring="-m ${memory}" +else + echo "Memory unit suffix not recognised. Use default setting: 1G" + echo $memory + smemorystring="-m 1G" +fi + ############### HANDLE DEPENDENCIES ############### ## Juicer & Phaser @@ -306,7 +328,7 @@ if [ "$first_stage" == "prep" ]; then samtools merge --no-PG -f mega_header.bam ${header_list} rm ${header_list} - samtools cat -@ $((threads * 2)) -h mega_header.bam $bam | samtools view -u -d "rt:0" -d "rt:1" -d "rt:2" -d "rt:3" -d "rt:4" -d "rt:5" -@ $((threads * 2)) -F 0x400 -q $mapq - | samtools sort -@ $threads -m 6G -o reads.sorted.bam + samtools cat -@ $((threads * 2)) -h mega_header.bam $bam | samtools view -u -d "rt:0" -d "rt:1" -d "rt:2" -d "rt:3" -d "rt:4" -d "rt:5" -@ $((threads * 2)) -F 0x400 -q $mapq - | samtools sort -@ $threads $smemorystring -o reads.sorted.bam [ `echo "${PIPESTATUS[@]}" | tr -s ' ' + | bc` -eq 0 ] || { echo ":( Pipeline failed at bam sorting. See stderr for more info. Exiting!" | tee -a /dev/stderr && exit 1; } rm mega_header.bam @@ -522,7 +544,7 @@ if [ "$first_stage" == "diploid_hic" ]; then export pipeline=${phaser_dir} export reads_to_homologs=$reads_to_homologs doit () { - samtools view -@ 2 -h reads.sorted.bam $1 | awk -v chr=$1 'BEGIN{OFS="\t"}FILENAME==ARGV[1]{if($2==chr"-r"||$2==chr"-a"){if(keep[$1]&&keep[$1]!=$2){delete keep[$1]}else{keep[$1]=$2}};next}$0~/^@SQ/{$2=$2"-r"; print; $2=substr($2,1,length($2)-2)"-a";print;next}$0~/^@/{print;next}($1 in keep)&&($7=="="||$7=="*"){$3=keep[$1];print}' $reads_to_homologs - | samtools sort -n -m 1G -O sam | awk '$0~/^@/{next}($1!=prev){if(n==2){sub("\t","",str); print str}; str=""; n=0}{for(i=12;i<=NF;i++){if($i~/^ip:i:/){$4=substr($i,6);break;}};str=str"\t"n"\t"$3"\t"$4"\t"n; n++; prev=$1}END{if(n==2){sub("\t","",str); print str}}' | sort -k 2,2 -S 6G + samtools view -@ 2 -h reads.sorted.bam $1 | awk -v chr=$1 'BEGIN{OFS="\t"}FILENAME==ARGV[1]{if($2==chr"-r"||$2==chr"-a"){if(keep[$1]&&keep[$1]!=$2){delete keep[$1]}else{keep[$1]=$2}};next}$0~/^@SQ/{$2=$2"-r"; print; $2=substr($2,1,length($2)-2)"-a";print;next}$0~/^@/{print;next}($1 in keep)&&($7=="="||$7=="*"){$3=keep[$1];print}' $reads_to_homologs - | samtools sort -@ 2 -n $smemorystring -O sam | awk '$0~/^@/{next}($1!=prev){if(n==2){sub("\t","",str); print str}; str=""; n=0}{for(i=12;i<=NF;i++){if($i~/^ip:i:/){$4=substr($i,6);break;}};str=str"\t"n"\t"$3"\t"$4"\t"n; n++; prev=$1}END{if(n==2){sub("\t","",str); print str}}' | sort -k 2,2 -S 6G } export -f doit @@ -576,7 +598,7 @@ if [ "$first_stage" == "diploid_dhs" ]; then export junction_rt_string=${junction_rt_string} export reads_to_homologs=${reads_to_homologs} doit () { -samtools view -@2 ${junction_rt_string} -h reads.sorted.bam $1 | awk -v chr=$1 'BEGIN{ +samtools view -@ 2 ${junction_rt_string} -h reads.sorted.bam $1 | awk -v chr=$1 'BEGIN{ OFS="\t"}FILENAME==ARGV[1]{ if($2==chr"-r"||$2==chr"-a"){ if(keep[$1]&&keep[$1]!=$2){ From c868fc2528c3f6b608f8b5f146612a99b2198ced Mon Sep 17 00:00:00 2001 From: I-Hsuan Lin <9032946+ycl6@users.noreply.github.com> Date: Tue, 29 Oct 2024 21:18:33 +0000 Subject: [PATCH 2/2] Set memory usage using '-m' to 'samtools sort' in PBS and Slurm requests --- PBS/scripts/juicer.sh | 13 +++++++++++-- PBS_without_launch/scripts/juicer.sh | 13 +++++++++++-- SLURM/scripts/juicer.sh | 4 ++-- 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/PBS/scripts/juicer.sh b/PBS/scripts/juicer.sh index d2209c9..457e3c8 100755 --- a/PBS/scripts/juicer.sh +++ b/PBS/scripts/juicer.sh @@ -621,13 +621,22 @@ ALGNR1 date +"%Y-%m-%d %H:%M:%S" $load_samtools export LC_ALL=C + + # Set memory for samtools sort (maximum memory is set above with 'mem=24gb') + smemory=$(echo 24 / $threads | bc) + if [ "$smemory" -le 1 ] + then + smemory=1 + fi + smemorystring="-m ${smemory}G" + # call chimeric_blacklist.awk to deal with chimeric reads; sorted file is sorted by read name at this point if [ "$site" != "none" ] && [ -e "$site_file" ] then - awk -v stem=${name}${ext}_norm -v site_file=$site_file -f $juiceDir/scripts/chimeric_sam.awk $name$ext.sam | samtools sort -t cb -n $sthreadstring > ${name}${ext}.bam + awk -v stem=${name}${ext}_norm -v site_file=$site_file -f $juiceDir/scripts/chimeric_sam.awk $name$ext.sam | samtools sort -t cb -n $sthreadstring $smemorystring > ${name}${ext}.bam else awk -v stem=${name}${ext}_norm -f $juiceDir/scripts/chimeric_sam.awk $name$ext.sam > $name$ext.sam2 - awk -v avgInsertFile=${name}${ext}_norm.txt.res.txt -f $juiceDir/scripts/adjust_insert_size.awk $name$ext.sam2 | samtools sort -t cb -n $sthreadstring > ${name}${ext}.bam + awk -v avgInsertFile=${name}${ext}_norm.txt.res.txt -f $juiceDir/scripts/adjust_insert_size.awk $name$ext.sam2 | samtools sort -t cb -n $sthreadstring $smemorystring > ${name}${ext}.bam fi if [ \$? -ne 0 ] then diff --git a/PBS_without_launch/scripts/juicer.sh b/PBS_without_launch/scripts/juicer.sh index 6600be3..3e99f92 100755 --- a/PBS_without_launch/scripts/juicer.sh +++ b/PBS_without_launch/scripts/juicer.sh @@ -608,13 +608,22 @@ ALGNR1 date +"%Y-%m-%d %H:%M:%S" $load_samtools export LC_ALL=C + + # Set memory for samtools sort (maximum memory is set above with 'mem=24gb') + smemory=$(echo 24 / $threads | bc) + if [ "$smemory" -le 1 ] + then + smemory=1 + fi + smemorystring="-m ${smemory}G" + # call chimeric_blacklist.awk to deal with chimeric reads; sorted file is sorted by read name at this point if [ "$site" != "none" ] && [ -e "$site_file" ] then - awk -v stem=${name}${ext}_norm -v site_file=$site_file -f $juiceDir/scripts/chimeric_sam.awk $name$ext.sam | samtools sort -t cb -n $sthreadstring > ${name}${ext}.bam + awk -v stem=${name}${ext}_norm -v site_file=$site_file -f $juiceDir/scripts/chimeric_sam.awk $name$ext.sam | samtools sort -t cb -n $sthreadstring $smemorystring > ${name}${ext}.bam else awk -v stem=${name}${ext}_norm -f $juiceDir/scripts/chimeric_sam.awk $name$ext.sam > $name$ext.sam2 - awk -v avgInsertFile=${name}${ext}_norm.txt.res.txt -f $juiceDir/scripts/adjust_insert_size.awk $name$ext.sam2 | samtools sort -t cb -n $sthreadstring > ${name}${ext}.bam + awk -v avgInsertFile=${name}${ext}_norm.txt.res.txt -f $juiceDir/scripts/adjust_insert_size.awk $name$ext.sam2 | samtools sort -t cb -n $sthreadstring $smemorystring > ${name}${ext}.bam fi if [ $? -ne 0 ] then diff --git a/SLURM/scripts/juicer.sh b/SLURM/scripts/juicer.sh index 7e8acd4..6cf0ad9 100755 --- a/SLURM/scripts/juicer.sh +++ b/SLURM/scripts/juicer.sh @@ -964,7 +964,7 @@ MRGALL3` $userstring ${load_samtools} #we should probably set the -m based on memory / num of threads -if time samtools sort -t cb -n -O SAM -@ $sortthreads -l 0 -m 2G $name$ext.sam3 > ${name}${ext}.sam +if time samtools sort -t cb -n -O SAM $sthreadstring -l 0 -m 2G $name$ext.sam3 > ${name}${ext}.sam then rm -f $name$ext.sam2 $name$ext.sam3 touch $touchfile @@ -1326,7 +1326,7 @@ BAMRM` $userstring ${load_samtools} export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/gpfs0/home/neva/lib - samtools sort $sthreadstring ${outputdir}/merged_dedup.bam > ${outputdir}/merged_dedup_sort.bam + samtools sort $sthreadstring -m 10G ${outputdir}/merged_dedup.bam > ${outputdir}/merged_dedup_sort.bam /gpfs0/home/neva/bin/MethylDackel extract -F 1024 --keepSingleton --keepDiscordant $refSeq ${outputdir}/merged_dedup_sort.bam /gpfs0/home/neva/bin/MethylDackel extract -F 1024 --keepSingleton --keepDiscordant --cytosine_report --CHH --CHG $refSeq ${outputdir}/merged_dedup_sort.bam ${juiceDir}/scripts/conversion.sh ${outputdir}/merged_dedup_sort.cytosine_report.txt > ${outputdir}/conversion_report.txt