From 82838651faad7bbf6e2146498ea4e1389b24db89 Mon Sep 17 00:00:00 2001 From: Harriet Dashnow Date: Mon, 24 Sep 2018 16:55:30 +1000 Subject: [PATCH 1/6] calculate number of reads on decoy within identify_locus, STR_counts stage no longer required. Additional functionality commented out --- pipelines/STRetch_exome_pipeline.groovy | 1 - pipelines/STRetch_wgs_bam_pipeline.groovy | 5 +-- pipelines/STRetch_wgs_pipeline.groovy | 1 - pipelines/pipeline_stages.groovy | 16 ++------ scripts/estimateSTR.py | 50 +++++++++-------------- scripts/identify_locus.py | 26 ++++++++++-- 6 files changed, 48 insertions(+), 51 deletions(-) diff --git a/pipelines/STRetch_exome_pipeline.groovy b/pipelines/STRetch_exome_pipeline.groovy index 0e9b168..12268ee 100644 --- a/pipelines/STRetch_exome_pipeline.groovy +++ b/pipelines/STRetch_exome_pipeline.groovy @@ -15,7 +15,6 @@ run { set_sample_info + align_bwa + index_bam + median_cov_region + - STR_coverage + STR_locus_counts ] + estimate_size diff --git a/pipelines/STRetch_wgs_bam_pipeline.groovy b/pipelines/STRetch_wgs_bam_pipeline.groovy index dfd482a..7a5ede8 100644 --- a/pipelines/STRetch_wgs_bam_pipeline.groovy +++ b/pipelines/STRetch_wgs_bam_pipeline.groovy @@ -32,10 +32,9 @@ run { mosdepth_dist + mosdepth_median, shards * [ init_shard + align_bwa_bam + index_bam - ] + merge_bams - ] + - STR_coverage + + ] + merge_bams + STR_locus_counts + ] ] + estimate_size } diff --git a/pipelines/STRetch_wgs_pipeline.groovy b/pipelines/STRetch_wgs_pipeline.groovy index cd8ad9d..9d83e5a 100644 --- a/pipelines/STRetch_wgs_pipeline.groovy +++ b/pipelines/STRetch_wgs_pipeline.groovy @@ -12,7 +12,6 @@ run { set_sample_info + align_bwa + index_bam + median_cov + - STR_coverage + STR_locus_counts ] + estimate_size diff --git a/pipelines/pipeline_stages.groovy b/pipelines/pipeline_stages.groovy index 2cafc28..eabe472 100644 --- a/pipelines/pipeline_stages.groovy +++ b/pipelines/pipeline_stages.groovy @@ -165,20 +165,9 @@ index_bam = { forward input } -STR_coverage = { - transform("bam") to ("STR_counts") { - exec """ - $bedtools coverage -counts - -sorted - -g ${REF}.genome - -a $DECOY_BED - -b $input.bam > $output.STR_counts - """ - } -} - STR_locus_counts = { - transform("bam") to ("locus_counts") { + + transform("bam") to ("locus_counts", "STR_counts") { exec """ STRPATH=$PATH; PATH=$STRETCH/tools/bin:$PATH; @@ -186,6 +175,7 @@ STR_locus_counts = { --bam $input.bam --bed $STR_BED --output $output.locus_counts + --STR_counts $output.STR_counts ;PATH=$STRPATH """ } diff --git a/scripts/estimateSTR.py b/scripts/estimateSTR.py index 138203a..c03b87e 100644 --- a/scripts/estimateSTR.py +++ b/scripts/estimateSTR.py @@ -59,13 +59,12 @@ def parse_STRcov(filename): """Parse all STR coverage""" sample_id = get_sample(filename) try: - cov_data = pd.read_table(filename, delim_whitespace = True, - names = ['chrom', 'start', 'end', 'decoycov']) + cov_data = pd.read_table(filename, delim_whitespace = True) except pd.io.common.EmptyDataError: sys.exit('ERROR: file {0} was empty.\n'.format(filename)) cov_data['sample'] = sample_id cov_data['repeatunit'] = [x.split('-')[1] for x in cov_data['chrom']] - cov_data = cov_data[['sample', 'repeatunit', 'decoycov']] + cov_data = cov_data[['sample', 'repeatunit', 'decoycov', 'decoycov_pairs']] return(cov_data) def parse_locuscov(filename): @@ -224,32 +223,23 @@ def main(): factor = 100 STRcov_data = pd.merge(STRcov_data, genomecov_data) - #STRcov_data['decoycov_norm'] = factor * (STRcov_data['decoycov'] + 1) / STRcov_data['genomecov'] - #STRcov_data['decoycov_log'] = np.log2(STRcov_data['decoycov_norm']) - #XXX combines the previous two lines into one. Keeping commented out in case coverage_norm is required later - STRcov_data['decoycov_log'] = np.log2(factor * (STRcov_data['decoycov'] + 1) / STRcov_data['genomecov']) - - # #XXX ***Not fully implemented*** - # # Look for repeat units where the number of reads mapping to the decoy can't be - # # explained by those mapping to all loci with that repeat unit - # - # # Sum the counts over all loci for each repeat unit in each sample - # locus_totals = locuscov_data.groupby(['sample', 'repeatunit'])['locuscoverage'].aggregate(np.sum) - # locus_totals = pd.DataFrame(locus_totals).reset_index() # convert to DataFrame and make indices into columns - # # Calculate the difference between reads assigned to a decoy and the sum of - # # all reads assigned to loci with that repeat unit - # all_differences = pd.merge(STRcov_data, locus_totals, how='left') - # all_differences['difference'] = all_differences['decoycov'] - all_differences['locuscoverage'] - # # Normalise differences by median coverage and take the log2 - # all_differences['difference_log'] = np.log2(factor * (all_differences['difference'] + 1) / all_differences['genomecov']) - # - # locus_totals = pd.merge(locus_totals, STRcov_data) - # - # # Assign decoy counts to each locus, based on what proportion of the counts for that repeat unit they already have - - locus_totals = pd.merge(locuscov_data, STRcov_data, how = 'left') - - locus_totals['total_assigned'] = locus_totals['locuscoverage'] #XXX remove this line if implementing the above + +# # XXX this code works, but not sure if it's a good idea +# # Use pairs of reads where both map to the same STR decoy chromosome to increase +# # size estimates and calculate outliers +# # Sum the counts over all loci for each repeat unit in each sample and count loci +# # with same repeat unit +# locus_totals = locuscov_data.groupby(['sample', 'repeatunit'])['locuscoverage'].agg(['sum', +# 'count']).reset_index() +# locus_totals = locuscov_data.merge(locus_totals, how = 'left').merge(STRcov_data, how = 'left') +# # Assign decoy counts to each locus, based on what proportion of the counts for that +# # repeat unit they already have +# locus_totals['total_assigned'] = locus_totals['locuscoverage'] + locus_totals['decoycov_pairs'] * locus_totals['locuscoverage']/locus_totals['sum'] +# # NaNs introduced where count = 0, fill these with these with locuscoverage +# locus_totals['total_assigned'].fillna(locus_totals['locuscoverage'], inplace=True) + locus_totals = locuscov_data.merge(STRcov_data, how = 'left') + locus_totals['total_assigned'] = locus_totals['locuscoverage'] #XXX remove if implementing above + locus_totals['total_assigned_log'] = np.log2(factor * (locus_totals['total_assigned'] + 1) / locus_totals['genomecov']) # For each locus, calculate if that sample is an outlier relative to others @@ -392,7 +382,7 @@ def main(): # Specify output data columns write_data = locus_totals[['chrom', 'start', 'end', 'sample', 'repeatunit', 'reflen', - 'locuscoverage', + 'locuscoverage', 'total_assigned', 'outlier', 'p_adj', 'bpInsertion', 'repeatUnits' ]] diff --git a/scripts/identify_locus.py b/scripts/identify_locus.py index 3951f1c..07d4821 100644 --- a/scripts/identify_locus.py +++ b/scripts/identify_locus.py @@ -33,7 +33,10 @@ def parse_args(): help='bed file containing genomic locations of STR loci. Genomic locations should be relative to the fasta reference used to create the bam') parser.add_argument( '--output', type=str, required=False, - help='Output file name. Defaults to stdout.') + help='Output file name for locus counts. Defaults to stdout.') + parser.add_argument( + '--STR_counts', type=str, required=False, + help='Output file name for total counts of reads aligned to each STR decoy chromosome. If not given, these will not be reported.') parser.add_argument( '--dist', type=int, required=False, default=500, help='Counts are only assigned to an STR locus that is at most this many bp away. The best choice is probably the insert size. (default: %(default)s)') @@ -95,6 +98,13 @@ def main(): # Read bam bam = pysam.Samfile(bamfile, 'rb') + # Get count of reads aligned to STR decoy chromosomes + STR_counts = {} + index_stats = bam.get_index_statistics() + for x in index_stats: + if x.contig.startswith("STR-"): + STR_counts[x.contig] = [x.mapped, None] # cols: mapped, both on same decoy + # Get relevant chromosomes required_chroms = [] unpaired = 0 @@ -105,8 +115,8 @@ def main(): # Check if any STR- chromosomes if len(required_chroms) == 0: sys.exit('ERROR: There were no reads mapping to chromosomes with names starting with "STR-" in {0}. Are you sure this data is mapped to a reference genome with STR decoy chromosomes?'.format(bamfile)) - for chrom in required_chroms: + STR_counts[chrom][1] = 0 motif = chrom.split('-')[1] all_positions = [] all_segments = bam.fetch(reference=chrom) @@ -121,7 +131,8 @@ def main(): mate_start = read.next_reference_start mate_stop = mate_start + readlen all_positions.append([mate_chr, mate_start, mate_stop]) - + if mate_chr == chrom: # check if both in pair map to the same STR decoy + STR_counts[chrom][1] += 1 # Strategy: # Merge all overlapping intervals # Keep the count of reads corresponding to each merged interval (i.e. 1 for each read contained in it) @@ -182,6 +193,15 @@ def main(): sys.stderr.write('WARNING: it appears that {0} of the {1} reads overlapping the target STR regions were unpaired and so no useful data could be obtained from them.\n'.format(unpaired, total)) # Write results + + if args.STR_counts: + STR_counts_df = pd.DataFrame.from_dict(STR_counts, orient='index', + columns=['decoycov', 'decoycov_pairs']) + with open(args.STR_counts, 'w') as STR_outstream: + STR_outstring = STR_counts_df.to_csv(sep='\t', header=True, index=True, + index_label='chrom') + STR_outstream.write(STR_outstring) + if outfile: outstream = open(outfile, 'w') else: From 1528f8e7b0ea3f6eb0e5f82f40e1d4f0995ad542 Mon Sep 17 00:00:00 2001 From: Harriet Dashnow Date: Sat, 29 Sep 2018 14:09:04 +1000 Subject: [PATCH 2/6] Generate max estimated allele size by allocating both-in-pair on decoy reads. Use locuscoverage only for stats --- scripts/estimateSTR.py | 69 +++++++++++++++++++++++------------------- 1 file changed, 38 insertions(+), 31 deletions(-) diff --git a/scripts/estimateSTR.py b/scripts/estimateSTR.py index c03b87e..ba6ee6e 100644 --- a/scripts/estimateSTR.py +++ b/scripts/estimateSTR.py @@ -224,26 +224,28 @@ def main(): STRcov_data = pd.merge(STRcov_data, genomecov_data) -# # XXX this code works, but not sure if it's a good idea -# # Use pairs of reads where both map to the same STR decoy chromosome to increase -# # size estimates and calculate outliers -# # Sum the counts over all loci for each repeat unit in each sample and count loci -# # with same repeat unit -# locus_totals = locuscov_data.groupby(['sample', 'repeatunit'])['locuscoverage'].agg(['sum', -# 'count']).reset_index() -# locus_totals = locuscov_data.merge(locus_totals, how = 'left').merge(STRcov_data, how = 'left') -# # Assign decoy counts to each locus, based on what proportion of the counts for that -# # repeat unit they already have -# locus_totals['total_assigned'] = locus_totals['locuscoverage'] + locus_totals['decoycov_pairs'] * locus_totals['locuscoverage']/locus_totals['sum'] -# # NaNs introduced where count = 0, fill these with these with locuscoverage -# locus_totals['total_assigned'].fillna(locus_totals['locuscoverage'], inplace=True) - locus_totals = locuscov_data.merge(STRcov_data, how = 'left') - locus_totals['total_assigned'] = locus_totals['locuscoverage'] #XXX remove if implementing above - + # Use pairs of reads where both map to the same STR decoy chromosome to increase + # size estimates and calculate outliers + # Sum the counts over all loci for each repeat unit in each sample and count loci + # with same repeat unit + locus_totals = locuscov_data.groupby(['sample', 'repeatunit'])['locuscoverage'].agg(['sum', + 'count']).reset_index() + locus_totals = locuscov_data.merge(locus_totals, how = 'left').merge(STRcov_data, how = 'left') + # Assign decoy counts to each locus, based on what proportion of the counts for that + # repeat unit they already have + locus_totals['total_assigned'] = locus_totals['locuscoverage'] + locus_totals['decoycov_pairs'] * locus_totals['locuscoverage']/locus_totals['sum'] + # NaNs introduced where count = 0, fill these with with locuscoverage + locus_totals['total_assigned'].fillna(locus_totals['locuscoverage'], inplace=True) + +#XXX Uncomment if not implementing the above +# locus_totals = locuscov_data.merge(STRcov_data, how = 'left') +# locus_totals['total_assigned'] = locus_totals['locuscoverage'] + + locus_totals['locuscoverage_log'] = np.log2(factor * (locus_totals['locuscoverage'] + 1) / locus_totals['genomecov']) locus_totals['total_assigned_log'] = np.log2(factor * (locus_totals['total_assigned'] + 1) / locus_totals['genomecov']) # For each locus, calculate if that sample is an outlier relative to others - total_assigned_wide = locus_totals.pivot(index='locus', columns='sample', values='total_assigned_log') + locuscoverage_wide = locus_totals.pivot(index='locus', columns='sample', values='locuscoverage_log') # Calculate values for if there were zero reads at a locus in all samples null_locus_counts = np.log2(factor * (0 + 1) / genomecov_data['genomecov']) @@ -258,7 +260,7 @@ def main(): # Use Huber's M-estimator to calculate median and SD across all samples # for each locus - sample_estimates = total_assigned_wide.apply(hubers_est, axis=1) + sample_estimates = locuscoverage_wide.apply(hubers_est, axis=1) # Where sd is NA, replace with the minimum non-zero sd from all loci min_sd = np.min(sample_estimates['sd'][sample_estimates['sd'] > 0]) sample_estimates['sd'].fillna(min_sd, inplace=True) @@ -274,7 +276,7 @@ def main(): sample_estimates.loc['null_locus_counts'] = null_locus_counts_est - n = len(total_assigned_wide.columns) + n = len(locuscoverage_wide.columns) sample_estimates['n'] = n sample_estimates.to_csv(emit_file, sep= '\t') @@ -286,30 +288,30 @@ def main(): control_estimates = parse_controls(control_file) # Get a list of all loci in the control file but not the sample data control_loci_df = control_estimates.iloc[control_estimates.index != 'null_locus_counts'] - control_loci = [x for x in control_loci_df.index if x not in total_assigned_wide.index] + control_loci = [x for x in control_loci_df.index if x not in locuscoverage_wide.index] # Extract and order just those control estimates appearing in the current data - mu_sd_estimates = control_estimates.reindex(total_assigned_wide.index) + mu_sd_estimates = control_estimates.reindex(locuscoverage_wide.index) # Fill NaNs with null_locus_counts values mu_sd_estimates.fillna(control_estimates.loc['null_locus_counts'], inplace=True) else: # Extract and order estimates to match the current data - mu_sd_estimates = sample_estimates.reindex(total_assigned_wide.index) + mu_sd_estimates = sample_estimates.reindex(locuscoverage_wide.index) # calculate z scores - z = z_score(total_assigned_wide, mu_sd_estimates) + z = z_score(locuscoverage_wide, mu_sd_estimates) # If a control file is given, effectively add zeros counts at all loci in # controls but not in the samples. # These extra rows will dissapear due to a later merge if control_file != '': - # Create a total_assigned_wide as if all loci have zero counts - null_total_assigned_wide = pd.DataFrame(columns = sample_names, index = control_loci) - null_total_assigned_wide.fillna(null_locus_counts, inplace = True) + # Create a locuscoverage_wide as if all loci have zero counts + null_locuscoverage_wide = pd.DataFrame(columns = sample_names, index = control_loci) + null_locuscoverage_wide.fillna(null_locus_counts, inplace = True) # Caculate z scores - null_z = z_score(null_total_assigned_wide, - control_estimates.reindex(null_total_assigned_wide.index)) + null_z = z_score(null_locuscoverage_wide, + control_estimates.reindex(null_locuscoverage_wide.index)) loci_with_counts = z.index z = z.append(null_z) @@ -365,14 +367,19 @@ def main(): X_train = np.log2(STRcov_model['coverage_norm']).values.reshape(-1, 1) Y_train = np.log2(STRcov_model['allele2']) regr.fit(X_train, Y_train) - # Make a prediction - Y_pred = regr.predict(locus_totals['total_assigned_log'].values.reshape(-1, 1)) + # Make a prediction based on locuscoverage + Y_pred = regr.predict(locus_totals['locuscoverage_log'].values.reshape(-1, 1)) predict = np.power(2, Y_pred) locus_totals['bpInsertion'] = predict + # Make a prediction based on total_assigned + Y_pred_max = regr.predict(locus_totals['total_assigned_log'].values.reshape(-1, 1)) + predict_max = np.power(2, Y_pred_max) + locus_totals['bpInsertion_max'] = predict_max # Get the estimated size in terms of repeat units (total, not relative to ref) repeatunit_lens = [len(x) for x in locus_totals['repeatunit']] locus_totals['repeatUnits'] = (locus_totals['bpInsertion']/repeatunit_lens) + locus_totals['reflen'] + locus_totals['repeatUnits_max'] = (locus_totals['bpInsertion_max']/repeatunit_lens) + locus_totals['reflen'] # Split locus into 3 columns: chrom start end locuscols = pd.DataFrame([x.split('-') for x in locus_totals['locus']], @@ -384,7 +391,7 @@ def main(): 'sample', 'repeatunit', 'reflen', 'locuscoverage', 'total_assigned', 'outlier', 'p_adj', - 'bpInsertion', 'repeatUnits' + 'bpInsertion', 'repeatUnits', 'repeatUnits_max' ]] #sort by outlier score then estimated size (bpInsertion), both descending From dcb3b8b59dbcac468ec0e7473eb31cf15085fd91 Mon Sep 17 00:00:00 2001 From: Harriet Dashnow Date: Mon, 1 Oct 2018 14:23:20 +1000 Subject: [PATCH 3/6] limit the number of decimal points reported in the final STRetch results --- scripts/estimateSTR.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/scripts/estimateSTR.py b/scripts/estimateSTR.py index ba6ee6e..f693463 100644 --- a/scripts/estimateSTR.py +++ b/scripts/estimateSTR.py @@ -397,6 +397,11 @@ def main(): #sort by outlier score then estimated size (bpInsertion), both descending write_data = write_data.sort_values(['outlier', 'bpInsertion'], ascending=[False, False]) #XXX check for duplicate rows? + # Convert outlier and p_adj to numeric type and do some rounding/formatting + write_data['outlier'] = pd.to_numeric(write_data['outlier']) + write_data['p_adj'] = [ format(x, '.2g') for x in pd.to_numeric(write_data['p_adj']) ] + write_data = write_data.round({'total_assigned': 1, 'outlier': 1, + 'bpInsertion': 1, 'repeatUnits': 1, 'repeatUnits_max': 1}) # Write individual files for each sample, remove rows where locuscoverage == 0 samples = set(write_data['sample']) From 54a079fb88c9283827512954d0e6d686f93a0211 Mon Sep 17 00:00:00 2001 From: Harriet Dashnow Date: Mon, 1 Oct 2018 14:34:55 +1000 Subject: [PATCH 4/6] update test data reference output file for Travis --- .testing/STRs.benchmark.tsv | 12 ++++++------ .travis.yml | 1 + 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/.testing/STRs.benchmark.tsv b/.testing/STRs.benchmark.tsv index 2b0f96b..3b471a6 100644 --- a/.testing/STRs.benchmark.tsv +++ b/.testing/STRs.benchmark.tsv @@ -1,6 +1,6 @@ -chrom start end sample repeatunit reflen locuscoverage outlier p_adj bpInsertion repeatUnits -chr13 70713515 70713561 11 AGC 15.3 35 1.88685648944145 0.0295898153640992 316.950118548597 120.950039516199 -chr13 70713515 70713561 69 AGC 15.3 8 0.4297790752454 0.333678177749521 76.1210174457791 40.6736724819264 -chr13 70713515 70713561 1 AGC 15.3 3 -0.426744171073212 0.665217162914456 32.9115187238725 26.2705062412908 -chr13 70713515 70713561 54 AGC 15.3 2 -0.730726313557163 0.7675268301731 24.4403744973217 23.4467914991072 -chr13 70713515 70713561 49 AGC 15.3 1 -1.15916508005647 0.876805548823274 16.0677184130193 20.6559061376731 +chrom start end sample repeatunit reflen locuscoverage total_assigned outlier p_adj bpInsertion repeatUnits repeatUnits_max +chr13 70713515 70713561 11_L001_R1 AGC 15.3 35 52.0 1.9 0.03 317.0 121.0 172.9 +chr13 70713515 70713561 69_L001_R1 AGC 15.3 8 8.0 0.4 0.33 76.1 40.7 40.7 +chr13 70713515 70713561 1_L001_R1 AGC 15.3 3 3.0 -0.4 0.67 32.9 26.3 26.3 +chr13 70713515 70713561 54_L001_R1 AGC 15.3 2 2.0 -0.7 0.77 24.4 23.4 23.4 +chr13 70713515 70713561 49_L001_R1 AGC 15.3 1 1.0 -1.2 0.88 16.1 20.7 20.7 diff --git a/.travis.yml b/.travis.yml index 6e58058..6c33799 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,6 +11,7 @@ script: # Run the test data - ../tools/bin/bpipe run ../pipelines/STRetch_exome_pipeline.groovy ../test-data/*.fastq.gz - if diff STRs.tsv ../.testing/STRs.benchmark.tsv; then echo exit 0; else echo exit 1; fi +#- diff STRs.tsv ../.testing/STRs.benchmark.tsv after_script: - head *.locus_counts *.STR_counts *.median_cov - head *.tsv From 6e26e955823c9c89c7bb2c947a70098cd1c1d54a Mon Sep 17 00:00:00 2001 From: Harriet Dashnow Date: Tue, 23 Oct 2018 14:35:43 +1100 Subject: [PATCH 5/6] set sample name differently depending on number of underscores in the fastq filename --- pipelines/pipeline_stages.groovy | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pipelines/pipeline_stages.groovy b/pipelines/pipeline_stages.groovy index eabe472..a37ab77 100644 --- a/pipelines/pipeline_stages.groovy +++ b/pipelines/pipeline_stages.groovy @@ -55,9 +55,13 @@ set_sample_info = { // sample_flowcell_library_lane_read.fastq.gz // however sample can also include underscores def info = get_info(input) - if (info.length >= 5) { + if (info.length == 2) { - //branch.sample = info[0..-5].join("_") + branch.sample = info[0..-2].join("_") + } + if (info.length >= 5) { + + branch.sample = info[0..-5].join("_") branch.flowcell = info[-4] branch.library = info[-3] branch.lane = info[-2] From 2e939ed2ca8f0fb408a3f1edb3c58fe7f0ef7f1a Mon Sep 17 00:00:00 2001 From: Harriet Dashnow Date: Thu, 14 Feb 2019 11:35:22 +1100 Subject: [PATCH 6/6] report in STRetch output: locuscoverage_log, total_assigned_log, genomecov --- scripts/estimateSTR.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/estimateSTR.py b/scripts/estimateSTR.py index f693463..70d3f37 100644 --- a/scripts/estimateSTR.py +++ b/scripts/estimateSTR.py @@ -390,6 +390,7 @@ def main(): write_data = locus_totals[['chrom', 'start', 'end', 'sample', 'repeatunit', 'reflen', 'locuscoverage', 'total_assigned', + 'locuscoverage_log', 'total_assigned_log', 'genomecov', 'outlier', 'p_adj', 'bpInsertion', 'repeatUnits', 'repeatUnits_max' ]]