From 82838651faad7bbf6e2146498ea4e1389b24db89 Mon Sep 17 00:00:00 2001
From: Harriet Dashnow <h.dashnow@gmail.com>
Date: Mon, 24 Sep 2018 16:55:30 +1000
Subject: [PATCH 1/6] calculate number of reads on decoy within identify_locus,
 STR_counts stage no longer required. Additional functionality commented out

---
 pipelines/STRetch_exome_pipeline.groovy   |  1 -
 pipelines/STRetch_wgs_bam_pipeline.groovy |  5 +--
 pipelines/STRetch_wgs_pipeline.groovy     |  1 -
 pipelines/pipeline_stages.groovy          | 16 ++------
 scripts/estimateSTR.py                    | 50 +++++++++--------------
 scripts/identify_locus.py                 | 26 ++++++++++--
 6 files changed, 48 insertions(+), 51 deletions(-)

diff --git a/pipelines/STRetch_exome_pipeline.groovy b/pipelines/STRetch_exome_pipeline.groovy
index 0e9b168..12268ee 100644
--- a/pipelines/STRetch_exome_pipeline.groovy
+++ b/pipelines/STRetch_exome_pipeline.groovy
@@ -15,7 +15,6 @@ run {
         set_sample_info +
         align_bwa + index_bam +
         median_cov_region +
-        STR_coverage +
         STR_locus_counts
     ] +
     estimate_size
diff --git a/pipelines/STRetch_wgs_bam_pipeline.groovy b/pipelines/STRetch_wgs_bam_pipeline.groovy
index dfd482a..7a5ede8 100644
--- a/pipelines/STRetch_wgs_bam_pipeline.groovy
+++ b/pipelines/STRetch_wgs_bam_pipeline.groovy
@@ -32,10 +32,9 @@ run {
             mosdepth_dist + mosdepth_median,
             shards * [
                 init_shard + align_bwa_bam + index_bam 
-            ] + merge_bams
-        ] +
-        STR_coverage +
+            ] + merge_bams +
         STR_locus_counts 
+        ]
     ] +
     estimate_size
 }
diff --git a/pipelines/STRetch_wgs_pipeline.groovy b/pipelines/STRetch_wgs_pipeline.groovy
index cd8ad9d..9d83e5a 100644
--- a/pipelines/STRetch_wgs_pipeline.groovy
+++ b/pipelines/STRetch_wgs_pipeline.groovy
@@ -12,7 +12,6 @@ run {
         set_sample_info +
         align_bwa + index_bam +
         median_cov +
-        STR_coverage +
         STR_locus_counts 
     ] +
     estimate_size
diff --git a/pipelines/pipeline_stages.groovy b/pipelines/pipeline_stages.groovy
index 2cafc28..eabe472 100644
--- a/pipelines/pipeline_stages.groovy
+++ b/pipelines/pipeline_stages.groovy
@@ -165,20 +165,9 @@ index_bam = {
     forward input
 }
 
-STR_coverage = {
-    transform("bam") to ("STR_counts") {
-        exec """
-            $bedtools coverage -counts
-            -sorted
-            -g ${REF}.genome
-            -a $DECOY_BED
-            -b $input.bam > $output.STR_counts
-        """
-    }
-}
-
 STR_locus_counts = {
-    transform("bam") to ("locus_counts") {
+    
+    transform("bam") to ("locus_counts", "STR_counts") {
         exec """
             STRPATH=$PATH;
             PATH=$STRETCH/tools/bin:$PATH;
@@ -186,6 +175,7 @@ STR_locus_counts = {
             --bam $input.bam
             --bed $STR_BED
             --output $output.locus_counts
+            --STR_counts $output.STR_counts
             ;PATH=$STRPATH
         """
     }
diff --git a/scripts/estimateSTR.py b/scripts/estimateSTR.py
index 138203a..c03b87e 100644
--- a/scripts/estimateSTR.py
+++ b/scripts/estimateSTR.py
@@ -59,13 +59,12 @@ def parse_STRcov(filename):
     """Parse all STR coverage"""
     sample_id = get_sample(filename)
     try:
-        cov_data = pd.read_table(filename, delim_whitespace = True,
-                            names = ['chrom', 'start', 'end', 'decoycov'])
+        cov_data = pd.read_table(filename, delim_whitespace = True)
     except pd.io.common.EmptyDataError:
         sys.exit('ERROR: file {0} was empty.\n'.format(filename))
     cov_data['sample'] = sample_id
     cov_data['repeatunit'] = [x.split('-')[1] for x in cov_data['chrom']]
-    cov_data = cov_data[['sample', 'repeatunit', 'decoycov']]
+    cov_data = cov_data[['sample', 'repeatunit', 'decoycov', 'decoycov_pairs']]
     return(cov_data)
 
 def parse_locuscov(filename):
@@ -224,32 +223,23 @@ def main():
     factor = 100
 
     STRcov_data = pd.merge(STRcov_data, genomecov_data)
-    #STRcov_data['decoycov_norm'] = factor * (STRcov_data['decoycov'] + 1) / STRcov_data['genomecov']
-    #STRcov_data['decoycov_log'] = np.log2(STRcov_data['decoycov_norm'])
-    #XXX combines the previous two lines into one. Keeping commented out in case coverage_norm is required later
-    STRcov_data['decoycov_log'] = np.log2(factor * (STRcov_data['decoycov'] + 1) / STRcov_data['genomecov'])
-
-    # #XXX ***Not fully implemented***
-    # # Look for repeat units where the number of reads mapping to the decoy can't be
-    # # explained by those mapping to all loci with that repeat unit
-    #
-    # # Sum the counts over all loci for each repeat unit in each sample
-    # locus_totals = locuscov_data.groupby(['sample', 'repeatunit'])['locuscoverage'].aggregate(np.sum)
-    # locus_totals = pd.DataFrame(locus_totals).reset_index() # convert to DataFrame and make indices into columns
-    # # Calculate the difference between reads assigned to a decoy and the sum of
-    # # all reads assigned to loci with that repeat unit
-    # all_differences = pd.merge(STRcov_data, locus_totals, how='left')
-    # all_differences['difference'] = all_differences['decoycov'] - all_differences['locuscoverage']
-    # # Normalise differences by median coverage and take the log2
-    # all_differences['difference_log'] = np.log2(factor * (all_differences['difference'] + 1) / all_differences['genomecov'])
-    #
-    # locus_totals = pd.merge(locus_totals, STRcov_data)
-    #
-    # # Assign decoy counts to each locus, based on what proportion of the counts for that repeat unit they already have
-
-    locus_totals = pd.merge(locuscov_data, STRcov_data, how = 'left')
-
-    locus_totals['total_assigned'] = locus_totals['locuscoverage'] #XXX remove this line if implementing the above
+
+#    # XXX this code works, but not sure if it's a good idea
+#    # Use pairs of reads where both map to the same STR decoy chromosome to increase
+#    # size estimates and calculate outliers
+#    # Sum the counts over all loci for each repeat unit in each sample and count loci 
+#    # with same repeat unit
+#    locus_totals = locuscov_data.groupby(['sample', 'repeatunit'])['locuscoverage'].agg(['sum', 
+#        'count']).reset_index()
+#    locus_totals = locuscov_data.merge(locus_totals, how = 'left').merge(STRcov_data, how = 'left')
+#    # Assign decoy counts to each locus, based on what proportion of the counts for that 
+#    # repeat unit they already have
+#    locus_totals['total_assigned'] = locus_totals['locuscoverage'] + locus_totals['decoycov_pairs'] * locus_totals['locuscoverage']/locus_totals['sum']
+#    # NaNs introduced where count = 0, fill these with these with locuscoverage
+#    locus_totals['total_assigned'].fillna(locus_totals['locuscoverage'], inplace=True)
+    locus_totals = locuscov_data.merge(STRcov_data, how = 'left')
+    locus_totals['total_assigned'] = locus_totals['locuscoverage'] #XXX remove if implementing above
+
     locus_totals['total_assigned_log'] = np.log2(factor * (locus_totals['total_assigned'] + 1) / locus_totals['genomecov'])
     
     # For each locus, calculate if that sample is an outlier relative to others
@@ -392,7 +382,7 @@ def main():
     # Specify output data columns
     write_data = locus_totals[['chrom', 'start', 'end',
                                     'sample', 'repeatunit', 'reflen',
-                                    'locuscoverage',
+                                    'locuscoverage', 'total_assigned',
                                     'outlier', 'p_adj',
                                     'bpInsertion', 'repeatUnits'
                                     ]]
diff --git a/scripts/identify_locus.py b/scripts/identify_locus.py
index 3951f1c..07d4821 100644
--- a/scripts/identify_locus.py
+++ b/scripts/identify_locus.py
@@ -33,7 +33,10 @@ def parse_args():
         help='bed file containing genomic locations of STR loci. Genomic locations should be relative to the fasta reference used to create the bam')
     parser.add_argument(
         '--output', type=str, required=False,
-        help='Output file name. Defaults to stdout.')
+        help='Output file name for locus counts. Defaults to stdout.')
+    parser.add_argument(
+        '--STR_counts', type=str, required=False,
+        help='Output file name for total counts of reads aligned to each STR decoy chromosome. If not given, these will not be reported.')
     parser.add_argument(
         '--dist', type=int, required=False, default=500,
         help='Counts are only assigned to an STR locus that is at most this many bp away. The best choice is probably the insert size. (default: %(default)s)')
@@ -95,6 +98,13 @@ def main():
         # Read bam
         bam = pysam.Samfile(bamfile, 'rb')
 
+        # Get count of reads aligned to STR decoy chromosomes
+        STR_counts = {}
+        index_stats = bam.get_index_statistics()
+        for x in index_stats:
+            if x.contig.startswith("STR-"):
+                STR_counts[x.contig] = [x.mapped, None] # cols: mapped, both on same decoy
+
         # Get relevant chromosomes
         required_chroms = []
         unpaired = 0
@@ -105,8 +115,8 @@ def main():
         # Check if any STR- chromosomes
         if len(required_chroms) == 0:
            sys.exit('ERROR: There were no reads mapping to chromosomes with names starting with "STR-" in {0}. Are you sure this data is mapped to a reference genome with STR decoy chromosomes?'.format(bamfile))
-
         for chrom in required_chroms:
+            STR_counts[chrom][1] = 0
             motif = chrom.split('-')[1]
             all_positions = []
             all_segments = bam.fetch(reference=chrom)
@@ -121,7 +131,8 @@ def main():
                 mate_start = read.next_reference_start
                 mate_stop = mate_start + readlen
                 all_positions.append([mate_chr, mate_start, mate_stop])
-
+                if mate_chr == chrom: # check if both in pair map to the same STR decoy
+                    STR_counts[chrom][1] += 1
             # Strategy:
             # Merge all overlapping intervals
             # Keep the count of reads corresponding to each merged interval (i.e. 1 for each read contained in it)
@@ -182,6 +193,15 @@ def main():
         sys.stderr.write('WARNING: it appears that {0} of the {1} reads overlapping the target STR regions were unpaired and so no useful data could be obtained from them.\n'.format(unpaired, total))
 
     # Write results
+
+    if args.STR_counts:
+        STR_counts_df = pd.DataFrame.from_dict(STR_counts, orient='index',
+            columns=['decoycov', 'decoycov_pairs'])
+        with open(args.STR_counts, 'w') as STR_outstream:
+            STR_outstring = STR_counts_df.to_csv(sep='\t', header=True, index=True,
+                index_label='chrom')
+            STR_outstream.write(STR_outstring)
+
     if outfile:
         outstream = open(outfile, 'w')
     else:

From 1528f8e7b0ea3f6eb0e5f82f40e1d4f0995ad542 Mon Sep 17 00:00:00 2001
From: Harriet Dashnow <h.dashnow@gmail.com>
Date: Sat, 29 Sep 2018 14:09:04 +1000
Subject: [PATCH 2/6] Generate max estimated allele size by allocating
 both-in-pair on decoy reads. Use locuscoverage only for stats

---
 scripts/estimateSTR.py | 69 +++++++++++++++++++++++-------------------
 1 file changed, 38 insertions(+), 31 deletions(-)

diff --git a/scripts/estimateSTR.py b/scripts/estimateSTR.py
index c03b87e..ba6ee6e 100644
--- a/scripts/estimateSTR.py
+++ b/scripts/estimateSTR.py
@@ -224,26 +224,28 @@ def main():
 
     STRcov_data = pd.merge(STRcov_data, genomecov_data)
 
-#    # XXX this code works, but not sure if it's a good idea
-#    # Use pairs of reads where both map to the same STR decoy chromosome to increase
-#    # size estimates and calculate outliers
-#    # Sum the counts over all loci for each repeat unit in each sample and count loci 
-#    # with same repeat unit
-#    locus_totals = locuscov_data.groupby(['sample', 'repeatunit'])['locuscoverage'].agg(['sum', 
-#        'count']).reset_index()
-#    locus_totals = locuscov_data.merge(locus_totals, how = 'left').merge(STRcov_data, how = 'left')
-#    # Assign decoy counts to each locus, based on what proportion of the counts for that 
-#    # repeat unit they already have
-#    locus_totals['total_assigned'] = locus_totals['locuscoverage'] + locus_totals['decoycov_pairs'] * locus_totals['locuscoverage']/locus_totals['sum']
-#    # NaNs introduced where count = 0, fill these with these with locuscoverage
-#    locus_totals['total_assigned'].fillna(locus_totals['locuscoverage'], inplace=True)
-    locus_totals = locuscov_data.merge(STRcov_data, how = 'left')
-    locus_totals['total_assigned'] = locus_totals['locuscoverage'] #XXX remove if implementing above
-
+    # Use pairs of reads where both map to the same STR decoy chromosome to increase
+    # size estimates and calculate outliers
+    # Sum the counts over all loci for each repeat unit in each sample and count loci 
+    # with same repeat unit
+    locus_totals = locuscov_data.groupby(['sample', 'repeatunit'])['locuscoverage'].agg(['sum', 
+        'count']).reset_index()
+    locus_totals = locuscov_data.merge(locus_totals, how = 'left').merge(STRcov_data, how = 'left')
+    # Assign decoy counts to each locus, based on what proportion of the counts for that 
+    # repeat unit they already have
+    locus_totals['total_assigned'] = locus_totals['locuscoverage'] + locus_totals['decoycov_pairs'] * locus_totals['locuscoverage']/locus_totals['sum']
+    # NaNs introduced where count = 0, fill these with with locuscoverage
+    locus_totals['total_assigned'].fillna(locus_totals['locuscoverage'], inplace=True)
+
+#XXX Uncomment if not implementing the above
+#    locus_totals = locuscov_data.merge(STRcov_data, how = 'left')
+#    locus_totals['total_assigned'] = locus_totals['locuscoverage']
+
+    locus_totals['locuscoverage_log'] = np.log2(factor * (locus_totals['locuscoverage'] + 1) / locus_totals['genomecov'])
     locus_totals['total_assigned_log'] = np.log2(factor * (locus_totals['total_assigned'] + 1) / locus_totals['genomecov'])
     
     # For each locus, calculate if that sample is an outlier relative to others
-    total_assigned_wide = locus_totals.pivot(index='locus', columns='sample', values='total_assigned_log')
+    locuscoverage_wide = locus_totals.pivot(index='locus', columns='sample', values='locuscoverage_log')
 
     # Calculate values for if there were zero reads at a locus in all samples
     null_locus_counts = np.log2(factor * (0 + 1) / genomecov_data['genomecov'])
@@ -258,7 +260,7 @@ def main():
 
     # Use Huber's M-estimator to calculate median and SD across all samples
     # for each locus
-    sample_estimates = total_assigned_wide.apply(hubers_est, axis=1)
+    sample_estimates = locuscoverage_wide.apply(hubers_est, axis=1)
     # Where sd is NA, replace with the minimum non-zero sd from all loci
     min_sd = np.min(sample_estimates['sd'][sample_estimates['sd'] > 0])
     sample_estimates['sd'].fillna(min_sd, inplace=True)
@@ -274,7 +276,7 @@ def main():
 
         sample_estimates.loc['null_locus_counts'] = null_locus_counts_est
 
-        n = len(total_assigned_wide.columns)
+        n = len(locuscoverage_wide.columns)
         sample_estimates['n'] = n
 
         sample_estimates.to_csv(emit_file, sep= '\t')
@@ -286,30 +288,30 @@ def main():
         control_estimates = parse_controls(control_file)
         # Get a list of all loci in the control file but not the sample data
         control_loci_df = control_estimates.iloc[control_estimates.index != 'null_locus_counts']
-        control_loci = [x for x in control_loci_df.index if x not in total_assigned_wide.index]
+        control_loci = [x for x in control_loci_df.index if x not in locuscoverage_wide.index]
 
         # Extract and order just those control estimates appearing in the current data
-        mu_sd_estimates = control_estimates.reindex(total_assigned_wide.index)
+        mu_sd_estimates = control_estimates.reindex(locuscoverage_wide.index)
         # Fill NaNs with null_locus_counts values
         mu_sd_estimates.fillna(control_estimates.loc['null_locus_counts'],
                                 inplace=True)
     else:
         # Extract and order estimates to match the current data
-        mu_sd_estimates = sample_estimates.reindex(total_assigned_wide.index)
+        mu_sd_estimates = sample_estimates.reindex(locuscoverage_wide.index)
 
     # calculate z scores
-    z = z_score(total_assigned_wide, mu_sd_estimates)
+    z = z_score(locuscoverage_wide, mu_sd_estimates)
 
     # If a control file is given, effectively add zeros counts at all loci in 
     # controls but not in the samples. 
     # These extra rows will dissapear due to a later merge
     if control_file != '': 
-        # Create a total_assigned_wide as if all loci have zero counts
-        null_total_assigned_wide = pd.DataFrame(columns = sample_names, index = control_loci)
-        null_total_assigned_wide.fillna(null_locus_counts, inplace = True)
+        # Create a locuscoverage_wide as if all loci have zero counts
+        null_locuscoverage_wide = pd.DataFrame(columns = sample_names, index = control_loci)
+        null_locuscoverage_wide.fillna(null_locus_counts, inplace = True)
         # Caculate z scores
-        null_z = z_score(null_total_assigned_wide, 
-                            control_estimates.reindex(null_total_assigned_wide.index))
+        null_z = z_score(null_locuscoverage_wide, 
+                            control_estimates.reindex(null_locuscoverage_wide.index))
         loci_with_counts = z.index
         z = z.append(null_z)
 
@@ -365,14 +367,19 @@ def main():
     X_train = np.log2(STRcov_model['coverage_norm']).values.reshape(-1, 1)
     Y_train = np.log2(STRcov_model['allele2'])
     regr.fit(X_train, Y_train)
-    # Make a prediction
-    Y_pred = regr.predict(locus_totals['total_assigned_log'].values.reshape(-1, 1))
+    # Make a prediction based on locuscoverage
+    Y_pred = regr.predict(locus_totals['locuscoverage_log'].values.reshape(-1, 1))
     predict = np.power(2, Y_pred)
     locus_totals['bpInsertion'] = predict
+    # Make a prediction based on total_assigned
+    Y_pred_max = regr.predict(locus_totals['total_assigned_log'].values.reshape(-1, 1))
+    predict_max = np.power(2, Y_pred_max)
+    locus_totals['bpInsertion_max'] = predict_max
 
     # Get the estimated size in terms of repeat units (total, not relative to ref)
     repeatunit_lens = [len(x) for x in locus_totals['repeatunit']]
     locus_totals['repeatUnits'] = (locus_totals['bpInsertion']/repeatunit_lens) + locus_totals['reflen']
+    locus_totals['repeatUnits_max'] = (locus_totals['bpInsertion_max']/repeatunit_lens) + locus_totals['reflen']
 
     # Split locus into 3 columns: chrom start end
     locuscols = pd.DataFrame([x.split('-') for x in locus_totals['locus']],
@@ -384,7 +391,7 @@ def main():
                                     'sample', 'repeatunit', 'reflen',
                                     'locuscoverage', 'total_assigned',
                                     'outlier', 'p_adj',
-                                    'bpInsertion', 'repeatUnits'
+                                    'bpInsertion', 'repeatUnits', 'repeatUnits_max'
                                     ]]
 
     #sort by outlier score then estimated size (bpInsertion), both descending

From dcb3b8b59dbcac468ec0e7473eb31cf15085fd91 Mon Sep 17 00:00:00 2001
From: Harriet Dashnow <h.dashnow@gmail.com>
Date: Mon, 1 Oct 2018 14:23:20 +1000
Subject: [PATCH 3/6] limit the number of decimal points reported in the final
 STRetch results

---
 scripts/estimateSTR.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/scripts/estimateSTR.py b/scripts/estimateSTR.py
index ba6ee6e..f693463 100644
--- a/scripts/estimateSTR.py
+++ b/scripts/estimateSTR.py
@@ -397,6 +397,11 @@ def main():
     #sort by outlier score then estimated size (bpInsertion), both descending
     write_data = write_data.sort_values(['outlier', 'bpInsertion'], ascending=[False, False])
     #XXX check for duplicate rows?
+    # Convert outlier and p_adj to numeric type and do some rounding/formatting
+    write_data['outlier'] = pd.to_numeric(write_data['outlier'])
+    write_data['p_adj'] = [ format(x, '.2g') for x in pd.to_numeric(write_data['p_adj']) ]
+    write_data = write_data.round({'total_assigned': 1, 'outlier': 1,
+        'bpInsertion': 1, 'repeatUnits': 1, 'repeatUnits_max': 1})
 
     # Write individual files for each sample, remove rows where locuscoverage == 0
     samples = set(write_data['sample'])

From 54a079fb88c9283827512954d0e6d686f93a0211 Mon Sep 17 00:00:00 2001
From: Harriet Dashnow <h.dashnow@gmail.com>
Date: Mon, 1 Oct 2018 14:34:55 +1000
Subject: [PATCH 4/6] update test data reference output file for Travis

---
 .testing/STRs.benchmark.tsv | 12 ++++++------
 .travis.yml                 |  1 +
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/.testing/STRs.benchmark.tsv b/.testing/STRs.benchmark.tsv
index 2b0f96b..3b471a6 100644
--- a/.testing/STRs.benchmark.tsv
+++ b/.testing/STRs.benchmark.tsv
@@ -1,6 +1,6 @@
-chrom	start	end	sample	repeatunit	reflen	locuscoverage	outlier	p_adj	bpInsertion	repeatUnits
-chr13	70713515	70713561	11	AGC	15.3	35	1.88685648944145	0.0295898153640992	316.950118548597	120.950039516199
-chr13	70713515	70713561	69	AGC	15.3	8	0.4297790752454	0.333678177749521	76.1210174457791	40.6736724819264
-chr13	70713515	70713561	1	AGC	15.3	3	-0.426744171073212	0.665217162914456	32.9115187238725	26.2705062412908
-chr13	70713515	70713561	54	AGC	15.3	2	-0.730726313557163	0.7675268301731	24.4403744973217	23.4467914991072
-chr13	70713515	70713561	49	AGC	15.3	1	-1.15916508005647	0.876805548823274	16.0677184130193	20.6559061376731
+chrom	start	end	sample	repeatunit	reflen	locuscoverage	total_assigned	outlier	p_adj	bpInsertion	repeatUnits	repeatUnits_max
+chr13	70713515	70713561	11_L001_R1	AGC	15.3	35	52.0	1.9	0.03	317.0	121.0	172.9
+chr13	70713515	70713561	69_L001_R1	AGC	15.3	8	8.0	0.4	0.33	76.1	40.7	40.7
+chr13	70713515	70713561	1_L001_R1	AGC	15.3	3	3.0	-0.4	0.67	32.9	26.3	26.3
+chr13	70713515	70713561	54_L001_R1	AGC	15.3	2	2.0	-0.7	0.77	24.4	23.4	23.4
+chr13	70713515	70713561	49_L001_R1	AGC	15.3	1	1.0	-1.2	0.88	16.1	20.7	20.7
diff --git a/.travis.yml b/.travis.yml
index 6e58058..6c33799 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -11,6 +11,7 @@ script:
 # Run the test data
 - ../tools/bin/bpipe run ../pipelines/STRetch_exome_pipeline.groovy ../test-data/*.fastq.gz
 - if diff STRs.tsv ../.testing/STRs.benchmark.tsv; then echo exit 0; else echo exit 1; fi
+#- diff STRs.tsv ../.testing/STRs.benchmark.tsv
 after_script:
 - head *.locus_counts *.STR_counts *.median_cov
 - head *.tsv

From 6e26e955823c9c89c7bb2c947a70098cd1c1d54a Mon Sep 17 00:00:00 2001
From: Harriet Dashnow <h.dashnow@gmail.com>
Date: Tue, 23 Oct 2018 14:35:43 +1100
Subject: [PATCH 5/6] set sample name differently depending on number of
 underscores in the fastq filename

---
 pipelines/pipeline_stages.groovy | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/pipelines/pipeline_stages.groovy b/pipelines/pipeline_stages.groovy
index eabe472..a37ab77 100644
--- a/pipelines/pipeline_stages.groovy
+++ b/pipelines/pipeline_stages.groovy
@@ -55,9 +55,13 @@ set_sample_info = {
         // sample_flowcell_library_lane_read.fastq.gz
         // however sample can also include underscores
         def info = get_info(input)
-        if (info.length >= 5) {
+        if (info.length == 2) {
          
-            //branch.sample = info[0..-5].join("_")
+            branch.sample = info[0..-2].join("_")
+        }
+       if (info.length >= 5) {
+         
+            branch.sample = info[0..-5].join("_")
             branch.flowcell = info[-4]
             branch.library = info[-3]
             branch.lane = info[-2]

From 2e939ed2ca8f0fb408a3f1edb3c58fe7f0ef7f1a Mon Sep 17 00:00:00 2001
From: Harriet Dashnow <h.dashnow@gmail.com>
Date: Thu, 14 Feb 2019 11:35:22 +1100
Subject: [PATCH 6/6] report in STRetch output: locuscoverage_log,
 total_assigned_log, genomecov

---
 scripts/estimateSTR.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/estimateSTR.py b/scripts/estimateSTR.py
index f693463..70d3f37 100644
--- a/scripts/estimateSTR.py
+++ b/scripts/estimateSTR.py
@@ -390,6 +390,7 @@ def main():
     write_data = locus_totals[['chrom', 'start', 'end',
                                     'sample', 'repeatunit', 'reflen',
                                     'locuscoverage', 'total_assigned',
+                                    'locuscoverage_log', 'total_assigned_log', 'genomecov',
                                     'outlier', 'p_adj',
                                     'bpInsertion', 'repeatUnits', 'repeatUnits_max'
                                     ]]