From fca7e7de2846f2c7a99d41b96d554813a1890e7f Mon Sep 17 00:00:00 2001 From: Greg Pinero Date: Fri, 5 Jul 2013 11:37:40 -0400 Subject: [PATCH] Update grepfqparser.py Fixed grep to remove -- lines and removed extra code. --- grepfqparser.py | 52 +++++++++++++++++++++---------------------------- 1 file changed, 22 insertions(+), 30 deletions(-) diff --git a/grepfqparser.py b/grepfqparser.py index 1922790..34e6908 100644 --- a/grepfqparser.py +++ b/grepfqparser.py @@ -43,21 +43,24 @@ def main(): #parse command line options try: - opts, arg = getopt.getopt(sys.argv[1:],"h", ["help"]) + opts, arg = getopt.getopt(sys.argv[1:],"ht:", ["help"]) except getopt.error, msg: print msg print "for help use --help" sys.exit(2) # process options + offset = 0 for o, a in opts: + if o == '-t': + offset = int(a) + print "using offset", offset if o in ("-h", "--help"): print __doc__ sys.exit(0) if len(arg) < 3: - print "\nUsage: python grepfqparser.py \m" + print "\nUsage: python grepfqparser.py [options] \m" sys.exit(0) #process arguments - fqFile = arg[0] bcFile = arg[1] OutFolder = arg[2] @@ -83,7 +86,7 @@ def main(): else: gzbool = "NO" - print "fq file gzipped? = %s" %(gzbool) + print "fq file gzipped? = %s" %(gzbool) if gzbool == "YES": print "unzipping file" tempfq = open("tempfq",'w') @@ -101,31 +104,18 @@ def main(): barcode_up = barcode.upper() name = lineItems[1] print barcode - - parsed_file_step1_name = str(OutFolder + "/indiv" + name + "_" + barcode + "firstgrep") - parsed_file_step1 = open(parsed_file_step1_name,'w') - errlog = open("errlog2",'w') - cmd = 'grep -B 1 -A 2 ^%s %s' % (barcode_up, fqFile) - subprocess.call(cmd, shell=True,stdout=parsed_file_step1, stderr=errlog) - #(note: pipe into sed to remove barcodes and associated quality scores from each line) - cmd = "grep -B 1 -A 2 ^%s %s | sed '2~2s/^%s//g'" % (barcode_up, fqFile, '.'*len(barcode_up)) - subprocess.call(cmd, shell=True,stdout=parsed_file_step1, stderr=errlog) - errlog.close() - parsed_file_step1.close() - - """grep with -B and -A produces spacer marks '--' in file. Cannot figure out how to suppress these, - so remove and paste into new file, then delete original""" - parsed_file_name = str(OutFolder + "/indiv" + name + "_" + barcode) parsed_file = open(parsed_file_name,'w') - errlog = open("errlog3",'w') - cmd = 'awk "!/^--$/" %s' % (parsed_file_step1_name) - subprocess.check_call(cmd,shell=True,stdout=parsed_file,stderr=errlog) - errlog.close() - parsed_file.close() - cmd = 'rm %s' % (parsed_file_step1_name) - subprocess.check_call(cmd,shell=True) - + errlog = open("errlog2",'w') + #First grep finds lines starting with the barcode and includes the line above, and two lines below each match + #Pipe into grep again to filter out -- between matches which some (versions??) of grep insert + #Pipe into sed to remove barcodes (optional offset) and associated quality scores from each line + cmd = """grep -B 1 -A 2 ^%s %s | grep -v "^--$" | sed '2~2s/^%s//g'""" % (barcode_up, fqFile, '.'*(len(barcode_up)+offset)) + try: + failed = subprocess.call(cmd, shell=True,stdout=parsed_file, stderr=errlog) + finally: + errlog.close() + parsed_file.close() bc.close() """Now collect all unparsed reads""" @@ -146,9 +136,11 @@ def main(): nomatch_file = open(OutFolder + "/nomatches",'w') errlog = open("errlog4",'w') cmd = "awk 'NR%%4==2' %s | grep -f %s -v" % (fqFile, bconly) - nomatch = subprocess.call(cmd, shell=True, stdout=nomatch_file,stderr=errlog) - errlog.close() - nomatch_file.close() + try: + nomatch = subprocess.call(cmd, shell=True, stdout=nomatch_file,stderr=errlog) + finally: + errlog.close() + nomatch_file.close() """delete tempfq, the gunzipped original file""" cmd = 'rm tempfq bcOnly errlog1 errlog2 errlog3 errlog4'