diff --git a/NeoPredPipe.py b/NeoPredPipe.py index ec3ec2c..69a1de3 100644 --- a/NeoPredPipe.py +++ b/NeoPredPipe.py @@ -10,48 +10,62 @@ import glob import argparse try: - import ConfigParser as configparser # for python 2 + import ConfigParser as configparser # for python 2 except: - import configparser # for python 3 + import configparser # for python 3 import shutil from collections import Counter -from vcf_manipulate import convert_to_annovar, annovar_annotation, get_coding_change,\ +from vcf_manipulate import convert_to_annovar, annovar_annotation, get_coding_change, \ ReformatFasta, MakeTempFastas -from predict_binding import predict_neoantigens +# from predict_binding import predict_neoantigens +from predict_binding import predict_neoantigens, predict_neoantigens2 from hla_preprocess import ConstructAlleles, ConstructAlleles_typeII, composeHLAFile, composeHLA2File, readInHLAwinners from postprocessing import DigestIndSample, AppendDigestedEps from process_expression import GetExpressionFiles +from plugins import mergeMTWT, mergeMTWT_class2 + def Parser(): # get user variables parser = argparse.ArgumentParser() parser.add_argument("-E", "--epitopes", dest="epitopes", nargs='+', type=int, default=[8, 9, 10], help="Epitope lengths for predictions. Default: 8 9 10") - parser.add_argument("-l", dest="cleanLog", default=True, action='store_false', help="Specifies whether to delete the ANNOVAR log file. Default: True. Note: Use for debugging.") - parser.add_argument("-d", dest="deleteInt", default=True, action='store_false', help="Specifies whether to delete intermediate files created by program. Default: True. Note: Set flag to resume job.") - parser.add_argument("-r", "--cleanrun", dest="makeitclean", default=False, action='store_true', help="Specify this alone with no other options to clean-up a run. Be careful that you mean to do this!!") - parser.add_argument('-p', "--preponly", dest="preponly", default=False, action='store_true',help="Prep files only without running neoantigen predictions. The prediction step takes the most time.") - parser.add_argument("--manualproc", dest="manualproc", default=False, action='store_true',help="Process vcf files into annovar-input format manually, to avoid issues from non 'genotype-calling' formats.") + parser.add_argument("-l", dest="cleanLog", default=True, action='store_false', + help="Specifies whether to delete the ANNOVAR log file. Default: True. Note: Use for debugging.") + parser.add_argument("-d", dest="deleteInt", default=True, action='store_false', + help="Specifies whether to delete intermediate files created by program. Default: True. Note: Set flag to resume job.") + parser.add_argument("-r", "--cleanrun", dest="makeitclean", default=False, action='store_true', + help="Specify this alone with no other options to clean-up a run. Be careful that you mean to do this!!") + parser.add_argument('-p', "--preponly", dest="preponly", default=False, action='store_true', + help="Prep files only without running neoantigen predictions. The prediction step takes the most time.") + parser.add_argument("--manualproc", dest="manualproc", default=False, action='store_true', + help="Process vcf files into annovar-input format manually, to avoid issues from non 'genotype-calling' formats.") parser.add_argument("--EL", dest="ELpred", default=False, action='store_true', - help="Flag to perform netMHCpan predictions with Eluted Ligand option (without the -BA flag). Please note that the output will NOT be compatible with downstream Recognition Potential analysis. Default=False (BA predictions)") + help="Flag to perform netMHCpan predictions with Eluted Ligand option (without the -BA flag). Please note that the output will NOT be compatible with downstream Recognition Potential analysis. Default=False (BA predictions)") requiredNamed = parser.add_argument_group('Required arguments') requiredNamed.add_argument("-I", dest="vcfdir", default=None, type=str, help="Input vcf file directory location. Example: -I ./Example/input_vcfs/") - requiredNamed.add_argument("-H", dest="hlafile", default=None, type=str, help="HLA file for vcf patient samples OR directory with patient-specific directories from running POLYSOLVER (see Readme).") - requiredNamed.add_argument("-o", dest="OutputDir", default=None, type=str, help="Output Directory Path") - requiredNamed.add_argument("-n", dest="outName", default="AllSamples", type=str, help="Name of the output file for neoantigen predictions") + requiredNamed.add_argument("-H", dest="hlafile", default=None, type=str, + help="HLA file for vcf patient samples OR directory with patient-specific directories from running POLYSOLVER (see Readme).") + requiredNamed.add_argument( + "-o", dest="OutputDir", default=None, type=str, help="Output Directory Path") + requiredNamed.add_argument("-n", dest="outName", default="AllSamples", + type=str, help="Name of the output file for neoantigen predictions") postProcess = parser.add_argument_group('Post Processing Options') - postProcess.add_argument("-pp", dest="postprocess", default=True, action='store_false', help="Flag to perform post processing. Default=True.") + postProcess.add_argument("-pp", dest="postprocess", default=True, + action='store_false', help="Flag to perform post processing. Default=True.") postProcess.add_argument("-c", dest="colRegions", default=None, nargs='+', - help="Columns of regions within vcf that are not normal within a multiregion vcf file after the format field. Example: 0 is normal in test samples, tumor are the other columns. Program can handle different number of regions per vcf file.") - postProcess.add_argument("-a", dest="includeall", default=False, action='store_true', help="Flag to not filter neoantigen predictions and keep all regardless of prediction value.") + help="Columns of regions within vcf that are not normal within a multiregion vcf file after the format field. Example: 0 is normal in test samples, tumor are the other columns. Program can handle different number of regions per vcf file.") + postProcess.add_argument("-a", dest="includeall", default=False, action='store_true', + help="Flag to not filter neoantigen predictions and keep all regardless of prediction value.") postProcess.add_argument("-m", dest="checkPeptides", default=False, action='store_true', help="Specifies whether to perform check if predicted epitopes match any normal peptide. If set to True, output is added as a column to neoantigens file. Requires PeptideMatch specified in usr_paths.ini. Default=False") postProcess.add_argument("-x", "--expression", dest="expression", default=None, type=str, - help="RNAseq expression quantification file(s), if specified, expression information is added to output tables.") + help="RNAseq expression quantification file(s), if specified, expression information is added to output tables.") postProcess.add_argument("--expmulti", dest="expMultiregion", default=False, action='store_true', - help="Flag to specify if expression file(s) has information on multiple regions in multiple columns. Default=False.") - postProcess.add_argument("-t", dest="buildSumTable", default=True, action='store_false', help="Flag to turn off a neoantigen burden summary table. Default=True.") + help="Flag to specify if expression file(s) has information on multiple regions in multiple columns. Default=False.") + postProcess.add_argument("-t", dest="buildSumTable", default=True, action='store_false', + help="Flag to turn off a neoantigen burden summary table. Default=True.") Options = parser.parse_args() # main user args Options.typeII = False @@ -59,9 +73,11 @@ def Parser(): CleanUp(Options) sys.exit("Process Complete") if not Options.vcfdir or not Options.hlafile or not Options.OutputDir: - parser.error("Some of the required arguments were not provided. Please check required arguments.") + parser.error( + "Some of the required arguments were not provided. Please check required arguments.") + + return (Options) - return(Options) def ConfigSectionMap(section, Config): dict1 = {} @@ -76,6 +92,7 @@ def ConfigSectionMap(section, Config): dict1[option] = None return dict1 + class Sample(): ''' Use this to run and execute the pipeline on an individual patient vcf file. @@ -85,53 +102,71 @@ def __init__(self, FilePath, patID, vcfFile, hla, annovar, netmhcpan, pepmatchPa self.patID = patID self.vcfFile = vcfFile self.hla = hla - self.avReadyFile=None - self.annotationReady=None - self.fastaChange=None - self.fastaChangeFormat=None - self.peptideFastas = None # Will be a dictionary of tmp files for predictions + self.avReadyFile = None + self.annotationReady = None + self.fastaChange = None + self.fastaChangeFormat = None + self.peptideFastas = None # Will be a dictionary of tmp files for predictions self.epcalls = None self.digestedEpitopes = None self.appendedEpitopes = None self.regionsPresent = None + # add for WT + self.peptideFastasWT = None + self.epcallsWT = None + self.digestedEpitopesWT = None + self.appendedEpitopesWT = None + self.ProcessAnnovar(FilePath, annovar, Options) if Options.typeII: - self.hlasnormed = ConstructAlleles_typeII(self.hla, FilePath, self.patID) + self.hlasnormed = ConstructAlleles_typeII( + self.hla, FilePath, self.patID) else: self.hlasnormed = ConstructAlleles(self.hla, FilePath, self.patID) if Options.preponly: - print("INFO: Input files prepared and completed for %s" % (self.patID)) + print("INFO: Input files prepared and completed for %s" % + (self.patID)) else: self.callNeoantigens(FilePath, netmhcpan, Options) if Options.postprocess: self.digestIndSample(FilePath, pepmatchPaths, Options) + self.digestIndSampleWT(FilePath, pepmatchPaths, Options) def ProcessAnnovar(self, FilePath, annovar, Options): # Prepare ANNOVAR input files if os.path.isfile(Options.OutputDir+"avready/"+self.patID+'.avinput'): - print("INFO: ANNOVAR Ready files for %s already present."%(self.patID)) + print("INFO: ANNOVAR Ready files for %s already present." % + (self.patID)) self.avReadyFile = Options.OutputDir+"avready/"+self.patID+'.avinput' else: - self.avReadyFile = convert_to_annovar(Options.OutputDir, self.patID, self.vcfFile, annovar, Options.manualproc) + self.avReadyFile = convert_to_annovar( + Options.OutputDir, self.patID, self.vcfFile, annovar, Options.manualproc) # Prepare ANNOVAR annotated input files if os.path.isfile(Options.OutputDir+"avannotated/"+self.patID+'.avannotated.exonic_variant_function'): - print("INFO: ANNOVAR Annotation files for %s already present." % (self.patID)) - self.annotationReady = Options.OutputDir+"avannotated/"+self.patID+'.avannotated.exonic_variant_function' + print("INFO: ANNOVAR Annotation files for %s already present." % + (self.patID)) + self.annotationReady = Options.OutputDir+"avannotated/" + \ + self.patID+'.avannotated.exonic_variant_function' else: - self.annotationReady = annovar_annotation(Options.OutputDir, self.patID, self.avReadyFile, annovar) + self.annotationReady = annovar_annotation( + Options.OutputDir, self.patID, self.avReadyFile, annovar) # Get Coding Change if os.path.isfile(Options.OutputDir+"fastaFiles/"+self.patID+'.fasta'): - print("INFO: Coding change fasta files for %s already present." % (self.patID)) - self.fastaChange = Options.OutputDir+"fastaFiles/"+self.patID+'.fasta' + print("INFO: Coding change fasta files for %s already present." % + (self.patID)) + self.fastaChange = Options.OutputDir+"fastaFiles/"+self.patID+'.fasta' else: - self.fastaChange = get_coding_change(Options.OutputDir, self.patID, self.annotationReady, annovar) + self.fastaChange = get_coding_change( + Options.OutputDir, self.patID, self.annotationReady, annovar) if os.path.isfile(Options.OutputDir+"fastaFiles/"+self.patID+'.reformat.fasta'): - print("INFO: Coding change fasta files %s has already been reformatted." % (self.patID)) - self.fastaChangeFormat = Options.OutputDir+"fastaFiles/"+self.patID+'.reformat.fasta' + print("INFO: Coding change fasta files %s has already been reformatted." % ( + self.patID)) + self.fastaChangeFormat = Options.OutputDir + \ + "fastaFiles/"+self.patID+'.reformat.fasta' else: self.fastaChangeFormat = ReformatFasta(self.fastaChange) @@ -140,55 +175,101 @@ def callNeoantigens(self, FilePath, netmhcpan, Options): i = 0 pepTmp = {} for n in Options.epitopes: - if os.path.isfile(Options.OutputDir+"fastaFiles/%s.tmp.%s.fasta"%(self.patID,n)) and os.path.isfile("fastaFiles/%s.tmp.%s.fasta"%(self.patID,str(n)+'.Indels')): - pepTmp.update({n:Options.OutputDir+"fastaFiles/%s.tmp.%s.fasta"%(self.patID,n)}) - pepTmp.update({str(n)+'.Indels':Options.OutputDir+"fastaFiles/%s.tmp.%s.fasta"%(self.patID,str(n)+'.Indels')}) - print("INFO: Tmp fasta files %s has already been created for netMHCpan length %s." % (self.patID,n)) - i+=1 + if os.path.isfile(Options.OutputDir+"fastaFiles/%s.tmp.%s.fasta" % (self.patID, n)) and os.path.isfile("fastaFiles/%s.tmp.%s.fasta" % (self.patID, str(n)+'.Indels')): + pepTmp.update( + {n: Options.OutputDir+"fastaFiles/%s.tmp.%s.fasta" % (self.patID, n)}) + pepTmp.update({str(n)+'.Indels': Options.OutputDir + + "fastaFiles/%s.tmp.%s.fasta" % (self.patID, str(n)+'.Indels')}) + print("INFO: Tmp fasta files %s has already been created for netMHCpan length %s." % ( + self.patID, n)) + i += 1 if i == len(Options.epitopes): self.peptideFastas = pepTmp if i != len(Options.epitopes): - self.peptideFastas = MakeTempFastas(self.fastaChangeFormat, Options.epitopes) + # self.peptideFastas = MakeTempFastas(self.fastaChangeFormat, Options.epitopes) + self.peptideFastas, self.peptideFastasWT = MakeTempFastas( + self.fastaChangeFormat, Options.epitopes) # Predict neoantigens i = 0 epTmp = [] for n in Options.epitopes: - if os.path.isfile(Options.OutputDir+"tmp/%s.epitopes.%s.txt" % (self.patID,n)): - epTmp.append(Options.OutputDir+"tmp/%s.epitopes.%s.txt" % (self.patID,n)) + if os.path.isfile(Options.OutputDir+"tmp/%s.epitopes.%s.txt" % (self.patID, n)): + epTmp.append(Options.OutputDir + + "tmp/%s.epitopes.%s.txt" % (self.patID, n)) i += 1 - print("INFO: Epitope prediction files %s have already been created for netMHCpan length %s." % (self.patID,n)) - if os.path.isfile(Options.OutputDir+"tmp/%s.epitopes.%s.txt" % (self.patID,str(n)+'.Indels')): - epTmp.append(Options.OutputDir+"tmp/%s.epitopes.%s.txt" % (self.patID,str(n)+'.Indels')) + print("INFO: Epitope prediction files %s have already been created for netMHCpan length %s." % ( + self.patID, n)) + if os.path.isfile(Options.OutputDir+"tmp/%s.epitopes.%s.txt" % (self.patID, str(n)+'.Indels')): + epTmp.append(Options.OutputDir+"tmp/%s.epitopes.%s.txt" % + (self.patID, str(n)+'.Indels')) i += 1 - print("INFO: Epitope prediction files %s have already been created for netMHCpan length %s." % (self.patID,str(n)+'.Indels')) + print("INFO: Epitope prediction files %s have already been created for netMHCpan length %s." % ( + self.patID, str(n)+'.Indels')) if i == 2*len(Options.epitopes): self.epcalls = epTmp - if i!=2*len(Options.epitopes): - if i>0: - os.system("rm "+Options.OutputDir+"tmp/"+self.patID+".epitopes.*.txt") # if doing predictions, remove existing files to ensure double predicting happens - self.epcalls = predict_neoantigens(Options.OutputDir, self.patID, self.peptideFastas, self.hlasnormed , Options.epitopes, netmhcpan, Options) + if i != 2*len(Options.epitopes): + if i > 0: + # if doing predictions, remove existing files to ensure double predicting happens + os.system("rm "+Options.OutputDir+"tmp/" + + self.patID+".epitopes.*.txt") + # self.epcalls = predict_neoantigens(Options.OutputDir, self.patID, self.peptideFastas, self.hlasnormed , Options.epitopes, netmhcpan, Options) + self.epcalls = predict_neoantigens2( + Options.OutputDir, self.patID, self.peptideFastas, self.hlasnormed, Options.epitopes, netmhcpan, Options, 'MT') + self.epcallsWT = predict_neoantigens2( + Options.OutputDir, self.patID, self.peptideFastasWT, self.hlasnormed, Options.epitopes, netmhcpan, Options, 'WT') def digestIndSample(self, FilePath, pmPaths, Options): if self.epcalls != []: - toDigestSNVs = filter(lambda y: 'Indels.txt' not in y, self.epcalls) + toDigestSNVs = filter( + lambda y: 'Indels.txt' not in y, self.epcalls) toDigestIndels = filter(lambda y: 'Indels.txt' in y, self.epcalls) if toDigestSNVs != []: - self.digestedEpitopes = DigestIndSample(toDigestSNVs, self.patID, Options, pmPaths) - self.appendedEpitopes, self.regionsPresent = AppendDigestedEps(FilePath, self.digestedEpitopes, self.patID, self.annotationReady, self.avReadyFile, Options) + self.digestedEpitopes = DigestIndSample( + toDigestSNVs, self.patID, Options, pmPaths) + # self.appendedEpitopes, self.regionsPresent = AppendDigestedEps(FilePath, self.digestedEpitopes, self.patID, self.annotationReady, self.avReadyFile, Options) + self.appendedEpitopesMT, self.regionsPresentMT = AppendDigestedEps( + FilePath, self.digestedEpitopes, self.patID, self.annotationReady, self.avReadyFile, Options) else: - self.appendedEpitopes = None - self.regionsPresent = None + # self.appendedEpitopes = None + # self.regionsPresent = None + self.appendedEpitopesMT = None + self.regionsPresentMT = None if toDigestIndels != []: - self.digestedEpitopesIndels = DigestIndSample(toDigestIndels, self.patID, Options, pmPaths, True) - self.appendedEpitopesIndels, self.regionsPresentIndels = AppendDigestedEps(FilePath, self.digestedEpitopesIndels, self.patID, self.annotationReady, self.avReadyFile, Options) + self.digestedEpitopesIndels = DigestIndSample( + toDigestIndels, self.patID, Options, pmPaths, True) + self.appendedEpitopesIndels, self.regionsPresentIndels = AppendDigestedEps( + FilePath, self.digestedEpitopesIndels, self.patID, self.annotationReady, self.avReadyFile, Options) else: self.appendedEpitopesIndels = None self.regionsPresentIndels = None + def digestIndSampleWT(self, FilePath, pmPaths, Options): + """ + 说明: + Indel 的位点,没有野生型 + + Args: + FilePath (_type_): _description_ + pmPaths (_type_): _description_ + Options (_type_): _description_ + """ + if self.epcalls != []: + toDigestSNVs = filter( + lambda y: 'Indels.txt' not in y, self.epcallsWT) + if toDigestSNVs != []: + self.digestedEpitopesWT = DigestIndSample( + toDigestSNVs, self.patID, Options, pmPaths) + self.appendedEpitopesWT, self.regionsPresentWT = AppendDigestedEps( + FilePath, self.digestedEpitopesWT, self.patID, self.annotationReady, self.avReadyFile, Options) + else: + self.appendedEpitopesWT = None + self.regionsPresentWT = None + + def PrepClasses(FilePath, Options): if Options.vcfdir[len(Options.vcfdir)-1] != "/": - Options.vcfdir+="/" + Options.vcfdir += "/" getFiles = Options.vcfdir + "*.vcf" allFiles = glob.glob(getFiles) if all([os.path.isfile(f) for f in allFiles]): @@ -200,7 +281,7 @@ def PrepClasses(FilePath, Options): pass else: sys.exit("Unable to locate vcf files.") - + hlas = dict() if os.path.isfile(Options.hlafile): with open(Options.hlafile, 'r') as hlaFile: @@ -220,10 +301,10 @@ def PrepClasses(FilePath, Options): if not os.path.exists(Options.OutputDir): os.mkdir(Options.OutputDir) - + try: os.mkdir(Options.OutputDir+'avready') - + except OSError as e: print("INFO: Proper directory already exists. Continue.") @@ -242,30 +323,36 @@ def PrepClasses(FilePath, Options): except OSError as e: print("INFO: Proper directory already exists. Continue.") - return(allFiles, hlas) + return (allFiles, hlas) + -def FinalOut(sampleClasses, Options, indelProcess=False): +def FinalOut(sampleClasses, Options, intype, indelProcess=False): + assert intype in ['MT', 'WT'] if indelProcess: - epitopesToProcess = 'appendedEpitopesIndels' #Process this special set of predicted epitopes + # Process this special set of predicted epitopes + epitopesToProcess = 'appendedEpitopesIndels' regionsToProcess = 'regionsPresentIndels' filePostFix = '.neoantigens.Indels' else: - epitopesToProcess = 'appendedEpitopes' - regionsToProcess = 'regionsPresent' + epitopesToProcess = 'appendedEpitopes' + intype + regionsToProcess = 'regionsPresent' + intype filePostFix = '.neoantigens' if Options.includeall: - outFile = Options.OutputDir + Options.outName + filePostFix + ".unfiltered.txt" + outFile = Options.OutputDir + Options.outName + \ + '.' + intype + filePostFix + ".unfiltered.txt" else: - outFile = Options.OutputDir + Options.outName + filePostFix + ".txt" + outFile = Options.OutputDir + Options.outName + \ + '.' + intype + filePostFix + ".txt" - outTable = Options.OutputDir + Options.outName + filePostFix + ".summarytable.txt" + outTable = Options.OutputDir + Options.outName + \ + '.' + intype + filePostFix + ".summarytable.txt" summaryTable = [] with open(outFile, 'w') as pentultimateFile: - if Options.includeall==True: + if Options.includeall == True: for i in range(0, len(sampleClasses)): - appendedEps = getattr(sampleClasses[i],epitopesToProcess) + appendedEps = getattr(sampleClasses[i], epitopesToProcess) if appendedEps is not None: pentultimateFile.write('\n'.join(appendedEps) + '\n') @@ -275,37 +362,37 @@ def FinalOut(sampleClasses, Options, indelProcess=False): summaryTable.append(line) else: for i in range(0, len(sampleClasses)): - appendedEps = getattr(sampleClasses[i],epitopesToProcess) + appendedEps = getattr(sampleClasses[i], epitopesToProcess) if appendedEps is not None: for line in appendedEps: if '<=' in line: pentultimateFile.write(line+"\n") summaryTable.append(line) - + # 统计计数 summaries = {} for z in range(0, len(sampleClasses)): - appendedEps = getattr(sampleClasses[z],epitopesToProcess) + appendedEps = getattr(sampleClasses[z], epitopesToProcess) if appendedEps is not None: - total_count=0 - wbind=0 - sbind=0 + total_count = 0 + wbind = 0 + sbind = 0 if Options.colRegions is not None: # Prep counts for multiregion data - region_count=[0 for k in Options.colRegions*3] + region_count = [0 for k in Options.colRegions*3] # Final Counters for each subtype of neoantigen - clonal=0 + clonal = 0 Wclonal = 0 Sclonal = 0 - subclonal=0 + subclonal = 0 Wsubclonal = 0 - Ssubclonal=0 - shared=0 + Ssubclonal = 0 + shared = 0 Wshared = 0 Sshared = 0 - regionsPesent = getattr(sampleClasses[z],regionsToProcess) + regionsPesent = getattr(sampleClasses[z], regionsToProcess) overallRegions = Counter(regionsPesent) for line in appendedEps: @@ -314,35 +401,38 @@ def FinalOut(sampleClasses, Options, indelProcess=False): rw = 0 rs = 0 - if '<=' in line and not (Options.checkPeptides and line[-1]=='0'): - total_count+=1 + if '<=' in line and not (Options.checkPeptides and line[-1] == '0'): + total_count += 1 if Options.colRegions is not None: - regions = [int(g) for g in line.split('\t')[1:len(Options.colRegions) + 1]] + regions = [int(g) for g in line.split('\t')[ + 1:len(Options.colRegions) + 1]] - for s in range(0,len(Options.colRegions)): + for s in range(0, len(Options.colRegions)): if regions[s] > 0: - region_count[s*3]+=1 - r +=1 + region_count[s*3] += 1 + r += 1 if "<=\tWB" in line or "<=WB" in line: - wbind +=1 + wbind += 1 if Options.colRegions is not None: - regions = [int(g) for g in line.split('\t')[1:len(Options.colRegions) + 1]] + regions = [int(g) for g in line.split('\t')[ + 1:len(Options.colRegions) + 1]] for s in range(0, len(Options.colRegions)): if regions[s] > 0: region_count[(s * 3)+1] += 1 - rw +=1 + rw += 1 elif "<=\tSB" in line or "<=SB" in line: - sbind+=1 + sbind += 1 if Options.colRegions is not None: - regions = [int(g) for g in line.split('\t')[1:len(Options.colRegions) + 1]] + regions = [int(g) for g in line.split('\t')[ + 1:len(Options.colRegions) + 1]] for s in range(0, len(Options.colRegions)): if regions[s] > 0: region_count[(s * 3)+2] += 1 - rs+=1 + rs += 1 if Options.colRegions is not None: if r == overallRegions['+']: @@ -351,7 +441,7 @@ def FinalOut(sampleClasses, Options, indelProcess=False): subclonal += 1 elif r > 1 and overallRegions['+'] > 2: shared += 1 - elif r==0: + elif r == 0: pass else: sys.exit('Error with mutliregion counter.') @@ -362,7 +452,7 @@ def FinalOut(sampleClasses, Options, indelProcess=False): Wsubclonal += 1 elif rw > 1 and overallRegions['+'] > 2: Wshared += 1 - elif rw==0: + elif rw == 0: pass else: sys.exit('Error with mutliregion counter.') @@ -373,48 +463,54 @@ def FinalOut(sampleClasses, Options, indelProcess=False): Ssubclonal += 1 elif rs > 1 and overallRegions['+'] > 2: Sshared += 1 - elif rs==0: + elif rs == 0: pass else: sys.exit('Error with mutliregion counter.') if Options.colRegions is not None: - summaries.update({sampleClasses[z].patID:{'Total':total_count,'WB':wbind,'SB':sbind, - 'Regions':region_count, 'Clonal':clonal, 'Subclonal':subclonal, 'Shared':shared, - 'clonal_w':Wclonal, 'clonal_s':Sclonal, 'subclonal_w':Wsubclonal, 'subclonal_s':Ssubclonal, - 'shared_w':Wshared,'shared_s':Sshared}}) + summaries.update({sampleClasses[z].patID: {'Total': total_count, 'WB': wbind, 'SB': sbind, + 'Regions': region_count, 'Clonal': clonal, 'Subclonal': subclonal, 'Shared': shared, + 'clonal_w': Wclonal, 'clonal_s': Sclonal, 'subclonal_w': Wsubclonal, 'subclonal_s': Ssubclonal, + 'shared_w': Wshared, 'shared_s': Sshared}}) else: - summaries.update({sampleClasses[z].patID:{'Total':total_count,'WB':wbind,'SB':sbind}}) + summaries.update( + {sampleClasses[z].patID: {'Total': total_count, 'WB': wbind, 'SB': sbind}}) with open(outTable, 'w') as finalFile: if Options.colRegions is not None: - header = ['Sample','Total','Total_WB','Total_SB','\t'.join(["Total_Region_%s"%(n) for n in range(0,len(Options.colRegions))]), - '\t'.join(["Total_WB_Region_%s" % (n) for n in range(0, len(Options.colRegions))]), '\t'.join(["Total_SB_Region_%s"%(n) for n in range(0,len(Options.colRegions))]), - 'Clonal','Subclonal','Shared','Clonal_WB','Clonal_SB','Subclonal_WB','Subclonal_SB','Shared_WB','Shared_SB'] + header = ['Sample', 'Total', 'Total_WB', 'Total_SB', '\t'.join(["Total_Region_%s" % (n) for n in range(0, len(Options.colRegions))]), + '\t'.join(["Total_WB_Region_%s" % (n) for n in range(0, len(Options.colRegions))]), '\t'.join( + ["Total_SB_Region_%s" % (n) for n in range(0, len(Options.colRegions))]), + 'Clonal', 'Subclonal', 'Shared', 'Clonal_WB', 'Clonal_SB', 'Subclonal_WB', 'Subclonal_SB', 'Shared_WB', 'Shared_SB'] finalFile.write('\t'.join(header) + '\n') for patient in summaries: line = [patient, summaries[patient]['Total'], summaries[patient]['WB'], summaries[patient]['SB'], - '\t'.join([str(summaries[patient]['Regions'][i]) for i in range(0,len(region_count), 3)]), '\t'.join([str(summaries[patient]['Regions'][i]) for i in range(1,len(region_count), 3)]), - '\t'.join([str(summaries[patient]['Regions'][i]) for i in range(2,len(region_count), 3)]), - summaries[patient]['Clonal'],summaries[patient]['Subclonal'],summaries[patient]['Shared'], - summaries[patient]['clonal_w'],summaries[patient]['clonal_s'],summaries[patient]['subclonal_w'], - summaries[patient]['subclonal_s'],summaries[patient]['shared_w'],summaries[patient]['shared_s'] + '\t'.join([str(summaries[patient]['Regions'][i]) for i in range(0, len(region_count), 3)]), '\t'.join( + [str(summaries[patient]['Regions'][i]) for i in range(1, len(region_count), 3)]), + '\t'.join([str(summaries[patient]['Regions'][i]) + for i in range(2, len(region_count), 3)]), + summaries[patient]['Clonal'], summaries[patient]['Subclonal'], summaries[patient]['Shared'], + summaries[patient]['clonal_w'], summaries[patient]['clonal_s'], summaries[patient]['subclonal_w'], + summaries[patient]['subclonal_s'], summaries[patient]['shared_w'], summaries[patient]['shared_s'] ] line = [str(i) for i in line] finalFile.write('\t'.join(line) + '\n') else: - header = ['Sample','Total','Total_WB','Total_SB'] + header = ['Sample', 'Total', 'Total_WB', 'Total_SB'] finalFile.write('\t'.join(header) + '\n') for patient in summaries: - line = [patient, summaries[patient]['Total'], summaries[patient]['WB'], summaries[patient]['SB']] + line = [patient, summaries[patient]['Total'], + summaries[patient]['WB'], summaries[patient]['SB']] line = [str(i) for i in line] finalFile.write('\t'.join(line) + '\n') print("INFO: Summary Tables Complete.") + return outFile def CleanUp(Options): if Options.cleanLog or Options.makeitclean: @@ -448,35 +544,45 @@ def CleanUp(Options): print("ERROR: Unable to clean tmp files.") print(e) + def main(): # Pull information about usr system files - localpath = os.path.abspath(__file__).replace('NeoPredPipe.py', '') # path to scripts working directory + localpath = os.path.abspath(__file__).replace( + 'NeoPredPipe.py', '') # path to scripts working directory Options = Parser() Config = configparser.ConfigParser() Config.read(localpath + "usr_paths.ini") - annPaths = ConfigSectionMap(Config.sections()[0], Config) # get annovar script paths - annPaths['build'] = annPaths['gene_table'].split('/')[-1].split('_')[0] # get build version from name of gene table (hg19/hg38_refGene...) - annPaths['gene_model'] = annPaths['gene_table'].split('/')[-1].split('_')[1].replace(".txt","") # get gene model from X_somethingGene.txt + # get annovar script paths + annPaths = ConfigSectionMap(Config.sections()[0], Config) + # get build version from name of gene table (hg19/hg38_refGene...) + annPaths['build'] = annPaths['gene_table'].split('/')[-1].split('_')[0] + annPaths['gene_model'] = annPaths['gene_table'].split( + '/')[-1].split('_')[1].replace(".txt", "") # get gene model from X_somethingGene.txt if annPaths['build'] not in ['hg19', 'hg38', 'hg18', 'mm10']: - print("WARNING: Unexpected genome build detected in annovar reference files: %s. Please check path for 'gene_table'. Build hg19 is used for analysis."%(annPaths['build'])) + print("WARNING: Unexpected genome build detected in annovar reference files: %s. Please check path for 'gene_table'. Build hg19 is used for analysis." % ( + annPaths['build'])) annPaths['build'] = 'hg19' else: - print("INFO: Annovar reference files of build %s were given, using this build for all analysis."%(annPaths['build'])) - netMHCpanPaths = ConfigSectionMap(Config.sections()[1], Config) # get netmhcpan paths - if netMHCpanPaths['netmhcpan'].rstrip('\n').split('/')[-1]=='netMHCIIpan': #set typeII if netMHCIIpan is supplied instead of netMHCpan + print("INFO: Annovar reference files of build %s were given, using this build for all analysis." % ( + annPaths['build'])) + netMHCpanPaths = ConfigSectionMap( + Config.sections()[1], Config) # get netmhcpan paths + # set typeII if netMHCIIpan is supplied instead of netMHCpan + if netMHCpanPaths['netmhcpan'].rstrip('\n').split('/')[-1] == 'netMHCIIpan': Options.typeII = True try: - pepmatchPaths = ConfigSectionMap(Config.sections()[2], Config) # get PeptideMatch paths + pepmatchPaths = ConfigSectionMap( + Config.sections()[2], Config) # get PeptideMatch paths except IndexError as e: pepmatchPaths = None - #Make sure outputdir ends with '/' - if Options.OutputDir[len(Options.OutputDir)-1]=="/": + # Make sure outputdir ends with '/' + if Options.OutputDir[len(Options.OutputDir)-1] == "/": pass else: Options.OutputDir = Options.OutputDir + "/" - #Check if PeptideMatch paths are provided, ignore -m if not + # Check if PeptideMatch paths are provided, ignore -m if not if Options.checkPeptides and pepmatchPaths is None: print("WARNING: You chose to perform peptide match checking for epitopes, but did not provide paths for PeptideMatch. The pipeline will ignore the -m flag") Options.checkPeptides = False @@ -493,32 +599,47 @@ def main(): Options.allExpFiles = GetExpressionFiles(Options) # Check VCF and HLA - assert len(allFiles) > 0, "No input vcf files detected. Perhaps they are compressed?" - if len(allFiles)>len(hlas.keys()): + assert len( + allFiles) > 0, "No input vcf files detected. Perhaps they are compressed?" + if len(allFiles) > len(hlas.keys()): print("WARNING: Less samples are detected in HLA file than in VCF folder. Only samples included in HLA file will be processed.") # Prepare samples t = [] for patFile in allFiles: - patname = patFile.replace('.vcf', '').split("/")[len(patFile.replace('.vcf', '').split("/")) - 1] + patname = patFile.replace('.vcf', '').split( + "/")[len(patFile.replace('.vcf', '').split("/")) - 1] if patname in hlas.keys(): - t.append(Sample(localpath, patname, patFile, hlas[patname], annPaths, netMHCpanPaths, pepmatchPaths, Options)) - - if len(t)line7;NM_001099852;c.G247A;p.D83N;protein-altering;;(position;83;changed;from;D;to;N) + :ETFQAVLNGLDALLT + Args: + infa (_type_): _description_ + outfa (_type_): _description_ + Add: + vcf_manipulate.py line 191 + """ + records = [] + for seq_record in SeqIO.parse(infa, "fasta"): + seqid = str(seq_record.id) + tag = seqid.split(";") + aa_wildType = tag[-3] + aa_mutant = tag[-1] + half_len = int(len(seq_record.seq) / 2) + # ic(seq_record.id) + # ic(seq_record.seq) + # ic(aa_wildType, aa_mutant, seq_record.seq[half_len]) + # 赋值 + tag[4] = "wildType" + nid = ";".join(tag) + seq = str(seq_record.seq) + seq = seq[:half_len] + aa_wildType + seq[half_len+1:] + tmp_rec = SeqRecord(Seq(seq), id=nid, description="") + records.append(tmp_rec) + # ic(seq_record.seq) + # ic(tmp_rec.seq) + SeqIO.write(records, outfa, "fasta") + + +def getdf(infile): + """_summary_ + + Args: + infile (_type_): 读取class1 的 df 表格 TestRun.MT.neoantigens.txt + + Returns: + _type_: _description_ + """ + columns = ["Sample", "R1", "R2", "Line", "chr", "allelepos", "ref", "alt", "GeneName:RefSeqID", "pos", "hla", "peptide", "Core", "Of", + "Gp", "Gl", "Ip", "Il", "Icore", "Identity", "Score_EL", "%Rank_EL", "Score_BA", "%Rank_BA", "Aff(nM)", "Candidate", "BindLevel"] + df = pd.read_csv(infile, sep="\t", names=columns) + df["peplen"] = len(df["peptide"].astype(str)) + df = df.drop(["R1", "R2", "Core", "Of", "Gp", + "Gl", "Ip", "Il", "Icore"], axis=1) + return df + + +def getdf_class2(infile): + """_summary_ + + Args: + infile (_type_): 读取class2 的 df 表格 TestRun.MT.neoantigens.txt + + ../netMHCIIpan -f example.pep -inptype 1 -a DRB1_0101 -BA + + INFO: Running Epitope Predictions for test2 on epitopes of length 13 + + Returns: + _type_: _description_ + """ + columns = ["Sample", "R1", "R2", "Line", "chr", "allelepos", "ref", "alt", "GeneName:RefSeqID", "pos", "hla", "peptide", "Of", + "Core", "Core_Rel", "Identity", "Score_EL", "%Rank_EL", "Exp_Bind", "Score_BA", "Aff(nM)", "%Rank_BA", "BindLevel"] + df = pd.read_csv(infile, sep="\t", names=columns) + df["peplen"] = len(df["peptide"].astype(str)) + df = df.drop(["R1", "R2", "Core", "Of", "Core_Rel", + "Exp_Bind"], axis=1) + return df + + +def mergeMTWT(infile1, infile2, outfile): + """ + infile1: MT + infile2: WT + """ + df1 = getdf(infile1) + df2 = getdf(infile2) + on_columns = ["Sample", "Line", "chr", "allelepos", + "ref", "alt", "GeneName:RefSeqID", "pos", "hla", "Identity"] # , "peplen" + df = pd.merge(df1, df2, on=on_columns, + how='inner', suffixes=('_MT', '_WT')) + df["Aff(nM)_FC"] = df["Aff(nM)_WT"] / df["Aff(nM)_MT"] + # df.to_excel(outfile, index = True) + df.to_csv(outfile, encoding='utf-8', sep='\t', index=False) + return df + + +def mergeMTWT_class2(infile1, infile2, outfile): + """ + infile1: MT + infile2: WT + """ + df1 = getdf_class2(infile1) + df2 = getdf_class2(infile2) + on_columns = ["Sample", "Line", "chr", "allelepos", + "ref", "alt", "GeneName:RefSeqID", "pos", "hla", "Identity"] # , "peplen" + df = pd.merge(df1, df2, on=on_columns, + how='inner', suffixes=('_MT', '_WT')) + df["Aff(nM)_FC"] = df["Aff(nM)_WT"] / df["Aff(nM)_MT"] + # df.to_excel(outfile, index = True) + df.to_csv(outfile, encoding='utf-8', sep='\t', index=False) + return df + +def main(): + # infile = "/home/report/lixy/neo/tc1/fastaFiles/test1.tmp.9.fasta" + # outfile = "xx.fasta" + # ConstructWildtypeFa(infile, outfile) + + # infile1 = "/home/report/lixy/neo/tc1/TestRun.MT.neoantigens.txt" + # infile2 = "/home/report/lixy/neo/tc1/TestRun.WT.neoantigens.txt" + # outfile = 'aa.txt' + # df = mergeMTWT(infile1, infile2, outfile) + # ic(df) + + + infile1 = "/home/report/lixy/neo/tc2/TestRun.MT.neoantigens.txt" + infile2 = "/home/report/lixy/neo/tc2/TestRun.WT.neoantigens.txt" + outfile = 'aa2.txt' + + df = mergeMTWT_class2(infile1, infile2, outfile) + # ic(df) + + pass + + +if __name__ == "__main__": + main() diff --git a/postprocessing.py b/postprocessing.py index 1331548..be108db 100755 --- a/postprocessing.py +++ b/postprocessing.py @@ -22,7 +22,6 @@ def DigestIndSample(toDigest, patName, Options, pepmatchPaths, indels=False): ''' # temp_files = None # output_file = "%s%s.digested.txt" % (FilePath, toDigest[0].split('/')[len(toDigest[0].split('/')) - 1].split('.epitopes.')[0]) - lines = [] if indels: pmInputFile = Options.OutputDir+'tmp/'+patName+'.epitopes.Indels.peptidematch.input' @@ -170,12 +169,11 @@ def AppendDigestedEps(FilePath,digestedEps, patName, exonicVars, avReady, Option genoTypesPresent = [] for ep in digestedEps: if Options.typeII: - epID = int(ep.split('\t')[3].split(';')[0].replace('line','')) + epID = int(ep.split('\t')[6].split(';')[0].replace('line','')) else: epID = int(ep.split('\t')[10].split('_')[0].replace('line','')) exonicLine = exonicInfo[epID] avReadyLine = varInfo[epID] - chrom = avReadyLine.split('\t')[0] pos = avReadyLine.split('\t')[1] ref = avReadyLine.split('\t')[3] alt = avReadyLine.split('\t')[4] @@ -188,6 +186,7 @@ def AppendDigestedEps(FilePath,digestedEps, patName, exonicVars, avReady, Option geneList = [':'.join(item.split(':')[0:2]) for item in exonicLine.split('\t')[2].split(',') if item != ''] nmList = [item.split(':')[1] for item in geneList] genes = ','.join(geneList) + chrom = avReadyLine.split('\t')[0] if Options.expression is not None: geneExp = 'NA' diff --git a/predict_binding.py b/predict_binding.py index 815322a..357cc8b 100644 --- a/predict_binding.py +++ b/predict_binding.py @@ -9,6 +9,7 @@ import os import subprocess + def predict_neoantigens(FilePath, patName, inFile, hlasnormed, epitopeLens, netMHCpan, Options): ''' Strips out all WILDTYPE and IMMEDIATE-STOPGAIN from fasta file. @@ -31,7 +32,7 @@ def predict_neoantigens(FilePath, patName, inFile, hlasnormed, epitopeLens, netM cmd = "wc -l %s" % (inFile[n]) pipe = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE).stdout k = int(pipe.read().decode("utf-8").lstrip(" ").split(" ")[0]) - checks[n]=k + checks[n] = k epcalls = [] for n in inFile: @@ -39,21 +40,33 @@ def predict_neoantigens(FilePath, patName, inFile, hlasnormed, epitopeLens, netM output_file = FilePath+'tmp/%s.epitopes.%s.txt' % (patName, n) epcalls.append(output_file) with open(output_file, 'a') as epitope_pred: - print("INFO: Running Epitope Predictions for %s on epitopes of length %s"%(patName,n)) + print("INFO: Running Epitope Predictions for %s on epitopes of length %s" % ( + patName, n)) if Options.ELpred: - cmd = [netMHCpan['netmhcpan'], '-l', str(n).split('.')[0], '-a', ','.join(hlasnormed), '-f', inFile[n]] + # cmd = [netMHCpan['netmhcpan'], '-l', + # str(n).split('.')[0], '-a', ','.join(hlasnormed), '-f', inFile[n]] + cmd = [netMHCpan['netmhcpan'], '-BA', '-l', + str(n).split('.')[0], '-a', ','.join(hlasnormed), '-f', inFile[n]] elif Options.typeII: - cmd = [netMHCpan['netmhcpan'], '-length', str(n).split('.')[0], '-a', ','.join(hlasnormed), '-f', inFile[n]] + # cmd = [netMHCpan['netmhcpan'], '-length', + # str(n).split('.')[0], '-a', ','.join(hlasnormed), '-f', inFile[n]] + # class2 也添加 -BA 参数 netMHCIIpan-4.2 + cmd = [netMHCpan['netmhcpan'], '-BA', '-length', + str(n).split('.')[0], '-a', ','.join(hlasnormed), '-f', inFile[n]] else: - cmd = [netMHCpan['netmhcpan'], '-BA', '-l', str(n).split('.')[0], '-a', ','.join(hlasnormed), '-f', inFile[n]] - netMHC_run = subprocess.Popen(cmd, stdout=epitope_pred, stderr=epitope_pred) + cmd = [netMHCpan['netmhcpan'], '-BA', '-l', + str(n).split('.')[0], '-a', ','.join(hlasnormed), '-f', inFile[n]] + netMHC_run = subprocess.Popen( + cmd, stdout=epitope_pred, stderr=epitope_pred) netMHC_run.wait() else: print("INFO: Skipping Sample! No peptides to predict for %s" % (patName)) - print("INFO: Predictions complete for %s on epitopes of length %s" % (patName, n)) + print("INFO: Predictions complete for %s on epitopes of length %s" % + (patName, n)) + + return (epcalls) - return(epcalls) def predict_neoantigensWT(FilePath, patName, inFile, hlasnormed, epitopeLens, netMHCpan): ''' @@ -74,26 +87,91 @@ def predict_neoantigensWT(FilePath, patName, inFile, hlasnormed, epitopeLens, ne cmd = "wc -l %s" % (inFile[n]) pipe = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE).stdout k = int(pipe.read().decode("utf-8").lstrip(" ").split(" ")[0]) - checks[n]=k + checks[n] = k epcalls = [] for n in epitopeLens: if checks[n] > 0: - output_file = '%s%s.wildtype.epitopes.%s.txt' % (FilePath, patName, n) + output_file = '%s%s.wildtype.epitopes.%s.txt' % ( + FilePath, patName, n) epcalls.append(output_file) - if os.path.isfile(output_file)==False: + if os.path.isfile(output_file) == False: print("INFO: Predicting neoantigens for %s" % (patName)) with open(output_file, 'a') as epitope_pred: - print("INFO: Running Epitope Predictions for %s on epitopes of length %s"%(patName,n)) - cmd = [netMHCpan['netmhcpan'], '-BA', '-l', str(n), '-a', ','.join(hlasnormed), '-f', inFile[n]] - netMHC_run = subprocess.Popen(cmd, stdout=epitope_pred, stderr=epitope_pred) + print("INFO: Running Epitope Predictions for %s on epitopes of length %s" % ( + patName, n)) + cmd = [netMHCpan['netmhcpan'], '-BA', '-l', + str(n), '-a', ','.join(hlasnormed), '-f', inFile[n]] + netMHC_run = subprocess.Popen( + cmd, stdout=epitope_pred, stderr=epitope_pred) netMHC_run.wait() - print("INFO: Predictions complete for %s on epitopes of length %s" % (patName, n)) + print( + "INFO: Predictions complete for %s on epitopes of length %s" % (patName, n)) else: - print("INFO: Neoantigen predictions already complete for %s epitopes of length %s" % (patName, n)) + print("INFO: Neoantigen predictions already complete for %s epitopes of length %s" % ( + patName, n)) + else: + print("INFO: Skipping Sample! No peptides to predict for %s" % (patName)) + + return (epcalls) + + +def predict_neoantigens2(FilePath, patName, inFile, hlasnormed, epitopeLens, netMHCpan, Options, inType): + ''' + Strips out all WILDTYPE and IMMEDIATE-STOPGAIN from fasta file. + + :param FilePath: Primary path of working directory + :param patName: ID associated with a patient + :param inFile: Fasta file with reformatted coding changes. + :param hlas: HLA types for the patient. + :param epitopeLens: List of epitope lengths to predict + :param netMHCpan: Dictionary housing netMHCpan specific script locations and data. See README.md. + :param ELpred: Logical for EL (true) or BA (false) predictions + :return: netMHCpan predictions for each file. + :inType: MT or WT (string) + ''' + assert inType in ['MT', 'WT'] + print("INFO: Predicting neoantigens for %s" % (patName)) + + # Verify that the fasta file has information in it to avoid any errors thrown from netMHCpan + checks = dict.fromkeys(inFile.keys()) + for n in inFile: + cmd = "wc -l %s" % (inFile[n]) + pipe = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE).stdout + k = int(pipe.read().decode("utf-8").lstrip(" ").split(" ")[0]) + checks[n] = k + + epcalls = [] + for n in inFile: + if checks[n] > 0: + output_file = FilePath + \ + 'tmp/%s.epitopes.%s.%s.txt' % (patName, str(inType), n) + epcalls.append(output_file) + with open(output_file, 'a') as epitope_pred: + print("INFO: Running Epitope Predictions for %s on epitopes of length %s" % ( + patName, n)) + if Options.ELpred: + # cmd = [netMHCpan['netmhcpan'], '-l', + # str(n).split('.')[0], '-a', ','.join(hlasnormed), '-f', inFile[n]] + cmd = [netMHCpan['netmhcpan'], '-BA', '-l', + str(n).split('.')[0], '-a', ','.join(hlasnormed), '-f', inFile[n]] + elif Options.typeII: + # cmd = [netMHCpan['netmhcpan'], '-length', + # str(n).split('.')[0], '-a', ','.join(hlasnormed), '-f', inFile[n]] + cmd = [netMHCpan['netmhcpan'], '-BA', '-length', + str(n).split('.')[0], '-a', ','.join(hlasnormed), '-f', inFile[n]] + else: + cmd = [netMHCpan['netmhcpan'], '-BA', '-l', + str(n).split('.')[0], '-a', ','.join(hlasnormed), '-f', inFile[n]] + netMHC_run = subprocess.Popen( + cmd, stdout=epitope_pred, stderr=epitope_pred) + netMHC_run.wait() else: print("INFO: Skipping Sample! No peptides to predict for %s" % (patName)) - return(epcalls) + print("INFO: Predictions complete for %s on epitopes of length %s" % + (patName, n)) + + return (epcalls) diff --git a/singularity_images.def b/singularity_images.def new file mode 100644 index 0000000..94ab641 --- /dev/null +++ b/singularity_images.def @@ -0,0 +1,27 @@ +Bootstrap: docker +From: python:3.9.18 +Stage: build + + +%files + annovar /opt + NeoPredPipe_class1 /opt + NeoPredPipe_class2 /opt + /home/report/lixy/sif/neo/opt/netMHCpan-4.1 /opt + /home/report/lixy/sif/neo/opt/netMHCIIpan-4.2 /opt + /home/report/lixy/neo/ncbi-blast-2.13.0+/bin/blastp /opt + /home/report/lixy/neo/PeptideMatchCMD_1.1.jar /opt +%post + ln -s /opt/netMHCIIpan-4.2/netMHCIIpan /usr/local/bin/ + ln -s /opt/netMHCpan-4.1/netMHCpan /usr/local/bin/ + ln -s /opt/blastp /usr/local/bin/ + apt-get update && apt-get install -y tcsh perl ccrypt + pip install -i https://pypi.tuna.tsinghua.edu.cn/simple biopython pandas numpy + +%environment + export PATH=/opt:$PATH + +%labels + Author biolxy@aliyun.com + +# singularity build --fakeroot NeoPredPipe.sif singularity_images.def diff --git a/vcf_manipulate.py b/vcf_manipulate.py index c2b6b92..885665e 100755 --- a/vcf_manipulate.py +++ b/vcf_manipulate.py @@ -10,6 +10,8 @@ import subprocess from Bio import SeqIO +from plugins import ConstructWildtypeFa + # Convert to annovar format def convert_to_annovar(FilePath, patName, inFile, annovar, manualproc=False): ''' @@ -183,6 +185,7 @@ def MakeTempFastas(inFile, epitopeLens): epsIndels[n] = mySeqsIndels tmpFiles = {} + tmpFilesWT = {} for n in epitopeLens: tmpFasta = inFile.replace(".reformat.fasta",".tmp.%s.fasta"%(n)) tmpFastaIndels = inFile.replace(".reformat.fasta",".tmp.%s.Indels.fasta"%(n)) @@ -194,5 +197,12 @@ def MakeTempFastas(inFile, epitopeLens): with open(tmpFastaIndels, 'w') as outFile: for line in epsIndels[n]: outFile.write(line) - - return(tmpFiles) + # ConstructWildtypeFa + tmpFastaWT = tmpFasta.replace(".fasta", ".WT.fasta") + tmpFastaIndelsWT = tmpFastaIndels.replace(".fasta", ".WT.fasta") + ConstructWildtypeFa(tmpFasta, tmpFastaWT) + ConstructWildtypeFa(tmpFastaIndels, tmpFastaIndelsWT) + tmpFilesWT.update({n:tmpFastaWT}) + tmpFilesWT.update({str(n)+'.Indels':tmpFastaIndelsWT}) + # return(tmpFiles) + return tmpFiles, tmpFilesWT \ No newline at end of file