diff --git a/docs/conf.py b/docs/conf.py index d698c80..03cb96d 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -41,8 +41,8 @@ master_doc = 'index' # General information about the project. -project = u'poretools' -copyright = u'2014' +project = 'poretools' +copyright = '2014' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -185,7 +185,7 @@ # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, documentclass [howto/manual]). latex_documents = [ - ('index', 'poretools.tex', u'poretools Documentation', u'Nick Loman and Aaron Quinlan', 'manual'), + ('index', 'poretools.tex', 'poretools Documentation', 'Nick Loman and Aaron Quinlan', 'manual'), ] # The name of an image file (relative to this directory) to place at the top of @@ -217,7 +217,7 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - ('index', 'gemini', u'poretools Documentation', [u'Nick Loman and Aaron Quinlan'], 1) + ('index', 'gemini', 'poretools Documentation', ['Nick Loman and Aaron Quinlan'], 1) ] diff --git a/poretools/Event.py b/poretools/Event.py index 1363fef..9339b5c 100644 --- a/poretools/Event.py +++ b/poretools/Event.py @@ -1,74 +1,74 @@ class Event(object): - """ - Very basic class to represent a nanopore - translocation event for a single pore - based upon data in the Events table of - a Oxford Nanopore FAST5 (HDF5) file - """ - def __init__(self, row): - self.row = row - try: - self.mean = row['mean'] - except Exception: - self.mean = "" - try: - self.start = row['start'] - except Exception: - self.start = "" - try: - self.stdv = row['stdv'] - except Exception: - self.stdv = "" - try: - self.length = row['length'] - except Exception: - self.length = "" - try: - self.model_state = row['model_state'] - except Exception: - self.model_state = "" - try: - self.model_level = row['model_level'] - except Exception: - self.model_level = "" - try: - self.move = row['move'] - except Exception: - self.move = "" - try: - self.p_model_state = row['p_model_state'] - except Exception: - self.p_model_state = "" - try: - self.mp_state = row['mp_state'] - except Exception: - self.mp_state = "" - try: - self.p_mp_state = row['p_mp_state'] - except Exception: - self.p_mp_state = "" - try: - self.p_A = row['p_A'] - except Exception: - self.p_A = "" - try: - self.p_C = row['p_C'] - except Exception: - self.p_C = "" - try: - self.p_G = row['p_G'] - except Exception: - self.p_G = "" - try: - self.p_T = row['p_T'] - except Exception: - self.p_T = "" + """ + Very basic class to represent a nanopore + translocation event for a single pore + based upon data in the Events table of + a Oxford Nanopore FAST5 (HDF5) file + """ + def __init__(self, row): + self.row = row + try: + self.mean = row['mean'] + except Exception: + self.mean = "" + try: + self.start = row['start'] + except Exception: + self.start = "" + try: + self.stdv = row['stdv'] + except Exception: + self.stdv = "" + try: + self.length = row['length'] + except Exception: + self.length = "" + try: + self.model_state = row['model_state'] + except Exception: + self.model_state = "" + try: + self.model_level = row['model_level'] + except Exception: + self.model_level = "" + try: + self.move = row['move'] + except Exception: + self.move = "" + try: + self.p_model_state = row['p_model_state'] + except Exception: + self.p_model_state = "" + try: + self.mp_state = row['mp_state'] + except Exception: + self.mp_state = "" + try: + self.p_mp_state = row['p_mp_state'] + except Exception: + self.p_mp_state = "" + try: + self.p_A = row['p_A'] + except Exception: + self.p_A = "" + try: + self.p_C = row['p_C'] + except Exception: + self.p_C = "" + try: + self.p_G = row['p_G'] + except Exception: + self.p_G = "" + try: + self.p_T = row['p_T'] + except Exception: + self.p_T = "" - def __repr__(self): - return '\t'.join([str(s) for s in [self.mean, self.start, self.stdv, - self.length, self.model_state, - self.model_level, self.move, - self.p_model_state, - self.mp_state, self.p_mp_state, - self.p_A, self.p_C, - self.p_G, self.p_T]]) + def __repr__(self): + return '\t'.join([str(s) for s in [self.mean, self.start, self.stdv, + self.length, self.model_state, + self.model_level, self.move, + self.p_model_state, + self.mp_state, self.p_mp_state, + self.p_A, self.p_C, + self.p_G, self.p_T]]) diff --git a/poretools/Fast5File.py b/poretools/Fast5File.py index 5421fdf..b794c4c 100644 --- a/poretools/Fast5File.py +++ b/poretools/Fast5File.py @@ -11,8 +11,8 @@ # poretools imports -import formats -from Event import Event +from . import formats +from .Event import Event fastq_paths = { 'closed' : {}, @@ -63,7 +63,7 @@ def clear(self): def __iter__(self): return self - def next(self): + def __next__(self): if len(self.files) > 0: return self.files.pop(0) else: @@ -72,253 +72,253 @@ def next(self): class Fast5FileSet(object): - def __init__(self, fileset, group=0): - if isinstance(fileset, list): - self.fileset = fileset - elif isinstance(fileset, str): - self.fileset = [fileset] - self.set_type = None - self.num_files_in_set = None - self.group = group - self._extract_fast5_files() - - def get_num_files(self): - """ - Return the number of files in the FAST5 set. - """ - if self.num_files_in_set is None and self.set_type == FAST5SET_TARBALL: - self.num_files_in_set = len(self.files) - return self.num_files_in_set - - def __iter__(self): - return self - - def next(self): - try: - return Fast5File(self.files.next(), self.group) - except Exception as e: - # cleanup our mess - if self.set_type == FAST5SET_TARBALL: - shutil.rmtree(PORETOOLS_TMPDIR) - raise StopIteration - - def _extract_fast5_files(self): - - # return as-is if list of files - if len(self.fileset) > 1: - self.files = iter(self.fileset) - self.num_files_in_set = len(self.fileset) - self.set_type = FAST5SET_FILELIST - elif len(self.fileset) == 1: - # e.g. ['/path/to/dir'] or ['/path/to/file'] - f = self.fileset[0] - # is it a directory? - if os.path.isdir(f): - pattern = f + '/' + '*.fast5' - files = glob.glob(pattern) - self.files = iter(files) - self.num_files_in_set = len(files) - self.set_type = FAST5SET_DIRECTORY - if not len(files): - logger.warning("Directory is empty!") - - # is it a tarball? - elif tarfile.is_tarfile(f): - if os.path.isdir(PORETOOLS_TMPDIR): - shutil.rmtree(PORETOOLS_TMPDIR) - os.mkdir(PORETOOLS_TMPDIR) - - self.files = TarballFileIterator(f) - # set to None to delay initialisation - self.num_files_in_set = None - self.set_type = FAST5SET_TARBALL - - # just a single FAST5 file. - else: - self.files = iter([f]) - self.num_files_in_set = 1 - self.set_type = FAST5SET_SINGLEFILE - else: - logger.error("Directory %s could not be opened. Exiting.\n" % dir) - sys.exit() + def __init__(self, fileset, group=0): + if isinstance(fileset, list): + self.fileset = fileset + elif isinstance(fileset, str): + self.fileset = [fileset] + self.set_type = None + self.num_files_in_set = None + self.group = group + self._extract_fast5_files() + + def get_num_files(self): + """ + Return the number of files in the FAST5 set. + """ + if self.num_files_in_set is None and self.set_type == FAST5SET_TARBALL: + self.num_files_in_set = len(self.files) + return self.num_files_in_set + + def __iter__(self): + return self + + def __next__(self): + try: + return Fast5File(next(self.files), self.group) + except Exception as e: + # cleanup our mess + if self.set_type == FAST5SET_TARBALL: + shutil.rmtree(PORETOOLS_TMPDIR) + raise StopIteration + + def _extract_fast5_files(self): + + # return as-is if list of files + if len(self.fileset) > 1: + self.files = iter(self.fileset) + self.num_files_in_set = len(self.fileset) + self.set_type = FAST5SET_FILELIST + elif len(self.fileset) == 1: + # e.g. ['/path/to/dir'] or ['/path/to/file'] + f = self.fileset[0] + # is it a directory? + if os.path.isdir(f): + pattern = f + '/' + '*.fast5' + files = glob.glob(pattern) + self.files = iter(files) + self.num_files_in_set = len(files) + self.set_type = FAST5SET_DIRECTORY + if not len(files): + logger.warning("Directory is empty!") + + # is it a tarball? + elif tarfile.is_tarfile(f): + if os.path.isdir(PORETOOLS_TMPDIR): + shutil.rmtree(PORETOOLS_TMPDIR) + os.mkdir(PORETOOLS_TMPDIR) + + self.files = TarballFileIterator(f) + # set to None to delay initialisation + self.num_files_in_set = None + self.set_type = FAST5SET_TARBALL + + # just a single FAST5 file. + else: + self.files = iter([f]) + self.num_files_in_set = 1 + self.set_type = FAST5SET_SINGLEFILE + else: + logger.error("Directory %s could not be opened. Exiting.\n" % dir) + sys.exit() class TarballFileIterator: - def _fast5_filename_filter(self, filename): - return os.path.basename(filename).endswith('.fast5') and not os.path.basename(filename).startswith('.') + def _fast5_filename_filter(self, filename): + return os.path.basename(filename).endswith('.fast5') and not os.path.basename(filename).startswith('.') - def __init__(self, tarball): - self._tarball = tarball - self._tarfile = tarfile.open(tarball) + def __init__(self, tarball): + self._tarball = tarball + self._tarfile = tarfile.open(tarball) - def __del__(self): - self._tarfile.close() + def __del__(self): + self._tarfile.close() - def __iter__(self): - return self + def __iter__(self): + return self - def next(self): - while True: - tarinfo = self._tarfile.next() - if tarinfo is None: - raise StopIteration - elif self._fast5_filename_filter(tarinfo.name): - break - self._tarfile.extract(tarinfo, path=PORETOOLS_TMPDIR) - return os.path.join(PORETOOLS_TMPDIR, tarinfo.name) + def __next__(self): + while True: + tarinfo = next(self._tarfile) + if tarinfo is None: + raise StopIteration + elif self._fast5_filename_filter(tarinfo.name): + break + self._tarfile.extract(tarinfo, path=PORETOOLS_TMPDIR) + return os.path.join(PORETOOLS_TMPDIR, tarinfo.name) - def __len__(self): - with tarfile.open(self._tarball) as tar: - return len(tar.getnames()) + def __len__(self): + with tarfile.open(self._tarball) as tar: + return len(tar.getnames()) class Fast5File(object): - def __init__(self, filename, group=0): - self.filename = filename - self.group = group - self.is_open = self.open() - if self.is_open: - self.version = self.guess_version() - else: - self.version = 'closed' - - self.fastas = {} - self.fastqs = {} - - # pre-load the FASTQ data - #self._extract_fastqs_from_fast5() - - # booleans for lazy loading (speed) - self.have_fastqs = False - self.have_fastas = False - self.have_templates = False - self.have_complements = False - self.have_pre_basecalled = False - self.have_metadata = False - - - def __del__(self): - self.close() - - #################################################################### - # Public API methods - #################################################################### - - def open(self): - """ - Open an ONT Fast5 file, assuming HDF5 format - """ - try: - self.hdf5file = h5py.File(self.filename, 'r') - return True - except Exception, e: - logger.warning("Cannot open file: %s. Perhaps it is corrupt? Moving on.\n" % self.filename) - return False - - def guess_version(self): - """ - Try and guess the location of template/complement blocks - """ - try: - self.hdf5file["/Analyses/Basecall_RNN_1D_%03d/BaseCalled_template" % (self.group)] - return 'r9rnn' - except KeyError: - pass - - try: - self.hdf5file["/Analyses/Basecall_2D_%03d/BaseCalled_template" % (self.group)] - return 'classic' - except KeyError: - pass - - try: - self.hdf5file["/Analyses/Basecall_1D_%03d/BaseCalled_template" % (self.group)] - return 'metrichor1.16' - except KeyError: - pass - - return 'prebasecalled' - - def close(self): - """ - Close an open an ONT Fast5 file, assuming HDF5 format - """ - if self.is_open: - self.hdf5file.close() - self.is_open = False - - def has_2D(self): - """ - Return TRUE if the FAST5 has a 2D base-called sequence. - Return FALSE otherwise. - """ - if self.have_fastas is False: - self._extract_fastas_from_fast5() - self.have_fastas = True - - if self.fastas.get('twodirections') is not None: - return True - return False - - def get_fastqs(self, choice): - """ - Return the set of base called sequences in the FAST5 - in FASTQ format. - """ - if self.have_fastqs is False: - self._extract_fastqs_from_fast5() - self.have_fastqs = True - - fqs = [] - if choice == "all": - for fastq in self.fastqs: - fqs.append(self.fastqs[fastq]) - elif choice == "fwd": - fqs.append(self.fastqs.get('template')) - elif choice == "rev": - fqs.append(self.fastqs.get('complement')) - elif choice == "2D": - fqs.append(self.fastqs.get('twodirections')) - elif choice == "fwd,rev": - fqs.append(self.fastqs.get('template')) - fqs.append(self.fastqs.get('complement')) - elif choice == "best": - fqs.append(self.fastqs.get(self.get_best_type())) - - return fqs - - - def get_fastas(self, choice): - """ - Return the set of base called sequences in the FAST5 - in FASTQ format. - """ - if self.have_fastas is False: - self._extract_fastas_from_fast5() - self.have_fastas = True - - fas = [] - if choice == "all": - for fasta in self.fastas: - fas.append(self.fastas[fasta]) - elif choice == "fwd": - fas.append(self.fastas.get('template')) - elif choice == "rev": - fas.append(self.fastas.get('complement')) - elif choice == "2D": - fas.append(self.fastas.get('twodirections')) - elif choice == "fwd,rev": - fas.append(self.fastas.get('template')) - fas.append(self.fastas.get('complement')) - elif choice == "best": - if self.have_fastqs is False: - self._extract_fastqs_from_fast5() - self.have_fastqs = True - fas.append(self.fastas.get(self.get_best_type())) - - return fas - - def get_fastas_dict(self): + def __init__(self, filename, group=0): + self.filename = filename + self.group = group + self.is_open = self.open() + if self.is_open: + self.version = self.guess_version() + else: + self.version = 'closed' + + self.fastas = {} + self.fastqs = {} + + # pre-load the FASTQ data + #self._extract_fastqs_from_fast5() + + # booleans for lazy loading (speed) + self.have_fastqs = False + self.have_fastas = False + self.have_templates = False + self.have_complements = False + self.have_pre_basecalled = False + self.have_metadata = False + + + def __del__(self): + self.close() + + #################################################################### + # Public API methods + #################################################################### + + def open(self): + """ + Open an ONT Fast5 file, assuming HDF5 format + """ + try: + self.hdf5file = h5py.File(self.filename, 'r') + return True + except Exception as e: + logger.warning("Cannot open file: %s. Perhaps it is corrupt? Moving on.\n" % self.filename) + return False + + def guess_version(self): + """ + Try and guess the location of template/complement blocks + """ + try: + self.hdf5file["/Analyses/Basecall_RNN_1D_%03d/BaseCalled_template" % (self.group)] + return 'r9rnn' + except KeyError: + pass + + try: + self.hdf5file["/Analyses/Basecall_2D_%03d/BaseCalled_template" % (self.group)] + return 'classic' + except KeyError: + pass + + try: + self.hdf5file["/Analyses/Basecall_1D_%03d/BaseCalled_template" % (self.group)] + return 'metrichor1.16' + except KeyError: + pass + + return 'prebasecalled' + + def close(self): + """ + Close an open an ONT Fast5 file, assuming HDF5 format + """ + if self.is_open: + self.hdf5file.close() + self.is_open = False + + def has_2D(self): + """ + Return TRUE if the FAST5 has a 2D base-called sequence. + Return FALSE otherwise. + """ + if self.have_fastas is False: + self._extract_fastas_from_fast5() + self.have_fastas = True + + if self.fastas.get('twodirections') is not None: + return True + return False + + def get_fastqs(self, choice): + """ + Return the set of base called sequences in the FAST5 + in FASTQ format. + """ + if self.have_fastqs is False: + self._extract_fastqs_from_fast5() + self.have_fastqs = True + + fqs = [] + if choice == "all": + for fastq in self.fastqs: + fqs.append(self.fastqs[fastq]) + elif choice == "fwd": + fqs.append(self.fastqs.get('template')) + elif choice == "rev": + fqs.append(self.fastqs.get('complement')) + elif choice == "2D": + fqs.append(self.fastqs.get('twodirections')) + elif choice == "fwd,rev": + fqs.append(self.fastqs.get('template')) + fqs.append(self.fastqs.get('complement')) + elif choice == "best": + fqs.append(self.fastqs.get(self.get_best_type())) + + return fqs + + + def get_fastas(self, choice): + """ + Return the set of base called sequences in the FAST5 + in FASTQ format. + """ + if self.have_fastas is False: + self._extract_fastas_from_fast5() + self.have_fastas = True + + fas = [] + if choice == "all": + for fasta in self.fastas: + fas.append(self.fastas[fasta]) + elif choice == "fwd": + fas.append(self.fastas.get('template')) + elif choice == "rev": + fas.append(self.fastas.get('complement')) + elif choice == "2D": + fas.append(self.fastas.get('twodirections')) + elif choice == "fwd,rev": + fas.append(self.fastas.get('template')) + fas.append(self.fastas.get('complement')) + elif choice == "best": + if self.have_fastqs is False: + self._extract_fastqs_from_fast5() + self.have_fastqs = True + fas.append(self.fastas.get(self.get_best_type())) + + return fas + + def get_fastas_dict(self): """ Return the set of base called sequences in the FAST5 in FASTQ format. @@ -327,330 +327,330 @@ def get_fastas_dict(self): self._extract_fastas_from_fast5() self.have_fastas = True - return self.fastas - - def get_fastq(self): - """ - Return the base called sequence in the FAST5 - in FASTQ format. Try 2D then template, then complement. - If all fail, return None - """ - if self.have_fastqs is False: - self._extract_fastqs_from_fast5() - self.have_fastqs = True - - if not self.fastqs: - return None - elif self.fastqs.get('twodirections') is not None: - return self.fastqs.get('twodirections') - elif self.fastqs.get('template') is not None: - return self.fastqs.get('template') - elif self.fastqs.get('complement') is not None: - return self.fastqs.get('complement') - - - def get_fasta(self): - """ - Return the base called sequence in the FAST5 - in FASTA format. Try 2D then template, then complement. - If all fail, return None - """ - if not self.fastas: - return None - elif self.fastas.get('twodirections') is not None: - return self.fastas.get('twodirections') - elif self.fastas.get('template') is not None: - return self.fastas.get('template') - elif self.fastas.get('complement') is not None: - return self.fastas.get('complement') - - def get_template_events(self): - """ - Return the table of event data for the template strand - """ - if self.have_templates is False: - self._extract_template_events() - self.have_templates = True - - return self.template_events - - def get_complement_events(self): - """ - Return the table of event data for the complement strand - """ - if self.have_complements is False: - self._extract_complement_events() - self.have_complements = True - - return self.complement_events - - def get_pre_basecalled_events(self): - """ - Return the table of pre-basecalled events - """ - if self.have_pre_basecalled is False: - self._extract_pre_basecalled_events() - self.have_pre_basecalled = True - - return self.pre_basecalled_events - - #################################################################### - # Flowcell Metadata methods - #################################################################### - - def get_exp_start_time(self): - """ - Return the starting time at which signals were collected - for the given read. - """ - if self.have_metadata is False: - self._get_metadata() - self.have_metadata = True - - try: - return int(self.keyinfo['tracking_id'].attrs['exp_start_time']) - except: - return None - - def get_channel_number(self): - """ - Return the channel (pore) number at which signals were collected - for the given read. - """ - if self.have_metadata is False: - self._get_metadata() - self.have_metadata = True - - try: - return self.keyinfo['channel_id'].attrs['channel_number'] - except: - pass - - try: - return self.keyinfo['read_id'].attrs['channel_number'] - except: - return None - - def find_read_number_block_link(self): - """ - Old-style FAST5/HDF5 structure: - Inside /Analyses/Basecall_XXXX there is an 'InputEvents' - link that points to the location of the Read in the HDF5 file. - - Return the Read's node if found, or None if not found. - """ - if self.version == 'classic': - path = "/Analyses/Basecall_2D_000" - else: - path = "/Analyses/Basecall_1D_000" - - basecall = self.hdf5file[path] - path = basecall.get('InputEvents', getlink=True) - if path is None: - return None - - # the soft link target seems broken? - newpath = "/" + "/".join(path.path.split("/")[:-1]) - - node = self.hdf5file[newpath] - - return node - - def hdf_internal_error(self,reason): - """Report an error and exit in case of an invalid + return self.fastas + + def get_fastq(self): + """ + Return the base called sequence in the FAST5 + in FASTQ format. Try 2D then template, then complement. + If all fail, return None + """ + if self.have_fastqs is False: + self._extract_fastqs_from_fast5() + self.have_fastqs = True + + if not self.fastqs: + return None + elif self.fastqs.get('twodirections') is not None: + return self.fastqs.get('twodirections') + elif self.fastqs.get('template') is not None: + return self.fastqs.get('template') + elif self.fastqs.get('complement') is not None: + return self.fastqs.get('complement') + + + def get_fasta(self): + """ + Return the base called sequence in the FAST5 + in FASTA format. Try 2D then template, then complement. + If all fail, return None + """ + if not self.fastas: + return None + elif self.fastas.get('twodirections') is not None: + return self.fastas.get('twodirections') + elif self.fastas.get('template') is not None: + return self.fastas.get('template') + elif self.fastas.get('complement') is not None: + return self.fastas.get('complement') + + def get_template_events(self): + """ + Return the table of event data for the template strand + """ + if self.have_templates is False: + self._extract_template_events() + self.have_templates = True + + return self.template_events + + def get_complement_events(self): + """ + Return the table of event data for the complement strand + """ + if self.have_complements is False: + self._extract_complement_events() + self.have_complements = True + + return self.complement_events + + def get_pre_basecalled_events(self): + """ + Return the table of pre-basecalled events + """ + if self.have_pre_basecalled is False: + self._extract_pre_basecalled_events() + self.have_pre_basecalled = True + + return self.pre_basecalled_events + + #################################################################### + # Flowcell Metadata methods + #################################################################### + + def get_exp_start_time(self): + """ + Return the starting time at which signals were collected + for the given read. + """ + if self.have_metadata is False: + self._get_metadata() + self.have_metadata = True + + try: + return int(self.keyinfo['tracking_id'].attrs['exp_start_time']) + except: + return None + + def get_channel_number(self): + """ + Return the channel (pore) number at which signals were collected + for the given read. + """ + if self.have_metadata is False: + self._get_metadata() + self.have_metadata = True + + try: + return self.keyinfo['channel_id'].attrs['channel_number'] + except: + pass + + try: + return self.keyinfo['read_id'].attrs['channel_number'] + except: + return None + + def find_read_number_block_link(self): + """ + Old-style FAST5/HDF5 structure: + Inside /Analyses/Basecall_XXXX there is an 'InputEvents' + link that points to the location of the Read in the HDF5 file. + + Return the Read's node if found, or None if not found. + """ + if self.version == 'classic': + path = "/Analyses/Basecall_2D_000" + else: + path = "/Analyses/Basecall_1D_000" + + basecall = self.hdf5file[path] + path = basecall.get('InputEvents', getlink=True) + if path is None: + return None + + # the soft link target seems broken? + newpath = "/" + "/".join(path.path.split("/")[:-1]) + + node = self.hdf5file[newpath] + + return node + + def hdf_internal_error(self,reason): + """Report an error and exit in case of an invalid (or unknown) HDF5 structure. Hurrah for ONT!""" - msg = """poretools internal error in file '%s': + msg = """poretools internal error in file '%s': %s Please report this error (with the offending file) to: https://github.com/arq5x/poretools/issues""" % (self.filename, reason) - sys.exit(msg) + sys.exit(msg) def find_read_number_block_fixed_raw(self): - """ - New-style FAST5/HDF5 structure: - There is a fixed 'Raw/Reads' node with only one 'read_NNN' item - inside it (no more 'InputEvents' link). - - Return the Read's node if found, or None if not found. - """ - raw_reads = self.hdf5file.get('Raw/Reads') - if raw_reads is None: - return None - - reads = raw_reads.keys() - if len(reads)==0: - self.hdf_internal_error("Raw/Reads group does not contain any items") - if len(reads)>1: - # This should not happen, based on information from ONT developers. - self.hdf_internal_error("Raw/Reads group contains more than one item") - path = 'Raw/Reads/%s' % ( reads[0] ) - node = self.hdf5file.get(path) - if node is None: - self.hdf_internal_error("Failed to get HDF5 item '%s'"% (path)) - return node + """ + New-style FAST5/HDF5 structure: + There is a fixed 'Raw/Reads' node with only one 'read_NNN' item + inside it (no more 'InputEvents' link). + + Return the Read's node if found, or None if not found. + """ + raw_reads = self.hdf5file.get('Raw/Reads') + if raw_reads is None: + return None + + reads = list(raw_reads.keys()) + if len(reads)==0: + self.hdf_internal_error("Raw/Reads group does not contain any items") + if len(reads)>1: + # This should not happen, based on information from ONT developers. + self.hdf_internal_error("Raw/Reads group contains more than one item") + path = 'Raw/Reads/%s' % ( reads[0] ) + node = self.hdf5file.get(path) + if node is None: + self.hdf_internal_error("Failed to get HDF5 item '%s'"% (path)) + return node def find_read_number_block(self): - """Returns the node of the 'Read_NNN' information, or None if not - found""" - node = self.find_read_number_block_link() - if node is not None: - return node - - node = self.find_read_number_block_fixed_raw() - if node is not None: - return node - - # Couldn't find the node, bail out. - self.hdf_internal_error("unknown HDF5 structure: can't find read block item") - - def find_event_timing_block(self): - path = fastq_paths[self.version]['template'] % (self.group) - try: - node = self.hdf5file[path] - path = node.get('Events') + """Returns the node of the 'Read_NNN' information, or None if not + found""" + node = self.find_read_number_block_link() + if node is not None: + return node + + node = self.find_read_number_block_fixed_raw() + if node is not None: + return node + + # Couldn't find the node, bail out. + self.hdf_internal_error("unknown HDF5 structure: can't find read block item") + + def find_event_timing_block(self): + path = fastq_paths[self.version]['template'] % (self.group) + try: + node = self.hdf5file[path] + path = node.get('Events') #, getlink=True) - return path - except Exception: - return None - - def get_read_number(self): - """ - Return the read number for the pore representing the given read. - """ - node = self.find_read_number_block() - if node: - try: - return node.attrs['read_number'] - except: - return None - return None - - def get_duration(self): - node = self.find_event_timing_block() - if node: - #NOTE: 'duration' in the HDF is a float-point number, - # and can be less than one - which will return 0. - #TODO: consider supporing floating-point, or at least - # rounding values instead of truncating to int. - return int(node.attrs['duration']) - return None - - def get_start_time(self): - exp_start_time = self.get_exp_start_time() - - node = self.find_event_timing_block() - if node: - return int(exp_start_time) + int(node.attrs['start_time']) - - return None - - def get_end_time(self): - exp_start_time = self.get_exp_start_time() - start_time = self.get_start_time() - duration = self.get_duration() - - # 'duration' can be zero and still valid - # (if the duration of the template was less than 1 second). - # Check for None instead of False. - if start_time and (duration is not None): - return start_time + duration - else: - return None - - def get_version_name(self): - """ - Return the flow cell version name. - """ - if self.have_metadata is False: - self._get_metadata() - self.have_metadata = True - - try: - return self.keyinfo['context_tags'].attrs['version_name'] - except: - return None - - def get_run_id(self): - """ - Return the run id. - """ - if self.have_metadata is False: - self._get_metadata() - self.have_metadata = True - - try: - return self.keyinfo['tracking_id'].attrs['run_id'] - except: - return None - - def get_heatsink_temp(self): - """ - Return the heatsink temperature. - """ - if self.have_metadata is False: - self._get_metadata() - self.have_metadata = True - - try: - return self.keyinfo['tracking_id'].attrs['heatsink_temp'] - except: - return None - - def get_asic_temp(self): - """ - Return the ASIC temperature. - """ - if self.have_metadata is False: - self._get_metadata() - self.have_metadata = True - - try: - return self.keyinfo['tracking_id'].attrs['asic_temp'] - except: - return None - - def get_flowcell_id(self): - """ - Return the flowcell_id. - """ - if self.have_metadata is False: - self._get_metadata() - self.have_metadata = True - - try: - return self.keyinfo['tracking_id'].attrs['flowcell_id'] - except: - return None - - def get_run_purpose(self): - """ - Return the exp_script_purpose. - """ - if self.have_metadata is False: - self._get_metadata() - self.have_metadata = True - - try: - return self.keyinfo['tracking_id'].attrs['exp_script_purpose'] - except: - return None - - def get_asic_id(self): - """ - Return the flowcell's ASIC id. - """ - if self.have_metadata is False: - self._get_metadata() - self.have_metadata = True - - try: - return self.keyinfo['tracking_id'].attrs['asic_id'] - except: - return None - - if self.have_metadata is False: - self._get_metadata() - self.have_metadata = True + return path + except Exception: + return None + + def get_read_number(self): + """ + Return the read number for the pore representing the given read. + """ + node = self.find_read_number_block() + if node: + try: + return node.attrs['read_number'] + except: + return None + return None + + def get_duration(self): + node = self.find_event_timing_block() + if node: + #NOTE: 'duration' in the HDF is a float-point number, + # and can be less than one - which will return 0. + #TODO: consider supporing floating-point, or at least + # rounding values instead of truncating to int. + return int(node.attrs['duration']) + return None + + def get_start_time(self): + exp_start_time = self.get_exp_start_time() + + node = self.find_event_timing_block() + if node: + return int(exp_start_time) + int(node.attrs['start_time']) + + return None + + def get_end_time(self): + exp_start_time = self.get_exp_start_time() + start_time = self.get_start_time() + duration = self.get_duration() + + # 'duration' can be zero and still valid + # (if the duration of the template was less than 1 second). + # Check for None instead of False. + if start_time and (duration is not None): + return start_time + duration + else: + return None + + def get_version_name(self): + """ + Return the flow cell version name. + """ + if self.have_metadata is False: + self._get_metadata() + self.have_metadata = True + + try: + return self.keyinfo['context_tags'].attrs['version_name'] + except: + return None + + def get_run_id(self): + """ + Return the run id. + """ + if self.have_metadata is False: + self._get_metadata() + self.have_metadata = True + + try: + return self.keyinfo['tracking_id'].attrs['run_id'] + except: + return None + + def get_heatsink_temp(self): + """ + Return the heatsink temperature. + """ + if self.have_metadata is False: + self._get_metadata() + self.have_metadata = True + + try: + return self.keyinfo['tracking_id'].attrs['heatsink_temp'] + except: + return None + + def get_asic_temp(self): + """ + Return the ASIC temperature. + """ + if self.have_metadata is False: + self._get_metadata() + self.have_metadata = True + + try: + return self.keyinfo['tracking_id'].attrs['asic_temp'] + except: + return None + + def get_flowcell_id(self): + """ + Return the flowcell_id. + """ + if self.have_metadata is False: + self._get_metadata() + self.have_metadata = True + + try: + return self.keyinfo['tracking_id'].attrs['flowcell_id'] + except: + return None + + def get_run_purpose(self): + """ + Return the exp_script_purpose. + """ + if self.have_metadata is False: + self._get_metadata() + self.have_metadata = True + + try: + return self.keyinfo['tracking_id'].attrs['exp_script_purpose'] + except: + return None + + def get_asic_id(self): + """ + Return the flowcell's ASIC id. + """ + if self.have_metadata is False: + self._get_metadata() + self.have_metadata = True + + try: + return self.keyinfo['tracking_id'].attrs['asic_id'] + except: + return None + + if self.have_metadata is False: + self._get_metadata() + self.have_metadata = True def get_host_name(self): """ @@ -669,155 +669,155 @@ def get_host_name(self): self._get_metadata() self.have_metadata = True - def get_device_id(self): - """ - Return the flowcell's device id. - """ + def get_device_id(self): + """ + Return the flowcell's device id. + """ if self.have_metadata is False: self._get_metadata() self.have_metadata = True - try: - return self.keyinfo['tracking_id'].attrs['device_id'] - except: - return None + try: + return self.keyinfo['tracking_id'].attrs['device_id'] + except: + return None - def get_sample_name(self): - """ - Return the user supplied sample name - """ + def get_sample_name(self): + """ + Return the user supplied sample name + """ if self.have_metadata is False: self._get_metadata() self.have_metadata = True - try: - return self.keyinfo['context_tags'].attrs['user_filename_input'] - except Exception, e: - return None - - - def get_template_events_count(self): - """ - Pull out the event count for the template strand - """ - try: - table = self.hdf5file[fastq_paths[self.version]['template'] % self.group] - return len(table['Events'][()]) - except Exception, e: - return 0 - - def get_complement_events_count(self): - """ - Pull out the event count for the complementary strand - """ - try: - table = self.hdf5file[fastq_paths[self.version]['complement'] % self.group] - return len(table['Events'][()]) - except Exception, e: - return 0 - - def is_high_quality(self): - if self.get_complement_events_count() >= \ - self.get_template_events_count(): - return True - else: - return False - - def get_best_type(self): - """ - Returns the type with the anticipated highest quality: - 'twodirections', 'template', 'complement' or None. - """ - try: - if 'twodirections' in self.fastqs: - return 'twodirections' - fwd = 'template' in self.fastqs - rev = 'complement' in self.fastqs - if fwd and not rev: - return 'template' - elif rev and not fwd: - return 'complement' - else: - fwd_err_rate = self.fastqs['template'].est_error_rate() - rev_err_rate = self.fastqs['complement'].est_error_rate() - if fwd_err_rate <= rev_err_rate: - return 'template' - else: - return 'complement' - except Exception, e: - return None - - #################################################################### - # Private API methods - #################################################################### - - def _extract_fastqs_from_fast5(self): - """ - Return the sequence in the FAST5 file in FASTQ format - """ - for id, h5path in fastq_paths[self.version].iteritems(): - try: - table = self.hdf5file[h5path % self.group] - fq = formats.Fastq(table['Fastq'][()]) - fq.name += " " + self.filename - self.fastqs[id] = fq - except Exception, e: - pass - - def _extract_fastas_from_fast5(self): - """ - Return the sequence in the FAST5 file in FASTA format - """ - for id, h5path in fastq_paths[self.version].iteritems(): - try: - table = self.hdf5file[h5path % self.group] - fa = formats.Fasta(table['Fastq'][()]) - fa.name += " " + self.filename - self.fastas[id] = fa - except Exception, e: - pass - - def _extract_template_events(self): - """ - Pull out the event information for the template strand - """ - try: - table = self.hdf5file[fastq_paths[self.version]['template'] % self.group] - self.template_events = [Event(x) for x in table['Events'][()]] - except Exception, e: - self.template_events = [] - - def _extract_complement_events(self): - """ - Pull out the event information for the complementary strand - """ - try: - table = self.hdf5file[fastq_paths[self.version]['complement'] % self.group] - self.complement_events = [Event(x) for x in table['Events'][()]] - except Exception, e: - self.complement_events = [] - - def _extract_pre_basecalled_events(self): - """ - Pull out the pre-basecalled event information - """ - # try: - table = self.hdf5file[fastq_paths[self.version]['pre_basecalled']] - events = [] - for read in table: - events.extend(table[read]["Events"][()]) - self.pre_basecalled_events = [Event(x) for x in events] - # except Exception, e: - # self.pre_basecalled_events = [] - - def _get_metadata(self): - try: - self.keyinfo = self.hdf5file['/UniqueGlobalKey'] - except Exception, e: - try: - self.keyinfo = self.hdf5file['/Key'] - except Exception, e: - self.keyinfo = None - logger.warning("Cannot find keyinfo. Exiting.\n") + try: + return self.keyinfo['context_tags'].attrs['user_filename_input'] + except Exception as e: + return None + + + def get_template_events_count(self): + """ + Pull out the event count for the template strand + """ + try: + table = self.hdf5file[fastq_paths[self.version]['template'] % self.group] + return len(table['Events'][()]) + except Exception as e: + return 0 + + def get_complement_events_count(self): + """ + Pull out the event count for the complementary strand + """ + try: + table = self.hdf5file[fastq_paths[self.version]['complement'] % self.group] + return len(table['Events'][()]) + except Exception as e: + return 0 + + def is_high_quality(self): + if self.get_complement_events_count() >= \ + self.get_template_events_count(): + return True + else: + return False + + def get_best_type(self): + """ + Returns the type with the anticipated highest quality: + 'twodirections', 'template', 'complement' or None. + """ + try: + if 'twodirections' in self.fastqs: + return 'twodirections' + fwd = 'template' in self.fastqs + rev = 'complement' in self.fastqs + if fwd and not rev: + return 'template' + elif rev and not fwd: + return 'complement' + else: + fwd_err_rate = self.fastqs['template'].est_error_rate() + rev_err_rate = self.fastqs['complement'].est_error_rate() + if fwd_err_rate <= rev_err_rate: + return 'template' + else: + return 'complement' + except Exception as e: + return None + + #################################################################### + # Private API methods + #################################################################### + + def _extract_fastqs_from_fast5(self): + """ + Return the sequence in the FAST5 file in FASTQ format + """ + for id, h5path in fastq_paths[self.version].items(): + try: + table = self.hdf5file[h5path % self.group] + fq = formats.Fastq(table['Fastq'][()]) + fq.name += " " + self.filename + self.fastqs[id] = fq + except Exception as e: + pass + + def _extract_fastas_from_fast5(self): + """ + Return the sequence in the FAST5 file in FASTA format + """ + for id, h5path in fastq_paths[self.version].items(): + try: + table = self.hdf5file[h5path % self.group] + fa = formats.Fasta(table['Fastq'][()]) + fa.name += " " + self.filename + self.fastas[id] = fa + except Exception as e: + pass + + def _extract_template_events(self): + """ + Pull out the event information for the template strand + """ + try: + table = self.hdf5file[fastq_paths[self.version]['template'] % self.group] + self.template_events = [Event(x) for x in table['Events'][()]] + except Exception as e: + self.template_events = [] + + def _extract_complement_events(self): + """ + Pull out the event information for the complementary strand + """ + try: + table = self.hdf5file[fastq_paths[self.version]['complement'] % self.group] + self.complement_events = [Event(x) for x in table['Events'][()]] + except Exception as e: + self.complement_events = [] + + def _extract_pre_basecalled_events(self): + """ + Pull out the pre-basecalled event information + """ + # try: + table = self.hdf5file[fastq_paths[self.version]['pre_basecalled']] + events = [] + for read in table: + events.extend(table[read]["Events"][()]) + self.pre_basecalled_events = [Event(x) for x in events] + # except Exception, e: + # self.pre_basecalled_events = [] + + def _get_metadata(self): + try: + self.keyinfo = self.hdf5file['/UniqueGlobalKey'] + except Exception as e: + try: + self.keyinfo = self.hdf5file['/Key'] + except Exception as e: + self.keyinfo = None + logger.warning("Cannot find keyinfo. Exiting.\n") diff --git a/poretools/__init__.py b/poretools/__init__.py index c5c2412..aef3168 100644 --- a/poretools/__init__.py +++ b/poretools/__init__.py @@ -1,5 +1,5 @@ import os import sys -import scripts -from Fast5File import * -from version import __version__ +from . import scripts +from .Fast5File import * +from .version import __version__ diff --git a/poretools/combine.py b/poretools/combine.py index 4d165d3..81b0690 100644 --- a/poretools/combine.py +++ b/poretools/combine.py @@ -1,6 +1,6 @@ import tarfile import sys -import Fast5File +from . import Fast5File #logging import logging @@ -8,23 +8,23 @@ def run(parser, args): - - if args.tar_filename.endswith('.tar'): - tar = tarfile.open(args.tar_filename, mode='w') - elif args.tar_filename.endswith('.gz'): - tar = tarfile.open(args.tar_filename, mode='w:gz') - elif args.tar_filename.endswith('.bz2'): - tar = tarfile.open(args.tar_filename, mode='w:bz2') - else: - logger.error("Unrecognized FAST5 archive extension. Exiting.\n") - sys.exit() + + if args.tar_filename.endswith('.tar'): + tar = tarfile.open(args.tar_filename, mode='w') + elif args.tar_filename.endswith('.gz'): + tar = tarfile.open(args.tar_filename, mode='w:gz') + elif args.tar_filename.endswith('.bz2'): + tar = tarfile.open(args.tar_filename, mode='w:bz2') + else: + logger.error("Unrecognized FAST5 archive extension. Exiting.\n") + sys.exit() - file_count = 0 - for fast5 in Fast5File.Fast5FileSet(args.files): - tar.add(fast5.filename) - fast5.close() - file_count += 1 - tar.close() + file_count = 0 + for fast5 in Fast5File.Fast5FileSet(args.files): + tar.add(fast5.filename) + fast5.close() + file_count += 1 + tar.close() - logger.info("%s successfully created from %d FAST5 files.\n" % \ - (args.tar_filename, file_count)) + logger.info("%s successfully created from %d FAST5 files.\n" % \ + (args.tar_filename, file_count)) diff --git a/poretools/events.py b/poretools/events.py index c0315ec..91a0d78 100644 --- a/poretools/events.py +++ b/poretools/events.py @@ -1,24 +1,24 @@ -import Fast5File +from . import Fast5File def run(parser, args): - # print header. - keys = ['file', 'strand', 'mean', 'start', 'stdv', \ - 'length', 'model_state', 'model_level', 'move', \ - 'p_model_state', 'mp_model_state', 'p_mp_model_state', \ - 'p_A', 'p_C', 'p_G', 'p_T', 'raw_index'] - print "\t".join(keys) + # print header. + keys = ['file', 'strand', 'mean', 'start', 'stdv', \ + 'length', 'model_state', 'model_level', 'move', \ + 'p_model_state', 'mp_model_state', 'p_mp_model_state', \ + 'p_A', 'p_C', 'p_G', 'p_T', 'raw_index'] + print("\t".join(keys)) - if args.pre_basecalled: - for fast5 in Fast5File.Fast5FileSet(args.files): - for event in fast5.get_pre_basecalled_events(): - print '\t'.join([fast5.filename, 'pre_basecalled', str(event)]) - else: - for fast5 in Fast5File.Fast5FileSet(args.files): - for event in fast5.get_template_events(): - print '\t'.join([fast5.filename, 'template', str(event)]) - for event in fast5.get_complement_events(): - print '\t'.join([fast5.filename, 'complement', str(event)]) + if args.pre_basecalled: + for fast5 in Fast5File.Fast5FileSet(args.files): + for event in fast5.get_pre_basecalled_events(): + print('\t'.join([fast5.filename, 'pre_basecalled', str(event)])) + else: + for fast5 in Fast5File.Fast5FileSet(args.files): + for event in fast5.get_template_events(): + print('\t'.join([fast5.filename, 'template', str(event)])) + for event in fast5.get_complement_events(): + print('\t'.join([fast5.filename, 'complement', str(event)])) - fast5.close() + fast5.close() diff --git a/poretools/fasta.py b/poretools/fasta.py index c65b58f..16382cb 100644 --- a/poretools/fasta.py +++ b/poretools/fasta.py @@ -1,48 +1,48 @@ -import Fast5File +from . import Fast5File import sys def run(parser, args): - for fast5 in Fast5File.Fast5FileSet(args.files, args.group): - - if args.start_time or args.end_time: - read_start_time = fast5.get_start_time() - read_end_time = fast5.get_end_time() - if args.start_time and args.start_time > read_start_time: - fast5.close() - continue - if args.end_time and args.end_time < read_end_time: - fast5.close() - continue - - fas = fast5.get_fastas(args.type) - - # high quality 2D: means there are more nanopore events on the - # complement strand than on the template strand. We also - # require there to be a 2D base-called sequence from Metrichor. - if args.high_quality: - if (fast5.get_complement_events_count() <= \ - fast5.get_template_events_count()) or not fast5.has_2D(): - fast5.close() - continue - - # norem quality 2D : means there are less (or equal) nanopore - # events on the complement strand than on the template strand. - # We also require there to be a 2D base-called sequence from Metrichor. - if args.normal_quality: - if (fast5.get_complement_events_count() > \ - fast5.get_template_events_count()) or not fast5.has_2D(): - fast5.close() - continue - - for fa in fas: - if fa is None or \ - len(fa.seq) < args.min_length or \ - (len(fa.seq) > args.max_length and \ - args.max_length > 0): - continue - - print fa - - fast5.close() + for fast5 in Fast5File.Fast5FileSet(args.files, args.group): + + if args.start_time or args.end_time: + read_start_time = fast5.get_start_time() + read_end_time = fast5.get_end_time() + if args.start_time and args.start_time > read_start_time: + fast5.close() + continue + if args.end_time and args.end_time < read_end_time: + fast5.close() + continue + + fas = fast5.get_fastas(args.type) + + # high quality 2D: means there are more nanopore events on the + # complement strand than on the template strand. We also + # require there to be a 2D base-called sequence from Metrichor. + if args.high_quality: + if (fast5.get_complement_events_count() <= \ + fast5.get_template_events_count()) or not fast5.has_2D(): + fast5.close() + continue + + # norem quality 2D : means there are less (or equal) nanopore + # events on the complement strand than on the template strand. + # We also require there to be a 2D base-called sequence from Metrichor. + if args.normal_quality: + if (fast5.get_complement_events_count() > \ + fast5.get_template_events_count()) or not fast5.has_2D(): + fast5.close() + continue + + for fa in fas: + if fa is None or \ + len(fa.seq) < args.min_length or \ + (len(fa.seq) > args.max_length and \ + args.max_length > 0): + continue + + print(fa) + + fast5.close() diff --git a/poretools/fastq.py b/poretools/fastq.py index 314d264..10a54ce 100644 --- a/poretools/fastq.py +++ b/poretools/fastq.py @@ -1,48 +1,48 @@ -import Fast5File +from . import Fast5File import sys def run(parser, args): - - for fast5 in Fast5File.Fast5FileSet(args.files, args.group): - - if args.start_time or args.end_time: - read_start_time = fast5.get_start_time() - read_end_time = fast5.get_end_time() - if args.start_time and args.start_time > read_start_time: - fast5.close() - continue - if args.end_time and args.end_time < read_end_time: - fast5.close() - continue - - fas = fast5.get_fastqs(args.type) - - # high quality 2D: means there are more nanopore events on the - # complement strand than on the template strand. We also - # require there to be a 2D base-called sequence from Metrichor. - if args.high_quality: - if (fast5.get_complement_events_count() <= \ - fast5.get_template_events_count()) or not fast5.has_2D(): - fast5.close() - continue - - # norem quality 2D : means there are less (or equal) nanopore - # events on the complement strand than on the template strand. - # We also require there to be a 2D base-called sequence from Metrichor. - if args.normal_quality: - if (fast5.get_complement_events_count() > \ - fast5.get_template_events_count()) or not fast5.has_2D(): - fast5.close() - continue - - for fa in fas: - if fa is None or \ - len(fa.seq) < args.min_length or \ - (len(fa.seq) > args.max_length and \ - args.max_length > 0): - continue - - print fa - - fast5.close() + + for fast5 in Fast5File.Fast5FileSet(args.files, args.group): + + if args.start_time or args.end_time: + read_start_time = fast5.get_start_time() + read_end_time = fast5.get_end_time() + if args.start_time and args.start_time > read_start_time: + fast5.close() + continue + if args.end_time and args.end_time < read_end_time: + fast5.close() + continue + + fas = fast5.get_fastqs(args.type) + + # high quality 2D: means there are more nanopore events on the + # complement strand than on the template strand. We also + # require there to be a 2D base-called sequence from Metrichor. + if args.high_quality: + if (fast5.get_complement_events_count() <= \ + fast5.get_template_events_count()) or not fast5.has_2D(): + fast5.close() + continue + + # norem quality 2D : means there are less (or equal) nanopore + # events on the complement strand than on the template strand. + # We also require there to be a 2D base-called sequence from Metrichor. + if args.normal_quality: + if (fast5.get_complement_events_count() > \ + fast5.get_template_events_count()) or not fast5.has_2D(): + fast5.close() + continue + + for fa in fas: + if fa is None or \ + len(fa.seq) < args.min_length or \ + (len(fa.seq) > args.max_length and \ + args.max_length > 0): + continue + + print(fa) + + fast5.close() diff --git a/poretools/formats.py b/poretools/formats.py index 29bc7b5..9f4ef0f 100644 --- a/poretools/formats.py +++ b/poretools/formats.py @@ -1,37 +1,37 @@ class Fastq(object): - def __init__(self, s): - self.s = s - self.parse() + def __init__(self, s): + self.s = s + self.parse() - def parse(self): - (self.name, self.seq, self.sep, self.qual) = self.s.strip().split('\n') + def parse(self): + (self.name, self.seq, self.sep, self.qual) = self.s.strip().split('\n') - def __repr__(self): - return '\n'.join([self.name, self.seq, self.sep, self.qual]) + def __repr__(self): + return '\n'.join([self.name, self.seq, self.sep, self.qual]) - def est_error_rate(self): - """ - Returns an error rate estimate using the Phred quality scores. - """ - try: - error_count = 0.0 - for score in self.qual: - phred = ord(score) - 33 - error_count += 10.0 ** (-phred / 10.0) - return error_count / len(self.qual) - except Exception, e: - return 0.0 + def est_error_rate(self): + """ + Returns an error rate estimate using the Phred quality scores. + """ + try: + error_count = 0.0 + for score in self.qual: + phred = ord(score) - 33 + error_count += 10.0 ** (-phred / 10.0) + return error_count / len(self.qual) + except Exception as e: + return 0.0 class Fasta(object): - def __init__(self, s): - self.s = s - self.parse() + def __init__(self, s): + self.s = s + self.parse() - def parse(self): - (self.name, self.seq, self.sep, self.qual) = self.s.strip().split('\n') - self.name = self.name.lstrip('@') + def parse(self): + (self.name, self.seq, self.sep, self.qual) = self.s.strip().split('\n') + self.name = self.name.lstrip('@') - def __repr__(self): - return '\n'.join(['>'+self.name, self.seq]) \ No newline at end of file + def __repr__(self): + return '\n'.join(['>'+self.name, self.seq]) \ No newline at end of file diff --git a/poretools/hist.py b/poretools/hist.py index b16c683..1130c05 100644 --- a/poretools/hist.py +++ b/poretools/hist.py @@ -6,7 +6,7 @@ from matplotlib import pyplot as plt import seaborn as sns -import Fast5File +from . import Fast5File import logging logger = logging.getLogger('poretools') diff --git a/poretools/index.py b/poretools/index.py index abc1af0..96b2ac8 100644 --- a/poretools/index.py +++ b/poretools/index.py @@ -1,66 +1,66 @@ -import Fast5File +from . import Fast5File import datetime ############ # -# index +# index # # A tool to extract -# all info needed to -# identify a pile of -# unsorted reads from -# multiple MinION -# sequencing -# experiments. +# all info needed to +# identify a pile of +# unsorted reads from +# multiple MinION +# sequencing +# experiments. # ############ def run(parser, args): - print "source_filename\ttemplate_fwd_length\tcomplement_rev_length\t2d_length\tasic_id\tasic_temp\theatsink_temp\tchannel\texp_start_time\texp_start_time_string_date\texp_start_time_string_time\tstart_time\tstart_time_string_date\tstart_time_string_time\tduration\tfast5_version" + print("source_filename\ttemplate_fwd_length\tcomplement_rev_length\t2d_length\tasic_id\tasic_temp\theatsink_temp\tchannel\texp_start_time\texp_start_time_string_date\texp_start_time_string_time\tstart_time\tstart_time_string_date\tstart_time_string_time\tduration\tfast5_version") - for fast5 in Fast5File.Fast5FileSet(args.files): - - - # run and flowcell parameters - asic_temp = fast5.get_asic_temp() - asic_id = fast5.get_asic_id() - heatsink_temp = fast5.get_heatsink_temp() - channel_number = fast5.get_channel_number() - - # try and get timing info - try: - start_time = fast5.get_start_time() - start_time_string = datetime.datetime.fromtimestamp(float(start_time)).strftime("%Y-%b-%d (%a)\t%H:%M:%S") - exp_start_time = fast5.get_exp_start_time() - exp_start_time_string = datetime.datetime.fromtimestamp(float(exp_start_time)).strftime("%Y-%b-%d (%a)\t%H:%M:%S") - duration = fast5.get_duration() - except KeyError: - start_time = "Not found" - start_time_string = "NA\tNA" - exp_start_time = "Not found" - exp_start_time_string = "NA\tNA" - duration = "Not found" - - # sequence file info - fast5_version = fast5.guess_version() - - # read info - fastq_reads = fast5.get_fastqs('all') - length_template = None - length_complement = None - length_2d = None - if (len(fastq_reads) > 0): - length_template = len(fastq_reads[0].seq) - if (len(fastq_reads) > 2): - length_complement = len(fastq_reads[1].seq) - length_2d = len(fastq_reads[2].seq) + for fast5 in Fast5File.Fast5FileSet(args.files): + + + # run and flowcell parameters + asic_temp = fast5.get_asic_temp() + asic_id = fast5.get_asic_id() + heatsink_temp = fast5.get_heatsink_temp() + channel_number = fast5.get_channel_number() + + # try and get timing info + try: + start_time = fast5.get_start_time() + start_time_string = datetime.datetime.fromtimestamp(float(start_time)).strftime("%Y-%b-%d (%a)\t%H:%M:%S") + exp_start_time = fast5.get_exp_start_time() + exp_start_time_string = datetime.datetime.fromtimestamp(float(exp_start_time)).strftime("%Y-%b-%d (%a)\t%H:%M:%S") + duration = fast5.get_duration() + except KeyError: + start_time = "Not found" + start_time_string = "NA\tNA" + exp_start_time = "Not found" + exp_start_time_string = "NA\tNA" + duration = "Not found" + + # sequence file info + fast5_version = fast5.guess_version() + + # read info + fastq_reads = fast5.get_fastqs('all') + length_template = None + length_complement = None + length_2d = None + if (len(fastq_reads) > 0): + length_template = len(fastq_reads[0].seq) + if (len(fastq_reads) > 2): + length_complement = len(fastq_reads[1].seq) + length_2d = len(fastq_reads[2].seq) - print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % ( - fast5.filename, - length_template, - length_complement, - length_2d, - asic_id, asic_temp, heatsink_temp,channel_number,exp_start_time,exp_start_time_string,start_time,start_time_string,duration,fast5_version) + print("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % ( + fast5.filename, + length_template, + length_complement, + length_2d, + asic_id, asic_temp, heatsink_temp,channel_number,exp_start_time,exp_start_time_string,start_time,start_time_string,duration,fast5_version)) - fast5.close() + fast5.close() diff --git a/poretools/metadata.py b/poretools/metadata.py index 958a675..a1c91fe 100644 --- a/poretools/metadata.py +++ b/poretools/metadata.py @@ -1,22 +1,22 @@ -import Fast5File +from . import Fast5File def run(parser, args): - if args.read: - for i, fast5 in enumerate(Fast5File.Fast5FileSet(args.files)): - for metadata_dict in fast5.read_metadata: - if i == 0: - header = metadata_dict.keys() - print "\t".join(["filename"] + header) - print "\t".join([fast5.filename] + [str( metadata_dict[k] ) for k in header]) - else: - print "asic_id\tasic_temp\theatsink_temp" - for fast5 in Fast5File.Fast5FileSet(args.files): + if args.read: + for i, fast5 in enumerate(Fast5File.Fast5FileSet(args.files)): + for metadata_dict in fast5.read_metadata: + if i == 0: + header = list(metadata_dict.keys()) + print("\t".join(["filename"] + header)) + print("\t".join([fast5.filename] + [str( metadata_dict[k] ) for k in header])) + else: + print("asic_id\tasic_temp\theatsink_temp") + for fast5 in Fast5File.Fast5FileSet(args.files): - asic_temp = fast5.get_asic_temp() - asic_id = fast5.get_asic_id() - heatsink_temp = fast5.get_heatsink_temp() + asic_temp = fast5.get_asic_temp() + asic_id = fast5.get_asic_id() + heatsink_temp = fast5.get_heatsink_temp() - print "%s\t%s\t%s" % (asic_id, asic_temp, heatsink_temp) + print("%s\t%s\t%s" % (asic_id, asic_temp, heatsink_temp)) - fast5.close() + fast5.close() diff --git a/poretools/nucdist.py b/poretools/nucdist.py index 94a7fae..ba9e9ee 100644 --- a/poretools/nucdist.py +++ b/poretools/nucdist.py @@ -1,19 +1,19 @@ -import Fast5File +from . import Fast5File from collections import Counter def run(parser, args): - nuc_count = Counter() - total_nucs = 0 + nuc_count = Counter() + total_nucs = 0 - for fast5 in Fast5File.Fast5FileSet(args.files): - fq = fast5.get_fastq() - if fq is not None: - for n in fq.seq: - nuc_count[n] += 1 - total_nucs += 1 - fast5.close() + for fast5 in Fast5File.Fast5FileSet(args.files): + fq = fast5.get_fastq() + if fq is not None: + for n in fq.seq: + nuc_count[n] += 1 + total_nucs += 1 + fast5.close() - for n in nuc_count: - print '\t'.join(str(s) for s in [n, nuc_count[n], - total_nucs, float(nuc_count[n]) / float(total_nucs)]) \ No newline at end of file + for n in nuc_count: + print('\t'.join(str(s) for s in [n, nuc_count[n], + total_nucs, float(nuc_count[n]) / float(total_nucs)])) \ No newline at end of file diff --git a/poretools/occupancy.py b/poretools/occupancy.py index 532d6c2..74df532 100644 --- a/poretools/occupancy.py +++ b/poretools/occupancy.py @@ -1,4 +1,4 @@ -import Fast5File +from . import Fast5File from collections import Counter import sys import pandas as pd @@ -35,8 +35,8 @@ def plot_performance(parser, args, pore_measure): pore_values.append(0) # make a data frame of the lists - d = {'rownum': range(1,17)*32, - 'colnum': sorted(range(1,33)*16), + d = {'rownum': list(range(1,17))*32, + 'colnum': sorted(list(range(1,33))*16), 'tot_reads': pore_values, 'labels': flowcell_layout} df = pd.DataFrame(d) @@ -55,7 +55,7 @@ def run(parser, args): tot_reads_per_pore = Counter() tot_bp_per_pore = Counter() - print "\t".join(['channel_number', 'start_time', 'duration']) + print("\t".join(['channel_number', 'start_time', 'duration'])) for fast5 in Fast5File.Fast5FileSet(args.files): if fast5.is_open: fq = fast5.get_fastq() @@ -70,10 +70,10 @@ def run(parser, args): tot_reads_per_pore[int(pore_id)] += 1 tot_bp_per_pore[int(pore_id)] += len(fq.seq) - print "\t".join([ + print("\t".join([ str(pore_id), str(start_time), - str(fast5.get_duration())]) + str(fast5.get_duration())])) fast5.close() if args.plot_type == 'read_count': diff --git a/poretools/organise.py b/poretools/organise.py index 7b3e578..8ab643b 100644 --- a/poretools/organise.py +++ b/poretools/organise.py @@ -1,4 +1,4 @@ -import Fast5File +from . import Fast5File import sys import os from os import makedirs @@ -11,29 +11,29 @@ logger.setLevel(logging.INFO) def run(parser, args): - if not os.path.isdir(args.dest): + if not os.path.isdir(args.dest): logger.error('destination directory needs to exist') return - for fast5 in Fast5File.Fast5FileSet(args.files): + for fast5 in Fast5File.Fast5FileSet(args.files): - #offset = fast5.get_start_time() - fast5.get_exp_start_time() + #offset = fast5.get_start_time() - fast5.get_exp_start_time() - specific_id = fast5.get_sample_name() - if not specific_id: - specific_id = fast5.get_asic_id() + specific_id = fast5.get_sample_name() + if not specific_id: + specific_id = fast5.get_asic_id() - path = "%s/%s" % (args.dest, specific_id) - if not os.path.isdir(path): - makedirs(path) + path = "%s/%s" % (args.dest, specific_id) + if not os.path.isdir(path): + makedirs(path) - #fas = fast5.get_fastas(args.type) + #fas = fast5.get_fastas(args.type) - fast5.close() + fast5.close() - filename = os.path.split(fast5.filename)[1] - if args.copy: - shutil.copyfile(fast5.filename, path + '/' + filename) - else: - shutil.move(fast5.filename, path + '/' + filename) + filename = os.path.split(fast5.filename)[1] + if args.copy: + shutil.copyfile(fast5.filename, path + '/' + filename) + else: + shutil.move(fast5.filename, path + '/' + filename) diff --git a/poretools/poretools_main.py b/poretools/poretools_main.py old mode 100755 new mode 100644 index 22d0aa9..7e6bbea --- a/poretools/poretools_main.py +++ b/poretools/poretools_main.py @@ -13,43 +13,43 @@ def run_subtool(parser, args): if args.command == 'combine': - import combine as submodule + from . import combine as submodule elif args.command == 'events': - import events as submodule + from . import events as submodule elif args.command == 'fasta': - import fasta as submodule + from . import fasta as submodule elif args.command == 'fastq': - import fastq as submodule + from . import fastq as submodule elif args.command == 'hist': - import hist as submodule + from . import hist as submodule elif args.command == 'metadata': - import metadata as submodule + from . import metadata as submodule elif args.command == 'nucdist': - import nucdist as submodule + from . import nucdist as submodule elif args.command == 'occupancy': - import occupancy as submodule + from . import occupancy as submodule elif args.command == 'qualdist': - import qualdist as submodule + from . import qualdist as submodule elif args.command == 'qualpos': - import qual_v_pos as submodule + from . import qual_v_pos as submodule elif args.command == 'readstats': - import readstats as submodule + from . import readstats as submodule elif args.command == 'stats': - import stats as submodule + from . import stats as submodule elif args.command == 'tabular': - import tabular as submodule + from . import tabular as submodule elif args.command == 'times': - import times as submodule + from . import times as submodule elif args.command == 'squiggle': - import squiggle as submodule + from . import squiggle as submodule elif args.command == 'winner': - import winner as submodule + from . import winner as submodule elif args.command == 'yield_plot': - import yield_plot as submodule + from . import yield_plot as submodule elif args.command == 'index': - import index as submodule + from . import index as submodule elif args.command == 'organise': - import organise as submodule + from . import organise as submodule # run the chosen submodule. submodule.run(parser, args) @@ -57,7 +57,7 @@ def run_subtool(parser, args): class ArgumentParserWithDefaults(argparse.ArgumentParser): def __init__(self, *args, **kwargs): super(ArgumentParserWithDefaults, self).__init__(*args, **kwargs) - self.add_argument("-q", "--quiet", help="Do not output warnings to stderr", + self.add_argument("-q", "--quiet", help="Do not output warnings to stderr", action="store_true", dest="quiet") @@ -531,7 +531,7 @@ def main(): try: args.func(parser, args) - except IOError, e: + except IOError as e: if e.errno != 32: # ignore SIGPIPE raise diff --git a/poretools/qual_v_pos.py b/poretools/qual_v_pos.py index c310b1e..433d92a 100644 --- a/poretools/qual_v_pos.py +++ b/poretools/qual_v_pos.py @@ -1,4 +1,4 @@ -import Fast5File +from . import Fast5File from collections import defaultdict import pandas import matplotlib.pyplot as plt @@ -32,7 +32,7 @@ def run(parser, args): continue for fq in fqs: - if fq is None or len(fq.seq) < args.min_length or len(fq.seq) > args.max_length: + if fq is None or len(fq.seq) < args.min_length or len(fq.seq) > args.max_length: continue ctr = 0 diff --git a/poretools/qualdist.py b/poretools/qualdist.py index 66fbb26..414b935 100644 --- a/poretools/qualdist.py +++ b/poretools/qualdist.py @@ -1,19 +1,19 @@ -import Fast5File +from . import Fast5File from collections import Counter def run(parser, args): - qual_count = Counter() - total_nucs = 0 + qual_count = Counter() + total_nucs = 0 - for fast5 in Fast5File.Fast5FileSet(args.files): - fq = fast5.get_fastq() - if fq is not None: - for q in fq.qual: - qual_count[ord(q)-33] += 1 - total_nucs += 1 - fast5.close() + for fast5 in Fast5File.Fast5FileSet(args.files): + fq = fast5.get_fastq() + if fq is not None: + for q in fq.qual: + qual_count[ord(q)-33] += 1 + total_nucs += 1 + fast5.close() - for q in qual_count: - print '\t'.join(str(s) for s in [chr(q+33), q, qual_count[q], - total_nucs, float(qual_count[q]) / float(total_nucs)]) \ No newline at end of file + for q in qual_count: + print('\t'.join(str(s) for s in [chr(q+33), q, qual_count[q], + total_nucs, float(qual_count[q]) / float(total_nucs)])) \ No newline at end of file diff --git a/poretools/readstats.py b/poretools/readstats.py index 3d7658f..dce9d8d 100644 --- a/poretools/readstats.py +++ b/poretools/readstats.py @@ -1,27 +1,27 @@ -import Fast5File +from . import Fast5File def run(parser, args): - print "start_time\tchannel_number\tread_number\ttemplate_events\tcomplement_events" + print("start_time\tchannel_number\tread_number\ttemplate_events\tcomplement_events") - for fast5 in Fast5File.Fast5FileSet(args.files): + for fast5 in Fast5File.Fast5FileSet(args.files): - start_time = fast5.get_start_time() - channel_number = fast5.get_channel_number() - read_number = fast5.get_read_number() + start_time = fast5.get_start_time() + channel_number = fast5.get_channel_number() + read_number = fast5.get_read_number() - template_events = fast5.get_template_events() - if template_events is not None: - template_len = len(template_events) - else: - template_len = 0 + template_events = fast5.get_template_events() + if template_events is not None: + template_len = len(template_events) + else: + template_len = 0 - complement_events = fast5.get_complement_events() - if complement_events is not None: - complement_len = len(complement_events) - else: - complement_len = 0 + complement_events = fast5.get_complement_events() + if complement_events is not None: + complement_len = len(complement_events) + else: + complement_len = 0 - print "%s\t%s\t%s\t%s\t%s" % (start_time, channel_number, read_number, template_len, complement_len) + print("%s\t%s\t%s\t%s\t%s" % (start_time, channel_number, read_number, template_len, complement_len)) - fast5.close() + fast5.close() diff --git a/poretools/squiggle.py b/poretools/squiggle.py index cd8d667..14949ef 100644 --- a/poretools/squiggle.py +++ b/poretools/squiggle.py @@ -10,7 +10,7 @@ import logging logger = logging.getLogger('poretools') -import Fast5File +from . import Fast5File def plot_squiggle(args, filename, start_times, mean_signals): """ @@ -70,7 +70,7 @@ def run(parser, args): fast5_set = Fast5File.Fast5FileSet(args.files) - first_fast5 = fast5_set.next() + first_fast5 = next(fast5_set) for fast5 in fast5_set: # only create a squiggle plot for multiple reads if saving to file. if args.saveas is None: diff --git a/poretools/statistics.py b/poretools/statistics.py index 4ab2a34..cf47560 100644 --- a/poretools/statistics.py +++ b/poretools/statistics.py @@ -1,30 +1,30 @@ def mean(l): - """ - Return the mean of a list of numbers - """ - if isinstance(l, list): - if len(l): - return float(sum(l)) / float(len(l)) - else: - return None - else: - return None + """ + Return the mean of a list of numbers + """ + if isinstance(l, list): + if len(l): + return float(sum(l)) / float(len(l)) + else: + return None + else: + return None def median(l): - """ - Return the median of a list of numbers - """ - if isinstance(l, list): - l = sorted(l) - if len(l) % 2 > 0: - mid = len(l) / 2 - return l[mid] - else: - low = len(l) / 2 - 1 - high = len(l) / 2 - return float(l[low] + l[high]) / 2.0 - else: - return None + """ + Return the median of a list of numbers + """ + if isinstance(l, list): + l = sorted(l) + if len(l) % 2 > 0: + mid = len(l) / 2 + return l[mid] + else: + low = len(l) / 2 - 1 + high = len(l) / 2 + return float(l[low] + l[high]) / 2.0 + else: + return None def NX(l, x=[25,50,75]): """ @@ -33,13 +33,13 @@ def NX(l, x=[25,50,75]): Assumes all values in list x are between 0 and 100. Interpretation: When NX = NX_value, X% of data (in bp) is contained in reads at least NX_value bp long. """ - if isinstance(l, list) and isinstance(x, list): - l = sorted(l) - x = sorted(x) - total = sum(l) + if isinstance(l, list) and isinstance(x, list): + l = sorted(l) + x = sorted(x) + total = sum(l) nxsum = 0 nxvalues = {e:0 for e in x} - for e in x: + for e in x: xpct = total*e/100.0 while nxsum < xpct and l: nxsum += l[-1] @@ -47,5 +47,5 @@ def NX(l, x=[25,50,75]): nxvalues[e] = lastsize return nxvalues - else: - return None + else: + return None diff --git a/poretools/stats.py b/poretools/stats.py index b2b9920..9de76af 100644 --- a/poretools/stats.py +++ b/poretools/stats.py @@ -1,63 +1,63 @@ -import statistics as stat -import Fast5File +from . import statistics as stat +from . import Fast5File import logging from collections import defaultdict logger = logging.getLogger('poretools') def run(parser, args): - if args.full_tsv: - files = 0 - basecalled_files = 0 - stats = defaultdict(list) - for fast5 in Fast5File.Fast5FileSet(args.files): - files += 1 - fas = fast5.get_fastas_dict() - if len(fas) > 0: - basecalled_files += 1 - for category, fa in fas.iteritems(): - if fa is not None: - stats[category].append(len(fa.seq)) - if category == 'twodirections': - if fast5.is_high_quality(): - stats['2D_hq'].append(len(fa.seq)) + if args.full_tsv: + files = 0 + basecalled_files = 0 + stats = defaultdict(list) + for fast5 in Fast5File.Fast5FileSet(args.files): + files += 1 + fas = fast5.get_fastas_dict() + if len(fas) > 0: + basecalled_files += 1 + for category, fa in fas.items(): + if fa is not None: + stats[category].append(len(fa.seq)) + if category == 'twodirections': + if fast5.is_high_quality(): + stats['2D_hq'].append(len(fa.seq)) - fast5.close() + fast5.close() - print "files\ttotal reads\t%d" % (files) - print "files\ttotal base-called reads\t%d" % (basecalled_files) - for category in sorted(stats.keys()): - sizes = stats[category] + print("files\ttotal reads\t%d" % (files)) + print("files\ttotal base-called reads\t%d" % (basecalled_files)) + for category in sorted(stats.keys()): + sizes = stats[category] - if len(sizes) > 0: - print "%s\ttotal reads\t%d" % (category, len(sizes)) - print "%s\ttotal base pairs\t%d" % (category, sum(sizes)) - print "%s\tmean\t%.2f" % (category, stat.mean(sizes)) - print "%s\tmedian\t%d" % (category, stat.median(sizes)) - print "%s\tmin\t%d" % (category, min(sizes)) - print "%s\tmax\t%d" % (category, max(sizes)) - nxvalues = stat.NX(sizes, [25,50,75]) - print "%s\tN25\t%d" % (category, nxvalues[25]) - print "%s\tN50\t%d" % (category, nxvalues[50]) - print "%s\tN75\t%d" % (category, nxvalues[75]) - else: - logger.warning("No valid sequences observed.\n") - else: - sizes = [] - for fast5 in Fast5File.Fast5FileSet(args.files): - fas = fast5.get_fastas(args.type) - sizes.extend([len(fa.seq) for fa in fas if fa is not None]) - fast5.close() + if len(sizes) > 0: + print("%s\ttotal reads\t%d" % (category, len(sizes))) + print("%s\ttotal base pairs\t%d" % (category, sum(sizes))) + print("%s\tmean\t%.2f" % (category, stat.mean(sizes))) + print("%s\tmedian\t%d" % (category, stat.median(sizes))) + print("%s\tmin\t%d" % (category, min(sizes))) + print("%s\tmax\t%d" % (category, max(sizes))) + nxvalues = stat.NX(sizes, [25,50,75]) + print("%s\tN25\t%d" % (category, nxvalues[25])) + print("%s\tN50\t%d" % (category, nxvalues[50])) + print("%s\tN75\t%d" % (category, nxvalues[75])) + else: + logger.warning("No valid sequences observed.\n") + else: + sizes = [] + for fast5 in Fast5File.Fast5FileSet(args.files): + fas = fast5.get_fastas(args.type) + sizes.extend([len(fa.seq) for fa in fas if fa is not None]) + fast5.close() - if len(sizes) > 0: - print "total reads\t%d" % (len(sizes)) - print "total base pairs\t%d" % (sum(sizes)) - print "mean\t%.2f" % (stat.mean(sizes)) - print "median\t%d" % (stat.median(sizes)) - print "min\t%d" % (min(sizes)) - print "max\t%d" % (max(sizes)) + if len(sizes) > 0: + print("total reads\t%d" % (len(sizes))) + print("total base pairs\t%d" % (sum(sizes))) + print("mean\t%.2f" % (stat.mean(sizes))) + print("median\t%d" % (stat.median(sizes))) + print("min\t%d" % (min(sizes))) + print("max\t%d" % (max(sizes))) nxvalues = stat.NX(sizes, [25,50,75]) - print "N25\t%d" % (nxvalues[25]) - print "N50\t%d" % (nxvalues[50]) - print "N75\t%d" % (nxvalues[75]) - else: - logger.warning("No valid sequences observed.\n") + print("N25\t%d" % (nxvalues[25])) + print("N50\t%d" % (nxvalues[50])) + print("N75\t%d" % (nxvalues[75])) + else: + logger.warning("No valid sequences observed.\n") diff --git a/poretools/tabular.py b/poretools/tabular.py index 24a6a2e..ff70648 100644 --- a/poretools/tabular.py +++ b/poretools/tabular.py @@ -1,14 +1,14 @@ -import Fast5File +from . import Fast5File def run(parser, args): - - print '\t'.join(['length', 'name', 'sequence', 'quals']) - - for fast5 in Fast5File.Fast5FileSet(args.files): - fqs = fast5.get_fastqs(args.type) - for fq in fqs: - if fq is None: - fast5.close() - continue - print '\t'.join([str(len(fq.seq)), fq.name, fq.seq, fq.qual]) - fast5.close() \ No newline at end of file + + print('\t'.join(['length', 'name', 'sequence', 'quals'])) + + for fast5 in Fast5File.Fast5FileSet(args.files): + fqs = fast5.get_fastqs(args.type) + for fq in fqs: + if fq is None: + fast5.close() + continue + print('\t'.join([str(len(fq.seq)), fq.name, fq.seq, fq.qual])) + fast5.close() \ No newline at end of file diff --git a/poretools/times.py b/poretools/times.py index d568cc0..76e3249 100644 --- a/poretools/times.py +++ b/poretools/times.py @@ -1,4 +1,4 @@ -import Fast5File +from . import Fast5File from time import strftime, localtime import sys @@ -7,37 +7,37 @@ logger = logging.getLogger('poretools') def run(parser, args): - print '\t'.join(['channel', 'filename', 'read_length', - 'exp_starttime', 'unix_timestamp', 'duration', - 'unix_timestamp_end', 'iso_timestamp', 'day', - 'hour', 'minute']) - - for fast5 in Fast5File.Fast5FileSet(args.files): - if fast5.is_open: - - fq = fast5.get_fastq() - - start_time = fast5.get_start_time() - if start_time is None: - logger.warning("No start time for %s!" % (fast5.filename)) - fast5.close() - continue + print('\t'.join(['channel', 'filename', 'read_length', + 'exp_starttime', 'unix_timestamp', 'duration', + 'unix_timestamp_end', 'iso_timestamp', 'day', + 'hour', 'minute'])) + + for fast5 in Fast5File.Fast5FileSet(args.files): + if fast5.is_open: + + fq = fast5.get_fastq() + + start_time = fast5.get_start_time() + if start_time is None: + logger.warning("No start time for %s!" % (fast5.filename)) + fast5.close() + continue - if fq is not None: - read_length = len(fq.seq) - else: - read_length = 0 + if fq is not None: + read_length = len(fq.seq) + else: + read_length = 0 - lt = localtime(start_time) - print "\t".join([fast5.get_channel_number(), - fast5.filename, - str(read_length), - str(fast5.get_exp_start_time()), - str(start_time), \ - str(fast5.get_duration()), - str(fast5.get_end_time()), - strftime('%Y-%m-%dT%H:%M:%S%z', lt), - strftime('%d', lt), - strftime('%H', lt), - strftime('%M', lt)]) - fast5.close() + lt = localtime(start_time) + print("\t".join([fast5.get_channel_number(), + fast5.filename, + str(read_length), + str(fast5.get_exp_start_time()), + str(start_time), \ + str(fast5.get_duration()), + str(fast5.get_end_time()), + strftime('%Y-%m-%dT%H:%M:%S%z', lt), + strftime('%d', lt), + strftime('%H', lt), + strftime('%M', lt)])) + fast5.close() diff --git a/poretools/winner.py b/poretools/winner.py index a6671ac..5766aed 100644 --- a/poretools/winner.py +++ b/poretools/winner.py @@ -1,4 +1,4 @@ -import Fast5File +from . import Fast5File import sys #logging @@ -7,19 +7,19 @@ def run(parser, args): - longest_size = 0 - longest_read = None - - for fast5 in Fast5File.Fast5FileSet(args.files): - fas = fast5.get_fastas(args.type) + longest_size = 0 + longest_read = None + + for fast5 in Fast5File.Fast5FileSet(args.files): + fas = fast5.get_fastas(args.type) - for fa in fas: - if fa and len(fa.seq) > longest_size: - longest_size = len(fa.seq) - longest_read = fa + for fa in fas: + if fa and len(fa.seq) > longest_size: + longest_size = len(fa.seq) + longest_read = fa - fast5.close() + fast5.close() - logger.info("Wow, it's a whopper: your longest read is %d bases." % (longest_size,)) - print longest_read + logger.info("Wow, it's a whopper: your longest read is %d bases." % (longest_size,)) + print(longest_read) diff --git a/poretools/yield_plot.py b/poretools/yield_plot.py index af4b69b..1ab218e 100644 --- a/poretools/yield_plot.py +++ b/poretools/yield_plot.py @@ -1,4 +1,4 @@ -import Fast5File +from . import Fast5File import matplotlib #matplotlib.use('Agg') # Must be called before any other matplotlib calls from matplotlib import pyplot as plt @@ -25,16 +25,16 @@ def plot_collectors_curve(args, start_times, read_lengths): # compute the cumulative based on reads or total base pairs if args.plot_type == 'reads': y_label = "Total reads" - cumulative = np.cumsum(range(len(start_times))) + cumulative = np.cumsum(list(range(len(start_times)))) elif args.plot_type == 'basepairs': y_label = "Total base pairs" cumulative = np.cumsum(read_lengths) step = args.skip # make a data frame of the lists - d = {'start': [start_times[n] for n in xrange(0, len(start_times), step)], - 'lengths': [read_lengths[n] for n in xrange(0, len(read_lengths), step)], - 'cumul': [cumulative[n] for n in xrange(0, len(cumulative), step)]} + d = {'start': [start_times[n] for n in range(0, len(start_times), step)], + 'lengths': [read_lengths[n] for n in range(0, len(read_lengths), step)], + 'cumul': [cumulative[n] for n in range(0, len(cumulative), step)]} df = pd.DataFrame(d) if args.savedf: diff --git a/setup.py b/setup.py index e5479f0..c5978d2 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ version_py = os.path.join(os.path.dirname(__file__), 'poretools', 'version.py') version = open(version_py).read().strip().split('=')[-1].replace('"','').strip() -print version +print(version) long_description = """ ``poretools`` is a toolset for working with nanopore sequencing data' """