From 77fb4a977204a021bbc63af855111dfa8ad372e8 Mon Sep 17 00:00:00 2001 From: Rob Egan Date: Fri, 22 Apr 2016 13:58:57 -0700 Subject: [PATCH 1/9] protect fastq_paths in try / catch also --- poretools/Fast5File.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/poretools/Fast5File.py b/poretools/Fast5File.py index 62c3fc4..f03d5e4 100644 --- a/poretools/Fast5File.py +++ b/poretools/Fast5File.py @@ -433,8 +433,8 @@ def find_read_number_block(self): return node def find_event_timing_block(self): - path = fastq_paths[self.version]['template'] % (self.group) try: + path = fastq_paths[self.version]['template'] % (self.group) node = self.hdf5file[path] path = node.get('Events') #, getlink=True) From c8ea6c4f32966083a17c8066c76e4357e217c83f Mon Sep 17 00:00:00 2001 From: Rob Egan Date: Sun, 4 Dec 2016 20:44:05 -0500 Subject: [PATCH 2/9] added zipfile and zipfile iterator --- poretools/Fast5File.py | 58 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 53 insertions(+), 5 deletions(-) diff --git a/poretools/Fast5File.py b/poretools/Fast5File.py index 5421fdf..ad6fadd 100644 --- a/poretools/Fast5File.py +++ b/poretools/Fast5File.py @@ -2,6 +2,7 @@ import os import glob import tarfile +import zipfile import shutil import h5py @@ -34,6 +35,7 @@ FAST5SET_DIRECTORY = 1 FAST5SET_SINGLEFILE = 2 FAST5SET_TARBALL = 3 +FAST5SET_ZIP = 4 PORETOOLS_TMPDIR = '.poretools_tmp' @@ -73,6 +75,7 @@ def next(self): class Fast5FileSet(object): def __init__(self, fileset, group=0): + self.set_type = None if isinstance(fileset, list): self.fileset = fileset elif isinstance(fileset, str): @@ -98,10 +101,19 @@ def next(self): return Fast5File(self.files.next(), self.group) except Exception as e: # cleanup our mess - if self.set_type == FAST5SET_TARBALL: + if self.set_type == FAST5SET_TARBALL or self.set_type == FAST5SET_ZIP: shutil.rmtree(PORETOOLS_TMPDIR) raise StopIteration + def _prep_tmpdir(self, path): + if path is None: + path = PORETOOLS_TMPDIR + else: + PORETOOLS_TMPDIR = path + if os.path.isdir(PORETOOLS_TMPDIR): + shutil.rmtree(PORETOOLS_TMPDIR) + os.mkdir(PORETOOLS_TMPDIR) + def _extract_fast5_files(self): # return as-is if list of files @@ -124,15 +136,21 @@ def _extract_fast5_files(self): # is it a tarball? elif tarfile.is_tarfile(f): - if os.path.isdir(PORETOOLS_TMPDIR): - shutil.rmtree(PORETOOLS_TMPDIR) - os.mkdir(PORETOOLS_TMPDIR) - + self._prep_tmpdir(PORETOOLS_TMPDIR) self.files = TarballFileIterator(f) # set to None to delay initialisation self.num_files_in_set = None self.set_type = FAST5SET_TARBALL + # is it a zipfile? + elif zipfile.is_zipfile(f): + self._prep_tmpdir(PORETOOLS_TMPDIR) + zipfile = zipfile.ZipFile(f, 'r', zipfile.ZIP_STORED, True) + self.files = ZipFileIterator( zipfile ) + # set to None to delay initialisation + self.num_files_in_set = None + self.set_type( FAST5SET_ZIP ) + # just a single FAST5 file. else: self.files = iter([f]) @@ -170,6 +188,36 @@ def __len__(self): with tarfile.open(self._tarball) as tar: return len(tar.getnames()) +class ZipFileIterator: + def _fast5_filename_filter(self, filename): + return os.path.basename(filename).endswith('.fast5') and not os.path.basename(filename).startswith('.') + + def __init__(self, zipfile): + self._zipfile = zipfile + self._infolist = zipfile.infolist().reverse() + + def __del__(self): + self._zipfile.close() + + def __iter__(self): + return self + + def next(self): + zipinfo = None + while True: + if len(self._infolist) == 0: + break + zipinfo = self._infolist.pop() # returns last entry, hence the reverse at init + if zipinfo and self._fast5_filename_filter( zipinfo.filename ): + break + if zipinfo: + self._zipfile.extract(zipinfo, PORETOOLS_TMPDIR) + return os.path.join(PORETOOLS_TMPDIR, zipinfo.filename ) + else: + raise StopIteration + + def __len__(self): + return len(self._infolist) class Fast5File(object): From 8ead21a3cc57521854816f85a9b2e2a04f573dd8 Mon Sep 17 00:00:00 2001 From: Rob Egan Date: Thu, 2 Mar 2017 13:44:12 -0800 Subject: [PATCH 3/9] checkpoint --- poretools/Fast5File.py | 60 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/poretools/Fast5File.py b/poretools/Fast5File.py index ddc866d..2a62267 100644 --- a/poretools/Fast5File.py +++ b/poretools/Fast5File.py @@ -5,6 +5,7 @@ import zipfile import shutil import h5py +import tempfile #logging import logging @@ -295,6 +296,23 @@ def close(self): self.hdf5file.close() self.is_open = False + def repack(self, newfile): + """ + Copy the contents into a new Fast5 file more optimally + """ + if self.is_open: + try: + fcopy = h5py.File(newfile, 'w') + for x in self.hdf5file.items(): + self.hdf5file.copy(x[0], fcopy) + fcopy.close() + except Exception, e: + logger.warning("Can not open a new file %s for writing!\n" % (newfile)) + return False + return True + else: + return False + def has_2D(self): """ Return TRUE if the FAST5 has a 2D base-called sequence. @@ -869,3 +887,45 @@ def _get_metadata(self): except Exception, e: self.keyinfo = None logger.warning("Cannot find keyinfo. Exiting.\n") + +class Fast5ZipArchive(object): + """ + Creates or appends a .zip file with a directory or list of fast5 files + """ + + def __init__(self, filename): + """Opens a new or appends an old zip file""" + self.filename = args[0] + self.zipfile = zipfile.ZipFile(self.filename, 'a', zipfile.ZIP_DEFLATED, True) + self.tmp = tempdir.mkdtemp(prefix=prefix) + + def __del__(self): + self.zipfile.close() + os.rmdir(self.tmp) + + def append_dir(self, path): + for file in os.listdir(path): + fpath = '%s/%s' % (path,file) + if os.path.isdir(fpath): + self.append_dir(fpath) + elif file.endswith('.fast5'): + files.append_file(fpath) + + def append_file(self, filepath): + fast5 = Fast5File(file) + tmppath = "%s/%s" % (self.tmp, filepath) + try: + self.mkdirs(os.path.dirname(tmppath)) + except OSError: + # okay + fast5.repack(tmppath) + self.zipfile.write(tmppath, filepath) + os.unlink(tmppath) + + def append(self, *args): + for input in args: + if os.path.isdir(input): + self.append_dir(input) + elif input.endswith('.fast5): + self.append_file(input) + From c00a8a038221598c27607c50b96619c1762fecc1 Mon Sep 17 00:00:00 2001 From: Rob Egan Date: Mon, 3 Apr 2017 01:30:48 -0700 Subject: [PATCH 4/9] fixed some exceptions --- poretools/Fast5File.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/poretools/Fast5File.py b/poretools/Fast5File.py index 4bafd01..1a2b05f 100644 --- a/poretools/Fast5File.py +++ b/poretools/Fast5File.py @@ -159,11 +159,11 @@ def _extract_fast5_files(self): # is it a zipfile? elif zipfile.is_zipfile(f): self._prep_tmpdir(PORETOOLS_TMPDIR) - zipfile = zipfile.ZipFile(f, 'r', zipfile.ZIP_STORED, True) - self.files = ZipFileIterator( zipfile ) + zip = zipfile.ZipFile(f, 'r', zipfile.ZIP_STORED, True) + self.files = ZipFileIterator( zip ) # set to None to delay initialisation self.num_files_in_set = None - self.set_type( FAST5SET_ZIP ) + self.set_type = FAST5SET_ZIP # just a single FAST5 file. else: @@ -206,12 +206,12 @@ class ZipFileIterator: def _fast5_filename_filter(self, filename): return os.path.basename(filename).endswith('.fast5') and not os.path.basename(filename).startswith('.') - def __init__(self, zipfile): - self._zipfile = zipfile - self._infolist = zipfile.infolist().reverse() + def __init__(self, zip): + self._zip = zip + self._infolist = zip.infolist().reverse() def __del__(self): - self._zipfile.close() + self._zip.close() def __iter__(self): return self @@ -225,7 +225,7 @@ def next(self): if zipinfo and self._fast5_filename_filter( zipinfo.filename ): break if zipinfo: - self._zipfile.extract(zipinfo, PORETOOLS_TMPDIR) + self._zip.extract(zipinfo, PORETOOLS_TMPDIR) return os.path.join(PORETOOLS_TMPDIR, zipinfo.filename ) else: raise StopIteration @@ -1001,11 +1001,11 @@ class Fast5ZipArchive(object): def __init__(self, filename): """Opens a new or appends an old zip file""" self.filename = args[0] - self.zipfile = zipfile.ZipFile(self.filename, 'a', zipfile.ZIP_DEFLATED, True) + self.zip = zipfile.ZipFile(self.filename, 'a', zipfile.ZIP_DEFLATED, True) self.tmp = tempdir.mkdtemp(prefix=prefix) def __del__(self): - self.zipfile.close() + self.zip.close() os.rmdir(self.tmp) def append_dir(self, path): @@ -1022,15 +1022,15 @@ def append_file(self, filepath): try: self.mkdirs(os.path.dirname(tmppath)) except OSError: - # okay + pass # okay fast5.repack(tmppath) - self.zipfile.write(tmppath, filepath) + self.zip.write(tmppath, filepath) os.unlink(tmppath) def append(self, *args): for input in args: if os.path.isdir(input): self.append_dir(input) - elif input.endswith('.fast5): + elif input.endswith('.fast5'): self.append_file(input) From 07b99041645a13ca86de4feb0960f28a52d9b39b Mon Sep 17 00:00:00 2001 From: Rob Egan Date: Tue, 4 Apr 2017 13:37:21 -0700 Subject: [PATCH 5/9] minor changes --- poretools/Fast5File.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/poretools/Fast5File.py b/poretools/Fast5File.py index 1a2b05f..5909dad 100644 --- a/poretools/Fast5File.py +++ b/poretools/Fast5File.py @@ -44,7 +44,7 @@ FAST5SET_SINGLEFILE = 2 FAST5SET_TARBALL = 3 FAST5SET_ZIP = 4 -PORETOOLS_TMPDIR = '.poretools_tmp' +PORETOOLS_TMPDIR = '/dev/shm/.poretools_tmp' class Fast5DirHandler(object): @@ -158,6 +158,7 @@ def _extract_fast5_files(self): # is it a zipfile? elif zipfile.is_zipfile(f): + print("Found zipfile %s" % (f)) self._prep_tmpdir(PORETOOLS_TMPDIR) zip = zipfile.ZipFile(f, 'r', zipfile.ZIP_STORED, True) self.files = ZipFileIterator( zip ) From dff9a5464f3c6bb250f780dcf2557376d95057b8 Mon Sep 17 00:00:00 2001 From: Rob Egan Date: Wed, 5 Apr 2017 01:14:00 -0700 Subject: [PATCH 6/9] fixed tempfile / tempdir extraction for tarball and zipfile --- poretools/Fast5File.py | 52 +++++++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/poretools/Fast5File.py b/poretools/Fast5File.py index 5909dad..9ee9398 100644 --- a/poretools/Fast5File.py +++ b/poretools/Fast5File.py @@ -44,8 +44,11 @@ FAST5SET_SINGLEFILE = 2 FAST5SET_TARBALL = 3 FAST5SET_ZIP = 4 -PORETOOLS_TMPDIR = '/dev/shm/.poretools_tmp' - +PORETOOLS_TMPDIR = None +for testdir in ['/dev/shm/', '/tmp/', '.']: + if os.path.isdir(testdir): + PORETOOLS_TMPDIR = testdir + break class Fast5DirHandler(object): @@ -92,8 +95,12 @@ def __init__(self, fileset, group=0): self.set_type = None self.num_files_in_set = None self.group = group + self._tmp = tempfile.mkdtemp(prefix=PORETOOLS_TMPDIR) self._extract_fast5_files() + def __del__(self): + os.rmdir(self._tmp) + def get_num_files(self): """ Return the number of files in the FAST5 set. @@ -107,11 +114,11 @@ def __iter__(self): def next(self): try: - return Fast5File(self.files.next(), self.group) + nextFile = next(self.files) + autoremove = isinstance(self.files, ZipFileIterator) or isinstance(self.files, TarballFileIterator) + nextFast5 = Fast5File(nextFile, self.group, autoremove) + return nextFast5 except Exception as e: - # cleanup our mess - if self.set_type == FAST5SET_TARBALL or self.set_type == FAST5SET_ZIP: - shutil.rmtree(PORETOOLS_TMPDIR) raise StopIteration def _prep_tmpdir(self, path): @@ -150,18 +157,15 @@ def _extract_fast5_files(self): # is it a tarball? elif tarfile.is_tarfile(f): - self._prep_tmpdir(PORETOOLS_TMPDIR) - self.files = TarballFileIterator(f) + self.files = TarballFileIterator(f, self._tmp) # set to None to delay initialisation self.num_files_in_set = None self.set_type = FAST5SET_TARBALL # is it a zipfile? elif zipfile.is_zipfile(f): - print("Found zipfile %s" % (f)) - self._prep_tmpdir(PORETOOLS_TMPDIR) zip = zipfile.ZipFile(f, 'r', zipfile.ZIP_STORED, True) - self.files = ZipFileIterator( zip ) + self.files = ZipFileIterator( zip, self._tmp ) # set to None to delay initialisation self.num_files_in_set = None self.set_type = FAST5SET_ZIP @@ -179,9 +183,10 @@ class TarballFileIterator: def _fast5_filename_filter(self, filename): return os.path.basename(filename).endswith('.fast5') and not os.path.basename(filename).startswith('.') - def __init__(self, tarball): + def __init__(self, tarball, tempdir): self._tarball = tarball self._tarfile = tarfile.open(tarball) + self._tmp = tempdir def __del__(self): self._tarfile.close() @@ -196,8 +201,8 @@ def next(self): raise StopIteration elif self._fast5_filename_filter(tarinfo.name): break - self._tarfile.extract(tarinfo, path=PORETOOLS_TMPDIR) - return os.path.join(PORETOOLS_TMPDIR, tarinfo.name) + self._tarfile.extract(tarinfo, path=self._tmp) + return os.path.join(self._tmp, tarinfo.name) def __len__(self): with tarfile.open(self._tarball) as tar: @@ -207,9 +212,10 @@ class ZipFileIterator: def _fast5_filename_filter(self, filename): return os.path.basename(filename).endswith('.fast5') and not os.path.basename(filename).startswith('.') - def __init__(self, zip): + def __init__(self, zip, tempdir): self._zip = zip - self._infolist = zip.infolist().reverse() + self._infolist = iter(zip.infolist()) + self._tmp = tempdir def __del__(self): self._zip.close() @@ -220,14 +226,12 @@ def __iter__(self): def next(self): zipinfo = None while True: - if len(self._infolist) == 0: - break - zipinfo = self._infolist.pop() # returns last entry, hence the reverse at init + zipinfo = next(self._infolist) if zipinfo and self._fast5_filename_filter( zipinfo.filename ): break if zipinfo: - self._zip.extract(zipinfo, PORETOOLS_TMPDIR) - return os.path.join(PORETOOLS_TMPDIR, zipinfo.filename ) + self._zip.extract(zipinfo, self._tmp) + return os.path.join(self._tmp, zipinfo.filename ) else: raise StopIteration @@ -236,7 +240,7 @@ def __len__(self): class Fast5File(object): - def __init__(self, filename, group=0): + def __init__(self, filename, group=0, autoremove=False): self.filename = filename self.group = group self.is_open = self.open() @@ -258,6 +262,8 @@ def __init__(self, filename, group=0): self.have_complements = False self.have_pre_basecalled = False self.have_metadata = False + if autoremove: + os.unlink(self.filename) def __del__(self): @@ -1003,7 +1009,7 @@ def __init__(self, filename): """Opens a new or appends an old zip file""" self.filename = args[0] self.zip = zipfile.ZipFile(self.filename, 'a', zipfile.ZIP_DEFLATED, True) - self.tmp = tempdir.mkdtemp(prefix=prefix) + self.tmp = tempfile.mkdtemp(prefix=prefix) def __del__(self): self.zip.close() From 15fa30771cad985223620fe6c193fa012802c5e5 Mon Sep 17 00:00:00 2001 From: Rob Egan Date: Wed, 5 Apr 2017 01:14:15 -0700 Subject: [PATCH 7/9] fixed print --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index e5479f0..c5978d2 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ version_py = os.path.join(os.path.dirname(__file__), 'poretools', 'version.py') version = open(version_py).read().strip().split('=')[-1].replace('"','').strip() -print version +print(version) long_description = """ ``poretools`` is a toolset for working with nanopore sequencing data' """ From b66ad6e4ec521fef5312ef601a3b4760e0323ea7 Mon Sep 17 00:00:00 2001 From: Rob Egan Date: Wed, 5 Apr 2017 01:55:43 -0700 Subject: [PATCH 8/9] allow multiple tar or zip files as arguments in addition to a list of fast5 files --- poretools/Fast5File.py | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/poretools/Fast5File.py b/poretools/Fast5File.py index 9ee9398..bab9b6b 100644 --- a/poretools/Fast5File.py +++ b/poretools/Fast5File.py @@ -96,6 +96,7 @@ def __init__(self, fileset, group=0): self.num_files_in_set = None self.group = group self._tmp = tempfile.mkdtemp(prefix=PORETOOLS_TMPDIR) + self.oldfiles = None self._extract_fast5_files() def __del__(self): @@ -114,22 +115,34 @@ def __iter__(self): def next(self): try: - nextFile = next(self.files) + # allow multiple tarball or zip files to expand + try: + nextFile = next(self.files) + except StopIteration as e: + if self.oldfiles: + self.files = self.oldfiles; + self.oldfiles = None; + return self.next() + raise e + + if tarfile.is_tarfile(nextFile): + self.set_type = FAST5SET_TARBALL + self.oldfiles = self.files + self.files = TarballFileIterator(nextFile, self._tmp) + return self.next() + elif zipfile.is_zipfile(nextFile): + self.set_type = FAST5SET_ZIP + zip = zipfile.ZipFile(nextFile, 'r', zipfile.ZIP_STORED, True) + self.oldfiles = self.files + self.files = ZipFileIterator( zip, self._tmp ) + return self.next() + autoremove = isinstance(self.files, ZipFileIterator) or isinstance(self.files, TarballFileIterator) nextFast5 = Fast5File(nextFile, self.group, autoremove) return nextFast5 except Exception as e: raise StopIteration - def _prep_tmpdir(self, path): - if path is None: - path = PORETOOLS_TMPDIR - else: - PORETOOLS_TMPDIR = path - if os.path.isdir(PORETOOLS_TMPDIR): - shutil.rmtree(PORETOOLS_TMPDIR) - os.mkdir(PORETOOLS_TMPDIR) - def _extract_fast5_files(self): # return as-is if list of files From 8ff017bbf9be9a552c4885bbc922e5f53ebb60ba Mon Sep 17 00:00:00 2001 From: Rob Egan Date: Thu, 6 Apr 2017 15:21:22 -0700 Subject: [PATCH 9/9] fixes for iterator and zipfile --- poretools/Fast5File.py | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/poretools/Fast5File.py b/poretools/Fast5File.py index bab9b6b..e55f374 100644 --- a/poretools/Fast5File.py +++ b/poretools/Fast5File.py @@ -92,6 +92,8 @@ def __init__(self, fileset, group=0): self.fileset = fileset elif isinstance(fileset, str): self.fileset = [fileset] + else: + raise Exception('unknown fileset - should be a string file path or list: %s'%(fileset)) self.set_type = None self.num_files_in_set = None self.group = group @@ -100,7 +102,8 @@ def __init__(self, fileset, group=0): self._extract_fast5_files() def __del__(self): - os.rmdir(self._tmp) + if self._tmp: + os.rmdir(self._tmp) def get_num_files(self): """ @@ -122,23 +125,33 @@ def next(self): if self.oldfiles: self.files = self.oldfiles; self.oldfiles = None; - return self.next() - raise e + return self.next() # recurse + else: + raise e - if tarfile.is_tarfile(nextFile): + nextFast5 = None + (f, ext) = os.path.splitext(nextFile) + ext = ext.lower() + autoremove = isinstance(self.files, ZipFileIterator) or isinstance(self.files, TarballFileIterator) + + if ext == '.fast5': + nextFast5 = Fast5File(nextFile, self.group, autoremove) + elif ext == '.tar' and tarfile.is_tarfile(nextFile) and self.oldfiles is None: self.set_type = FAST5SET_TARBALL self.oldfiles = self.files self.files = TarballFileIterator(nextFile, self._tmp) - return self.next() - elif zipfile.is_zipfile(nextFile): + nextFast5 = self.next() + elif ext == '.zip' and zipfile.is_zipfile(nextFile) and self.oldfiles is None: self.set_type = FAST5SET_ZIP zip = zipfile.ZipFile(nextFile, 'r', zipfile.ZIP_STORED, True) self.oldfiles = self.files self.files = ZipFileIterator( zip, self._tmp ) - return self.next() + nextFast5 = self.next() + else: + # fallthrough - hope it is a fast5! + nextFast5 = Fast5File(nextFile, self.group, autoremove) + - autoremove = isinstance(self.files, ZipFileIterator) or isinstance(self.files, TarballFileIterator) - nextFast5 = Fast5File(nextFile, self.group, autoremove) return nextFast5 except Exception as e: raise StopIteration