From 48eaa13a1e98df3456c31696c79d2b6241f066d7 Mon Sep 17 00:00:00 2001 From: Tim LaRock Date: Mon, 6 May 2019 11:44:55 -0400 Subject: [PATCH 1/5] Updated Paths.add_path() to allow removal of selfloops from input paths. Updated Paths.read_file to use add_path. --- pathpy/classes/paths.py | 46 +++++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/pathpy/classes/paths.py b/pathpy/classes/paths.py index 5a13504..0774fe2 100644 --- a/pathpy/classes/paths.py +++ b/pathpy/classes/paths.py @@ -450,7 +450,7 @@ def read_file(cls, filename, separator=',', frequency=True, maxlines=sys.maxsize # Omit empty fields v = fields[i].strip() if v: - path += (v,) + path += (v,) freq = float(fields[len(fields) - 1]) if freq >0: if len(path) <= max_ngram_length: @@ -468,22 +468,19 @@ def read_file(cls, filename, separator=',', frequency=True, maxlines=sys.maxsize v = field.strip() if v: path += (v,) - if len(path) <= max_ngram_length: - p.paths[len(path) - 1][path] += (0, 1) - max_length = max(max_length, len(path) - 1) - else: # cut path at max_ngram_length - p.paths[max_ngram_length - 1][path[:max_ngram_length]] += (0, 1) - max_length = max(max_length, max_ngram_length - 1) + + if len(path) > max_ngram_length: + path = path[:max_ngram_length] + + p.add_path(path, frequency=(0,1), expand_subpaths=expand_sub_paths) + max_length = max(max_length, len(path) - 1) + line = f.readline() n += 1 # end of with open() Log.add( 'finished. Read ' + str(n - 1) + ' paths with maximum length ' + str(max_length)) - if expand_sub_paths: - p.expand_subpaths() - Log.add('finished.') - return p def write_file(self, filename, separator=','): @@ -576,7 +573,7 @@ def expand_subpaths(self): self.paths[k][path_slice][0] += frequency - def add_path(self, path, frequency=1, expand_subpaths=True, separator=','): + def add_path(self, path, frequency=1, expand_subpaths=True, remove_selfloops=False, separator=','): """Adds a path to this Paths instance. The path argument can either be a list, tuple or a string ngram with a customisable node separator. @@ -592,13 +589,16 @@ def add_path(self, path, frequency=1, expand_subpaths=True, separator=','): path as subpath (first component) and as longest path (second component). Integer values x are automatically converted to (0, x). Default value is 1. expand_subpaths: bool - Whether or not to calculate subpath statistics. Default value is True. + Whether or not to calculate subpath statistics. Default value is True. + remove_selfloops: bool + Whether or not to remove selfloops (e.g. repeated nodes) from paths. Default values + is False. separator: str - A string sepcifying the character that separates nodes in the ngram. Default is + A string sepcifying the character that separates nodes in the ngram. Default is ','. Returns - ------- - """ + ------- + """ assert isinstance(path, tuple) or isinstance(path, list) or isinstance(path, str), 'Path must be tuple or ngram string.' # Turn string ngram into tuple @@ -607,10 +607,20 @@ def add_path(self, path, frequency=1, expand_subpaths=True, separator=','): assert path, 'Path must contain at least one element' - for x in path: - if isinstance(x, str) and self.separator in x: + if remove_selfloops: + collapsed_path = [path[0]] + + for x in range(1, len(path)): + # Error check + if isinstance(path[x-1], str) and self.separator in path[x-1]: raise PathpyError('Node name contains separator character. ' 'Choose different separator.') + # Test for selfloop + if remove_selfloops and path[x-1] != path[x]: + collapsed_path.append(path[x]) + + if remove_selfloops: + path = collapsed_path # Convert tuple elements to strings path_str = path if isinstance(path, str) else tuple(map(str, path)) From 3916457265bab1e2ef52412b1f5731a240e9b455 Mon Sep 17 00:00:00 2001 From: Tim LaRock Date: Mon, 6 May 2019 11:53:45 -0400 Subject: [PATCH 2/5] Added option to read_file for remove_selfloops. --- pathpy/classes/paths.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pathpy/classes/paths.py b/pathpy/classes/paths.py index 0774fe2..ba32da0 100644 --- a/pathpy/classes/paths.py +++ b/pathpy/classes/paths.py @@ -390,7 +390,7 @@ def read_edges(filename, separator=',', weight=False, undirected=False, @classmethod def read_file(cls, filename, separator=',', frequency=True, maxlines=sys.maxsize, - max_ngram_length=sys.maxsize, expand_sub_paths=True, + max_ngram_length=sys.maxsize, expand_sub_paths=True, remove_selfloops=False, max_subpath_length=sys.maxsize): """Reads path data from a file containing multiple lines of n-grams of the form ``a,b,c,d,frequency`` (where frequency is optional). Each n-gram is interpreted @@ -472,7 +472,7 @@ def read_file(cls, filename, separator=',', frequency=True, maxlines=sys.maxsize if len(path) > max_ngram_length: path = path[:max_ngram_length] - p.add_path(path, frequency=(0,1), expand_subpaths=expand_sub_paths) + p.add_path(path, frequency=(0,1), expand_subpaths=expand_sub_paths, remove_selfloops=remove_selfloops) max_length = max(max_length, len(path) - 1) line = f.readline() From f5384e9dc0d9fda76b2229e3b1f051b06742a73c Mon Sep 17 00:00:00 2001 From: Tim LaRock Date: Mon, 6 May 2019 11:58:08 -0400 Subject: [PATCH 3/5] Changed read_file with frequency to also use add_path. --- pathpy/classes/paths.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/pathpy/classes/paths.py b/pathpy/classes/paths.py index ba32da0..bc2b43b 100644 --- a/pathpy/classes/paths.py +++ b/pathpy/classes/paths.py @@ -453,13 +453,11 @@ def read_file(cls, filename, separator=',', frequency=True, maxlines=sys.maxsize path += (v,) freq = float(fields[len(fields) - 1]) if freq >0: - if len(path) <= max_ngram_length: - p.paths[len(path) - 1][path] += (0, freq) - max_length = max(max_length, len(path) - 1) - else: # cut path at max_ngram_length - mnl = max_ngram_length - p.paths[mnl - 1][path[:mnl]] += (0, freq) - max_length = max(max_length, max_ngram_length - 1) + if len(path) > max_ngram_length: + path = path[:max_ngram_length] + p.paths[len(path) - 1][path] += (0, freq) + p.add_path(path, frequency=(0,freq), expand_subpaths=expand_sub_paths, remove_selfloops=remove_selfloops) + max_length = max(max_length, len(path) - 1) else: Log.add('Non-positive path count in line {0}'.format(n), Severity.WARNING) else: From 829734010314347c72c3fe01b881b3e4d4fde920 Mon Sep 17 00:00:00 2001 From: Tim LaRock Date: Mon, 6 May 2019 11:58:32 -0400 Subject: [PATCH 4/5] Removed duplicate line left in by accident. --- pathpy/classes/paths.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pathpy/classes/paths.py b/pathpy/classes/paths.py index bc2b43b..5583f7b 100644 --- a/pathpy/classes/paths.py +++ b/pathpy/classes/paths.py @@ -455,7 +455,7 @@ def read_file(cls, filename, separator=',', frequency=True, maxlines=sys.maxsize if freq >0: if len(path) > max_ngram_length: path = path[:max_ngram_length] - p.paths[len(path) - 1][path] += (0, freq) + p.add_path(path, frequency=(0,freq), expand_subpaths=expand_sub_paths, remove_selfloops=remove_selfloops) max_length = max(max_length, len(path) - 1) else: From 4d1d75e28e9813e24156583ce095776a92390add Mon Sep 17 00:00:00 2001 From: Tim LaRock Date: Mon, 6 May 2019 13:16:43 -0400 Subject: [PATCH 5/5] Minor typo fix in docstring. --- pathpy/classes/paths.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pathpy/classes/paths.py b/pathpy/classes/paths.py index 5583f7b..9672ef3 100644 --- a/pathpy/classes/paths.py +++ b/pathpy/classes/paths.py @@ -453,6 +453,7 @@ def read_file(cls, filename, separator=',', frequency=True, maxlines=sys.maxsize path += (v,) freq = float(fields[len(fields) - 1]) if freq >0: + # Cut a path if it is longer than specificed maximum if len(path) > max_ngram_length: path = path[:max_ngram_length] @@ -467,6 +468,7 @@ def read_file(cls, filename, separator=',', frequency=True, maxlines=sys.maxsize if v: path += (v,) + # Cut a path if it is longer than specificed maximum if len(path) > max_ngram_length: path = path[:max_ngram_length] @@ -589,7 +591,7 @@ def add_path(self, path, frequency=1, expand_subpaths=True, remove_selfloops=Fal expand_subpaths: bool Whether or not to calculate subpath statistics. Default value is True. remove_selfloops: bool - Whether or not to remove selfloops (e.g. repeated nodes) from paths. Default values + Whether or not to remove selfloops (e.g. repeated nodes) from paths. Default value is False. separator: str A string sepcifying the character that separates nodes in the ngram. Default is