diff --git a/bench.py b/bench.py index 0841030..cad0cac 100755 --- a/bench.py +++ b/bench.py @@ -18,6 +18,7 @@ def run1(args, src_name, num_runs): runs = [ [['java', '-jar', './bin/freq01scala.jar'], 'freq01.scala', 3], [['python', './src/freq01.py'], 'freq01.py', 3], + [['python', './src/freq02.py'], 'freq02.py', 3], [['./bin/freq03cpp' + EXE], 'freq03.cpp'], [['./bin/freq02cpp' + EXE], 'freq02.cpp'], [['./bin/freq01cpp' + EXE], 'freq01.cpp'], diff --git a/src/freq02.py b/src/freq02.py new file mode 100644 index 0000000..51775e0 --- /dev/null +++ b/src/freq02.py @@ -0,0 +1,46 @@ +import string +import sys +from collections import Counter + +from itertools import chain + + +if len(sys.argv) != 3: + print('No args') + exit(1) + + +def sort_key(x): + word, fq = x + return -fq, word + + +non_alpha = bytes(ch for ch in range(256) if chr(ch) not in string.ascii_letters) + + +tab = bytes.maketrans( + string.ascii_uppercase.encode() + non_alpha, + string.ascii_lowercase.encode() + b' ' * len(non_alpha) +) + + +with open(sys.argv[1], 'rb') as in_file: + # Replace any non-alphabetical symbols with spaces and split the string using space separator. + translated = chain.from_iterable( + line.translate(tab).split(b' ') for line in in_file + ) + + # Ignore empty strings. + words = filter(None, translated) + + counts = sorted(Counter(words).items(), key=sort_key) + + result = b'\r\n'.join( + b' '.join( + (str(v).encode(), k) + ) + for k, v in counts + ) + b'\r\n' + +with open(sys.argv[2], 'wb+') as out_file: + out_file.write(result)