diff --git a/bench.py b/bench.py index 0841030..cad0cac 100755 --- a/bench.py +++ b/bench.py @@ -18,6 +18,7 @@ def run1(args, src_name, num_runs): runs = [ [['java', '-jar', './bin/freq01scala.jar'], 'freq01.scala', 3], [['python', './src/freq01.py'], 'freq01.py', 3], + [['python', './src/freq02.py'], 'freq02.py', 3], [['./bin/freq03cpp' + EXE], 'freq03.cpp'], [['./bin/freq02cpp' + EXE], 'freq02.cpp'], [['./bin/freq01cpp' + EXE], 'freq01.cpp'], diff --git a/src/freq02.py b/src/freq02.py new file mode 100644 index 0000000..8b52091 --- /dev/null +++ b/src/freq02.py @@ -0,0 +1,69 @@ +import string +import sys +from collections import Counter + +from itertools import chain + + +CHUNK_SIZE = 20 * 1024 * 1024 + + +non_alpha = bytes(ch for ch in range(256) if chr(ch) not in string.ascii_letters) + +translation_table = bytes.maketrans( + string.ascii_uppercase.encode() + non_alpha, + string.ascii_lowercase.encode() + b' ' * len(non_alpha) +) + +tail = [b''] # variable to store tail of previous chunk of data. + + +def get_words_with_empty_strings(line): + """Replaces any non-alphabetical symbols with spaces and splits the string using space separator. + + Saves the last word into tails[0] to concat it with the first word of the next chunk. + """ + result = line.translate(translation_table).split(b' ') + result[0] = tail[0] + result[0] + tail[0] = result[-1] + result[-1] = b'' + return result + + +def sort_key(x): + word, fq = x + return -fq, word + + +def main(in_file_name, out_file_name): + + with open(in_file_name, 'rb') as in_file: + def read_chunk(): + return in_file.read(CHUNK_SIZE) + + words_and_spaces = chain.from_iterable( + get_words_with_empty_strings(line) for line in iter(read_chunk, b'') + ) + + # Ignore empty strings. + words = filter(None, chain(words_and_spaces, tail[0])) + + counts = sorted(Counter(words).items(), key=sort_key) + + rows = ( + b' '.join((str(v).encode(), k)) + for k, v in counts + ) + + result = b'\r\n'.join(rows) + b'\r\n' + + with open(out_file_name, 'wb+') as out_file: + out_file.write(result) + + +if __name__ == '__main__': + if len(sys.argv) != 3: + print('No args') + exit(1) + + main(sys.argv[1], sys.argv[2])