diff --git a/editquality/bwds/__init__.py b/editquality/bwds/__init__.py new file mode 100644 index 00000000..b0af7a69 --- /dev/null +++ b/editquality/bwds/__init__.py @@ -0,0 +1,234 @@ +""" +Code to find bad words automatically. + +It gets a set of added words and determines tf-idf of words +then it uses K-means algorithm to determine them. + +Some parts are copied from +https://github.com/halfak/Objective-Revision-Evaluation-Service/blob/master/ores/label_reverted.py + +>>> from editquality.bwds import Bot, Edit +>>> edits = [Edit(1, {'one':1, 'two': 2}, False), Edit(2, {'three':3}, True), +... Edit(3, {'one':5, 'four': 1}, False)] +>>> bot = Bot() +>>> bot.parse_edits(edits) +>>> bot.parse_bad_edits() +""" +import math +import json +import sys +import time +from collections import OrderedDict, namedtuple + +import traceback +from revscoring.extractors.api import Extractor +from revscoring.features import wikitext + +from mwapi import Session +import mwreverts + +base_file_path = '/data/project/dexbot/pywikibot-core/something_' + + +# This is nice for debugging, e.g. printing this includes it values +EditNamedTuple = namedtuple('EditNamedTuple', ['id', 'added_words', 'reverted']) + + +class Edit(object): + def __init__(self, rev_id, added_words, reverted): + self.id = rev_id + self.added_words = added_words + if not isinstance(self.added_words, dict): + self.fix_added_words() + self.reverted = reverted + + def fix_added_words(self): + temp = {} + for word in self.added_words: + temp[word] = temp.get(word, 0) + 1 + self.added_words = temp + + def as_named_tuple(self): + return EditNamedTuple(self.id, self.added_words, self.reverted) + + +class Bot(object): + + def __init__(self, words_cache=None, bad_words_cache=None, no_docs=None): + self.bad_edits = Edit(-1, {}, True) + self.counter = 0 + self.words_db = {} + self.bad_words_db = {} + self.bad_counter = 0 + if bool(bad_words_cache) != bool(words_cache): + raise ValueError( + "bad_words_cache should be defined if and only words_cache is " + "defined") + if words_cache: + self.cache = True + self.initiate_cache(words_cache, bad_words_cache, no_docs) + else: + self.cache = False + + def initiate_cache(self, words_cache, bad_words_cache, no_docs): + with open(words_cache, 'r') as f: + self.words_db = json.loads(f.read()) + with open(bad_words_cache, 'r') as f: + self.bad_edits.added_words = json.loads(f.read()) + with open(no_docs, 'r') as f: + self.counter = int(f.read()) + + def parse_edits(self, edits): + for edit in edits: + # Since edits can be gen and len doesn't mean there + self.counter += 1 + if edit.reverted: + for word in edit.added_words: + self.bad_edits.added_words[word] = \ + self.bad_edits.added_words.get(word, 0) + \ + edit.added_words[word] + self.bad_words_db[word] = ( + self.bad_words_db.get(word, 0) + 1) + self.bad_counter += 1 + continue + for word in edit.added_words: + self.words_db[word] = self.words_db.get(word, 0) + 1 + + def parse_bad_edits(self, numbers_to_show=10): + self.possible_bad_words = {} + self.stop_words = {} + if not self.cache: + self.counter += 1 + for word in self.bad_edits.added_words: + if not self.cache: + self.words_db[word] = self.words_db.get(word, 0) + 1 + if 'sh' in word or 'ch' in word: + continue + self.possible_bad_words[word] = self.tf_idf(word) + self.stop_words[word] = self.idf(word) + if numbers_to_show: + self.show_results(numbers_to_show) + self.show_results2(numbers_to_show) + + def tf_idf(self, word): + tf = math.log(self.bad_edits.added_words[word]) + 1 + idf = math.log(self.counter / self.words_db[word]) + return tf * idf + + def idf(self, word): + return math.log(self.counter / self.words_db[word]) + + def show_results(self, numbers_to_show): + print("Showing %d results" % numbers_to_show) + values = sorted(self.possible_bad_words.values()) + lim = values[max(0, len(values) - numbers_to_show)] + res = {} + for word in self.possible_bad_words: + if self.possible_bad_words[word] >= lim: + res[word] = self.possible_bad_words[word] + res = OrderedDict( + sorted(res.items(), key=lambda t: t[1], reverse=True)) + res_text = [] + for word in res: + res_text.append(word) + res_text.sort() + res_text = "#" + '\n#'.join(res_text) + self.bad_words_res_text = res_text + with open('%s_%s.txt' % (base_file_path, time.time()), 'w') as f: + f.write(res_text) + + def show_results2(self, numbers_to_show): + print("Showing another %d results" % numbers_to_show) + values = sorted(self.stop_words.values(), reverse=True) + lim = values[max(0, len(values) - numbers_to_show)] + res = {} + for word in self.stop_words: + if self.stop_words[word] <= lim: + res[word] = self.stop_words[word] + res = OrderedDict(sorted(res.items(), key=lambda t: t[1])) + res_text = [] + for word in res: + res_text.append(word) + res_text.sort() + res_text = "#" + '\n#'.join(res_text) + self.stop_words_res_text = res_text + with open('%s2_%s.txt' % (base_file_path, time.time()), 'w') as f: + f.write(res_text) + + def dump(self): + new_db = {} + for word in self.bad_edits.added_words: + new_db[word] = self.words_db[word] + with open('words_db.txt', 'w') as f: + f.write(json.dumps(new_db)) + with open('bad_edits_words.txt', 'w') as f: + f.write(json.dumps(self.bad_edits.added_words)) + with open('no_docs.txt', 'w') as f: + f.write(json.dumps(self.counter)) + + +def read_rev_pages(f): + + for line in f: + parts = line.strip().split('\t') + + if len(parts) == 1: + rev_id = parts + yield int(rev_id[0]), None + elif len(parts) == 2: + rev_id, page_id = parts + yield int(rev_id), int(page_id) + + +def cache_parse(pathes, num_res): + if not pathes.strip(): + pathes = 'words_db.txt,bad_edits_words.txt,no_docs.txt' + pathes = pathes.split(',') + bot = Bot(words_cache=pathes[0], bad_words_cache=pathes[1], + no_docs=pathes[2]) + bot.parse_bad_edits(num_res) + + +def bot_gen(rev_pages, api_url): + session = Session(api_url) + extractor = Extractor(session) + + for revision_id, page_id in rev_pages: + api_result = session.get( + action='query', + titles='Main Page', + prop='revisions', + rvlimit=500, + rvprop='sha1|ids' + ) + revisions = next(iter(api_result['query']['pages'].values()))['revisions'] + revisions = [ + revision for revision in revisions if 'sha1hidden' not in revision] + + sys.stderr.write(".") + sys.stderr.flush() + try: + reverted_revision_ids = set() + # Detect reverted status + for revert in mwreverts.detect( + (revision['sha1'], revision) for revision in revisions + ): + for reverted in revert.reverteds: + reverted_revision_ids.add(reverted['revid']) + + added_words = set(extractor.extract( + revision_id, wikitext.revision.diff.datasources.words_added + )) + yield Edit( + revision_id, added_words, revision_id in reverted_revision_ids + ) + + except KeyboardInterrupt: + sys.stderr.write("\n^C Caught. Exiting...") + break + + except: + sys.stderr.write(traceback.format_exc()) + sys.stderr.write("\n") + + sys.stderr.write("\n") diff --git a/editquality/bwds/tests/__init__.py b/editquality/bwds/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/editquality/bwds/tests/bad_edits_words.txt b/editquality/bwds/tests/bad_edits_words.txt new file mode 100644 index 00000000..3fafbf72 --- /dev/null +++ b/editquality/bwds/tests/bad_edits_words.txt @@ -0,0 +1 @@ +{"x": 1} \ No newline at end of file diff --git a/editquality/bwds/tests/no_docs.txt b/editquality/bwds/tests/no_docs.txt new file mode 100644 index 00000000..56a6051c --- /dev/null +++ b/editquality/bwds/tests/no_docs.txt @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/editquality/bwds/tests/test_bwds.py b/editquality/bwds/tests/test_bwds.py new file mode 100644 index 00000000..3c12b7f6 --- /dev/null +++ b/editquality/bwds/tests/test_bwds.py @@ -0,0 +1,74 @@ +from deltas import Token + +from editquality.bwds import cache_parse, Edit, Bot, read_rev_pages, bot_gen + +EDITS = [ + Edit(1, {'one': 1, 'two': 2}, False), + Edit(2, {'three': 3}, True), + Edit(3, {'one': 5, 'four': 1}, False) +] + + +def test_cache_parse(): + cache_parse( + 'editquality/bwds/tests/words_db.txt,' + 'editquality/bwds/tests/bad_edits_words.txt,' + 'editquality/bwds/tests/no_docs.txt', + 0 + ) + + +def test_bot_gen_empty(): + en_api_url = 'https://en.wikipedia.org/w/api.php' + assert list(bot_gen([], en_api_url)) == [] + + +def test_bot_gen(): + a_revision_id = 979192243 + pasta_page_id = 23871 + en_api_url = 'https://en.wikipedia.org/w/api.php' + generated, = bot_gen([(a_revision_id, pasta_page_id)], en_api_url) + assert generated.id == a_revision_id + assert Token('unleavened', type='word') in generated.added_words + assert not generated.reverted + + +def test_read_rev_pages(): + assert list(read_rev_pages(["0", "1\t2"])) == [(0, None), (1, 2)] + + +def test_parse_bad_edits(): + bot = Bot() + bot.parse_edits(EDITS) + bot.parse_bad_edits(numbers_to_show=0) + + +def dump_empty(): + bot = Bot() + bot.dump() + with open('words_db.txt') as words_db: + assert words_db.read() == '{}' + with open('bad_edits_words.txt') as bad_edits_words: + assert bad_edits_words.read() == '{}' + with open('no_docs.txt') as no_docs: + assert no_docs.read() == '0' + + +def dump_toy_data(): + bot = Bot() + bot.parse_edits(EDITS) + bot.parse_bad_edits(0) + bot.dump() + with open('words_db.txt') as words_db: + assert words_db.read() == '{"three": 1}' + with open('bad_edits_words.txt') as bad_edits_words: + assert bad_edits_words.read() == '{"three": 3}' + with open('no_docs.txt') as no_docs: + assert no_docs.read() == '4' + + +def test_dump(): + # Calling both tests from here because we want to ensure they're not run + # concurrently + dump_empty() + dump_toy_data() diff --git a/editquality/bwds/tests/words_db.txt b/editquality/bwds/tests/words_db.txt new file mode 100644 index 00000000..8429d4db --- /dev/null +++ b/editquality/bwds/tests/words_db.txt @@ -0,0 +1 @@ +{"x": 2} \ No newline at end of file diff --git a/editquality/utilities/__init__.py b/editquality/utilities/__init__.py index b26cdd75..ca4198f0 100644 --- a/editquality/utilities/__init__.py +++ b/editquality/utilities/__init__.py @@ -43,4 +43,8 @@ merge_labels ++++++++++++ .. automodule:: editquality.utilities.merge_labels + +bad_words_detection_system +++++++++++++++++++++++++++ +.. automodule:: editquality.utilities.bad_words_detection_system """ diff --git a/editquality/utilities/bad_words_detection_system.py b/editquality/utilities/bad_words_detection_system.py new file mode 100755 index 00000000..7ab74509 --- /dev/null +++ b/editquality/utilities/bad_words_detection_system.py @@ -0,0 +1,53 @@ +""" +WIP +The script to find bad words automatically. + +python3 bad_words_detection_system.py --rev-pages:f.txt + --api:https://en.wikipedia.org/w/api.php + +Use cache: +python3 bad_words_detection_system.py --cache: +""" +import sys + +from editquality.bwds import Bot, cache_parse, read_rev_pages, bot_gen + + +# TODO: Use argparse +def handle_args(): + args = {} + for arg in sys.argv[1:]: + if arg.startswith('--rev-pages:'): + args['--rev-pages'] = arg[len('--rev-pages:'):] + elif arg.startswith('--api:'): + args['--api'] = arg[len('--api:'):] + elif arg.startswith('--cache:'): + args['--cache'] = arg[len('--cache:'):] + elif arg.startswith('--num_res:'): + args['--num_res'] = arg[len('--num_res:'):] + else: + print('Unknown argument') + return args + + +def main(): + args = handle_args() + if '--num_res' in args: + num_res = int(args['--num_res']) + else: + num_res = 10 + if '--cache' in args: + cache_parse(args['--cache'], num_res) + return + rev_pages = read_rev_pages(open(args['--rev-pages'])) + + api_url = args['--api'] + gen = bot_gen(rev_pages, api_url) + bot = Bot() + bot.parse_edits(gen) + bot.parse_bad_edits(num_res) + bot.dump() + + +if __name__ == "__main__": + main()