diff --git a/.travis.yml b/.travis.yml index 77df471..869daaf 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,7 +14,7 @@ before_install: {} install: - pip install -r requirements.txt # - pip install -r docs/requirements.txt - - pip install -r test-requirements.txt + - pip install -r test-requirements.txt --upgrade - pip install twine script: - flake8 . --max-line-length=85 --exclude=.svn,CVS,.bzr,.hg,.git,__pycache__,.tox,.eggs,*.egg,docs diff --git a/deltas/tokenizers/wikitext_split.py b/deltas/tokenizers/wikitext_split.py index e9e6f0f..22ab166 100644 --- a/deltas/tokenizers/wikitext_split.py +++ b/deltas/tokenizers/wikitext_split.py @@ -13,46 +13,21 @@ r'(?:(?:' + '|'.join(SLASHED_PROTO) + r')\:)?\/\/' + r')' + ADDRESS ) -# re.compile(url, re.U).match("https://website.gov?param=value") - -# Matches Chinese, Japanese and Korean characters. -cjk = ( - r'[' + - r'\u4E00-\u62FF' + # noqa Unified Ideographs - r'\u6300-\u77FF' + - r'\u7800-\u8CFF' + - r'\u8D00-\u9FCC' + - r'\u3400-\u4DFF' + # Unified Ideographs Ext A - r'\U00020000-\U000215FF' + # Unified Ideographs Ext. B - r'\U00021600-\U000230FF' + - r'\U00023100-\U000245FF' + - r'\U00024600-\U000260FF' + - r'\U00026100-\U000275FF' + - r'\U00027600-\U000290FF' + - r'\U00029100-\U0002A6DF' + - r'\uF900-\uFAFF' + # Compatibility Ideographs - r'\U0002F800-\U0002FA1F' + # Compatibility Ideographs Suppl. - r'\u3041-\u3096' + # Hiragana - r'\u30A0-\u30FF' + # Katakana - r'\u3400-\u4DB5' + # Kanji - r'\u4E00-\u9FCB' + - r'\uF900-\uFA6A' + - r'\u2E80-\u2FD5' + # Kanji radicals - r'\uFF5F-\uFF9F' + # Katakana and Punctuation (Half Width) - r'\u31F0-\u31FF' + # Miscellaneous Japanese Symbols and Characters - r'\u3220-\u3243' + - r'\u3280-\u337F' - r']' -) devangari_word = r'\u0901-\u0963' arabic_word = r'\u0601-\u061A' + \ r'\u061C-\u0669' + \ r'\u06D5-\u06EF' bengali_word = r'\u0980-\u09FF' -combined_word = devangari_word + arabic_word + bengali_word +korean_word = r'\uac00-\ud7a3' + +combined_word = devangari_word + arabic_word + bengali_word + korean_word + +cjk_re = r'\u3040-\u30ff' + r'\u4e00-\u9FFF' -word = r'(?:[^\W\d]|[' + combined_word + r'])' + \ +cjk = r'[' + cjk_re + ']' + +word = r'(?:[^\W\d' + cjk_re + r']|[' + combined_word + r'])' + \ r'[\w' + combined_word + r']*' + \ r'(?:[\'’](?:[\w' + combined_word + r']+|(?=(?:$|\s))))*' @@ -71,8 +46,8 @@ ("bold", r"'''"), ("italic", r"''"), ('japan_punct', r'[\u3000-\u303F]'), - ('cjk', cjk), ('word', word), + ('cjk', cjk), ('tab_open', r'\{\|'), ('tab_close', r'\|\}'), ('dbrack_open', r'\[\['),