diff --git a/topik/tokenizers/entities.py b/topik/tokenizers/entities.py index 8a9f428..330d745 100644 --- a/topik/tokenizers/entities.py +++ b/topik/tokenizers/entities.py @@ -58,8 +58,8 @@ def _collect_entities(raw_corpus, freq_min=2, freq_max=10000): return set(np_counts) -def _tokenize_entities_document(text, entities, min_length=1, stopwords=None): - ''' +def _tokenize_entities_document(text, entities, min_length=1, stopwords=None, stop_regex=None): + """ A text tokenizer that passes only terms (a.k.a. 'entities') explicitly contained in the entities argument. @@ -73,6 +73,9 @@ def _tokenize_entities_document(text, entities, min_length=1, stopwords=None): Minimum length of any single word stopwords : None or iterable of str Collection of words to ignore as tokens + stop_regex : str + A regular expression of content to remove from text before tokenizing. + Potentially useful for ignoring code (HTML tags). Examples -------- @@ -82,19 +85,21 @@ def _tokenize_entities_document(text, entities, min_length=1, stopwords=None): >>> tokenized_text == [ ... u'frank', u'swank_tank', u'prancercise', u'sassy_unicorns'] True - ''' + """ + result = [] for np in TextBlob(text).noun_phrases: if np in entities: # filter out stop words - tmp = "_".join(_simple_document(np, min_length=min_length, stopwords=stopwords)) + tmp = "_".join(_simple_document(np, min_length=min_length, stopwords=stopwords, + stop_regex=stop_regex)) # if we end up with nothing, don't append an empty string if tmp: result.append(tmp) return result -def _tokenize_mixed_document(text, entities, min_length=1, stopwords=None): +def _tokenize_mixed_document(text, entities, min_length=1, stopwords=None, stop_regex=None): """ A text tokenizer that retrieves entities ('noun phrases') first and simple words for the rest of the text. @@ -108,6 +113,9 @@ def _tokenize_mixed_document(text, entities, min_length=1, stopwords=None): Minimum length of any single word stopwords : None or iterable of str Collection of words to ignore as tokens + stop_regex : str + A regular expression of content to remove from text before tokenizing. + Potentially useful for ignoring code (HTML tags). Examples -------- @@ -119,14 +127,17 @@ def _tokenize_mixed_document(text, entities, min_length=1, stopwords=None): ... u'pastime', u'sassy_unicorns'] True """ + result = [] for np in TextBlob(text).noun_phrases: if ' ' in np and np not in entities: # break apart the noun phrase; it does not occur often enough in the collection of text to be considered. - result.extend(_simple_document(np, min_length=min_length, stopwords=stopwords)) + result.extend(_simple_document(np, min_length=min_length, stopwords=stopwords, + stop_regex=stop_regex)) else: # filter out stop words - tmp = "_".join(_simple_document(np, min_length=min_length, stopwords=stopwords)) + tmp = "_".join(_simple_document(np, min_length=min_length, stopwords=stopwords, + stop_regex=stop_regex)) # if we end up with nothing, don't append an empty string if tmp: result.append(tmp) @@ -134,7 +145,7 @@ def _tokenize_mixed_document(text, entities, min_length=1, stopwords=None): @register -def entities(corpus, min_length=1, freq_min=2, freq_max=10000, stopwords=None): +def entities(corpus, min_length=1, freq_min=2, freq_max=10000, stopwords=None, stop_regex=None): """ A tokenizer that extracts noun phrases from a corpus, then tokenizes all documents using those extracted phrases. @@ -151,6 +162,9 @@ def entities(corpus, min_length=1, freq_min=2, freq_max=10000, stopwords=None): Maximum occurrence of phrase, beyond which it is ignored stopwords : None or iterable of str Collection of words to ignore as tokens + stop_regex : str + A regular expression of content to remove from text before tokenizing. + Potentially useful for ignoring code (HTML tags). Examples -------- @@ -162,11 +176,11 @@ def entities(corpus, min_length=1, freq_min=2, freq_max=10000, stopwords=None): entities = _collect_entities(corpus, freq_min=freq_min, freq_max=freq_max) for doc_id, doc_text in corpus: yield doc_id, _tokenize_entities_document(doc_text, entities, min_length=min_length, - stopwords=stopwords) + stopwords=stopwords, stop_regex=stop_regex) @register -def mixed(corpus, min_length=1, freq_min=2, freq_max=10000, stopwords=None): +def mixed(corpus, min_length=1, freq_min=2, freq_max=10000, stopwords=None, stop_regex=None): """A text tokenizer that retrieves entities ('noun phrases') first and simple words for the rest of the text. Parameters @@ -181,6 +195,9 @@ def mixed(corpus, min_length=1, freq_min=2, freq_max=10000, stopwords=None): Maximum occurrence of phrase, beyond which it is ignored stopwords : None or iterable of str Collection of words to ignore as tokens + stop_regex : str + A regular expression of content to remove from text before tokenizing. + Potentially useful for ignoring code (HTML tags). Examples -------- @@ -192,5 +209,6 @@ def mixed(corpus, min_length=1, freq_min=2, freq_max=10000, stopwords=None): entities = _collect_entities(corpus, freq_min=freq_min, freq_max=freq_max) for doc_id, doc_text in corpus: yield doc_id, _tokenize_mixed_document(doc_text, entities, - min_length=min_length, - stopwords=stopwords) + min_length=min_length, + stopwords=stopwords, + stop_regex=stop_regex) diff --git a/topik/tokenizers/filter_regex.py b/topik/tokenizers/filter_regex.py new file mode 100644 index 0000000..09856b1 --- /dev/null +++ b/topik/tokenizers/filter_regex.py @@ -0,0 +1,12 @@ +import re + +predefined_filters= { + 'html': '<\/?[^>]+>' +} + +def filter_regex(text, regex): + if regex: + if regex in predefined_filters: + regex = predefined_filters[regex] + text = re.sub(regex, "", text) + return text diff --git a/topik/tokenizers/ngrams.py b/topik/tokenizers/ngrams.py index 1d5f096..e991d23 100644 --- a/topik/tokenizers/ngrams.py +++ b/topik/tokenizers/ngrams.py @@ -15,8 +15,10 @@ u"recently been popularized by such retired " u"celebrities as Frank The Swank-Tank."))] + # TODO: replace min_freqs with freq_bounds like ngrams takes. Unify format across the board. -def _collect_bigrams_and_trigrams(raw_corpus, top_n=10000, min_length=1, min_freqs=None, stopwords=None): +def _collect_bigrams_and_trigrams(raw_corpus, top_n=10000, min_length=1, min_freqs=None, + stopwords=None, stop_regex=None): """collects bigrams and trigrams from collection of documents. Input to collocation tokenizer. bigrams are pairs of words that recur in the collection; trigrams are triplets. @@ -34,6 +36,9 @@ def _collect_bigrams_and_trigrams(raw_corpus, top_n=10000, min_length=1, min_fre starting with bigrams. stopwords : None or iterable of str Collection of words to ignore as tokens + stop_regex : str + A regular expression of content to remove from text before tokenizing. + Potentially useful for ignoring code (HTML tags). Examples -------- @@ -48,7 +53,8 @@ def _collect_bigrams_and_trigrams(raw_corpus, top_n=10000, min_length=1, min_fre from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures # generator of documents, turn each element to its list of words - doc_texts = (_simple_document(doc_text, min_length=min_length, stopwords=stopwords) + doc_texts = (_simple_document(doc_text, min_length=min_length, stopwords=stopwords, + stop_regex=stop_regex) for doc_id, doc_text in raw_corpus) # generator, concatenate (chain) all words into a single sequence, lazily words = itertools.chain.from_iterable(doc_texts) @@ -67,7 +73,7 @@ def _collect_bigrams_and_trigrams(raw_corpus, top_n=10000, min_length=1, min_fre return bigrams_patterns, trigrams_patterns -def _collocation_document(text, patterns, min_length=1, stopwords=None): +def _collocation_document(text, patterns, min_length=1, stopwords=None, stop_regex=None): """A text tokenizer that includes collocations(bigrams and trigrams). A collocation is sequence of words or terms that co-occur more often @@ -90,6 +96,9 @@ def _collocation_document(text, patterns, min_length=1, stopwords=None): Minimum length of any single word stopwords : None or iterable of str Collection of words to ignore as tokens + stop_regex : str + A regular expression of content to remove from text before tokenizing. + Potentially useful for ignoring code (HTML tags). Examples -------- @@ -102,14 +111,15 @@ def _collocation_document(text, patterns, min_length=1, stopwords=None): ... u'popular', u'pastime', u'sassy_unicorns', u'retirees', u'alike'] True """ - text = ' '.join(_simple_document(text, min_length=min_length, stopwords=stopwords)) + text = ' '.join(_simple_document(text, min_length=min_length, stopwords=stopwords, stop_regex=stop_regex)) for pattern in patterns: text = re.sub(pattern, lambda match: match.group(0).replace(' ', '_'), text) return text.split() @register -def ngrams(raw_corpus, min_length=1, freq_bounds=None, top_n=10000, stopwords=None): - ''' +def ngrams(raw_corpus, min_length=1, freq_bounds=None, top_n=10000, stopwords=None, + stop_regex=None): + """ A tokenizer that extracts collocations (bigrams and trigrams) from a corpus according to the frequency bounds, then tokenizes all documents using those extracted phrases. @@ -128,6 +138,9 @@ def ngrams(raw_corpus, min_length=1, freq_bounds=None, top_n=10000, stopwords=No limit results to this many entries stopwords: None or iterable of str Collection of words to ignore as tokens + stop_regex : str + A regular expression of content to remove from text before tokenizing. + Potentially useful for ignoring code (HTML tags). Examples -------- @@ -137,11 +150,12 @@ def ngrams(raw_corpus, min_length=1, freq_bounds=None, top_n=10000, stopwords=No ... u'prancercise', u'class', u'daily', u'prancercise', u'tremendously', ... u'popular', u'pastime', u'sassy_unicorns', u'retirees', u'alike']) True - ''' + """ if not freq_bounds: freq_bounds=[(50, 10000), (20, 10000)] min_freqs = [freq[0] for freq in freq_bounds] patterns = _collect_bigrams_and_trigrams(raw_corpus, top_n=top_n, min_length=min_length, min_freqs=min_freqs, - stopwords=stopwords) + stopwords=stopwords, stop_regex=stop_regex) for doc_id, doc_text in raw_corpus: - yield doc_id, _collocation_document(doc_text, patterns, min_length=min_length, stopwords=stopwords) + yield doc_id, _collocation_document(doc_text, patterns, min_length=min_length, + stopwords=stopwords, stop_regex=stop_regex) diff --git a/topik/tokenizers/simple.py b/topik/tokenizers/simple.py index 1b613ed..629b0b6 100644 --- a/topik/tokenizers/simple.py +++ b/topik/tokenizers/simple.py @@ -2,9 +2,10 @@ # imports used only for doctests from topik.tokenizers._registry import register +from topik.tokenizers.filter_regex import filter_regex -def _simple_document(text, min_length=1, stopwords=None): +def _simple_document(text, min_length=1, stopwords=None, stop_regex=None): """A text tokenizer that simply lowercases, matches alphabetic characters and removes stopwords. For use on individual text documents. @@ -16,6 +17,9 @@ def _simple_document(text, min_length=1, stopwords=None): Minimum length of any single word stopwords: None or iterable of str Collection of words to ignore as tokens + stop_regex : str + A regular expression of content to remove from text before tokenizing. + Potentially useful for ignoring code (HTML tags). Examples -------- @@ -26,12 +30,13 @@ def _simple_document(text, min_length=1, stopwords=None): """ if not stopwords: from gensim.parsing.preprocessing import STOPWORDS as stopwords + text = filter_regex(text, stop_regex) return [word for word in gensim.utils.tokenize(text, lower=True) if word not in stopwords and len(word) >= min_length] @register -def simple(raw_corpus, min_length=1, stopwords=None): +def simple(raw_corpus, min_length=1, stopwords=None, stop_regex=None): """A text tokenizer that simply lowercases, matches alphabetic characters and removes stopwords. @@ -54,4 +59,5 @@ def simple(raw_corpus, min_length=1, stopwords=None): True """ for doc_id, doc_text in raw_corpus: - yield(doc_id, _simple_document(doc_text, min_length=min_length, stopwords=stopwords)) \ No newline at end of file + yield(doc_id, _simple_document(doc_text, min_length=min_length, stopwords=stopwords, + stop_regex=stop_regex)) \ No newline at end of file diff --git a/topik/tokenizers/tests/test_filter_regex.py b/topik/tokenizers/tests/test_filter_regex.py new file mode 100644 index 0000000..a01b74a --- /dev/null +++ b/topik/tokenizers/tests/test_filter_regex.py @@ -0,0 +1,17 @@ +from topik.tokenizers.filter_regex import filter_regex + +example_text = """ +

Yes it is bad. If your task is long running, you will eventually crash with

+ +
+

RuntimeError: maximum recursion depth exceeded

+
+ +

Also, it is not very efficient for this simple use case.

+
+""" + + +def test_html_regex_filter(): + assert "" not in filter_regex(example_text, 'html') +