ContinuumIO · msarahan · Mar 14, 2016 · Mar 14, 2016 · Mar 14, 2016
diff --git a/topik/tokenizers/entities.py b/topik/tokenizers/entities.py
@@ -58,8 +58,8 @@ def _collect_entities(raw_corpus, freq_min=2, freq_max=10000):
     return set(np_counts)
 
 
-def _tokenize_entities_document(text, entities, min_length=1, stopwords=None):
-    '''
+def _tokenize_entities_document(text, entities, min_length=1, stopwords=None, stop_regex=None):
+    """
     A text tokenizer that passes only terms (a.k.a. 'entities') explicitly
     contained in the entities argument.
 
@@ -73,6 +73,9 @@ def _tokenize_entities_document(text, entities, min_length=1, stopwords=None):
         Minimum length of any single word
     stopwords : None or iterable of str
         Collection of words to ignore as tokens
+    stop_regex : str
+        A regular expression of content to remove from text before tokenizing.
+        Potentially useful for ignoring code (HTML tags).
 
     Examples
     --------
@@ -82,19 +85,21 @@ def _tokenize_entities_document(text, entities, min_length=1, stopwords=None):
     >>> tokenized_text == [
     ...     u'frank', u'swank_tank', u'prancercise', u'sassy_unicorns']
     True
-    '''
+    """
+
     result = []
     for np in TextBlob(text).noun_phrases:
         if np in entities:
             # filter out stop words
-            tmp = "_".join(_simple_document(np, min_length=min_length, stopwords=stopwords))
+            tmp = "_".join(_simple_document(np, min_length=min_length, stopwords=stopwords,
+                                            stop_regex=stop_regex))
             # if we end up with nothing, don't append an empty string
             if tmp:
                 result.append(tmp)
     return result
 
 
-def _tokenize_mixed_document(text, entities, min_length=1, stopwords=None):
+def _tokenize_mixed_document(text, entities, min_length=1, stopwords=None, stop_regex=None):
     """
     A text tokenizer that retrieves entities ('noun phrases') first and simple words for the rest of the text.
 
@@ -108,6 +113,9 @@ def _tokenize_mixed_document(text, entities, min_length=1, stopwords=None):
         Minimum length of any single word
     stopwords : None or iterable of str
         Collection of words to ignore as tokens
+    stop_regex : str
+        A regular expression of content to remove from text before tokenizing.
+        Potentially useful for ignoring code (HTML tags).
 
     Examples
     --------
@@ -119,22 +127,25 @@ def _tokenize_mixed_document(text, entities, min_length=1, stopwords=None):
     ... u'pastime', u'sassy_unicorns']
     True
     """
+
     result = []
     for np in TextBlob(text).noun_phrases:
         if ' ' in np and np not in entities:
             # break apart the noun phrase; it does not occur often enough in the collection of text to be considered.
-            result.extend(_simple_document(np, min_length=min_length, stopwords=stopwords))
+            result.extend(_simple_document(np, min_length=min_length, stopwords=stopwords,
+                                           stop_regex=stop_regex))
         else:
             # filter out stop words
-            tmp = "_".join(_simple_document(np, min_length=min_length, stopwords=stopwords))
+            tmp = "_".join(_simple_document(np, min_length=min_length, stopwords=stopwords,
+                                            stop_regex=stop_regex))
             # if we end up with nothing, don't append an empty string
             if tmp:
                 result.append(tmp)
     return result
 
 
 @register
-def entities(corpus, min_length=1, freq_min=2, freq_max=10000, stopwords=None):
+def entities(corpus, min_length=1, freq_min=2, freq_max=10000, stopwords=None, stop_regex=None):
     """
     A tokenizer that extracts noun phrases from a corpus, then tokenizes all
     documents using those extracted phrases.
@@ -151,6 +162,9 @@ def entities(corpus, min_length=1, freq_min=2, freq_max=10000, stopwords=None):
         Maximum occurrence of phrase, beyond which it is ignored
     stopwords : None or iterable of str
         Collection of words to ignore as tokens
+    stop_regex : str
+        A regular expression of content to remove from text before tokenizing.
+        Potentially useful for ignoring code (HTML tags).
 
     Examples
     --------
@@ -162,11 +176,11 @@ def entities(corpus, min_length=1, freq_min=2, freq_max=10000, stopwords=None):
     entities = _collect_entities(corpus, freq_min=freq_min, freq_max=freq_max)
     for doc_id, doc_text in corpus:
         yield doc_id, _tokenize_entities_document(doc_text, entities, min_length=min_length,
-                                       stopwords=stopwords)
+                                                  stopwords=stopwords, stop_regex=stop_regex)
 
 
 @register
-def mixed(corpus, min_length=1, freq_min=2, freq_max=10000, stopwords=None):
+def mixed(corpus, min_length=1, freq_min=2, freq_max=10000, stopwords=None, stop_regex=None):
     """A text tokenizer that retrieves entities ('noun phrases') first and simple words for the rest of the text.
 
     Parameters
@@ -181,6 +195,9 @@ def mixed(corpus, min_length=1, freq_min=2, freq_max=10000, stopwords=None):
         Maximum occurrence of phrase, beyond which it is ignored
     stopwords : None or iterable of str
         Collection of words to ignore as tokens
+    stop_regex : str
+        A regular expression of content to remove from text before tokenizing.
+        Potentially useful for ignoring code (HTML tags).
 
     Examples
     --------
@@ -192,5 +209,6 @@ def mixed(corpus, min_length=1, freq_min=2, freq_max=10000, stopwords=None):
     entities = _collect_entities(corpus, freq_min=freq_min, freq_max=freq_max)
     for doc_id, doc_text in corpus:
         yield doc_id, _tokenize_mixed_document(doc_text, entities,
-                                                min_length=min_length,
-                                                stopwords=stopwords)
+                                               min_length=min_length,
+                                               stopwords=stopwords,
+                                               stop_regex=stop_regex)
diff --git a/topik/tokenizers/filter_regex.py b/topik/tokenizers/filter_regex.py
@@ -0,0 +1,12 @@
+import re
+
+predefined_filters= {
+    'html': '<\/?[^>]+>'
+}
+
+def filter_regex(text, regex):
+    if regex:
+        if regex in predefined_filters:
+            regex = predefined_filters[regex]
+        text = re.sub(regex, "", text)
+    return text
diff --git a/topik/tokenizers/ngrams.py b/topik/tokenizers/ngrams.py
@@ -15,8 +15,10 @@
                          u"recently been popularized by such retired "
                          u"celebrities as Frank The Swank-Tank."))]
 
+
 # TODO: replace min_freqs with freq_bounds like ngrams takes.  Unify format across the board.
-def _collect_bigrams_and_trigrams(raw_corpus, top_n=10000, min_length=1, min_freqs=None, stopwords=None):
+def _collect_bigrams_and_trigrams(raw_corpus, top_n=10000, min_length=1, min_freqs=None,
+                                  stopwords=None, stop_regex=None):
     """collects bigrams and trigrams from collection of documents.  Input to collocation tokenizer.
 
     bigrams are pairs of words that recur in the collection; trigrams are triplets.
@@ -34,6 +36,9 @@ def _collect_bigrams_and_trigrams(raw_corpus, top_n=10000, min_length=1, min_fre
         starting with bigrams.
     stopwords : None or iterable of str
         Collection of words to ignore as tokens
+    stop_regex : str
+        A regular expression of content to remove from text before tokenizing.
+        Potentially useful for ignoring code (HTML tags).
 
     Examples
     --------
@@ -48,7 +53,8 @@ def _collect_bigrams_and_trigrams(raw_corpus, top_n=10000, min_length=1, min_fre
     from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures
 
     # generator of documents, turn each element to its list of words
-    doc_texts = (_simple_document(doc_text, min_length=min_length, stopwords=stopwords)
+    doc_texts = (_simple_document(doc_text, min_length=min_length, stopwords=stopwords,
+                                  stop_regex=stop_regex)
                  for doc_id, doc_text in raw_corpus)
     # generator, concatenate (chain) all words into a single sequence, lazily
     words = itertools.chain.from_iterable(doc_texts)
@@ -67,7 +73,7 @@ def _collect_bigrams_and_trigrams(raw_corpus, top_n=10000, min_length=1, min_fre
     return bigrams_patterns, trigrams_patterns
 
 
-def _collocation_document(text, patterns, min_length=1, stopwords=None):
+def _collocation_document(text, patterns, min_length=1, stopwords=None, stop_regex=None):
     """A text tokenizer that includes collocations(bigrams and trigrams).
 
     A collocation is sequence of words or terms that co-occur more often
@@ -90,6 +96,9 @@ def _collocation_document(text, patterns, min_length=1, stopwords=None):
         Minimum length of any single word
     stopwords : None or iterable of str
         Collection of words to ignore as tokens
+    stop_regex : str
+        A regular expression of content to remove from text before tokenizing.
+        Potentially useful for ignoring code (HTML tags).
 
     Examples
     --------
@@ -102,14 +111,15 @@ def _collocation_document(text, patterns, min_length=1, stopwords=None):
     ...     u'popular', u'pastime', u'sassy_unicorns', u'retirees', u'alike']
     True
     """
-    text = ' '.join(_simple_document(text, min_length=min_length, stopwords=stopwords))
+    text = ' '.join(_simple_document(text, min_length=min_length, stopwords=stopwords, stop_regex=stop_regex))
     for pattern in patterns:
         text = re.sub(pattern, lambda match: match.group(0).replace(' ', '_'), text)
     return text.split()
 
 @register
-def ngrams(raw_corpus, min_length=1, freq_bounds=None, top_n=10000, stopwords=None):
-    '''
+def ngrams(raw_corpus, min_length=1, freq_bounds=None, top_n=10000, stopwords=None,
+           stop_regex=None):
+    """
     A tokenizer that extracts collocations (bigrams and trigrams) from a corpus
     according to the frequency bounds, then tokenizes all documents using those
     extracted phrases.
@@ -128,6 +138,9 @@ def ngrams(raw_corpus, min_length=1, freq_bounds=None, top_n=10000, stopwords=No
         limit results to this many entries
     stopwords: None or iterable of str
         Collection of words to ignore as tokens
+    stop_regex : str
+        A regular expression of content to remove from text before tokenizing.
+        Potentially useful for ignoring code (HTML tags).
 
     Examples
     --------
@@ -137,11 +150,12 @@ def ngrams(raw_corpus, min_length=1, freq_bounds=None, top_n=10000, stopwords=No
     ...     u'prancercise', u'class', u'daily', u'prancercise', u'tremendously',
     ...     u'popular', u'pastime', u'sassy_unicorns', u'retirees', u'alike'])
     True
-    '''
+    """
     if not freq_bounds:
         freq_bounds=[(50, 10000), (20, 10000)]
     min_freqs = [freq[0] for freq in freq_bounds]
     patterns = _collect_bigrams_and_trigrams(raw_corpus, top_n=top_n, min_length=min_length, min_freqs=min_freqs,
-                                 stopwords=stopwords)
+                                             stopwords=stopwords, stop_regex=stop_regex)
     for doc_id, doc_text in raw_corpus:
-        yield doc_id, _collocation_document(doc_text, patterns, min_length=min_length, stopwords=stopwords)
+        yield doc_id, _collocation_document(doc_text, patterns, min_length=min_length,
+                                            stopwords=stopwords, stop_regex=stop_regex)
diff --git a/topik/tokenizers/simple.py b/topik/tokenizers/simple.py
@@ -2,9 +2,10 @@
 
 # imports used only for doctests
 from topik.tokenizers._registry import register
+from topik.tokenizers.filter_regex import filter_regex
 
 
-def _simple_document(text, min_length=1, stopwords=None):
+def _simple_document(text, min_length=1, stopwords=None, stop_regex=None):
     """A text tokenizer that simply lowercases, matches alphabetic
     characters and removes stopwords.  For use on individual text documents.
 
@@ -16,6 +17,9 @@ def _simple_document(text, min_length=1, stopwords=None):
         Minimum length of any single word
     stopwords: None or iterable of str
         Collection of words to ignore as tokens
+    stop_regex : str
+        A regular expression of content to remove from text before tokenizing.
+        Potentially useful for ignoring code (HTML tags).
 
     Examples
     --------
@@ -26,12 +30,13 @@ def _simple_document(text, min_length=1, stopwords=None):
     """
     if not stopwords:
         from gensim.parsing.preprocessing import STOPWORDS as stopwords
+    text = filter_regex(text, stop_regex)
     return [word for word in gensim.utils.tokenize(text, lower=True)
             if word not in stopwords and len(word) >= min_length]
 
 
 @register
-def simple(raw_corpus, min_length=1, stopwords=None):
+def simple(raw_corpus, min_length=1, stopwords=None, stop_regex=None):
     """A text tokenizer that simply lowercases, matches alphabetic
     characters and removes stopwords.
 
@@ -54,4 +59,5 @@ def simple(raw_corpus, min_length=1, stopwords=None):
     True
     """
     for doc_id, doc_text in raw_corpus:
-        yield(doc_id, _simple_document(doc_text, min_length=min_length, stopwords=stopwords))
+        yield(doc_id, _simple_document(doc_text, min_length=min_length, stopwords=stopwords,
+                                       stop_regex=stop_regex))
diff --git a/topik/tokenizers/tests/test_filter_regex.py b/topik/tokenizers/tests/test_filter_regex.py
@@ -0,0 +1,17 @@
+from topik.tokenizers.filter_regex import filter_regex
+
+example_text = """
+<div class="md"><p>Yes it is bad. If your task is long running, you will eventually crash with</p>
+
+<blockquote>
+<p>RuntimeError: maximum recursion depth exceeded</p>
+</blockquote>
+
+<p>Also, it is not very efficient for this simple use case.</p>
+</div>
+"""
+
+
+def test_html_regex_filter():
+    assert "</div>" not in filter_regex(example_text, 'html')
+