Skip to content
This repository was archived by the owner on Jun 14, 2018. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 30 additions & 12 deletions topik/tokenizers/entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,8 @@ def _collect_entities(raw_corpus, freq_min=2, freq_max=10000):
return set(np_counts)


def _tokenize_entities_document(text, entities, min_length=1, stopwords=None):
'''
def _tokenize_entities_document(text, entities, min_length=1, stopwords=None, stop_regex=None):
"""
A text tokenizer that passes only terms (a.k.a. 'entities') explicitly
contained in the entities argument.

Expand All @@ -73,6 +73,9 @@ def _tokenize_entities_document(text, entities, min_length=1, stopwords=None):
Minimum length of any single word
stopwords : None or iterable of str
Collection of words to ignore as tokens
stop_regex : str
A regular expression of content to remove from text before tokenizing.
Potentially useful for ignoring code (HTML tags).

Examples
--------
Expand All @@ -82,19 +85,21 @@ def _tokenize_entities_document(text, entities, min_length=1, stopwords=None):
>>> tokenized_text == [
... u'frank', u'swank_tank', u'prancercise', u'sassy_unicorns']
True
'''
"""

result = []
for np in TextBlob(text).noun_phrases:
if np in entities:
# filter out stop words
tmp = "_".join(_simple_document(np, min_length=min_length, stopwords=stopwords))
tmp = "_".join(_simple_document(np, min_length=min_length, stopwords=stopwords,
stop_regex=stop_regex))
# if we end up with nothing, don't append an empty string
if tmp:
result.append(tmp)
return result


def _tokenize_mixed_document(text, entities, min_length=1, stopwords=None):
def _tokenize_mixed_document(text, entities, min_length=1, stopwords=None, stop_regex=None):
"""
A text tokenizer that retrieves entities ('noun phrases') first and simple words for the rest of the text.

Expand All @@ -108,6 +113,9 @@ def _tokenize_mixed_document(text, entities, min_length=1, stopwords=None):
Minimum length of any single word
stopwords : None or iterable of str
Collection of words to ignore as tokens
stop_regex : str
A regular expression of content to remove from text before tokenizing.
Potentially useful for ignoring code (HTML tags).

Examples
--------
Expand All @@ -119,22 +127,25 @@ def _tokenize_mixed_document(text, entities, min_length=1, stopwords=None):
... u'pastime', u'sassy_unicorns']
True
"""

result = []
for np in TextBlob(text).noun_phrases:
if ' ' in np and np not in entities:
# break apart the noun phrase; it does not occur often enough in the collection of text to be considered.
result.extend(_simple_document(np, min_length=min_length, stopwords=stopwords))
result.extend(_simple_document(np, min_length=min_length, stopwords=stopwords,
stop_regex=stop_regex))
else:
# filter out stop words
tmp = "_".join(_simple_document(np, min_length=min_length, stopwords=stopwords))
tmp = "_".join(_simple_document(np, min_length=min_length, stopwords=stopwords,
stop_regex=stop_regex))
# if we end up with nothing, don't append an empty string
if tmp:
result.append(tmp)
return result


@register
def entities(corpus, min_length=1, freq_min=2, freq_max=10000, stopwords=None):
def entities(corpus, min_length=1, freq_min=2, freq_max=10000, stopwords=None, stop_regex=None):
"""
A tokenizer that extracts noun phrases from a corpus, then tokenizes all
documents using those extracted phrases.
Expand All @@ -151,6 +162,9 @@ def entities(corpus, min_length=1, freq_min=2, freq_max=10000, stopwords=None):
Maximum occurrence of phrase, beyond which it is ignored
stopwords : None or iterable of str
Collection of words to ignore as tokens
stop_regex : str
A regular expression of content to remove from text before tokenizing.
Potentially useful for ignoring code (HTML tags).

Examples
--------
Expand All @@ -162,11 +176,11 @@ def entities(corpus, min_length=1, freq_min=2, freq_max=10000, stopwords=None):
entities = _collect_entities(corpus, freq_min=freq_min, freq_max=freq_max)
for doc_id, doc_text in corpus:
yield doc_id, _tokenize_entities_document(doc_text, entities, min_length=min_length,
stopwords=stopwords)
stopwords=stopwords, stop_regex=stop_regex)


@register
def mixed(corpus, min_length=1, freq_min=2, freq_max=10000, stopwords=None):
def mixed(corpus, min_length=1, freq_min=2, freq_max=10000, stopwords=None, stop_regex=None):
"""A text tokenizer that retrieves entities ('noun phrases') first and simple words for the rest of the text.

Parameters
Expand All @@ -181,6 +195,9 @@ def mixed(corpus, min_length=1, freq_min=2, freq_max=10000, stopwords=None):
Maximum occurrence of phrase, beyond which it is ignored
stopwords : None or iterable of str
Collection of words to ignore as tokens
stop_regex : str
A regular expression of content to remove from text before tokenizing.
Potentially useful for ignoring code (HTML tags).

Examples
--------
Expand All @@ -192,5 +209,6 @@ def mixed(corpus, min_length=1, freq_min=2, freq_max=10000, stopwords=None):
entities = _collect_entities(corpus, freq_min=freq_min, freq_max=freq_max)
for doc_id, doc_text in corpus:
yield doc_id, _tokenize_mixed_document(doc_text, entities,
min_length=min_length,
stopwords=stopwords)
min_length=min_length,
stopwords=stopwords,
stop_regex=stop_regex)
12 changes: 12 additions & 0 deletions topik/tokenizers/filter_regex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import re

predefined_filters= {
'html': '<\/?[^>]+>'
}

def filter_regex(text, regex):
if regex:
if regex in predefined_filters:
regex = predefined_filters[regex]
text = re.sub(regex, "", text)
return text
32 changes: 23 additions & 9 deletions topik/tokenizers/ngrams.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,10 @@
u"recently been popularized by such retired "
u"celebrities as Frank The Swank-Tank."))]


# TODO: replace min_freqs with freq_bounds like ngrams takes. Unify format across the board.
def _collect_bigrams_and_trigrams(raw_corpus, top_n=10000, min_length=1, min_freqs=None, stopwords=None):
def _collect_bigrams_and_trigrams(raw_corpus, top_n=10000, min_length=1, min_freqs=None,
stopwords=None, stop_regex=None):
"""collects bigrams and trigrams from collection of documents. Input to collocation tokenizer.

bigrams are pairs of words that recur in the collection; trigrams are triplets.
Expand All @@ -34,6 +36,9 @@ def _collect_bigrams_and_trigrams(raw_corpus, top_n=10000, min_length=1, min_fre
starting with bigrams.
stopwords : None or iterable of str
Collection of words to ignore as tokens
stop_regex : str
A regular expression of content to remove from text before tokenizing.
Potentially useful for ignoring code (HTML tags).

Examples
--------
Expand All @@ -48,7 +53,8 @@ def _collect_bigrams_and_trigrams(raw_corpus, top_n=10000, min_length=1, min_fre
from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures

# generator of documents, turn each element to its list of words
doc_texts = (_simple_document(doc_text, min_length=min_length, stopwords=stopwords)
doc_texts = (_simple_document(doc_text, min_length=min_length, stopwords=stopwords,
stop_regex=stop_regex)
for doc_id, doc_text in raw_corpus)
# generator, concatenate (chain) all words into a single sequence, lazily
words = itertools.chain.from_iterable(doc_texts)
Expand All @@ -67,7 +73,7 @@ def _collect_bigrams_and_trigrams(raw_corpus, top_n=10000, min_length=1, min_fre
return bigrams_patterns, trigrams_patterns


def _collocation_document(text, patterns, min_length=1, stopwords=None):
def _collocation_document(text, patterns, min_length=1, stopwords=None, stop_regex=None):
"""A text tokenizer that includes collocations(bigrams and trigrams).

A collocation is sequence of words or terms that co-occur more often
Expand All @@ -90,6 +96,9 @@ def _collocation_document(text, patterns, min_length=1, stopwords=None):
Minimum length of any single word
stopwords : None or iterable of str
Collection of words to ignore as tokens
stop_regex : str
A regular expression of content to remove from text before tokenizing.
Potentially useful for ignoring code (HTML tags).

Examples
--------
Expand All @@ -102,14 +111,15 @@ def _collocation_document(text, patterns, min_length=1, stopwords=None):
... u'popular', u'pastime', u'sassy_unicorns', u'retirees', u'alike']
True
"""
text = ' '.join(_simple_document(text, min_length=min_length, stopwords=stopwords))
text = ' '.join(_simple_document(text, min_length=min_length, stopwords=stopwords, stop_regex=stop_regex))
for pattern in patterns:
text = re.sub(pattern, lambda match: match.group(0).replace(' ', '_'), text)
return text.split()

@register
def ngrams(raw_corpus, min_length=1, freq_bounds=None, top_n=10000, stopwords=None):
'''
def ngrams(raw_corpus, min_length=1, freq_bounds=None, top_n=10000, stopwords=None,
stop_regex=None):
"""
A tokenizer that extracts collocations (bigrams and trigrams) from a corpus
according to the frequency bounds, then tokenizes all documents using those
extracted phrases.
Expand All @@ -128,6 +138,9 @@ def ngrams(raw_corpus, min_length=1, freq_bounds=None, top_n=10000, stopwords=No
limit results to this many entries
stopwords: None or iterable of str
Collection of words to ignore as tokens
stop_regex : str
A regular expression of content to remove from text before tokenizing.
Potentially useful for ignoring code (HTML tags).

Examples
--------
Expand All @@ -137,11 +150,12 @@ def ngrams(raw_corpus, min_length=1, freq_bounds=None, top_n=10000, stopwords=No
... u'prancercise', u'class', u'daily', u'prancercise', u'tremendously',
... u'popular', u'pastime', u'sassy_unicorns', u'retirees', u'alike'])
True
'''
"""
if not freq_bounds:
freq_bounds=[(50, 10000), (20, 10000)]
min_freqs = [freq[0] for freq in freq_bounds]
patterns = _collect_bigrams_and_trigrams(raw_corpus, top_n=top_n, min_length=min_length, min_freqs=min_freqs,
stopwords=stopwords)
stopwords=stopwords, stop_regex=stop_regex)
for doc_id, doc_text in raw_corpus:
yield doc_id, _collocation_document(doc_text, patterns, min_length=min_length, stopwords=stopwords)
yield doc_id, _collocation_document(doc_text, patterns, min_length=min_length,
stopwords=stopwords, stop_regex=stop_regex)
12 changes: 9 additions & 3 deletions topik/tokenizers/simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@

# imports used only for doctests
from topik.tokenizers._registry import register
from topik.tokenizers.filter_regex import filter_regex


def _simple_document(text, min_length=1, stopwords=None):
def _simple_document(text, min_length=1, stopwords=None, stop_regex=None):
"""A text tokenizer that simply lowercases, matches alphabetic
characters and removes stopwords. For use on individual text documents.

Expand All @@ -16,6 +17,9 @@ def _simple_document(text, min_length=1, stopwords=None):
Minimum length of any single word
stopwords: None or iterable of str
Collection of words to ignore as tokens
stop_regex : str
A regular expression of content to remove from text before tokenizing.
Potentially useful for ignoring code (HTML tags).

Examples
--------
Expand All @@ -26,12 +30,13 @@ def _simple_document(text, min_length=1, stopwords=None):
"""
if not stopwords:
from gensim.parsing.preprocessing import STOPWORDS as stopwords
text = filter_regex(text, stop_regex)
return [word for word in gensim.utils.tokenize(text, lower=True)
if word not in stopwords and len(word) >= min_length]


@register
def simple(raw_corpus, min_length=1, stopwords=None):
def simple(raw_corpus, min_length=1, stopwords=None, stop_regex=None):
"""A text tokenizer that simply lowercases, matches alphabetic
characters and removes stopwords.

Expand All @@ -54,4 +59,5 @@ def simple(raw_corpus, min_length=1, stopwords=None):
True
"""
for doc_id, doc_text in raw_corpus:
yield(doc_id, _simple_document(doc_text, min_length=min_length, stopwords=stopwords))
yield(doc_id, _simple_document(doc_text, min_length=min_length, stopwords=stopwords,
stop_regex=stop_regex))
17 changes: 17 additions & 0 deletions topik/tokenizers/tests/test_filter_regex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from topik.tokenizers.filter_regex import filter_regex

example_text = """
<div class="md"><p>Yes it is bad. If your task is long running, you will eventually crash with</p>

<blockquote>
<p>RuntimeError: maximum recursion depth exceeded</p>
</blockquote>

<p>Also, it is not very efficient for this simple use case.</p>
</div>
"""


def test_html_regex_filter():
assert "</div>" not in filter_regex(example_text, 'html')