diff --git a/.gitattributes b/.gitattributes index 412eeda..2431c40 100644 --- a/.gitattributes +++ b/.gitattributes @@ -10,13 +10,13 @@ *.dbproj merge=union # Standard to msysgit -*.doc diff=astextplain -*.DOC diff=astextplain +*.doc diff=astextplain +*.DOC diff=astextplain *.docx diff=astextplain *.DOCX diff=astextplain *.dot diff=astextplain *.DOT diff=astextplain *.pdf diff=astextplain -*.PDF diff=astextplain -*.rtf diff=astextplain -*.RTF diff=astextplain +*.PDF diff=astextplain +*.rtf diff=astextplain +*.RTF diff=astextplain diff --git a/README.txt b/README.txt index bf0b259..5e06b9b 100644 --- a/README.txt +++ b/README.txt @@ -5,7 +5,7 @@ BoilerPy About --------------------------------------- -BoilerPy is a native Python port of Christian Kohlschütter's Boilerpipe library, released under the Apache 2.0 Licence. (http://code.google.com/p/boilerpipe/ +BoilerPy is a native Python port of Christian Kohlschutter's Boilerpipe library, released under the Apache 2.0 Licence. (http://code.google.com/p/boilerpipe/ ) I created this port since I don't have access to Java on my webhost and I wanted to create a pure Python version. Another Python version which consists of Python hooks to the original Java library can be found here : (https://github.com/misja/python-boilerpipe @@ -20,19 +20,19 @@ Installation BoilerPy was packaged with distutils. In can be installed from the command-line with the following line: - ``>python setup.py install`` + ``>python setup.py install`` Usage --------------------------------------- - ``import boilerpy`` + ``import boilerpy`` - ``boilerpy.extractors.ARTICLE_EXTRACTOR.getContentFromUrl('http://www.example.com/')`` + ``boilerpy.extractors.ARTICLE_EXTRACTOR.getContentFromUrl('http://www.example.com/')`` - ``boilerpy.extractors.ARTICLE_EXTRACTOR.getContentFromFile('site/example.html')`` + ``boilerpy.extractors.ARTICLE_EXTRACTOR.getContentFromFile('site/example.html')`` - ``htmlText='

Example

'`` - ``boilerpy.extractors.ARTICLE_EXTRACTOR.getContent(htmlText)`` + ``htmlText='

Example

'`` + ``boilerpy.extractors.ARTICLE_EXTRACTOR.getContent(htmlText)`` @@ -83,4 +83,4 @@ A full-text extractor which is tuned towards extracting sentences from news arti Version --------------------------------------- -1.0 - Created 14 Feb 2013 \ No newline at end of file +1.0 - Created 14 Feb 2013 diff --git a/boilerpy/__init__.py b/boilerpy/__init__.py index a796300..6d36c52 100644 --- a/boilerpy/__init__.py +++ b/boilerpy/__init__.py @@ -8,7 +8,7 @@ # * (the "License"); you may not use this file except in compliance with # * the License. You may obtain a copy of the License at # * -# * http://www.apache.org/licenses/LICENSE-2.0 +# * http://www.apache.org/licenses/LICENSE-2.0 # * # * Unless required by applicable law or agreed to in writing, software # * distributed under the License is distributed on an "AS IS" BASIS, @@ -16,5 +16,4 @@ # * See the License for the specific language governing permissions and # * limitations under the License. # - -import extractors,filters,parser,document \ No newline at end of file +from . import extractors, filters, parser, document diff --git a/boilerpy/document.py b/boilerpy/document.py index 8c6852d..9f953ff 100644 --- a/boilerpy/document.py +++ b/boilerpy/document.py @@ -8,7 +8,7 @@ # * (the "License"); you may not use this file except in compliance with # * the License. You may obtain a copy of the License at # * -# * http://www.apache.org/licenses/LICENSE-2.0 +# * http://www.apache.org/licenses/LICENSE-2.0 # * # * Unless required by applicable law or agreed to in writing, software # * distributed under the License is distributed on an "AS IS" BASIS, @@ -17,7 +17,8 @@ # * limitations under the License. # # package: de.l3s.boilerpipe.document -import copy,sys +import copy +import sys # # * Some pre-defined labels which can be used in conjunction with @@ -26,14 +27,14 @@ # * @author Christian Kohlschtter # class DefaultLabels(object): - """ generated source for class DefaultLabels """ - TITLE = "de.l3s.boilerpipe/TITLE" - ARTICLE_METADATA = "de.l3s.boilerpipe/ARTICLE_METADATA" - INDICATES_END_OF_TEXT = "de.l3s.boilerpipe/INDICATES_END_OF_TEXT" - MIGHT_BE_CONTENT = "de.l3s.boilerpipe/MIGHT_BE_CONTENT" - STRICTLY_NOT_CONTENT = "de.l3s.boilerpipe/STRICTLY_NOT_CONTENT" - HR = "de.l3s.boilerpipe/HR" - MARKUP_PREFIX = "<" + """ generated source for class DefaultLabels """ + TITLE = "de.l3s.boilerpipe/TITLE" + ARTICLE_METADATA = "de.l3s.boilerpipe/ARTICLE_METADATA" + INDICATES_END_OF_TEXT = "de.l3s.boilerpipe/INDICATES_END_OF_TEXT" + MIGHT_BE_CONTENT = "de.l3s.boilerpipe/MIGHT_BE_CONTENT" + STRICTLY_NOT_CONTENT = "de.l3s.boilerpipe/STRICTLY_NOT_CONTENT" + HR = "de.l3s.boilerpipe/HR" + MARKUP_PREFIX = "<" # # * A text document, consisting of one or more {@link TextBlock}s. @@ -41,77 +42,77 @@ class DefaultLabels(object): # * @author Christian Kohlschtter # class TextDocument(object): - # * Creates a new {@link TextDocument} with given {@link TextBlock}s and - # * given title. - # * - # * @param title - # * The "main" title for this text document. - # * @param textBlocks - # * The text blocks of this document. - def __init__(self, textBlocks, title=None): - self.title = title - self.textBlocks = textBlocks - - # * Returns the {@link TextBlock}s of this document. - # * - # * @return A list of {@link TextBlock}s, in sequential order of appearance. - # - def getTextBlocks(self): - """ generated source for method getTextBlocks """ - return self.textBlocks - - def setTextBlocks(self,textBlocks): self.textBlocks=textBlocks - - # - # * Returns the "main" title for this document, or null if no - # * such title has ben set. - # * - # * @return The "main" title. - def getTitle(self): - """ generated source for method getTitle """ - return self.title - - # - # * Updates the "main" title for this document. - # * - # * @param title - def setTitle(self, title): - """ generated source for method setTitle """ - self.title = title - - # - # * Returns the {@link TextDocument}'s content. - # * - # * @return The content text. - def getContent(self): - """ generated source for method getContent """ - return self.getText(True, False) - - # - # * Returns the {@link TextDocument}'s content, non-content or both - # * - # * @param includeContent Whether to include TextBlocks marked as "content". - # * @param includeNonContent Whether to include TextBlocks marked as "non-content". - # * @return The text. - def getText(self, includeContent, includeNonContent): - sb = "" - for block in self.getTextBlocks(): - if block.isContent(): - if not includeContent: - continue - else: - if not includeNonContent: - continue - sb+=block.getText()+'\n' - return sb - - # * Returns detailed debugging information about the contained {@link TextBlock}s. - # * @return Debug information. - def debugString(self): - sb = "" - for tb in self.getTextBlocks(): - sb+=str(tb)+"\n" - return sb + # * Creates a new {@link TextDocument} with given {@link TextBlock}s and + # * given title. + # * + # * @param title + # * The "main" title for this text document. + # * @param textBlocks + # * The text blocks of this document. + def __init__(self, textBlocks, title=None): + self.title = title + self.textBlocks = textBlocks + + # * Returns the {@link TextBlock}s of this document. + # * + # * @return A list of {@link TextBlock}s, in sequential order of appearance. + # + def getTextBlocks(self): + """ generated source for method getTextBlocks """ + return self.textBlocks + + def setTextBlocks(self,textBlocks): self.textBlocks=textBlocks + + # + # * Returns the "main" title for this document, or null if no + # * such title has ben set. + # * + # * @return The "main" title. + def getTitle(self): + """ generated source for method getTitle """ + return self.title + + # + # * Updates the "main" title for this document. + # * + # * @param title + def setTitle(self, title): + """ generated source for method setTitle """ + self.title = title + + # + # * Returns the {@link TextDocument}'s content. + # * + # * @return The content text. + def getContent(self): + """ generated source for method getContent """ + return self.getText(True, False) + + # + # * Returns the {@link TextDocument}'s content, non-content or both + # * + # * @param includeContent Whether to include TextBlocks marked as "content". + # * @param includeNonContent Whether to include TextBlocks marked as "non-content". + # * @return The text. + def getText(self, includeContent, includeNonContent): + sb = "" + for block in self.getTextBlocks(): + if block.isContent(): + if not includeContent: + continue + else: + if not includeNonContent: + continue + sb+=block.getText()+'\n' + return sb + + # * Returns detailed debugging information about the contained {@link TextBlock}s. + # * @return Debug information. + def debugString(self): + sb = "" + for tb in self.getTextBlocks(): + sb+=str(tb)+"\n" + return sb @@ -127,174 +128,174 @@ def debugString(self): # class TextBlock(object): - """ generated source for class TextBlock """ - - def __init__(self, text, containedTextElements=set(), numWords=0, numWordsInAnchorText=0, numWordsInWrappedLines=0, numWrappedLines=0, offsetBlocks=0): - self._isContent = False - self.labels = set() - self.numFullTextWords = 0 - self.tagLevel = 0 - - self.text = text - self.containedTextElements = containedTextElements - self.numWords = numWords - self.numWordsInAnchorText = numWordsInAnchorText - self.numWordsInWrappedLines = numWordsInWrappedLines - self.numWrappedLines = numWrappedLines - self.offsetBlocksStart = offsetBlocks - self.offsetBlocksEnd = offsetBlocks - self.initDensities() - - def initDensities(self): - """ generated source for method initDensities """ - if self.numWordsInWrappedLines == 0: - self.numWordsInWrappedLines = self.numWords - self.numWrappedLines = 1 - self.textDensity = self.numWordsInWrappedLines / float(self.numWrappedLines) - self.linkDensity = 0 if self.numWords==0 else self.numWordsInAnchorText / float(self.numWords) - - def isContent(self): - """ generated source for method isContent """ - return self._isContent - - def setIsContent(self, isContent): - """ generated source for method setIsContent """ - if isContent != self._isContent: - self._isContent = isContent - return True - else: - return False - - def getText(self): - """ generated source for method getText """ - return self.text - - def getNumWords(self): - """ generated source for method getNumWords """ - return self.numWords - - def getNumWordsInAnchorText(self): - """ generated source for method getNumWordsInAnchorText """ - return self.numWordsInAnchorText - - def getTextDensity(self): - """ generated source for method getTextDensity """ - return self.textDensity - - def getLinkDensity(self): - """ generated source for method getLinkDensity """ - return self.linkDensity - - def mergeNext(self, nextTextBlock): - """ generated source for method mergeNext """ - if self.text==None: self.text="" - self.text+='\n'+nextTextBlock.text - self.numWords += nextTextBlock.numWords - self.numWordsInAnchorText += nextTextBlock.numWordsInAnchorText - self.numWordsInWrappedLines += nextTextBlock.numWordsInWrappedLines - self.numWrappedLines += nextTextBlock.numWrappedLines - self.offsetBlocksStart = min(self.offsetBlocksStart, nextTextBlock.offsetBlocksStart) - self.offsetBlocksEnd = max(self.offsetBlocksEnd, nextTextBlock.offsetBlocksEnd) - self.initDensities() - self._isContent |= nextTextBlock.isContent() - self.containedTextElements|=nextTextBlock.containedTextElements - self.numFullTextWords += nextTextBlock.numFullTextWords - self.labels|=nextTextBlock.labels - self.tagLevel = min(self.tagLevel, nextTextBlock.tagLevel) - - def getOffsetBlocksStart(self): - """ generated source for method getOffsetBlocksStart """ - return self.offsetBlocksStart - - def getOffsetBlocksEnd(self): - """ generated source for method getOffsetBlocksEnd """ - return self.offsetBlocksEnd - - def __repr__(self): - """ generated source for method toString """ - return "[" + str(self.offsetBlocksStart) + "-" + str(self.offsetBlocksEnd) + ";tl=" + str(self.tagLevel) + "; nw=" + str(self.numWords) + ";nwl=" + str(self.numWrappedLines) + ";ld=" + str(self.linkDensity) + "]\t" + ("CONTENT" if self.isContent else "boilerplate") + "," + str(self.labels) + "\n" + str(self.getText()) - - # - # * Adds an arbitrary String label to this {@link TextBlock}. - # * - # * @param label The label - # - def addLabel(self, label): - """ generated source for method addLabel """ - self.labels.add(label) - - # - # * Checks whether this TextBlock has the given label. - # * - # * @param label The label - # * @return true if this block is marked by the given label. - # - def hasLabel(self, label): - """ generated source for method hasLabel """ - return label in self.labels - - def removeLabel(self, label): - """ generated source for method removeLabel """ - try: - self.labels.remove(label) - return True - except KeyError: - return False - - # - # * Returns the labels associated to this TextBlock, or null if no such labels - # * exist. - # * - # * to the data structure. However it is recommended to use the label-specific methods in {@link TextBlock} - # * whenever possible. - # * - # * @return Returns the set of labels, or null if no labels was added yet. - # - def getLabels(self): - """ generated source for method getLabels """ - return self.labels - - # - # * Adds a set of labels to this {@link TextBlock}. - # * null-references are silently ignored. - # * - # * @param labels The labels to be added. - # - def addLabels(self, *labels): - """ generated source for method addLabels """ - if len(labels)==0 or labels[0] == None: return - if self.labels == None: self.labels = set() - elif len(labels)==1 and (type(labels[0])==set or type(labels[0])==list): self.labels|=set(labels[0]) - else: self.labels|=set(labels) - - - # - # * Returns the containedTextElements BitSet, or null. - # * @return - # - def getContainedTextElements(self): - """ generated source for method getContainedTextElements """ - return self.containedTextElements - - def clone(self): - try: - clone = copy.copy(self) - except copy.error: - raise copy.error - if self.labels != None: clone.labels = self.labels.copy() - if self.containedTextElements != None: clone.containedTextElements = self.containedTextElements.copy() - return clone - - def getTagLevel(self): - """ generated source for method getTagLevel """ - return self.tagLevel - - def setTagLevel(self, tagLevel): - """ generated source for method setTagLevel """ - self.tagLevel = tagLevel + """ generated source for class TextBlock """ + + def __init__(self, text, containedTextElements=set(), numWords=0, numWordsInAnchorText=0, numWordsInWrappedLines=0, numWrappedLines=0, offsetBlocks=0): + self._isContent = False + self.labels = set() + self.numFullTextWords = 0 + self.tagLevel = 0 + + self.text = text + self.containedTextElements = containedTextElements + self.numWords = numWords + self.numWordsInAnchorText = numWordsInAnchorText + self.numWordsInWrappedLines = numWordsInWrappedLines + self.numWrappedLines = numWrappedLines + self.offsetBlocksStart = offsetBlocks + self.offsetBlocksEnd = offsetBlocks + self.initDensities() + + def initDensities(self): + """ generated source for method initDensities """ + if self.numWordsInWrappedLines == 0: + self.numWordsInWrappedLines = self.numWords + self.numWrappedLines = 1 + self.textDensity = self.numWordsInWrappedLines / self.numWrappedLines + self.linkDensity = 0 if self.numWords == 0 else self.numWordsInAnchorText / self.numWords + + def isContent(self): + """ generated source for method isContent """ + return self._isContent + + def setIsContent(self, isContent): + """ generated source for method setIsContent """ + if isContent != self._isContent: + self._isContent = isContent + return True + else: + return False + + def getText(self): + """ generated source for method getText """ + return self.text + + def getNumWords(self): + """ generated source for method getNumWords """ + return self.numWords + + def getNumWordsInAnchorText(self): + """ generated source for method getNumWordsInAnchorText """ + return self.numWordsInAnchorText + + def getTextDensity(self): + """ generated source for method getTextDensity """ + return self.textDensity + + def getLinkDensity(self): + """ generated source for method getLinkDensity """ + return self.linkDensity + + def mergeNext(self, nextTextBlock): + """ generated source for method mergeNext """ + if self.text==None: self.text="" + self.text+='\n'+nextTextBlock.text + self.numWords += nextTextBlock.numWords + self.numWordsInAnchorText += nextTextBlock.numWordsInAnchorText + self.numWordsInWrappedLines += nextTextBlock.numWordsInWrappedLines + self.numWrappedLines += nextTextBlock.numWrappedLines + self.offsetBlocksStart = min(self.offsetBlocksStart, nextTextBlock.offsetBlocksStart) + self.offsetBlocksEnd = max(self.offsetBlocksEnd, nextTextBlock.offsetBlocksEnd) + self.initDensities() + self._isContent |= nextTextBlock.isContent() + self.containedTextElements|=nextTextBlock.containedTextElements + self.numFullTextWords += nextTextBlock.numFullTextWords + self.labels|=nextTextBlock.labels + self.tagLevel = min(self.tagLevel, nextTextBlock.tagLevel) + + def getOffsetBlocksStart(self): + """ generated source for method getOffsetBlocksStart """ + return self.offsetBlocksStart + + def getOffsetBlocksEnd(self): + """ generated source for method getOffsetBlocksEnd """ + return self.offsetBlocksEnd + + def __repr__(self): + """ generated source for method toString """ + return "[" + str(self.offsetBlocksStart) + "-" + str(self.offsetBlocksEnd) + ";tl=" + str(self.tagLevel) + "; nw=" + str(self.numWords) + ";nwl=" + str(self.numWrappedLines) + ";ld=" + str(self.linkDensity) + "]\t" + ("CONTENT" if self.isContent else "boilerplate") + "," + str(self.labels) + "\n" + str(self.getText()) + + # + # * Adds an arbitrary String label to this {@link TextBlock}. + # * + # * @param label The label + # + def addLabel(self, label): + """ generated source for method addLabel """ + self.labels.add(label) + + # + # * Checks whether this TextBlock has the given label. + # * + # * @param label The label + # * @return true if this block is marked by the given label. + # + def hasLabel(self, label): + """ generated source for method hasLabel """ + return label in self.labels + + def removeLabel(self, label): + """ generated source for method removeLabel """ + try: + self.labels.remove(label) + return True + except KeyError: + return False + + # + # * Returns the labels associated to this TextBlock, or null if no such labels + # * exist. + # * + # * to the data structure. However it is recommended to use the label-specific methods in {@link TextBlock} + # * whenever possible. + # * + # * @return Returns the set of labels, or null if no labels was added yet. + # + def getLabels(self): + """ generated source for method getLabels """ + return self.labels + + # + # * Adds a set of labels to this {@link TextBlock}. + # * null-references are silently ignored. + # * + # * @param labels The labels to be added. + # + def addLabels(self, *labels): + """ generated source for method addLabels """ + if len(labels)==0 or labels[0] == None: return + if self.labels == None: self.labels = set() + elif len(labels)==1 and (type(labels[0])==set or type(labels[0])==list): self.labels|=set(labels[0]) + else: self.labels|=set(labels) + + + # + # * Returns the containedTextElements BitSet, or null. + # * @return + # + def getContainedTextElements(self): + """ generated source for method getContainedTextElements """ + return self.containedTextElements + + def clone(self): + try: + clone = copy.copy(self) + except copy.error: + raise copy.error + if self.labels != None: clone.labels = self.labels.copy() + if self.containedTextElements != None: clone.containedTextElements = self.containedTextElements.copy() + return clone + + def getTagLevel(self): + """ generated source for method getTagLevel """ + return self.tagLevel + + def setTagLevel(self, tagLevel): + """ generated source for method setTagLevel """ + self.tagLevel = tagLevel TextBlock.EMPTY_START = TextBlock("", set(), 0, 0, 0, 0, -1) -TextBlock.EMPTY_END = TextBlock("", set(), 0, 0, 0, 0, sys.maxint) +TextBlock.EMPTY_END = TextBlock("", set(), 0, 0, 0, 0, sys.maxsize) @@ -303,35 +304,35 @@ def setTagLevel(self, tagLevel): # * @author Christian Kohlschuetter # class TextDocumentStatistics(object): - # - # * Computes statistics on a given {@link TextDocument}. - # * - # * @param doc The {@link TextDocument}. - # * @param contentOnly if true then o - # - def __init__(self, doc, contentOnly): - self.numWords=0 - self.numBlocks=0 - for tb in doc.getTextBlocks(): - if contentOnly and not tb.isContent(): continue - self.numWords += tb.getNumWords() - self.numBlocks += 1 - - - # * Returns the average number of words at block-level (= overall number of words divided by - # * the number of blocks). - # * - # * @return Average - # - def avgNumWords(self): - """ generated source for method avgNumWords """ - return self.numWords / float(self.numBlocks) - - # - # * Returns the overall number of words in all blocks. - # * - # * @return Sum - # - def getNumWords(self): - """ generated source for method getNumWords """ - return self.numWords + # + # * Computes statistics on a given {@link TextDocument}. + # * + # * @param doc The {@link TextDocument}. + # * @param contentOnly if true then o + # + def __init__(self, doc, contentOnly): + self.numWords=0 + self.numBlocks=0 + for tb in doc.getTextBlocks(): + if contentOnly and not tb.isContent(): continue + self.numWords += tb.getNumWords() + self.numBlocks += 1 + + + # * Returns the average number of words at block-level (= overall number of words divided by + # * the number of blocks). + # * + # * @return Average + # + def avgNumWords(self): + """ generated source for method avgNumWords """ + return self.numWords / self.numBlocks + + # + # * Returns the overall number of words in all blocks. + # * + # * @return Sum + # + def getNumWords(self): + """ generated source for method getNumWords """ + return self.numWords diff --git a/boilerpy/extractors.py b/boilerpy/extractors.py index 462c2f0..d3e95e7 100644 --- a/boilerpy/extractors.py +++ b/boilerpy/extractors.py @@ -8,7 +8,7 @@ # * (the "License"); you may not use this file except in compliance with # * the License. You may obtain a copy of the License at # * -# * http://www.apache.org/licenses/LICENSE-2.0 +# * http://www.apache.org/licenses/LICENSE-2.0 # * # * Unless required by applicable law or agreed to in writing, software # * distributed under the License is distributed on an "AS IS" BASIS, @@ -27,75 +27,77 @@ from xml.sax import parseString, SAXException -import HTMLParser +import html.parser from . import filters from . import parser -import urllib2 +import urllib.request +import urllib.error +import urllib.parse import re class Extractor(object): - def __init__(self,filtr): - self.filter=filtr - - def getContent(self, text): - return self.getDoc(text).getContent() - - def getContentFromUrl(self, url): - return self.getDocFromUrl(url).getContent() - - def getContentFromFile(self, filename): - return self.getDocFromFile(filename).getContent() - - def getDocFromFile(self,filename): - return self.getDoc(self.readFromFile(filename)) - - def getDocFromUrl(self,url): - return self.getDoc(self.readFromUrl(filename)) - - def getDoc(self,text): - doc=self.parseDoc(text) - self.filter.process(doc) - return doc - - def readFromFile(self,filename): - f=open(filename,'r') - text=f.read() - f.close() - try: - text=text.decode('utf8') - except UnicodeDecodeError: pass - return text - - def readFromUrl(self,url): - f=urllib2.urlopen(url) - text=f.read() - encoding=self.getUrlEncoding(f) - f.close() - try: - text=text.decode(encoding) - except UnicodeDecodeError: pass - return text - - def getUrlEncoding(self,f): - try: - return f.headers['content-type'].split('charset=')[1].split(';')[0] - except: return 'utf8' - - def parseDoc(self,inputStr): - bpParser=parser.BoilerpipeHTMLParser() - try: - bpParser.feed(inputStr) - except: - #in case of error, try again, first removing script tag content - bpParser=parser.BoilerpipeHTMLParser() - inputStr=re.sub(r'<(?:script|SCRIPT)[^>]*>.*?','',inputStr,0,re.DOTALL) - try: - bpParser.feed(inputStr) - except: - print "Error parsing HTML : "+str(e) - return None - doc=bpParser.toTextDocument() - return doc + def __init__(self,filtr): + self.filter=filtr + + def getContent(self, text): + return self.getDoc(text).getContent() + + def getContentFromUrl(self, url): + return self.getDocFromUrl(url).getContent() + + def getContentFromFile(self, filename): + return self.getDocFromFile(filename).getContent() + + def getDocFromFile(self,filename): + return self.getDoc(self.readFromFile(filename)) + + def getDocFromUrl(self,url): + return self.getDoc(self.readFromUrl(url)) + + def getDoc(self,text): + doc=self.parseDoc(text) + self.filter.process(doc) + return doc + + def readFromFile(self,filename): + f=open(filename,'r') + text=f.read() + f.close() + try: + text=text.decode('utf8') + except UnicodeDecodeError: pass + return text + + def readFromUrl(self,url): + f = urllib.request.urlopen(url) + text=f.read() + encoding=self.getUrlEncoding(f) + f.close() + try: + text=text.decode(encoding) + except UnicodeDecodeError: pass + return text + + def getUrlEncoding(self,f): + try: + return f.headers['content-type'].split('charset=')[1].split(';')[0] + except: return 'utf8' + + def parseDoc(self,inputStr): + bpParser=parser.BoilerpipeHTMLParser() + try: + bpParser.feed(inputStr) + except Exception as exc: + #in case of error, try again, first removing script tag content + bpParser=parser.BoilerpipeHTMLParser() + inputStr=re.sub(r'<(?:script|SCRIPT)[^>]*>.*?','',inputStr,0,re.DOTALL) + try: + bpParser.feed(inputStr) + except Exception as e: + print("Error parsing HTML : " + str(e)) + return None + doc=bpParser.toTextDocument() + return doc @@ -103,28 +105,28 @@ def parseDoc(self,inputStr): # * A full-text extractor which is tuned towards news articles. In this scenario # * it achieves higher accuracy than {@link DefaultExtractor}. articleFilterChain=filters.FilterChain([ - filters.TerminatingBlocksFinder(), - filters.DocumentTitleMatchClassifier(None,True), - filters.NumWordsRulesClassifier(), - filters.IgnoreBlocksAfterContentFilter(), - filters.BlockProximityFusion(1,False,False), - filters.BoilerplateBlockFilter(), - filters.BlockProximityFusion(1,True,False), - filters.KeepLargestBlockFilter(), - filters.ExpandTitleToContentFilter() + filters.TerminatingBlocksFinder(), + filters.DocumentTitleMatchClassifier(None,True), + filters.NumWordsRulesClassifier(), + filters.IgnoreBlocksAfterContentFilter(), + filters.BlockProximityFusion(1,False,False), + filters.BoilerplateBlockFilter(), + filters.BlockProximityFusion(1,True,False), + filters.KeepLargestBlockFilter(), + filters.ExpandTitleToContentFilter() ]) -# * Works very well for most types of Article-like HTML. +# * Works very well for most types of Article-like HTML. ARTICLE_EXTRACTOR = Extractor(articleFilterChain) # class DefaultExtractor -# * Usually worse than {@link ArticleExtractor}, but simpler/no heuristics. +# * Usually worse than {@link ArticleExtractor}, but simpler/no heuristics. # * A quite generic full-text extractor. defaultFilterChain=filters.FilterChain([ - filters.SimpleBlockFusionProcessor(), - filters.BlockProximityFusion(1,False,False), - filters.DensityRulesClassifier() + filters.SimpleBlockFusionProcessor(), + filters.BlockProximityFusion(1,False,False), + filters.DensityRulesClassifier() ]) DEFAULT_EXTRACTOR = Extractor(defaultFilterChain) @@ -135,19 +137,19 @@ def parseDoc(self,inputStr): # * For news articles, it may perform better than the {@link DefaultExtractor}, # * but usually worse than {@link ArticleExtractor}. largestContentFilterChain=filters.FilterChain([ - filters.NumWordsRulesClassifier(), - filters.BlockProximityFusion(1,False,False), - filters.KeepLargestBlockFilter() + filters.NumWordsRulesClassifier(), + filters.BlockProximityFusion(1,False,False), + filters.KeepLargestBlockFilter() ]) -# * Like {@link DefaultExtractor}, but keeps the largest text block only. +# * Like {@link DefaultExtractor}, but keeps the largest text block only. LARGEST_CONTENT_EXTRACTOR = Extractor(largestContentFilterChain) # class CanolaExtractor -# * Trained on krdwrd Canola (different definition of "boilerplate"). You may -# * give it a try. +# * Trained on krdwrd Canola (different definition of "boilerplate"). You may +# * give it a try. CANOLA_EXTRACTOR = Extractor(filters.CanolaFilter()) @@ -155,9 +157,9 @@ def parseDoc(self,inputStr): # class KeepEverythingExtractor # * Marks everything as content. -# * Dummy Extractor; should return the input text. Use this to double-check -# * that your problem is within a particular {@link BoilerpipeExtractor}, or -# * somewhere else. +# * Dummy Extractor; should return the input text. Use this to double-check +# * that your problem is within a particular {@link BoilerpipeExtractor}, or +# * somewhere else. KEEP_EVERYTHING_EXTRACTOR = Extractor(filters.MarkEverythingContentFilter()) @@ -174,9 +176,9 @@ def parseDoc(self,inputStr): # class ArticleSentencesExtractor # * A full-text extractor which is tuned towards extracting sentences from news articles. ARTICLE_SENTENCES_EXTRACTOR=Extractor(filters.FilterChain([ - articleFilterChain, - filters.SplitParagraphBlocksFilter(), - filters.MinClauseWordsFilter() + articleFilterChain, + filters.SplitParagraphBlocksFilter(), + filters.MinClauseWordsFilter() ])) @@ -184,10 +186,10 @@ def parseDoc(self,inputStr): # * For news articles, it may perform better than the {@link DefaultExtractor}, # * but usually worse than {@link ArticleExtractor}. class KeepEverythingWithMinKWordsFilter(filters.FilterChain): - def __init__(self, kMin): - filterArr = [ - filters.SimpleBlockFusionProcessor(), - filters.MarkEverythingContentFilter(), - filters.MinWordsFilter(kMin) - ] - super(KeepEverythingWithMinKWordsFilter, self).__init__(filters) + def __init__(self, kMin): + filterArr = [ + filters.SimpleBlockFusionProcessor(), + filters.MarkEverythingContentFilter(), + filters.MinWordsFilter(kMin) + ] + super(KeepEverythingWithMinKWordsFilter, self).__init__(filters) diff --git a/boilerpy/filters.py b/boilerpy/filters.py index c2885bb..a9714ed 100644 --- a/boilerpy/filters.py +++ b/boilerpy/filters.py @@ -9,7 +9,7 @@ # * (the "License"); you may not use this file except in compliance with # * the License. You may obtain a copy of the License at # * -# * http://www.apache.org/licenses/LICENSE-2.0 +# * http://www.apache.org/licenses/LICENSE-2.0 # * # * Unless required by applicable law or agreed to in writing, software # * distributed under the License is distributed on an "AS IS" BASIS, @@ -59,42 +59,42 @@ import re from . import document -from document import DefaultLabels +from .document import DefaultLabels # Boilerpipe abstract interface class BoilerpipeFilter(object): - def process(self, doc): pass - - def subtractBlocks(self,blockArr,blocksToRemove): - #inefficient but in place: for block in blocksToRemove: blockArr.remove(blocksToRemove) - #efficiently subtracts second array from first assuming blocksToRemove shows up in the same order as blocArr - if len(blocksToRemove)==0: return blockArr - newBlockArr=[] - removeIter=iter(blocksToRemove) - curBlockToRemove=removeIter.next() - for idx,block in enumerate(blockArr): - if block==curBlockToRemove: - try: - curBlockToRemove=removeIter.next() - except StopIteration: - #add the rest - newBlockArr.extend(blockArr[idx+1:]) - break - else: newBlockArr.append(block) - return newBlockArr + def process(self, doc): pass + + def subtractBlocks(self,blockArr,blocksToRemove): + #inefficient but in place: for block in blocksToRemove: blockArr.remove(blocksToRemove) + #efficiently subtracts second array from first assuming blocksToRemove shows up in the same order as blocArr + if len(blocksToRemove)==0: return blockArr + newBlockArr=[] + removeIter=iter(blocksToRemove) + curBlockToRemove = next(removeIter) + for idx,block in enumerate(blockArr): + if block==curBlockToRemove: + try: + curBlockToRemove = next(removeIter) + except StopIteration: + #add the rest + newBlockArr.extend(blockArr[idx+1:]) + break + else: newBlockArr.append(block) + return newBlockArr # chain together multiple filters in sequence class FilterChain(BoilerpipeFilter): - def __init__(self,filterArr): - super(FilterChain, self).__init__() - self.filterArr=filterArr - - def process(self,doc): - isUpdated=False - for filtr in self.filterArr: - isUpdated|=filtr.process(doc) - return isUpdated + def __init__(self,filterArr): + super(FilterChain, self).__init__() + self.filterArr=filterArr + + def process(self,doc): + isUpdated=False + for filtr in self.filterArr: + isUpdated|=filtr.process(doc) + return isUpdated #----------------------------------------------------------------------- @@ -109,14 +109,14 @@ def process(self,doc): # * @author Christian Kohlschtter # class MarkEverythingContentFilter(BoilerpipeFilter): - def process(self, doc): - """ generated source for method process """ - changes = False - for tb in doc.getTextBlocks(): - if not tb.isContent(): - tb.setIsContent(True) - changes = True - return changes + def process(self, doc): + """ generated source for method process """ + changes = False + for tb in doc.getTextBlocks(): + if not tb.isContent(): + tb.setIsContent(True) + changes = True + return changes # @@ -126,12 +126,12 @@ def process(self, doc): # class InvertedFilter(BoilerpipeFilter): - def process(self, doc): - """ generated source for method process """ - tbs = doc.getTextBlocks() - if len(tbs)==0: return False - for tb in tbs: tb.setIsContent(not tb.isContent()) - return True + def process(self, doc): + """ generated source for method process """ + tbs = doc.getTextBlocks() + if len(tbs)==0: return False + for tb in tbs: tb.setIsContent(not tb.isContent()) + return True # @@ -140,14 +140,14 @@ def process(self, doc): # * @author Christian Kohlschtter # class BoilerplateBlockFilter(BoilerpipeFilter): - def process(self, doc): - """ generated source for method process """ - textBlocks = doc.getTextBlocks() - newBlocks=[tb for tb in textBlocks if tb.isContent()] - hasChanges = len(newBlocks)= self.minWords: return True - return n >= self.minWords + def __init__(self, minWords=5, acceptClausesWithoutDelimiter=False): + super(MinClauseWordsFilter, self).__init__() + self.minWords = minWords + self.acceptClausesWithoutDelimiter = acceptClausesWithoutDelimiter + + PAT_CLAUSE_DELIMITER = re.compile(r"\b[\,\.\:\;\!\?]+(?:\s+|\Z)",re.UNICODE) + PAT_WHITESPACE = re.compile("\s+") + + def process(self, doc): + """ generated source for method process """ + changes = False + for tb in doc.getTextBlocks(): + if not tb.isContent(): continue + hasClause = False + possibleClauseArr=self.PAT_CLAUSE_DELIMITER.split(tb.getText()) + for possibleClause in possibleClauseArr[:-1]: + hasClause = self.isClauseAccepted(possibleClause) + if hasClause: break + + # since clauses should *always end* with a delimiter, we normally + # don't consider text without one + if self.acceptClausesWithoutDelimiter: + hasClause |= self.isClauseAccepted(possibleClauseArr[-1]) + if not hasClause: + tb.setIsContent(False) + changes = True + # System.err.println("IS NOT CONTENT: " + text); + return changes + + def isClauseAccepted(self, text): + """ generated source for method isClause """ + n = 1 + for match in self.PAT_WHITESPACE.finditer(text): + n += 1 + if n >= self.minWords: return True + return n >= self.minWords # @@ -230,56 +230,56 @@ def isClauseAccepted(self, text): # * @see MinClauseWordsFilter # class SplitParagraphBlocksFilter(BoilerpipeFilter): - def process(self, doc): - changes = False - blocks = doc.getTextBlocks() - blocksNew = [] - for tb in blocks: - text = tb.getText(); - paragraphs = re.split(r"[\n\r]+",text) - if len(paragraphs)<2: - blocksNew.append(tb) - continue - isContent = tb.isContent() - labels = tb.getLabels() - for p in paragraphs: - tbP=document.TextBlock(p) - tbP.setIsContent(isContent) - tbP.addLabels(labels) - blocksNew.append(tbP) - changes = True - - if changes: doc.setTextBlocks(blocksNew) - return changes - + def process(self, doc): + changes = False + blocks = doc.getTextBlocks() + blocksNew = [] + for tb in blocks: + text = tb.getText(); + paragraphs = re.split(r"[\n\r]+",text) + if len(paragraphs)<2: + blocksNew.append(tb) + continue + isContent = tb.isContent() + labels = tb.getLabels() + for p in paragraphs: + tbP=document.TextBlock(p) + tbP.setIsContent(isContent) + tbP.addLabels(labels) + blocksNew.append(tbP) + changes = True + + if changes: doc.setTextBlocks(blocksNew) + return changes + class SurroundingToContentFilter(BoilerpipeFilter): - # this is now default when no arguments are passed - #INSTANCE_TEXT = SurroundingToContentFilter(TextBlockCondition()) - - #ctor - condition is an function for an additional condition to determine if it can be made content - def __init__(self, condition=lambda tb:tb.getLinkDensity()==0 and tb.getNumWords()>6): - super(SurroundingToContentFilter, self).__init__() - self.cond=condition - - def process(self, doc): - """ generated source for method process """ - tbs = doc.getTextBlocks() - n=len(tbs) - hasChanges=False - i=1 - while i6): + super(SurroundingToContentFilter, self).__init__() + self.cond=condition + + def process(self, doc): + """ generated source for method process """ + tbs = doc.getTextBlocks() + n=len(tbs) + hasChanges=False + i=1 + while i0: - newBlocks=self.subtractBlocks(textBlocks,blocksToRemove) - doc.setTextBlocks(newBlocks) - changes=True - - return changes + """ generated source for class BlockProximityFusion """ + #MAX_DISTANCE_1 = BlockProximityFusion(1, False, False) + #MAX_DISTANCE_1_SAME_TAGLEVEL = BlockProximityFusion(1, False, True) + #MAX_DISTANCE_1_CONTENT_ONLY = BlockProximityFusion(1, True, False) + #MAX_DISTANCE_1_CONTENT_ONLY_SAME_TAGLEVEL = BlockProximityFusion(1, True, True) + + # + # * Creates a new {@link BlockProximityFusion} instance. + # * + # * @param maxBlocksDistance The maximum distance in blocks. + # * @param contentOnly + # + def __init__(self, maxBlocksDistance=1, contentOnly=False, sameTagLevelOnly=False): + """ generated source for method __init__ """ + super(BlockProximityFusion, self).__init__() + self.maxBlocksDistance = maxBlocksDistance + self.contentOnly = contentOnly + self.sameTagLevelOnly = sameTagLevelOnly + + def process(self, doc): + """ generated source for method process """ + textBlocks = doc.getTextBlocks() + if len(textBlocks) < 2: return False + changes = False + + if self.contentOnly: + startIdx=None + for idx,block in enumerate(textBlocks): + if block.isContent(): + startIdx=idx + break + if startIdx == None: return False + else: + startIdx=0 + + prevBlock=textBlocks[startIdx] + blocksToRemove=[] + for block in textBlocks[startIdx+1:]: + if not block.isContent(): + prevBlock = block + continue + diffBlocks = block.getOffsetBlocksStart() - prevBlock.getOffsetBlocksEnd() - 1; + if diffBlocks <= self.maxBlocksDistance: + ok=True + if self.contentOnly: + if not prevBlock.isContent() or not block.isContent(): + ok = False + if self.sameTagLevelOnly and prevBlock.getTagLevel() != block.getTagLevel(): + ok = False + if ok: + prevBlock.mergeNext(block) + #remove current block + blocksToRemove.append(block) + changes = True + else: + prevBlock = block + else: + prevBlock = block + + if len(blocksToRemove)>0: + newBlocks=self.subtractBlocks(textBlocks,blocksToRemove) + doc.setTextBlocks(newBlocks) + changes=True + + return changes @@ -521,49 +521,49 @@ def process(self, doc): # * @author Christian Kohlschtter # class KeepLargestBlockFilter(BoilerpipeFilter): - """ generated source for class KeepLargestBlockFilter """ - #INSTANCE = KeepLargestBlockFilter(False) - #INSTANCE_EXPAND_TO_SAME_TAGLEVEL = KeepLargestBlockFilter(True) - - def __init__(self, expandToSameLevelText=False): - """ generated source for method __init__ """ - super(KeepLargestBlockFilter, self).__init__() - self.expandToSameLevelText = expandToSameLevelText - - def process(self, doc): - """ generated source for method process """ - textBlocks = doc.getTextBlocks() - if len(textBlocks) < 2: return False - - try: - contentBlockIter=(tb for tb in textBlocks if tb.isContent()) - largestBlock=max(contentBlockIter,key=lambda tb:tb.getNumWords()) - except ValueError: - #no content blocks exist / largest block not found - largestBlock=None - - for tb in textBlocks: - if tb == largestBlock: - tb.setIsContent(True) - else: - tb.setIsContent(False) - tb.addLabel(DefaultLabels.MIGHT_BE_CONTENT) - - if self.expandToSameLevelText and largestBlock!=None: - level = largestBlock.getTagLevel() - largestBlockIdx=textBlocks.index(largestBlock) - - for tb in textBlocks[largestBlockIdx::-1]: - tl=tb.getTagLevel() - if tl < level: break - elif tl == level: tb.setIsContent(True) - - for tb in textBlocks[largestBlockIdx:]: - tl=tb.getTagLevel() - if tl < level: break - elif tl == level: tb.setIsContent(True) - - return True + """ generated source for class KeepLargestBlockFilter """ + #INSTANCE = KeepLargestBlockFilter(False) + #INSTANCE_EXPAND_TO_SAME_TAGLEVEL = KeepLargestBlockFilter(True) + + def __init__(self, expandToSameLevelText=False): + """ generated source for method __init__ """ + super(KeepLargestBlockFilter, self).__init__() + self.expandToSameLevelText = expandToSameLevelText + + def process(self, doc): + """ generated source for method process """ + textBlocks = doc.getTextBlocks() + if len(textBlocks) < 2: return False + + try: + contentBlockIter=(tb for tb in textBlocks if tb.isContent()) + largestBlock=max(contentBlockIter,key=lambda tb:tb.getNumWords()) + except ValueError: + #no content blocks exist / largest block not found + largestBlock=None + + for tb in textBlocks: + if tb == largestBlock: + tb.setIsContent(True) + else: + tb.setIsContent(False) + tb.addLabel(DefaultLabels.MIGHT_BE_CONTENT) + + if self.expandToSameLevelText and largestBlock!=None: + level = largestBlock.getTagLevel() + largestBlockIdx=textBlocks.index(largestBlock) + + for tb in textBlocks[largestBlockIdx::-1]: + tl=tb.getTagLevel() + if tl < level: break + elif tl == level: tb.setIsContent(True) + + for tb in textBlocks[largestBlockIdx:]: + tl=tb.getTagLevel() + if tl < level: break + elif tl == level: tb.setIsContent(True) + + return True # * Marks all {@link TextBlock}s "content" which are between the headline and the part that @@ -574,44 +574,44 @@ def process(self, doc): # * @author Christian Kohlschtter # class ExpandTitleToContentFilter(BoilerpipeFilter): - def process(self, doc): - """ generated source for method process """ - i = 0 - titleIdx = -1 - contentStart = -1 - for tb in doc.getTextBlocks(): - if contentStart == -1 and tb.hasLabel(DefaultLabels.TITLE): - titleIdx = i - if contentStart == -1 and tb.isContent(): - contentStart = i - i += 1 - - if contentStart <= titleIdx or titleIdx == -1: return False - - changes = False - for tb in doc.getTextBlocks()[titleIdx:contentStart]: - if tb.hasLabel(DefaultLabels.MIGHT_BE_CONTENT): - changes |= tb.setIsContent(True) - return changes + def process(self, doc): + """ generated source for method process """ + i = 0 + titleIdx = -1 + contentStart = -1 + for tb in doc.getTextBlocks(): + if contentStart == -1 and tb.hasLabel(DefaultLabels.TITLE): + titleIdx = i + if contentStart == -1 and tb.isContent(): + contentStart = i + i += 1 + + if contentStart <= titleIdx or titleIdx == -1: return False + + changes = False + for tb in doc.getTextBlocks()[titleIdx:contentStart]: + if tb.hasLabel(DefaultLabels.MIGHT_BE_CONTENT): + changes |= tb.setIsContent(True) + return changes class ArticleMetadataFilter(BoilerpipeFilter): - #checks for date/time/author blocks - PATTERNS_SHORT = [re.compile(r"^[0-9 \,\./]*\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January|February|March|April|May|June|July|August|September|October|November|December)?\b[0-9 \,\:apm\./]*(?:[CPSDMGET]{2,3})?$"), re.compile("^[Bb]y ")]; - - def process(self, doc): - """ generated source for method process """ - changed = False - for tb in doc.getTextBlocks(): - if tb.getNumWords() > 10: continue - for p in self.PATTERNS_SHORT: - text = tb.getText() - if p.search(text): - changed = True - tb.setIsContent(True) - tb.addLabel(DefaultLabels.ARTICLE_METADATA) - break - return changed + #checks for date/time/author blocks + PATTERNS_SHORT = [re.compile(r"^[0-9 \,\./]*\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January|February|March|April|May|June|July|August|September|October|November|December)?\b[0-9 \,\:apm\./]*(?:[CPSDMGET]{2,3})?$"), re.compile("^[Bb]y ")]; + + def process(self, doc): + """ generated source for method process """ + changed = False + for tb in doc.getTextBlocks(): + if tb.getNumWords() > 10: continue + for p in self.PATTERNS_SHORT: + text = tb.getText() + if p.search(text): + changed = True + tb.setIsContent(True) + tb.addLabel(DefaultLabels.ARTICLE_METADATA) + break + return changed # @@ -620,36 +620,36 @@ def process(self, doc): # * @author Christian Kohlschtter # class AddPrecedingLabelsFilter(BoilerpipeFilter): - #INSTANCE = AddPrecedingLabelsFilter("") - #INSTANCE_PRE = AddPrecedingLabelsFilter("^") - - # - # * Creates a new {@link AddPrecedingLabelsFilter} instance. - # * - # * @param maxBlocksDistance The maximum distance in blocks. - # * @param contentOnly - # - def __init__(self, labelPrefix=""): - """ generated source for method __init__ """ - super(AddPrecedingLabelsFilter, self).__init__() - self.labelPrefix = labelPrefix - - def process(self, doc): - """ generated source for method process """ - textBlocks = doc.getTextBlocks() - if len(textBlocks) < 2: return False - changes = False - blockBelow = None - - for block in textBlocks[::-1]: - if blockBelow != None: - labels=block.getLabels() - if labels != None and len(labels)>0: - for l in labels: blockBelow.addLabel(self.labelPrefix + l) - changes = True - blockBelow = block - - return changes + #INSTANCE = AddPrecedingLabelsFilter("") + #INSTANCE_PRE = AddPrecedingLabelsFilter("^") + + # + # * Creates a new {@link AddPrecedingLabelsFilter} instance. + # * + # * @param maxBlocksDistance The maximum distance in blocks. + # * @param contentOnly + # + def __init__(self, labelPrefix=""): + """ generated source for method __init__ """ + super(AddPrecedingLabelsFilter, self).__init__() + self.labelPrefix = labelPrefix + + def process(self, doc): + """ generated source for method process """ + textBlocks = doc.getTextBlocks() + if len(textBlocks) < 2: return False + changes = False + blockBelow = None + + for block in textBlocks[::-1]: + if blockBelow != None: + labels=block.getLabels() + if labels != None and len(labels)>0: + for l in labels: blockBelow.addLabel(self.labelPrefix + l) + changes = True + blockBelow = block + + return changes # @@ -661,67 +661,67 @@ def process(self, doc): # class DocumentTitleMatchClassifier(BoilerpipeFilter): - """ generated source for class DocumentTitleMatchClassifier """ - - def __init__(self, title, useDocTitle=False): - """ generated source for method __init__ """ - super(DocumentTitleMatchClassifier, self).__init__() - self.useDocTitle=useDocTitle - if useDocTitle: self.potentialTitles=None - else: self.potentialTitles=self.findPotentialTitles(title) - - def findPotentialTitles(self,title): - if title == None: return None - title = title.strip() - if len(title)==0: - return None - else: - potentialTitles = set() - potentialTitles.add(title) - p = self.getLongestPart(title, "[ ]*[\||:][ ]*") - if p != None: potentialTitles.add(p) - p = self.getLongestPart(title, "[ ]*[\||:\(\)][ ]*") - if p != None: potentialTitles.add(p) - p = self.getLongestPart(title, "[ ]*[\||:\(\)\-][ ]*") - if p != None: potentialTitles.add(p) - p = self.getLongestPart(title, "[ ]*[\||,|:\(\)\-][ ]*") - if p != None: potentialTitles.add(p) - return potentialTitles - - def getPotentialTitles(self): - """ generated source for method getPotentialTitles """ - return self.potentialTitles - - def getLongestPart(self, title, pattern): - """ generated source for method getLongestPart """ - parts = re.split(pattern,title) - if len(parts)==1: return None - - longestNumWords = 0 - longestPart = "" - for p in parts: - if ".com" in p: continue - numWords=self.getNumWords(p) - if numWords > longestNumWords or len(p)>len(longestPart): - longestNumWords = numWords - longestPart = p - if len(longestPart)==0: return None - else: return longestPart.strip() - - def getNumWords(self,text): - return len(re.findall("\w+",text,re.UNICODE)) - - def process(self, doc): - """ generated source for method process """ - if self.useDocTitle: self.potentialTitles=self.findPotentialTitles(doc.getTitle()) - if self.potentialTitles == None: return False - changes = False - for tb in doc.getTextBlocks(): - text=tb.getText().strip().lower() - if any(candidate.lower()==text for candidate in self.potentialTitles): - tb.addLabel(DefaultLabels.TITLE) - changes = True - return changes + """ generated source for class DocumentTitleMatchClassifier """ + + def __init__(self, title, useDocTitle=False): + """ generated source for method __init__ """ + super(DocumentTitleMatchClassifier, self).__init__() + self.useDocTitle=useDocTitle + if useDocTitle: self.potentialTitles=None + else: self.potentialTitles=self.findPotentialTitles(title) + + def findPotentialTitles(self,title): + if title == None: return None + title = title.strip() + if len(title)==0: + return None + else: + potentialTitles = set() + potentialTitles.add(title) + p = self.getLongestPart(title, "[ ]*[\||:][ ]*") + if p != None: potentialTitles.add(p) + p = self.getLongestPart(title, "[ ]*[\||:\(\)][ ]*") + if p != None: potentialTitles.add(p) + p = self.getLongestPart(title, "[ ]*[\||:\(\)\-][ ]*") + if p != None: potentialTitles.add(p) + p = self.getLongestPart(title, "[ ]*[\||,|:\(\)\-][ ]*") + if p != None: potentialTitles.add(p) + return potentialTitles + + def getPotentialTitles(self): + """ generated source for method getPotentialTitles """ + return self.potentialTitles + + def getLongestPart(self, title, pattern): + """ generated source for method getLongestPart """ + parts = re.split(pattern,title) + if len(parts)==1: return None + + longestNumWords = 0 + longestPart = "" + for p in parts: + if ".com" in p: continue + numWords=self.getNumWords(p) + if numWords > longestNumWords or len(p)>len(longestPart): + longestNumWords = numWords + longestPart = p + if len(longestPart)==0: return None + else: return longestPart.strip() + + def getNumWords(self,text): + return len(re.findall("\w+",text,re.UNICODE)) + + def process(self, doc): + """ generated source for method process """ + if self.useDocTitle: self.potentialTitles=self.findPotentialTitles(doc.getTitle()) + if self.potentialTitles == None: return False + changes = False + for tb in doc.getTextBlocks(): + text=tb.getText().strip().lower() + if any(candidate.lower()==text for candidate in self.potentialTitles): + tb.addLabel(DefaultLabels.TITLE) + changes = True + return changes @@ -743,9 +743,9 @@ def process(self, doc): # * @author Christian Kohlschtter # class HeuristicFilterBase(BoilerpipeFilter): - def getNumFullTextWords(self, tb, minTextDensity=9): - if tb.getTextDensity() >= minTextDensity: return tb.getNumWords() - else: return 0 + def getNumFullTextWords(self, tb, minTextDensity=9): + if tb.getTextDensity() >= minTextDensity: return tb.getNumWords() + else: return 0 # # * Keeps only those content blocks which contain at least k full-text words @@ -754,17 +754,17 @@ def getNumFullTextWords(self, tb, minTextDensity=9): # * @author Christian Kohlschtter # class MinFulltextWordsFilter(HeuristicFilterBase): - def __init__(self, minWords=30): - self.minWords = minWords + def __init__(self, minWords=30): + self.minWords = minWords - def process(self, doc): - """ generated source for method process """ - changes = False - for tb in doc.getTextBlocks(): - if tb.isContent() and self.getNumFullTextWords(tb) < self.minWords: - tb.setIsContent(False) - changes = True - return changes + def process(self, doc): + """ generated source for method process """ + changes = False + for tb in doc.getTextBlocks(): + if tb.isContent() and self.getNumFullTextWords(tb) < self.minWords: + tb.setIsContent(False) + changes = True + return changes # @@ -785,21 +785,21 @@ def process(self, doc): # class KeepLargestFulltextBlockFilter(HeuristicFilterBase): - def process(self, doc): - """ generated source for method process """ - textBlocks = doc.getTextBlocks() - if len(textBlocks) < 2: return False - contentBlocks=[block for block in textBlocks if block.isContent()] - if len(contentBlocks)==0: return False - largestBlock=max(contentBlocks,key=self.getNumFullTextWords) - - for tb in textBlocks: - if tb == largestBlock: - tb.setIsContent(True) - else: - tb.setIsContent(False) - tb.addLabel(DefaultLabels.MIGHT_BE_CONTENT) - return True + def process(self, doc): + """ generated source for method process """ + textBlocks = doc.getTextBlocks() + if len(textBlocks) < 2: return False + contentBlocks=[block for block in textBlocks if block.isContent()] + if len(contentBlocks)==0: return False + largestBlock=max(contentBlocks,key=self.getNumFullTextWords) + + for tb in textBlocks: + if tb == largestBlock: + tb.setIsContent(True) + else: + tb.setIsContent(False) + tb.addLabel(DefaultLabels.MIGHT_BE_CONTENT) + return True # # * Marks all blocks as "non-content" that occur after blocks that have been @@ -811,28 +811,28 @@ def process(self, doc): # * @see TerminatingBlocksFinder # class IgnoreBlocksAfterContentFilter(HeuristicFilterBase): - """ generated source for class IgnoreBlocksAfterContentFilter """ - #DEFAULT_INSTANCE = IgnoreBlocksAfterContentFilter(60) - #INSTANCE_200 = IgnoreBlocksAfterContentFilter(200) - - def __init__(self, minNumWords=60): - self.minNumWords = minNumWords - - def process(self, doc): - """ generated source for method process """ - changes = False - numWords = 0 - foundEndOfText = False - for block in doc.getTextBlocks(): - if block.isContent(): - numWords += self.getNumFullTextWords(block) - if block.hasLabel(DefaultLabels.INDICATES_END_OF_TEXT) and numWords >= self.minNumWords: - foundEndOfText = True - if foundEndOfText: - changes = True - block.setIsContent(False) - - return changes + """ generated source for class IgnoreBlocksAfterContentFilter """ + #DEFAULT_INSTANCE = IgnoreBlocksAfterContentFilter(60) + #INSTANCE_200 = IgnoreBlocksAfterContentFilter(200) + + def __init__(self, minNumWords=60): + self.minNumWords = minNumWords + + def process(self, doc): + """ generated source for method process """ + changes = False + numWords = 0 + foundEndOfText = False + for block in doc.getTextBlocks(): + if block.isContent(): + numWords += self.getNumFullTextWords(block) + if block.hasLabel(DefaultLabels.INDICATES_END_OF_TEXT) and numWords >= self.minNumWords: + foundEndOfText = True + if foundEndOfText: + changes = True + block.setIsContent(False) + + return changes # # * Marks all blocks as "non-content" that occur after blocks that have been # * marked {@link DefaultLabels#INDICATES_END_OF_TEXT}, and after any content block. @@ -843,22 +843,22 @@ def process(self, doc): # class IgnoreBlocksAfterContentFromEndFilter(HeuristicFilterBase): - def process(self, doc): - """ generated source for method process """ - changes = False - words = 0 - blocks = doc.getTextBlocks() - if len(blocks)==0: return False - for tb in blocks[::-1]: - if tb.hasLabel(DefaultLabels.INDICATES_END_OF_TEXT): - tb.addLabel(DefaultLabels.STRICTLY_NOT_CONTENT) - tb.removeLabel(DefaultLabels.MIGHT_BE_CONTENT) - tb.setIsContent(False) - changes = True - elif tb.isContent(): - words += tb.getNumWords() - if words > 200: break - return changes + def process(self, doc): + """ generated source for method process """ + changes = False + words = 0 + blocks = doc.getTextBlocks() + if len(blocks)==0: return False + for tb in blocks[::-1]: + if tb.hasLabel(DefaultLabels.INDICATES_END_OF_TEXT): + tb.addLabel(DefaultLabels.STRICTLY_NOT_CONTENT) + tb.removeLabel(DefaultLabels.MIGHT_BE_CONTENT) + tb.setIsContent(False) + changes = True + elif tb.isContent(): + words += tb.getNumWords() + if words > 200: break + return changes # @@ -871,46 +871,46 @@ def process(self, doc): # class TerminatingBlocksFinder(BoilerpipeFilter): - # public static long timeSpent = 0; - def process(self, doc): - """ generated source for method process """ - changes = False - - for tb in doc.getTextBlocks(): - if tb.getNumWords() >=15: continue - text=tb.getText().strip() - if len(text)<8: continue - textLC = text.lower() - - startmatches=(" reuters","please rate this","post a comment") - inmatches=("what you think...","add your comment","add comment","reader views","have your say","reader comments","rtta artikeln") - eqmatch="thanks for your comments - this feedback is now closed" - - if textLC.startswith("comments") or self.startsWithNumber(textLC, " comments", " users responded in") or any(textLC.startswith(matchStr) for matchStr in startmatches) or any(matchStr in textLC for matchStr in inmatches) or textLC == eqmatch: - tb.addLabel(DefaultLabels.INDICATES_END_OF_TEXT) - changes = True - # timeSpent += System.currentTimeMillis() - t; - return changes - - # - # * Checks whether the given text t starts with a sequence of digits, - # * followed by one of the given strings. - # * - # * @param t - # * The text to examine - # * @param len - # * The length of the text to examine - # * @param str - # * Any strings that may follow the digits. - # * @return true if at least one combination matches - # - def startsWithNumber(self, text, *matchStrArr): - """ generated source for method startsWithNumber """ - numberMatch=re.search('\D',text) - if numberMatch==None: pos=len(text) - else: pos=numberMatch.start() - if pos==0: return False - else: return any(text.startswith(matchStr,pos) for matchStr in matchStrArr) + # public static long timeSpent = 0; + def process(self, doc): + """ generated source for method process """ + changes = False + + for tb in doc.getTextBlocks(): + if tb.getNumWords() >=15: continue + text=tb.getText().strip() + if len(text)<8: continue + textLC = text.lower() + + startmatches=(" reuters","please rate this","post a comment") + inmatches=("what you think...","add your comment","add comment","reader views","have your say","reader comments","rtta artikeln") + eqmatch="thanks for your comments - this feedback is now closed" + + if textLC.startswith("comments") or self.startsWithNumber(textLC, " comments", " users responded in") or any(textLC.startswith(matchStr) for matchStr in startmatches) or any(matchStr in textLC for matchStr in inmatches) or textLC == eqmatch: + tb.addLabel(DefaultLabels.INDICATES_END_OF_TEXT) + changes = True + # timeSpent += System.currentTimeMillis() - t; + return changes + + # + # * Checks whether the given text t starts with a sequence of digits, + # * followed by one of the given strings. + # * + # * @param t + # * The text to examine + # * @param len + # * The length of the text to examine + # * @param str + # * Any strings that may follow the digits. + # * @return true if at least one combination matches + # + def startsWithNumber(self, text, *matchStrArr): + """ generated source for method startsWithNumber """ + numberMatch=re.search('\D',text) + if numberMatch==None: pos=len(text) + else: pos=numberMatch.start() + if pos==0: return False + else: return any(text.startswith(matchStr,pos) for matchStr in matchStrArr) # @@ -923,46 +923,46 @@ def startsWithNumber(self, text, *matchStrArr): # class NumWordsRulesClassifier(BoilerpipeFilter): - def process(self, doc): - """ generated source for method process """ - textBlocks = doc.getTextBlocks() - hasChanges = False - - n=len(textBlocks) - for i,currentBlock in enumerate(textBlocks): - if i>0: prevBlock=textBlocks[i-1] - else: prevBlock=document.TextBlock.EMPTY_START - if i+10: prevBlock=textBlocks[i-1] + else: prevBlock=document.TextBlock.EMPTY_START + if i+10: prevBlock=textBlocks[i-1] - else: prevBlock=document.TextBlock.EMPTY_START - if i+10: prevBlock=textBlocks[i-1] + else: prevBlock=document.TextBlock.EMPTY_START + if i+1krdwrd 0: prevBlock=textBlocks[i-1] - else: prevBlock=document.TextBlock.EMPTY_START - if i+1 0 and next.getNumWords() > 11 - cond2=curr.getNumWords() > 19 - cond3=next.getNumWords() > 6 and next.getLinkDensity() == 0 and prev.getLinkDensity() == 0 and (curr.getNumWords() > 6 or prev.getNumWords() > 7 or next.getNumWords() > 19) - isContent = cond1 or cond2 or cond3 - return curr.setIsContent(isContent) + def process(self, doc): + """ generated source for method process """ + textBlocks = doc.getTextBlocks() + hasChanges = False + + n=len(textBlocks) + for i,currentBlock in enumerate(textBlocks): + if i>0: prevBlock=textBlocks[i-1] + else: prevBlock=document.TextBlock.EMPTY_START + if i+1 0 and next.getNumWords() > 11 + cond2=curr.getNumWords() > 19 + cond3=next.getNumWords() > 6 and next.getLinkDensity() == 0 and prev.getLinkDensity() == 0 and (curr.getNumWords() > 6 or prev.getNumWords() > 7 or next.getNumWords() > 19) + isContent = cond1 or cond2 or cond3 + return curr.setIsContent(isContent) diff --git a/boilerpy/parser.py b/boilerpy/parser.py index 5f07449..fc0a835 100644 --- a/boilerpy/parser.py +++ b/boilerpy/parser.py @@ -8,7 +8,7 @@ # * (the "License"); you may not use this file except in compliance with # * the License. You may obtain a copy of the License at # * -# * http://www.apache.org/licenses/LICENSE-2.0 +# * http://www.apache.org/licenses/LICENSE-2.0 # * # * Unless required by applicable law or agreed to in writing, software # * distributed under the License is distributed on an "AS IS" BASIS, @@ -17,10 +17,10 @@ # * limitations under the License. # -from HTMLParser import HTMLParser +from html.parser import HTMLParser from xml.sax import ContentHandler from . import document -from document import DefaultLabels +from .document import DefaultLabels import re @@ -31,25 +31,25 @@ class TagAction(object): - def start(self, contentHandler, tagName, attrs): return False - def end(self, contentHandler, tagName): return False - def changesTagLevel(self): return False + def start(self, contentHandler, tagName, attrs): return False + def end(self, contentHandler, tagName): return False + def changesTagLevel(self): return False # # * Marks this tag as "ignorable", i.e. all its inner content is silently skipped. # class IgnorableElementTagAction(TagAction): - """ generated source for class TA_IGNORABLE_ELEMENT """ - def start(self, contentHandler, tagName, attrs): - contentHandler.inIgnorableElement += 1 - return True + """ generated source for class TA_IGNORABLE_ELEMENT """ + def start(self, contentHandler, tagName, attrs): + contentHandler.inIgnorableElement += 1 + return True - def end(self, contentHandler, tagName): - contentHandler.inIgnorableElement -= 1 - return True + def end(self, contentHandler, tagName): + contentHandler.inIgnorableElement -= 1 + return True - def changesTagLevel(self): - return True + def changesTagLevel(self): + return True # # * Marks this tag as "anchor" (this should usually only be set for the <A> tag). @@ -59,286 +59,286 @@ def changesTagLevel(self): # * If boilerpipe encounters such nestings, a SAXException is thrown. # class AnchorTextTagAction(TagAction): - """ generated source for class TA_ANCHOR_TEXT """ - def start(self, contentHandler, tagName, attrs): - contentHandler.inAnchor += 1 - if contentHandler.inAnchor > 1: - # as nested A elements are not allowed per specification, we - # are probably reaching this branch due to a bug in the XML - # parser - print("Warning: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again. Trying to recover somehow...") - self.end(contentHandler, tagName) - if contentHandler.inIgnorableElement == 0: - contentHandler.addToken(SpecialTokens.ANCHOR_TEXT_START) - return False - - def end(self, contentHandler, tagName): - contentHandler.inAnchor -= 1 - if contentHandler.inAnchor == 0 and contentHandler.inIgnorableElement == 0: - contentHandler.addToken(SpecialTokens.ANCHOR_TEXT_END) - return False - - def changesTagLevel(self): - return True + """ generated source for class TA_ANCHOR_TEXT """ + def start(self, contentHandler, tagName, attrs): + contentHandler.inAnchor += 1 + if contentHandler.inAnchor > 1: + # as nested A elements are not allowed per specification, we + # are probably reaching this branch due to a bug in the XML + # parser + print("Warning: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again. Trying to recover somehow...") + self.end(contentHandler, tagName) + if contentHandler.inIgnorableElement == 0: + contentHandler.addToken(SpecialTokens.ANCHOR_TEXT_START) + return False + + def end(self, contentHandler, tagName): + contentHandler.inAnchor -= 1 + if contentHandler.inAnchor == 0 and contentHandler.inIgnorableElement == 0: + contentHandler.addToken(SpecialTokens.ANCHOR_TEXT_END) + return False + + def changesTagLevel(self): + return True # # * Marks this tag the body element (this should usually only be set for the <BODY> tag). # class BodyTagAction(TagAction): - """ generated source for class TA_BODY """ - def start(self, contentHandler, tagName, attrs): - contentHandler.flushBlock() - contentHandler.inBody += 1 - return False + """ generated source for class TA_BODY """ + def start(self, contentHandler, tagName, attrs): + contentHandler.flushBlock() + contentHandler.inBody += 1 + return False - def end(self, contentHandler, tagName): - contentHandler.flushBlock() - contentHandler.inBody -= 1 - return False + def end(self, contentHandler, tagName): + contentHandler.flushBlock() + contentHandler.inBody -= 1 + return False - def changesTagLevel(self): - return True + def changesTagLevel(self): + return True # # * Marks this tag a simple "inline" element, which generates whitespace, but no new block. # class InlineWhitespaceTagAction(TagAction): - """ generated source for class TA_INLINE_WHITESPACE """ - def start(self, contentHandler, tagName, attrs): - contentHandler.addWhitespaceIfNecessary() - return False + """ generated source for class TA_INLINE_WHITESPACE """ + def start(self, contentHandler, tagName, attrs): + contentHandler.addWhitespaceIfNecessary() + return False - def end(self, contentHandler, tagName): - contentHandler.addWhitespaceIfNecessary() - return False + def end(self, contentHandler, tagName): + contentHandler.addWhitespaceIfNecessary() + return False - def changesTagLevel(self): return False + def changesTagLevel(self): return False # # * Marks this tag a simple "inline" element, which neither generates whitespace, nor a new block. # class InlineTagAction(TagAction): - """ generated source for class TA_INLINE_NO_WHITESPACE """ - def start(self, contentHandler, tagName, attrs): return False - def end(self, contentHandler, tagName): return False - def changesTagLevel(self): return False + """ generated source for class TA_INLINE_NO_WHITESPACE """ + def start(self, contentHandler, tagName, attrs): return False + def end(self, contentHandler, tagName): return False + def changesTagLevel(self): return False # # * Explicitly marks this tag a simple "block-level" element, which always generates whitespace # class BlockTagAction(TagAction): - """ generated source for class TA_BLOCK_LEVEL """ - def start(self, contentHandler, tagName, attrs): return True - def end(self, contentHandler, tagName): return True - def changesTagLevel(self): return True + """ generated source for class TA_BLOCK_LEVEL """ + def start(self, contentHandler, tagName, attrs): return True + def end(self, contentHandler, tagName): return True + def changesTagLevel(self): return True # # * Special TagAction for the <FONT> tag, which keeps track of the # * absolute and relative font size. # class FontTagAction(TagAction): - """ generated source for class TA_FONT """ - #WARNING: POSSIBLE BUG -- used to be [0-9] without + - PAT_FONT_SIZE = re.compile("([\+\-]?)([0-9]+)") - - def start(self, contentHandler, tagName, attrs): - """ generated source for method start """ - sizeAttr = attrs.getValue("size") - size=None - if sizeAttr != None: - match=PAT_FONT_SIZE.match(sizeAttr) - if match!=None: - rel=match.group(0) - val=match.group(1) - if len(rel)==0: - # absolute - size = val - else: - # relative - #last non-none element from stack, default 3 - lastNonNone=(s for s in contentHandler.fontSizeStack[::-1] if s!=None) - prevSize=next(lastNonNone,3) - if rel[0] == '+': size = prevSize + val - else: size = prevSize - val - contentHandler.fontSizeStack.append(size) - return False - - def end(self, contentHandler, tagName): - contentHandler.fontSizeStack.pop() - return False - - def changesTagLevel(self): return False + """ generated source for class TA_FONT """ + #WARNING: POSSIBLE BUG -- used to be [0-9] without + + PAT_FONT_SIZE = re.compile("([\+\-]?)([0-9]+)") + + def start(self, contentHandler, tagName, attrs): + """ generated source for method start """ + sizeAttr = attrs.getValue("size") + size=None + if sizeAttr != None: + match = self.PAT_FONT_SIZE.match(sizeAttr) + if match!=None: + rel=match.group(0) + val=match.group(1) + if len(rel)==0: + # absolute + size = val + else: + # relative + #last non-none element from stack, default 3 + lastNonNone=(s for s in contentHandler.fontSizeStack[::-1] if s!=None) + prevSize=next(lastNonNone,3) + if rel[0] == '+': size = prevSize + val + else: size = prevSize - val + contentHandler.fontSizeStack.append(size) + return False + + def end(self, contentHandler, tagName): + contentHandler.fontSizeStack.pop() + return False + + def changesTagLevel(self): return False # # * {@link CommonTagActions} for inline elements, which triggers some {@link LabelAction} on the generated # * {@link TextBlock}. # class InlineTagLabelAction(TagAction): - """ generated source for class InlineTagLabelAction """ + """ generated source for class InlineTagLabelAction """ - def __init__(self, action): - """ generated source for method __init__ """ - super(InlineTagLabelAction, self).__init__() - self.action = action + def __init__(self, action): + """ generated source for method __init__ """ + super(InlineTagLabelAction, self).__init__() + self.action = action - def start(self, contentHandler, tagName, attrs): - """ generated source for method start """ - contentHandler.addWhitespaceIfNecessary() - contentHandler.addLabelAction(self.action) - return False + def start(self, contentHandler, tagName, attrs): + """ generated source for method start """ + contentHandler.addWhitespaceIfNecessary() + contentHandler.addLabelAction(self.action) + return False - def end(self, contentHandler, tagName): - """ generated source for method end """ - contentHandler.addWhitespaceIfNecessary() - return False + def end(self, contentHandler, tagName): + """ generated source for method end """ + contentHandler.addWhitespaceIfNecessary() + return False - def changesTagLevel(self): - """ generated source for method changesTagLevel """ - return False + def changesTagLevel(self): + """ generated source for method changesTagLevel """ + return False # # * {@link CommonTagActions} for block-level elements, which triggers some {@link LabelAction} on the generated # * {@link TextBlock}. # class BlockTagLabelAction(TagAction): - """ generated source for class BlockTagLabelAction """ + """ generated source for class BlockTagLabelAction """ - def __init__(self, action): - """ generated source for method __init__ """ - super(BlockTagLabelAction, self).__init__() - self.action = action + def __init__(self, action): + """ generated source for method __init__ """ + super(BlockTagLabelAction, self).__init__() + self.action = action - def start(self, contentHandler, tagName, attrs): - """ generated source for method start """ - contentHandler.addLabelAction(self.action) - return True + def start(self, contentHandler, tagName, attrs): + """ generated source for method start """ + contentHandler.addLabelAction(self.action) + return True - def end(self, contentHandler, tagName): - """ generated source for method end """ - return True + def end(self, contentHandler, tagName): + """ generated source for method end """ + return True - def changesTagLevel(self): - """ generated source for method changesTagLevel """ - return True + def changesTagLevel(self): + """ generated source for method changesTagLevel """ + return True class Chained(TagAction): - def __init__(self, tagAction1, tagAction2): - """ generated source for method __init__ """ - super(Chained, self).__init__() - self.tagAction1 = tagAction1 - self.tagAction2 = tagAction2 + def __init__(self, tagAction1, tagAction2): + """ generated source for method __init__ """ + super(Chained, self).__init__() + self.tagAction1 = tagAction1 + self.tagAction2 = tagAction2 - def start(self, contentHandler, tagName, attrs): - """ generated source for method start """ - return self.tagAction1.start(contentHandler, tagName, attrs) | self.tagAction2.start(contentHandler, tagName, attrs) + def start(self, contentHandler, tagName, attrs): + """ generated source for method start """ + return self.tagAction1.start(contentHandler, tagName, attrs) | self.tagAction2.start(contentHandler, tagName, attrs) - def end(self, contentHandler, tagName): - """ generated source for method end """ - return self.tagAction1.end(contentHandler, tagName) | self.tagAction2.end(contentHandler, tagName) + def end(self, contentHandler, tagName): + """ generated source for method end """ + return self.tagAction1.end(contentHandler, tagName) | self.tagAction2.end(contentHandler, tagName) - def changesTagLevel(self): - """ generated source for method changesTagLevel """ - return self.tagAction1.changesTagLevel() or self.tagAction2.changesTagLevel() + def changesTagLevel(self): + """ generated source for method changesTagLevel """ + return self.tagAction1.changesTagLevel() or self.tagAction2.changesTagLevel() class MarkupTagAction(TagAction): - """ generated source for class MarkupTagAction """ - - def __init__(self, isBlockLevel): - """ generated source for method __init__ """ - super(MarkupTagAction, self).__init__() - self.isBlockLevel = isBlockLevel - self.labelStack = [] - - PAT_NUM = re.compile("[0-9]+") - - def start(self, contentHandler, tagName, attrs): - """ generated source for method start """ - labels = [] - labels.append(DefaultLabels.MARKUP_PREFIX + tagName) - classVal = attrs.getValue("class") - if classVal != None and len(classVal)>0: - classVal = self.PAT_NUM.sub("#",classVal).strip() - vals = classVal.split(r"[ ]+") - labels.append(DefaultLabels.MARKUP_PREFIX + "." + classVal.replace(' ', '.')) - if len(vals)>1: - for s in vals: - labels.append(DefaultLabels.MARKUP_PREFIX + "." + s) - id = attrs.get("id") - if id != None and len(id)<0: - id = self.PAT_NUM.sub("#",id) - labels.append(DefaultLabels.MARKUP_PREFIX + "#" + id) - ancestors = self.getAncestorLabels() - labelsWithAncestors = [] - for l in labels: - for an in ancestors: - labelsWithAncestors.append(an) - labelsWithAncestors.append(an + " " + l) - labelsWithAncestors.append(l) - contentHandler.addLabelAction(LabelAction(labelsWithAncestors)) - self.labelStack.append(labels) - return self.isBlockLevel - - def end(self, contentHandler, tagName): - """ generated source for method end """ - self.labelStack.pop() - return self.isBlockLevel - - def changesTagLevel(self): - """ generated source for method changesTagLevel """ - return self.isBlockLevel - - def getAncestorLabels(self): - """ generated source for method getAncestorLabels """ - labelSet = set() - for labels in labelStack: - if labels == None:continue - labelSet.update(labels) - return labelSet - - -class CommonTagActions: - TA_IGNORABLE_ELEMENT=IgnorableElementTagAction() - TA_ANCHOR_TEXT=AnchorTextTagAction() - TA_BODY=BodyTagAction() - TA_INLINE_WHITESPACE=InlineWhitespaceTagAction() - TA_INLINE_NO_WHITESPACE=InlineTagAction() - TA_BLOCK_LEVEL=BlockTagAction() - TA_FONT=FontTagAction() + """ generated source for class MarkupTagAction """ + + def __init__(self, isBlockLevel): + """ generated source for method __init__ """ + super(MarkupTagAction, self).__init__() + self.isBlockLevel = isBlockLevel + self.labelStack = [] + + PAT_NUM = re.compile("[0-9]+") + + def start(self, contentHandler, tagName, attrs): + """ generated source for method start """ + labels = [] + labels.append(DefaultLabels.MARKUP_PREFIX + tagName) + classVal = attrs.getValue("class") + if classVal != None and len(classVal)>0: + classVal = self.PAT_NUM.sub("#",classVal).strip() + vals = classVal.split(r"[ ]+") + labels.append(DefaultLabels.MARKUP_PREFIX + "." + classVal.replace(' ', '.')) + if len(vals)>1: + for s in vals: + labels.append(DefaultLabels.MARKUP_PREFIX + "." + s) + id = attrs.get("id") + if id != None and len(id)<0: + id = self.PAT_NUM.sub("#",id) + labels.append(DefaultLabels.MARKUP_PREFIX + "#" + id) + ancestors = self.getAncestorLabels() + labelsWithAncestors = [] + for l in labels: + for an in ancestors: + labelsWithAncestors.append(an) + labelsWithAncestors.append(an + " " + l) + labelsWithAncestors.append(l) + contentHandler.addLabelAction(LabelAction(labelsWithAncestors)) + self.labelStack.append(labels) + return self.isBlockLevel + + def end(self, contentHandler, tagName): + """ generated source for method end """ + self.labelStack.pop() + return self.isBlockLevel + + def changesTagLevel(self): + """ generated source for method changesTagLevel """ + return self.isBlockLevel + + def getAncestorLabels(self): + """ generated source for method getAncestorLabels """ + labelSet = set() + for labels in self.labelStack: + if labels == None:continue + labelSet.update(labels) + return labelSet + + +class CommonTagActions(object): + TA_IGNORABLE_ELEMENT=IgnorableElementTagAction() + TA_ANCHOR_TEXT=AnchorTextTagAction() + TA_BODY=BodyTagAction() + TA_INLINE_WHITESPACE=InlineWhitespaceTagAction() + TA_INLINE_NO_WHITESPACE=InlineTagAction() + TA_BLOCK_LEVEL=BlockTagAction() + TA_FONT=FontTagAction() defaultTagActionMap={ - "STYLE" : CommonTagActions.TA_IGNORABLE_ELEMENT, - "SCRIPT" : CommonTagActions.TA_IGNORABLE_ELEMENT, - "OPTION" : CommonTagActions.TA_IGNORABLE_ELEMENT, - "OBJECT" : CommonTagActions.TA_IGNORABLE_ELEMENT, - "EMBED" : CommonTagActions.TA_IGNORABLE_ELEMENT, - "APPLET" : CommonTagActions.TA_IGNORABLE_ELEMENT, - #Note: link removed because it can be self-closing in HTML5 - #"LINK" : CommonTagActions.TA_IGNORABLE_ELEMENT, - "A" : CommonTagActions.TA_ANCHOR_TEXT, - "BODY" : CommonTagActions.TA_BODY, - "STRIKE" : CommonTagActions.TA_INLINE_NO_WHITESPACE, - "U" : CommonTagActions.TA_INLINE_NO_WHITESPACE, - "B" : CommonTagActions.TA_INLINE_NO_WHITESPACE, - "I" : CommonTagActions.TA_INLINE_NO_WHITESPACE, - "EM" : CommonTagActions.TA_INLINE_NO_WHITESPACE, - "STRONG" : CommonTagActions.TA_INLINE_NO_WHITESPACE, - "SPAN" : CommonTagActions.TA_INLINE_NO_WHITESPACE, - # New in 1.1 (especially to improve extraction quality from Wikipedia etc., - "SUP" : CommonTagActions.TA_INLINE_NO_WHITESPACE, - # New in 1.2 - "CODE" : CommonTagActions.TA_INLINE_NO_WHITESPACE, - "TT" : CommonTagActions.TA_INLINE_NO_WHITESPACE, - "SUB" : CommonTagActions.TA_INLINE_NO_WHITESPACE, - "VAR" : CommonTagActions.TA_INLINE_NO_WHITESPACE, - "ABBR" : CommonTagActions.TA_INLINE_WHITESPACE, - "ACRONYM" : CommonTagActions.TA_INLINE_WHITESPACE, - "FONT" : CommonTagActions.TA_INLINE_NO_WHITESPACE, - # could also use TA_FONT - # added in 1.1.1 - "NOSCRIPT" : CommonTagActions.TA_IGNORABLE_ELEMENT + "STYLE" : CommonTagActions.TA_IGNORABLE_ELEMENT, + "SCRIPT" : CommonTagActions.TA_IGNORABLE_ELEMENT, + "OPTION" : CommonTagActions.TA_IGNORABLE_ELEMENT, + "OBJECT" : CommonTagActions.TA_IGNORABLE_ELEMENT, + "EMBED" : CommonTagActions.TA_IGNORABLE_ELEMENT, + "APPLET" : CommonTagActions.TA_IGNORABLE_ELEMENT, + #Note: link removed because it can be self-closing in HTML5 + #"LINK" : CommonTagActions.TA_IGNORABLE_ELEMENT, + "A" : CommonTagActions.TA_ANCHOR_TEXT, + "BODY" : CommonTagActions.TA_BODY, + "STRIKE" : CommonTagActions.TA_INLINE_NO_WHITESPACE, + "U" : CommonTagActions.TA_INLINE_NO_WHITESPACE, + "B" : CommonTagActions.TA_INLINE_NO_WHITESPACE, + "I" : CommonTagActions.TA_INLINE_NO_WHITESPACE, + "EM" : CommonTagActions.TA_INLINE_NO_WHITESPACE, + "STRONG" : CommonTagActions.TA_INLINE_NO_WHITESPACE, + "SPAN" : CommonTagActions.TA_INLINE_NO_WHITESPACE, + # New in 1.1 (especially to improve extraction quality from Wikipedia etc., + "SUP" : CommonTagActions.TA_INLINE_NO_WHITESPACE, + # New in 1.2 + "CODE" : CommonTagActions.TA_INLINE_NO_WHITESPACE, + "TT" : CommonTagActions.TA_INLINE_NO_WHITESPACE, + "SUB" : CommonTagActions.TA_INLINE_NO_WHITESPACE, + "VAR" : CommonTagActions.TA_INLINE_NO_WHITESPACE, + "ABBR" : CommonTagActions.TA_INLINE_WHITESPACE, + "ACRONYM" : CommonTagActions.TA_INLINE_WHITESPACE, + "FONT" : CommonTagActions.TA_INLINE_NO_WHITESPACE, + # could also use TA_FONT + # added in 1.1.1 + "NOSCRIPT" : CommonTagActions.TA_IGNORABLE_ELEMENT } @@ -353,30 +353,30 @@ class CommonTagActions: # * @author Christian Kohlschtter # class LabelAction(object): - def __init__(self, *labels): - self.labels = labels + def __init__(self, *labels): + self.labels = labels - def addTo(self, textBlock): - self.addLabelsTo(textBlock) + def addTo(self, textBlock): + self.addLabelsTo(textBlock) - def addLabelsTo(self, textBlock): - textBlock.addLabels(self.labels) + def addLabelsTo(self, textBlock): + textBlock.addLabels(self.labels) - def __str__(self): - return str(self.labels) + def __str__(self): + return str(self.labels) class ConditionalLabelAction(LabelAction): - def __init__(self, condition, *labels): - super(ConditionalLabelAction, self).__init__(*labels) - self.condition = condition + def __init__(self, condition, *labels): + super(ConditionalLabelAction, self).__init__(*labels) + self.condition = condition - def addTo(self, textBlock): - if self.condition(textBlock): self.addLabelsTo(textBlock) + def addTo(self, textBlock): + if self.condition(textBlock): self.addLabelsTo(textBlock) -class SpecialTokens: - ANCHOR_TEXT_START = u'\ue00astart' - ANCHOR_TEXT_END = u'\ue00aend' +class SpecialTokens(object): + ANCHOR_TEXT_START = u'\ue00astart' + ANCHOR_TEXT_END = u'\ue00aend' #---------------------------------------------------------------------------- @@ -392,297 +392,296 @@ class SpecialTokens: class BoilerpipeBaseParser(object): - EVENT_START_TAG=0 - EVENT_END_TAG=1 - EVENT_CHARACTERS=2 - EVENT_WHITESPACE=3 - #all word characters except underscore -- i.e. not (not word or underscore) - PAT_VALID_WORD_CHARACTER = re.compile(r"[^\W_]",re.UNICODE) -# PAT_WORD = re.compile(r"\ue00a?[\w]+",re.UNICODE) - PAT_WORD = re.compile(ur"\ue00a?[\w\"'\.,\!\@\-\:\;\$\?\(\)/]+",re.UNICODE) - - """ generated source for class BoilerpipeHTMLContentHandler """ - # - # * Constructs a {@link BoilerpipeHTMLContentHandler} using the given - # * {@link TagActionMap}. - # * - # * @param tagActions - # * The {@link TagActionMap} to use, e.g. - # * {@link DefaultTagActionMap}. - # - def __init__(self, tagActions=None): - """ generated source for method __init___0 """ - #super(BoilerpipeHTMLContentHandler, self).__init__() - if tagActions==None: self.tagActions=defaultTagActionMap - else: self.tagActions = tagActions - - - self.clearTextBuffer() - self.inBody = 0 - self.inAnchor = 0 - self.inIgnorableElement = 0 - self.textElementIdx = 0 - self.lastStartTag = None - self.lastEndTag = None - self.lastEvent = None - self.offsetBlocks = 0 - self.currentContainedTextElements=set() - self.flush = False - self.inAnchorText = False - - self.title = None - self.tagLevel = 0 - self.blockTagLevel = -1 - self.textBlocks = [] - self.labelStacks = [] - self.fontSizeStack = [] - - # - # * Recycles this instance. - # - def recycle(self): - """ generated source for method recycle """ - self.clearTextBuffer() - self.inBody = 0 - self.inAnchor = 0 - self.inIgnorableElement = 0 - self.textElementIdx = 0 - self.lastStartTag = None - self.lastEndTag = None - self.lastEvent = None - self.offsetBlocks = 0 - self.currentContainedTextElements=set() - self.flush = False - self.inAnchorText = False - self.textBlocks=[] - - #--------- added ------- - self.title = None - self.tagLevel = 0 - self.blockTagLevel = -1 - self.labelStacks = [] - self.fontSizeStack = [] + EVENT_START_TAG=0 + EVENT_END_TAG=1 + EVENT_CHARACTERS=2 + EVENT_WHITESPACE=3 + #all word characters except underscore -- i.e. not (not word or underscore) + PAT_VALID_WORD_CHARACTER = re.compile(r"[^\W_]", re.UNICODE) + PAT_WORD = re.compile(r"\ue00a?[\w\"'\.,\!\@\-\:\;\$\?\(\)/]+", re.UNICODE) + + """ generated source for class BoilerpipeHTMLContentHandler """ + # + # * Constructs a {@link BoilerpipeHTMLContentHandler} using the given + # * {@link TagActionMap}. + # * + # * @param tagActions + # * The {@link TagActionMap} to use, e.g. + # * {@link DefaultTagActionMap}. + # + def __init__(self, tagActions=None): + """ generated source for method __init___0 """ + #super(BoilerpipeHTMLContentHandler, self).__init__() + if tagActions==None: self.tagActions=defaultTagActionMap + else: self.tagActions = tagActions + + + self.clearTextBuffer() + self.inBody = 0 + self.inAnchor = 0 + self.inIgnorableElement = 0 + self.textElementIdx = 0 + self.lastStartTag = None + self.lastEndTag = None + self.lastEvent = None + self.offsetBlocks = 0 + self.currentContainedTextElements=set() + self.flush = False + self.inAnchorText = False + + self.title = None + self.tagLevel = 0 + self.blockTagLevel = -1 + self.textBlocks = [] + self.labelStacks = [] + self.fontSizeStack = [] + + # + # * Recycles this instance. + # + def recycle(self): + """ generated source for method recycle """ + self.clearTextBuffer() + self.inBody = 0 + self.inAnchor = 0 + self.inIgnorableElement = 0 + self.textElementIdx = 0 + self.lastStartTag = None + self.lastEndTag = None + self.lastEvent = None + self.offsetBlocks = 0 + self.currentContainedTextElements=set() + self.flush = False + self.inAnchorText = False + self.textBlocks=[] + + #--------- added ------- + self.title = None + self.tagLevel = 0 + self.blockTagLevel = -1 + self.labelStacks = [] + self.fontSizeStack = [] #------------------------------- SAX Parser methods ---------------------------------------- - # @Override - def endDocument(self): - """ generated source for method endDocument """ - self.flushBlock() - - # @Override - def startDocument(self): pass - - # @Override - def startElement(self, name,attrs): - self.labelStacks.append([]) - - tagAction = self.tagActions.get(name.strip().upper()) - - if tagAction != None: - self.flush |= tagAction.start(self, name, attrs) - if tagAction.changesTagLevel(): self.tagLevel += 1 - else: - self.tagLevel += 1 - self.flush = True - self.lastEvent = self.EVENT_START_TAG - self.lastStartTag = name - - # @Override - def endElement(self, name): - tagAction = self.tagActions.get(name.strip().upper()) - - - if tagAction != None: - self.flush |= tagAction.end(self, name) - if tagAction.changesTagLevel(): self.tagLevel -= 1 - else: - self.flush = True - self.tagLevel -= 1 - - if self.flush: self.flushBlock() - self.lastEvent = self.EVENT_END_TAG - self.lastEndTag = name - self.labelStacks.pop() - - # @Override - def characters(self, content): - self.textElementIdx += 1 - if self.flush: - self.flushBlock() - self.flush = False - if self.inIgnorableElement != 0: return - - if len(content) == 0: return - - strippedContent=content.strip() - - if len(strippedContent) == 0: - self.addWhitespaceIfNecessary() - self.lastEvent = self.EVENT_WHITESPACE - return - - startWhitespace=content[0].isspace() - if startWhitespace: self.addWhitespaceIfNecessary() - - if self.blockTagLevel == -1: - self.blockTagLevel = self.tagLevel - self.textBuffer+=strippedContent - self.tokenBuffer+=strippedContent - - endWhitespace=content[-1].isspace() - if endWhitespace: self.addWhitespaceIfNecessary() - - self.lastEvent = self.EVENT_CHARACTERS - self.currentContainedTextElements.add(self.textElementIdx) - - # @Override - def ignorableWhitespace(self, whitespace): - self.addWhitespaceIfNecessary() + # @Override + def endDocument(self): + """ generated source for method endDocument """ + self.flushBlock() + + # @Override + def startDocument(self): pass + + # @Override + def startElement(self, name,attrs): + self.labelStacks.append([]) + + tagAction = self.tagActions.get(name.strip().upper()) + + if tagAction != None: + self.flush |= tagAction.start(self, name, attrs) + if tagAction.changesTagLevel(): self.tagLevel += 1 + else: + self.tagLevel += 1 + self.flush = True + self.lastEvent = self.EVENT_START_TAG + self.lastStartTag = name + + # @Override + def endElement(self, name): + tagAction = self.tagActions.get(name.strip().upper()) + + + if tagAction != None: + self.flush |= tagAction.end(self, name) + if tagAction.changesTagLevel(): self.tagLevel -= 1 + else: + self.flush = True + self.tagLevel -= 1 + + if self.flush: self.flushBlock() + self.lastEvent = self.EVENT_END_TAG + self.lastEndTag = name + self.labelStacks.pop() + + # @Override + def characters(self, content): + self.textElementIdx += 1 + if self.flush: + self.flushBlock() + self.flush = False + if self.inIgnorableElement != 0: return + + if len(content) == 0: return + + strippedContent=content.strip() + + if len(strippedContent) == 0: + self.addWhitespaceIfNecessary() + self.lastEvent = self.EVENT_WHITESPACE + return + + startWhitespace=content[0].isspace() + if startWhitespace: self.addWhitespaceIfNecessary() + + if self.blockTagLevel == -1: + self.blockTagLevel = self.tagLevel + self.textBuffer+=strippedContent + self.tokenBuffer+=strippedContent + + endWhitespace=content[-1].isspace() + if endWhitespace: self.addWhitespaceIfNecessary() + + self.lastEvent = self.EVENT_CHARACTERS + self.currentContainedTextElements.add(self.textElementIdx) + + # @Override + def ignorableWhitespace(self, whitespace): + self.addWhitespaceIfNecessary() #------------------------------- utility methods ---------------------------------------- - def flushBlock(self): - """ generated source for method flushBlock """ - if self.inBody == 0: - if self.lastStartTag.lower()=="title": self.setTitle(self.textBuffer.strip()) - self.clearTextBuffer() - return - if len(self.tokenBuffer.strip())==0: - self.clearTextBuffer() - return - - tokens = self.tokenize(self.tokenBuffer) - numWords = 0 - numLinkedWords = 0 - numWrappedLines = 0 - currentLineLength = -1 - # don't count the first space - maxLineLength = 80 - numTokens = 0 - numWordsCurrentLine = 0 - - for token in tokens: - if token==SpecialTokens.ANCHOR_TEXT_START: self.inAnchorText = True - elif token==SpecialTokens.ANCHOR_TEXT_END: self.inAnchorText = False - elif self.isWord(token): - numTokens += 1 - numWords += 1 - numWordsCurrentLine += 1 - if self.inAnchorText: - numLinkedWords += 1 - currentLineLength += len(token) + 1 - if currentLineLength > maxLineLength: - numWrappedLines += 1 - currentLineLength = len(token) - numWordsCurrentLine = 1 - else: - numTokens += 1 - - #if only special tokens (numTokens excludes special tokens) - if numTokens == 0: - self.clearTextBuffer() - return - - if numWrappedLines == 0: - numWordsInWrappedLines = numWords - numWrappedLines = 1 - else: - numWordsInWrappedLines = numWords - numWordsCurrentLine - - tb = document.TextBlock(self.textBuffer.strip(), self.currentContainedTextElements, numWords, numLinkedWords, numWordsInWrappedLines, numWrappedLines, self.offsetBlocks) - self.currentContainedTextElements = set() - self.offsetBlocks += 1 - self.clearTextBuffer() - tb.setTagLevel(self.blockTagLevel) - self.addTextBlock(tb) - self.blockTagLevel = -1 - - def addTextBlock(self, tb): - """ generated source for method addTextBlock """ - for fontSize in self.fontSizeStack[::-1]: - if fontSize != None: - tb.addLabel("font-" + str(fontSize)) - break - for labelStack in self.labelStacks: - for labels in labelStack: - labels.addTo(tb) - self.textBlocks.append(tb) - - - def isWord(self, token): - """ generated source for method isWord """ - return self.PAT_VALID_WORD_CHARACTER.search(token)!=None - - def tokenize(self,text): - return self.PAT_WORD.findall(text) - - def getTextBlocks(self): - """ generated source for method getTextBlocks """ - return self.textBlocks - - def getTitle(self): - """ generated source for method getTitle """ - return self.title - - def setTitle(self, s): - """ generated source for method setTitle """ - if s == None or len(s)==0: return - self.title = s - - # - # * Returns a {@link TextDocument} containing the extracted {@link TextBlock} - # * s. NOTE: Only call this after parsing. - # * - # * @return The {@link TextDocument} - # - def toTextDocument(self): - """ generated source for method toTextDocument """ - # just to be sure - self.flushBlock() - return document.TextDocument(self.getTextBlocks(), self.getTitle()) - - def addWhitespaceIfNecessary(self): - """ generated source for method addWhitespaceIfNecessary """ - if len(self.textBuffer)==0 or not self.textBuffer[-1].isspace(): - self.textBuffer+=' ' - if len(self.tokenBuffer)==0 or not self.tokenBuffer[-1].isspace(): - self.tokenBuffer+=' ' - - def clearTextBuffer(self): - self.textBuffer='' - self.tokenBuffer='' - - def addToken(self,token): - self.addWhitespaceIfNecessary() - self.tokenBuffer+=token - self.addWhitespaceIfNecessary() - - def addLabelAction(self, la): - """ generated source for method addLabelAction """ - if len(self.labelStacks)==0: self.labelStacks.append([]) - self.labelStacks[-1].append(la) + def flushBlock(self): + """ generated source for method flushBlock """ + if self.inBody == 0: + if self.lastStartTag.lower()=="title": self.setTitle(self.textBuffer.strip()) + self.clearTextBuffer() + return + if len(self.tokenBuffer.strip())==0: + self.clearTextBuffer() + return + + tokens = self.tokenize(self.tokenBuffer) + numWords = 0 + numLinkedWords = 0 + numWrappedLines = 0 + currentLineLength = -1 + # don't count the first space + maxLineLength = 80 + numTokens = 0 + numWordsCurrentLine = 0 + + for token in tokens: + if token==SpecialTokens.ANCHOR_TEXT_START: self.inAnchorText = True + elif token==SpecialTokens.ANCHOR_TEXT_END: self.inAnchorText = False + elif self.isWord(token): + numTokens += 1 + numWords += 1 + numWordsCurrentLine += 1 + if self.inAnchorText: + numLinkedWords += 1 + currentLineLength += len(token) + 1 + if currentLineLength > maxLineLength: + numWrappedLines += 1 + currentLineLength = len(token) + numWordsCurrentLine = 1 + else: + numTokens += 1 + + #if only special tokens (numTokens excludes special tokens) + if numTokens == 0: + self.clearTextBuffer() + return + + if numWrappedLines == 0: + numWordsInWrappedLines = numWords + numWrappedLines = 1 + else: + numWordsInWrappedLines = numWords - numWordsCurrentLine + + tb = document.TextBlock(self.textBuffer.strip(), self.currentContainedTextElements, numWords, numLinkedWords, numWordsInWrappedLines, numWrappedLines, self.offsetBlocks) + self.currentContainedTextElements = set() + self.offsetBlocks += 1 + self.clearTextBuffer() + tb.setTagLevel(self.blockTagLevel) + self.addTextBlock(tb) + self.blockTagLevel = -1 + + def addTextBlock(self, tb): + """ generated source for method addTextBlock """ + for fontSize in self.fontSizeStack[::-1]: + if fontSize != None: + tb.addLabel("font-" + str(fontSize)) + break + for labelStack in self.labelStacks: + for labels in labelStack: + labels.addTo(tb) + self.textBlocks.append(tb) + + + def isWord(self, token): + """ generated source for method isWord """ + return self.PAT_VALID_WORD_CHARACTER.search(token)!=None + + def tokenize(self,text): + return self.PAT_WORD.findall(text) + + def getTextBlocks(self): + """ generated source for method getTextBlocks """ + return self.textBlocks + + def getTitle(self): + """ generated source for method getTitle """ + return self.title + + def setTitle(self, s): + """ generated source for method setTitle """ + if s == None or len(s)==0: return + self.title = s + + # + # * Returns a {@link TextDocument} containing the extracted {@link TextBlock} + # * s. NOTE: Only call this after parsing. + # * + # * @return The {@link TextDocument} + # + def toTextDocument(self): + """ generated source for method toTextDocument """ + # just to be sure + self.flushBlock() + return document.TextDocument(self.getTextBlocks(), self.getTitle()) + + def addWhitespaceIfNecessary(self): + """ generated source for method addWhitespaceIfNecessary """ + if len(self.textBuffer)==0 or not self.textBuffer[-1].isspace(): + self.textBuffer+=' ' + if len(self.tokenBuffer)==0 or not self.tokenBuffer[-1].isspace(): + self.tokenBuffer+=' ' + + def clearTextBuffer(self): + self.textBuffer='' + self.tokenBuffer='' + + def addToken(self,token): + self.addWhitespaceIfNecessary() + self.tokenBuffer+=token + self.addWhitespaceIfNecessary() + + def addLabelAction(self, la): + """ generated source for method addLabelAction """ + if len(self.labelStacks)==0: self.labelStacks.append([]) + self.labelStacks[-1].append(la) class BoilerpipeHTMLParser(HTMLParser,BoilerpipeBaseParser): - def __init__(self): - HTMLParser.__init__(self) - BoilerpipeBaseParser.__init__(self) - - def feed(self,data): - self.startDocument() - HTMLParser.feed(self,data) - self.endDocument() - - def handle_starttag(self, tag, attrs): self.startElement(tag,attrs) - def handle_endtag(self, tag): self.endElement(tag) - def handle_data(self, data): self.characters(data) + def __init__(self): + HTMLParser.__init__(self) + BoilerpipeBaseParser.__init__(self) + + def feed(self,data): + self.startDocument() + HTMLParser.feed(self,data) + self.endDocument() + + def handle_starttag(self, tag, attrs): self.startElement(tag,attrs) + def handle_endtag(self, tag): self.endElement(tag) + def handle_data(self, data): self.characters(data) class BoilerpipeSAXContentHandler(ContentHandler,BoilerpipeBaseParser): - def __init__(self): - ContentHandler.__init__(self) - BoilerpipeBaseParser.__init__(self) + def __init__(self): + ContentHandler.__init__(self) + BoilerpipeBaseParser.__init__(self) diff --git a/dist/boilerpy-1.0.zip b/dist/boilerpy-1.0.zip deleted file mode 100644 index a849c7c..0000000 Binary files a/dist/boilerpy-1.0.zip and /dev/null differ diff --git a/setup.py b/setup.py index 6088060..aebbf74 100644 --- a/setup.py +++ b/setup.py @@ -6,22 +6,22 @@ # README file and 2) it's easier to type in the README file than to put a raw # string in below ... def read(fname): - return open(os.path.join(os.path.dirname(__file__), fname)).read() + return open(os.path.join(os.path.dirname(__file__), fname)).read() setup( - name = "boilerpy", - version = "1.0", - author = "Sam Myer", - author_email = "mail@frozencavemanmedia.com", - description = "Python port of Boilerpipe, Boilerplate Removal and Fulltext Extraction from HTML pages", - license = "Apache 2.0", - keywords = "boilerpipe fulltext extraction", - url = "https://github.com/sammyer/BoilerPy", - packages=['boilerpy'], - long_description=read('README.txt'), - classifiers=[ - "Development Status :: 4 - Beta", - "Topic :: Utilities", - "License :: OSI Approved :: Apache License", - ] -) \ No newline at end of file + name = "boilerpy", + version = "1.0", + author = "Sam Myer", + author_email = "mail@frozencavemanmedia.com", + description = "Python port of Boilerpipe, Boilerplate Removal and Fulltext Extraction from HTML pages", + license = "Apache 2.0", + keywords = "boilerpipe fulltext extraction", + url = "https://github.com/sammyer/BoilerPy", + packages=['boilerpy'], + long_description=read('README.txt'), + classifiers=[ + "Development Status :: 4 - Beta", + "Topic :: Utilities", + "License :: OSI Approved :: Apache License", + ] +) diff --git a/tests/unittests.py b/tests/unittests.py index ac0a97e..f12ab6e 100644 --- a/tests/unittests.py +++ b/tests/unittests.py @@ -1,420 +1,434 @@ import unittest import sys +from unittest import mock + from boilerpy.document import TextDocument,TextBlock from boilerpy.filters import * from boilerpy.extractors import Extractor def runTests(): - suite = unittest.TestLoader().loadTestsFromTestCase(TestFilters) - unittest.TextTestRunner(verbosity=2).run(suite) - suite = unittest.TestLoader().loadTestsFromTestCase(TestParser) - unittest.TextTestRunner(verbosity=2).run(suite) + suite = unittest.TestLoader().loadTestsFromTestCase(TestFilters) + unittest.TextTestRunner(verbosity=2).run(suite) + suite = unittest.TestLoader().loadTestsFromTestCase(TestParser) + unittest.TextTestRunner(verbosity=2).run(suite) def runOneTest(): - testName='test_anchor' - suite = unittest.TestSuite() - suite.addTest(TestParser(testName)) - unittest.TextTestRunner(verbosity=2).run(suite) + testName='test_anchor' + suite = unittest.TestSuite() + suite.addTest(TestParser(testName)) + unittest.TextTestRunner(verbosity=2).run(suite) class TestFilters(unittest.TestCase): - defaultWords="Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec fermentum tincidunt magna, eu pulvinar mauris dapibus pharetra. In varius, nisl a rutrum porta, sem sem semper lacus, et varius urna tellus vel lorem. Nullam urna eros, luctus eget blandit ac, imperdiet feugiat ipsum. Donec laoreet tristique mi a bibendum. Sed pretium bibendum scelerisque. Mauris id pellentesque turpis. Mauris porta adipiscing massa, quis tempus dui pharetra ac. Morbi lacus mauris, feugiat ac tempor ut, congue tincidunt risus. Pellentesque tincidunt adipiscing elit, in fringilla enim scelerisque vel. Nulla facilisi. ".split(' ') - - def makedoc(self,wordsArr,numAnchorWordsArr=None,isContentArr=None,labelArr=None): - textBlocks=[] - for idx,words in enumerate(wordsArr): - if type(words)==int: - numWords=words - text=' '.join(self.defaultWords[:numWords]) - else: - text=words - numWords=text.count(' ') - try: - numAnchorWords=numAnchorWordsArr[idx] - except TypeError,IndexError: - numAnchorWords=0 - block=TextBlock(text,set(),numWords,numAnchorWords,0,0,idx) - try: - block.setIsContent(isContentArr[idx]) - except TypeError,IndexError: - pass - try: - label=labelArr[idx] - if label==None: pass - elif type(label)==list: - for l in label: block.addLabel(l) - else: block.addLabel(label) - except TypeError,IndexError: - pass - - textBlocks.append(block) - - return TextDocument(textBlocks) - - def verifyContent(self,filtr,doc,contentArr,show=False): - isContentBefore=[block.isContent() for block in doc.getTextBlocks()] - isChanged=filtr.process(doc) - isContent=[block.isContent() for block in doc.getTextBlocks()] - self.assertEqual(isContent,contentArr) - self.assertEqual(isChanged,isContent!=isContentBefore) - - def test_markEveryhingContent(self): - doc=self.makedoc([5,100,80],None,[False,True,False]) - self.verifyContent(MarkEverythingContentFilter(),doc,[True,True,True]) - - def test_inverted(self): - doc=self.makedoc([5,100,80],None,[False,True,False]) - self.verifyContent(InvertedFilter(),doc,[True,False,True]) - - def test_boilerplateBlock(self): - #keeps if isContent - doc=self.makedoc([5,100,10,50,80],None,[False,True,False,True,False]) - initBlocks=doc.getTextBlocks() - finalBlocks=[initBlocks[1],initBlocks[3]] - filtr=BoilerplateBlockFilter() - isChanged=filtr.process(doc) - isContent=[block.isContent() for block in doc.getTextBlocks()] - self.assertEqual(doc.getTextBlocks(),finalBlocks) - self.assertEqual(isContent,[True,True]) - self.assertEqual(isChanged,True) - - def test_minWords(self): - #rejects if #words6 - self.verifyContent(SurroundingToContentFilter(defaultCondition),doc,[True,True,True,False,True,False,False,True]) - - def test_labelToBoilerplate(self): - #reject block if it has a particular label - lb_not=DefaultLabels.STRICTLY_NOT_CONTENT - lb_maybe=DefaultLabels.MIGHT_BE_CONTENT - doc=self.makedoc([10,10,10,10],None,[True,True,True,True],[lb_not,lb_maybe,[lb_not,lb_maybe],None]) - self.verifyContent(LabelToBoilerplateFilter(DefaultLabels.STRICTLY_NOT_CONTENT),doc,[False,True,False,True]) - - def test_labelToContent(self): - #accept block if it has a particular label - lb_not=DefaultLabels.STRICTLY_NOT_CONTENT - lb_maybe=DefaultLabels.MIGHT_BE_CONTENT - doc=self.makedoc([10,10,10,10],None,[False,False,False,False],[lb_not,lb_maybe,[lb_not,lb_maybe],None]) - self.verifyContent(LabelToContentFilter(DefaultLabels.MIGHT_BE_CONTENT),doc,[False,True,True,False]) - - - def test_simpleBlockFusion(self): - #join blocks with the same number of words per line - doc=self.makedoc(["two words","three fucking words","another three words"],None,[False,False,False]) - filtr=SimpleBlockFusionProcessor() - isChanged=filtr.process(doc) - blockIdxs=[(block.getOffsetBlocksStart(),block.getOffsetBlocksEnd()) for block in doc.getTextBlocks()] - self.assertEqual(blockIdxs,[(0,0),(1,2)]) - self.assertEqual(isChanged,True) - - def test_contentFusion(self): - #join blocks with low link density - filtr=ContentFusion() - - #merge - doc=self.makedoc([10,10],[0,0],[True,False]) - isChanged=filtr.process(doc) - self.assertEqual(len(doc.getTextBlocks()),1) - self.assertEqual(isChanged,True) - - #dont merge if tagged not content - doc=self.makedoc([10,10],[0,0],[True,False],[None,DefaultLabels.STRICTLY_NOT_CONTENT]) - isChanged=filtr.process(doc) - self.assertEqual(len(doc.getTextBlocks()),2) - self.assertEqual(isChanged,False) - - #dont merge if link density is high - doc=self.makedoc([10,10],[0,8],[True,False]) - isChanged=filtr.process(doc) - self.assertEqual(len(doc.getTextBlocks()),2) - self.assertEqual(isChanged,False) - - #multiple pass merging - doc=self.makedoc([10,10,10,10],[0,0,0,0],[True,False,True,False]) - isChanged=filtr.process(doc) - self.assertEqual(len(doc.getTextBlocks()),1) - self.assertEqual(isChanged,True) - - def test_labelFusion(self): - #fuse blocks with identical labels - ONLY LOOKS AT LABELS with markup prefix - - lb1=DefaultLabels.MARKUP_PREFIX+".title" - lb2=DefaultLabels.MARKUP_PREFIX+".menu" - doc=self.makedoc([10,10,10,10,10,10,10],None,None,[None,None,lb1,lb1,lb2,lb2,[lb1,lb2]]) - filtr=LabelFusion() - isChanged=filtr.process(doc) - blockIdxs=[(block.getOffsetBlocksStart(),block.getOffsetBlocksEnd()) for block in doc.getTextBlocks()] - self.assertEqual(blockIdxs,[(0,1),(2,3),(4,5),(6,6)]) - self.assertEqual(isChanged,True) - - def test_blockProximity(self): - #fuse blocks close to each other - doc=self.makedoc([10,10,10,10,10,10,10],None,[False,True,True,True,True,True,False]) - filtr=BlockProximityFusion(1,True,False) - isChanged=filtr.process(doc) - blockIdxs=[(block.getOffsetBlocksStart(),block.getOffsetBlocksEnd()) for block in doc.getTextBlocks()] - self.assertEqual(blockIdxs,[(0,0),(1,5),(6,6)]) - self.assertEqual(isChanged,True) - - def test_largestBlock(self): - #choose largest block - doc=self.makedoc([10,10,50,10],None,[False,True,True,True]) - self.verifyContent(KeepLargestBlockFilter(),doc,[False,False,True,False]) - - def test_expandTitleToContent(self): - #marks all between title and content start - lb1=DefaultLabels.MIGHT_BE_CONTENT - doc=self.makedoc([10,10,10,10],None,[False,False,False,True],[lb1,[lb1,DefaultLabels.TITLE],lb1,lb1]) - self.verifyContent(ExpandTitleToContentFilter(),doc,[False,True,True,True]) - - def test_articleMetadata(self): - #marks as content and tags blocks with date/time data - doc=self.makedoc([" May 1, 2009 8:00pm EST","May not be date 1","By Frank Sinatra","By looking at this sentence, you can see there is no author"],None,[False,False,False,False]) - self.verifyContent(ArticleMetadataFilter(),doc,[True,False,True,False]) - labels=[block.getLabels() for block in doc.getTextBlocks()] - self.assertIn(DefaultLabels.ARTICLE_METADATA,labels[0]) - - def test_largestBlock(self): - #accept largest block and reject all others - doc=self.makedoc([10,10,50,10],None,[False,True,True,True]) - self.verifyContent(KeepLargestBlockFilter(),doc,[False,False,True,False]) - - def test_addPrecedingLabels(self): - #add prefix+preceding label to each block - lb1=DefaultLabels.TITLE - lb2=DefaultLabels.MIGHT_BE_CONTENT - prefix="^" - doc=self.makedoc([10,10,10],None,None,[lb1,lb2,None]) - filtr=AddPrecedingLabelsFilter(prefix) - isChanged=filtr.process(doc) - labels=[block.getLabels() for block in doc.getTextBlocks()] - self.assertEqual(labels,[set([lb1]),set([prefix+lb1,lb2]),set([prefix+lb2])]) - self.assertEqual(isChanged,True) - - def test_documentTitleMatch(self): - #add title label to blocks matching sections of the title - doc=self.makedoc(["News","This is the real title","Red herring"]) - doc.setTitle("News - This is the real title") - filtr=DocumentTitleMatchClassifier(None,True) - isChanged=filtr.process(doc) - labels=[block.getLabels() for block in doc.getTextBlocks()] - self.assertEqual(labels,[set(),set([DefaultLabels.TITLE]),set()]) - self.assertEqual(isChanged,True) - - def test_minFulltextWords(self): - #choose largest block - doc=self.makedoc([10,50],None,[True,True]) - self.verifyContent(MinFulltextWordsFilter(30),doc,[False,True]) - - def test_largestFulltextBlock(self): - #accept largest block that has been marked as content and reject all others - doc=self.makedoc([10,50,80,10],None,[True,True,False,False]) - self.verifyContent(KeepLargestFulltextBlockFilter(),doc,[False,True,False,False]) - - def test_ignoreBlocksAfterContent(self): - #rejects all blocks after(&including) first block with ENDOFTEXT label - #Also: ENDOFTEXT labels are ignored until the total number of words in content blocks reaches a certain number - lb=DefaultLabels.INDICATES_END_OF_TEXT - doc=self.makedoc([10,30,50,80,20],None,[False,True,True,True,True],[lb,None,None,lb,None]) - self.verifyContent(IgnoreBlocksAfterContentFilter(60),doc,[False,True,True,False,False]) - - def test_ignoreBlocksAfterContentFromEnd(self): - #rejects all blocks with ENDOFTEXT label - #works backwards until the total number of words in content blocks reaches 200 and then halts - lb=DefaultLabels.INDICATES_END_OF_TEXT - doc=self.makedoc([80,80,80,80,80],None,[True,True,True,True,True],[lb,None,None,lb,None]) - self.verifyContent(IgnoreBlocksAfterContentFromEndFilter(),doc,[True,True,True,False,True]) - - def test_terminatingBlocks(self): - #add ENDOFTEXT label at detected beginning of comments section - lb=DefaultLabels.INDICATES_END_OF_TEXT - s1="Comments can be the first word of article text. If there are many words in the block, it is not comments" - s2="Thanks for your comments - this feedback is now closed" - doc=self.makedoc(["Comments","Please have your say","48 Comments today",s1,s2]) - filtr=TerminatingBlocksFinder() - isChanged=filtr.process(doc) - hasLabel=[(lb in block.getLabels()) for block in doc.getTextBlocks()] - self.assertEqual(hasLabel,[True,True,True,False,True]) - self.assertEqual(isChanged,True) - - def test_numWordsClassifier(self): - #accepts or rejects block based on machine-trained decision tree rules - #using features from previous, current and next block - filtr=NumWordsRulesClassifier() - - doc=self.makedoc([2,10,10],[0,0,0],[True,True,True]) - isChanged=filtr.process(doc) - #test middle block only - self.assertEqual(doc.getTextBlocks()[1].isContent(),False) - - doc=self.makedoc([10,10,10],[0,0,0],[True,True,True]) - isChanged=filtr.process(doc) - self.assertEqual(doc.getTextBlocks()[1].isContent(),True) - - def test_densityClassifier(self): - #accepts or rejects block based on a different set of machine-trained decision tree rules - #using features from previous, current and next block - doc=self.makedoc([10,10,5],[10,0,0],[True,True,True]) - isChanged=DensityRulesClassifier().process(doc) - self.assertEqual(doc.getTextBlocks()[1].isContent(),False) - - def test_canolaClassifier(self): - #accepts or rejects block based on a different set of machine-trained decision tree rules - #using features from previous, current and next block - doc=self.makedoc([5,10,30],[5,10,0],[True,False,True]) - isChanged=CanolaFilter().process(doc) - self.assertEqual(doc.getTextBlocks()[1].isContent(),True) + defaultWords="Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec fermentum tincidunt magna, eu pulvinar mauris dapibus pharetra. In varius, nisl a rutrum porta, sem sem semper lacus, et varius urna tellus vel lorem. Nullam urna eros, luctus eget blandit ac, imperdiet feugiat ipsum. Donec laoreet tristique mi a bibendum. Sed pretium bibendum scelerisque. Mauris id pellentesque turpis. Mauris porta adipiscing massa, quis tempus dui pharetra ac. Morbi lacus mauris, feugiat ac tempor ut, congue tincidunt risus. Pellentesque tincidunt adipiscing elit, in fringilla enim scelerisque vel. Nulla facilisi. ".split(' ') + + def makedoc(self,wordsArr,numAnchorWordsArr=None,isContentArr=None,labelArr=None): + textBlocks=[] + for idx,words in enumerate(wordsArr): + if type(words)==int: + numWords=words + text=' '.join(self.defaultWords[:numWords]) + else: + text=words + numWords=text.count(' ') + try: + numAnchorWords=numAnchorWordsArr[idx] + except (TypeError, IndexError): + numAnchorWords=0 + block=TextBlock(text,set(),numWords,numAnchorWords,0,0,idx) + try: + block.setIsContent(isContentArr[idx]) + except (TypeError, IndexError): + pass + try: + label=labelArr[idx] + if label==None: pass + elif type(label)==list: + for l in label: block.addLabel(l) + else: block.addLabel(label) + except (TypeError, IndexError): + pass + + textBlocks.append(block) + + return TextDocument(textBlocks) + + def verifyContent(self,filtr,doc,contentArr,show=False): + isContentBefore=[block.isContent() for block in doc.getTextBlocks()] + isChanged=filtr.process(doc) + isContent=[block.isContent() for block in doc.getTextBlocks()] + self.assertEqual(isContent,contentArr) + self.assertEqual(isChanged,isContent!=isContentBefore) + + def test_markEveryhingContent(self): + doc=self.makedoc([5,100,80],None,[False,True,False]) + self.verifyContent(MarkEverythingContentFilter(),doc,[True,True,True]) + + def test_inverted(self): + doc=self.makedoc([5,100,80],None,[False,True,False]) + self.verifyContent(InvertedFilter(),doc,[True,False,True]) + + def test_boilerplateBlock(self): + #keeps if isContent + doc=self.makedoc([5,100,10,50,80],None,[False,True,False,True,False]) + initBlocks=doc.getTextBlocks() + finalBlocks=[initBlocks[1],initBlocks[3]] + filtr=BoilerplateBlockFilter() + isChanged=filtr.process(doc) + isContent=[block.isContent() for block in doc.getTextBlocks()] + self.assertEqual(doc.getTextBlocks(),finalBlocks) + self.assertEqual(isContent,[True,True]) + self.assertEqual(isChanged,True) + + def test_minWords(self): + #rejects if #words6 + self.verifyContent(SurroundingToContentFilter(defaultCondition),doc,[True,True,True,False,True,False,False,True]) + + def test_labelToBoilerplate(self): + #reject block if it has a particular label + lb_not=DefaultLabels.STRICTLY_NOT_CONTENT + lb_maybe=DefaultLabels.MIGHT_BE_CONTENT + doc=self.makedoc([10,10,10,10],None,[True,True,True,True],[lb_not,lb_maybe,[lb_not,lb_maybe],None]) + self.verifyContent(LabelToBoilerplateFilter(DefaultLabels.STRICTLY_NOT_CONTENT),doc,[False,True,False,True]) + + def test_labelToContent(self): + #accept block if it has a particular label + lb_not=DefaultLabels.STRICTLY_NOT_CONTENT + lb_maybe=DefaultLabels.MIGHT_BE_CONTENT + doc=self.makedoc([10,10,10,10],None,[False,False,False,False],[lb_not,lb_maybe,[lb_not,lb_maybe],None]) + self.verifyContent(LabelToContentFilter(DefaultLabels.MIGHT_BE_CONTENT),doc,[False,True,True,False]) + + + def test_simpleBlockFusion(self): + #join blocks with the same number of words per line + doc=self.makedoc(["two words","three fucking words","another three words"],None,[False,False,False]) + filtr=SimpleBlockFusionProcessor() + isChanged=filtr.process(doc) + blockIdxs=[(block.getOffsetBlocksStart(),block.getOffsetBlocksEnd()) for block in doc.getTextBlocks()] + self.assertEqual(blockIdxs,[(0,0),(1,2)]) + self.assertEqual(isChanged,True) + + def test_contentFusion(self): + #join blocks with low link density + filtr=ContentFusion() + + #merge + doc=self.makedoc([10,10],[0,0],[True,False]) + isChanged=filtr.process(doc) + self.assertEqual(len(doc.getTextBlocks()),1) + self.assertEqual(isChanged,True) + + #dont merge if tagged not content + doc=self.makedoc([10,10],[0,0],[True,False],[None,DefaultLabels.STRICTLY_NOT_CONTENT]) + isChanged=filtr.process(doc) + self.assertEqual(len(doc.getTextBlocks()),2) + self.assertEqual(isChanged,False) + + #dont merge if link density is high + doc=self.makedoc([10,10],[0,8],[True,False]) + isChanged=filtr.process(doc) + self.assertEqual(len(doc.getTextBlocks()),2) + self.assertEqual(isChanged,False) + + #multiple pass merging + doc=self.makedoc([10,10,10,10],[0,0,0,0],[True,False,True,False]) + isChanged=filtr.process(doc) + self.assertEqual(len(doc.getTextBlocks()),1) + self.assertEqual(isChanged,True) + + def test_labelFusion(self): + #fuse blocks with identical labels - ONLY LOOKS AT LABELS with markup prefix + + lb1=DefaultLabels.MARKUP_PREFIX+".title" + lb2=DefaultLabels.MARKUP_PREFIX+".menu" + doc=self.makedoc([10,10,10,10,10,10,10],None,None,[None,None,lb1,lb1,lb2,lb2,[lb1,lb2]]) + filtr=LabelFusion() + isChanged=filtr.process(doc) + blockIdxs=[(block.getOffsetBlocksStart(),block.getOffsetBlocksEnd()) for block in doc.getTextBlocks()] + self.assertEqual(blockIdxs,[(0,1),(2,3),(4,5),(6,6)]) + self.assertEqual(isChanged,True) + + def test_blockProximity(self): + #fuse blocks close to each other + doc=self.makedoc([10,10,10,10,10,10,10],None,[False,True,True,True,True,True,False]) + filtr=BlockProximityFusion(1,True,False) + isChanged=filtr.process(doc) + blockIdxs=[(block.getOffsetBlocksStart(),block.getOffsetBlocksEnd()) for block in doc.getTextBlocks()] + self.assertEqual(blockIdxs,[(0,0),(1,5),(6,6)]) + self.assertEqual(isChanged,True) + + def test_largestBlock(self): + #choose largest block + doc=self.makedoc([10,10,50,10],None,[False,True,True,True]) + self.verifyContent(KeepLargestBlockFilter(),doc,[False,False,True,False]) + + def test_expandTitleToContent(self): + #marks all between title and content start + lb1=DefaultLabels.MIGHT_BE_CONTENT + doc=self.makedoc([10,10,10,10],None,[False,False,False,True],[lb1,[lb1,DefaultLabels.TITLE],lb1,lb1]) + self.verifyContent(ExpandTitleToContentFilter(),doc,[False,True,True,True]) + + def test_articleMetadata(self): + #marks as content and tags blocks with date/time data + doc=self.makedoc([" May 1, 2009 8:00pm EST","May not be date 1","By Frank Sinatra","By looking at this sentence, you can see there is no author"],None,[False,False,False,False]) + self.verifyContent(ArticleMetadataFilter(),doc,[True,False,True,False]) + labels=[block.getLabels() for block in doc.getTextBlocks()] + self.assertIn(DefaultLabels.ARTICLE_METADATA,labels[0]) + + def test_largestBlock(self): + #accept largest block and reject all others + doc=self.makedoc([10,10,50,10],None,[False,True,True,True]) + self.verifyContent(KeepLargestBlockFilter(),doc,[False,False,True,False]) + + def test_addPrecedingLabels(self): + #add prefix+preceding label to each block + lb1=DefaultLabels.TITLE + lb2=DefaultLabels.MIGHT_BE_CONTENT + prefix="^" + doc=self.makedoc([10,10,10],None,None,[lb1,lb2,None]) + filtr=AddPrecedingLabelsFilter(prefix) + isChanged=filtr.process(doc) + labels=[block.getLabels() for block in doc.getTextBlocks()] + self.assertEqual(labels,[set([lb1]),set([prefix+lb1,lb2]),set([prefix+lb2])]) + self.assertEqual(isChanged,True) + + def test_documentTitleMatch(self): + #add title label to blocks matching sections of the title + doc=self.makedoc(["News","This is the real title","Red herring"]) + doc.setTitle("News - This is the real title") + filtr=DocumentTitleMatchClassifier(None,True) + isChanged=filtr.process(doc) + labels=[block.getLabels() for block in doc.getTextBlocks()] + self.assertEqual(labels,[set(),set([DefaultLabels.TITLE]),set()]) + self.assertEqual(isChanged,True) + + def test_minFulltextWords(self): + #choose largest block + doc=self.makedoc([10,50],None,[True,True]) + self.verifyContent(MinFulltextWordsFilter(30),doc,[False,True]) + + def test_largestFulltextBlock(self): + #accept largest block that has been marked as content and reject all others + doc=self.makedoc([10,50,80,10],None,[True,True,False,False]) + self.verifyContent(KeepLargestFulltextBlockFilter(),doc,[False,True,False,False]) + + def test_ignoreBlocksAfterContent(self): + #rejects all blocks after(&including) first block with ENDOFTEXT label + #Also: ENDOFTEXT labels are ignored until the total number of words in content blocks reaches a certain number + lb=DefaultLabels.INDICATES_END_OF_TEXT + doc=self.makedoc([10,30,50,80,20],None,[False,True,True,True,True],[lb,None,None,lb,None]) + self.verifyContent(IgnoreBlocksAfterContentFilter(60),doc,[False,True,True,False,False]) + + def test_ignoreBlocksAfterContentFromEnd(self): + #rejects all blocks with ENDOFTEXT label + #works backwards until the total number of words in content blocks reaches 200 and then halts + lb=DefaultLabels.INDICATES_END_OF_TEXT + doc=self.makedoc([80,80,80,80,80],None,[True,True,True,True,True],[lb,None,None,lb,None]) + self.verifyContent(IgnoreBlocksAfterContentFromEndFilter(),doc,[True,True,True,False,True]) + + def test_terminatingBlocks(self): + #add ENDOFTEXT label at detected beginning of comments section + lb=DefaultLabels.INDICATES_END_OF_TEXT + s1="Comments can be the first word of article text. If there are many words in the block, it is not comments" + s2="Thanks for your comments - this feedback is now closed" + doc=self.makedoc(["Comments","Please have your say","48 Comments today",s1,s2]) + filtr=TerminatingBlocksFinder() + isChanged=filtr.process(doc) + hasLabel=[(lb in block.getLabels()) for block in doc.getTextBlocks()] + self.assertEqual(hasLabel,[True,True,True,False,True]) + self.assertEqual(isChanged,True) + + def test_numWordsClassifier(self): + #accepts or rejects block based on machine-trained decision tree rules + #using features from previous, current and next block + filtr=NumWordsRulesClassifier() + + doc=self.makedoc([2,10,10],[0,0,0],[True,True,True]) + isChanged=filtr.process(doc) + #test middle block only + self.assertEqual(doc.getTextBlocks()[1].isContent(),False) + + doc=self.makedoc([10,10,10],[0,0,0],[True,True,True]) + isChanged=filtr.process(doc) + self.assertEqual(doc.getTextBlocks()[1].isContent(),True) + + def test_densityClassifier(self): + #accepts or rejects block based on a different set of machine-trained decision tree rules + #using features from previous, current and next block + doc=self.makedoc([10,10,5],[10,0,0],[True,True,True]) + isChanged=DensityRulesClassifier().process(doc) + self.assertEqual(doc.getTextBlocks()[1].isContent(),False) + + def test_canolaClassifier(self): + #accepts or rejects block based on a different set of machine-trained decision tree rules + #using features from previous, current and next block + doc=self.makedoc([5,10,30],[5,10,0],[True,False,True]) + isChanged=CanolaFilter().process(doc) + self.assertEqual(doc.getTextBlocks()[1].isContent(),True) class TestParser(unittest.TestCase): - extractor=Extractor(None) - defaultWords="Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec fermentum tincidunt magna, eu pulvinar mauris dapibus pharetra. In varius, nisl a rutrum porta, sem sem semper lacus, et varius urna tellus vel lorem. Nullam urna eros, luctus eget blandit ac, imperdiet feugiat ipsum. Donec laoreet tristique mi a bibendum. Sed pretium bibendum scelerisque. Mauris id pellentesque turpis. Mauris porta adipiscing massa, quis tempus dui pharetra ac. Morbi lacus mauris, feugiat ac tempor ut, congue tincidunt risus. Pellentesque tincidunt adipiscing elit, in fringilla enim scelerisque vel. Nulla facilisi. ".split(' ') - - def contentitem(self,s): - if type(s)==int: - return ' '.join(self.defaultWords[:s]) - else: return s - - def makecontent(self,strArr): - return [self.contentitem(s) for s in strArr] - - def makedoc(self,template,contentArr): - templateArr=template.split('*') - s="" - for i,j in zip(templateArr[:-1],contentArr): - s+=i+j - s+=templateArr[-1] - doc=self.extractor.parseDoc(s) - return doc - - def test_blocks(self): - template="

*

*

*

*
" - content=self.makecontent([4,5,6,7]) - doc=self.makedoc(template,content) - - blocks=doc.getTextBlocks() - textArr=[block.getText() for block in blocks] - numWords=[block.getNumWords() for block in blocks] - self.assertEqual(textArr,content) - self.assertEqual(numWords,[4,5,6,7]) - - def test_anchor(self): - template="

*

*

" - content=self.makecontent([6,"end with space ",3,6]) - doc=self.makedoc(template,content) - - blocks=doc.getTextBlocks() - textArr=[block.getText() for block in blocks] - densityArr=[block.getLinkDensity() for block in blocks] - numAnchorWords=[block.getNumWordsInAnchorText() for block in blocks] - self.assertEqual(textArr,[content[0],content[1]+content[2],content[3]]) - self.assertEqual(numAnchorWords,[0,3,6]) - self.assertEqual(densityArr,[0.0,0.5,1.0]) - - def test_title(self): - titleText="THIS IS TITLE" - s=""+titleText+"

THIS IS CONTENT

" - doc=self.extractor.parseDoc(s) - self.assertEqual(doc.getTitle(),titleText) - - def test_body(self): - bodyText="THIS IS CONTENT" - s="

NOT IN BODY

"+bodyText+"

" - doc=self.extractor.parseDoc(s) - textArr=[block.getText() for block in doc.getTextBlocks()] - self.assertEqual(textArr,[bodyText]) - - def test_inline(self): - template="

*

*

**
" - content=['AA','BB','CC','DD'] - doc=self.makedoc(template,content) - - blocks=doc.getTextBlocks() - textArr=[block.getText() for block in blocks] - numWords=[block.getNumWords() for block in blocks] - self.assertEqual(textArr,[content[0],content[1],content[2]+content[3]]) - - def test_ignorable(self): - template="

*

" - content=self.makecontent([10,12]) - doc=self.makedoc(template,content) - - blocks=doc.getTextBlocks() - textArr=[block.getText() for block in blocks] - self.assertEqual(textArr,[content[0]]) - - def assertRange(self,val,minval,maxval): - self.assertTrue(val>=minval and val<=maxval) - - def test_textDensity(self): - template="

*

*

" - content=self.makecontent([80,"one, !!! two"]) - doc=self.makedoc(template,content) - - blocks=doc.getTextBlocks() - numArr=[[block.getNumWords(),block.numWordsInWrappedLines,block.numWrappedLines,block.getTextDensity()] for block in blocks] - - #exact values are unknown, approximate value range to check - self.assertEqual(blocks[0].getNumWords(),80) - self.assertRange(blocks[0].numWordsInWrappedLines,60,80) - self.assertRange(blocks[0].numWrappedLines,4,7) - self.assertRange(blocks[0].getTextDensity(),8,16) - - self.assertEqual(numArr[1],[2,2,1,2]) - - def test_blockIdxs(self): - template="

*

*

*

*

" - content=self.makecontent([11,12,13,14]) - doc=self.makedoc(template,content) - - blocks=doc.getTextBlocks() - idxArr=[[block.getOffsetBlocksStart(),block.getOffsetBlocksEnd()] for block in blocks] - self.assertEqual(idxArr,[[0,0],[1,1],[2,2],[3,3]]) - - def test_tagLevel(self): - template="

*

*
" - content=self.makecontent([5,6]) - doc=self.makedoc(template,content) - - blocks=doc.getTextBlocks() - levelArr=[block.getTagLevel() for block in blocks] - self.assertEqual(levelArr,[5,3]) - - def test_merge(self): - block1=TextBlock("AA BB CC ",set([0]),3,3,3,1,0) - block2=TextBlock("DD EE FF GG HH II JJ .",set([1]),6,0,6,2,1) - block1.addLabels(DefaultLabels.MIGHT_BE_CONTENT) - block2.addLabels(DefaultLabels.ARTICLE_METADATA) - block1.mergeNext(block2) - self.assertEqual(block1.getText(),"AA BB CC \nDD EE FF GG HH II JJ .") - self.assertEqual(block1.getNumWords(),9) - self.assertEqual(block1.getNumWordsInAnchorText(),3) - self.assertAlmostEqual(block1.getLinkDensity(),1.0/3.0) - self.assertEqual(block1.getTextDensity(),3) - self.assertEqual(block1.getLabels(),set([DefaultLabels.MIGHT_BE_CONTENT,DefaultLabels.ARTICLE_METADATA])) - self.assertEqual(block1.getOffsetBlocksStart(),0) - self.assertEqual(block1.getOffsetBlocksEnd(),1) + extractor=Extractor(None) + defaultWords="Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec fermentum tincidunt magna, eu pulvinar mauris dapibus pharetra. In varius, nisl a rutrum porta, sem sem semper lacus, et varius urna tellus vel lorem. Nullam urna eros, luctus eget blandit ac, imperdiet feugiat ipsum. Donec laoreet tristique mi a bibendum. Sed pretium bibendum scelerisque. Mauris id pellentesque turpis. Mauris porta adipiscing massa, quis tempus dui pharetra ac. Morbi lacus mauris, feugiat ac tempor ut, congue tincidunt risus. Pellentesque tincidunt adipiscing elit, in fringilla enim scelerisque vel. Nulla facilisi. ".split(' ') + + def contentitem(self,s): + if type(s)==int: + return ' '.join(self.defaultWords[:s]) + else: return s + + def makecontent(self,strArr): + return [self.contentitem(s) for s in strArr] + + def makedoc(self,template,contentArr): + templateArr=template.split('*') + s="" + for i,j in zip(templateArr[:-1],contentArr): + s+=i+j + s+=templateArr[-1] + doc=self.extractor.parseDoc(s) + return doc + + def test_blocks(self): + template="

*

*

*

*
" + content=self.makecontent([4,5,6,7]) + doc=self.makedoc(template,content) + + blocks=doc.getTextBlocks() + textArr=[block.getText() for block in blocks] + numWords=[block.getNumWords() for block in blocks] + self.assertEqual(textArr,content) + self.assertEqual(numWords,[4,5,6,7]) + + def test_anchor(self): + template="

*

**

*

" + content=self.makecontent([6,"end with space ",3,6]) + doc=self.makedoc(template,content) + + blocks=doc.getTextBlocks() + textArr=[block.getText() for block in blocks] + densityArr=[block.getLinkDensity() for block in blocks] + numAnchorWords=[block.getNumWordsInAnchorText() for block in blocks] + self.assertEqual(textArr,[content[0],content[1]+content[2],content[3]]) + self.assertEqual(numAnchorWords,[0,3,6]) + self.assertEqual(densityArr,[0.0,0.5,1.0]) + + def test_title(self): + titleText="THIS IS TITLE" + s=""+titleText+"

THIS IS CONTENT

" + doc=self.extractor.parseDoc(s) + self.assertEqual(doc.getTitle(),titleText) + + def test_body(self): + bodyText="THIS IS CONTENT" + s="

NOT IN BODY

"+bodyText+"

" + doc=self.extractor.parseDoc(s) + textArr=[block.getText() for block in doc.getTextBlocks()] + self.assertEqual(textArr,[bodyText]) + + def test_inline(self): + template="

*

*

**
" + content=['AA','BB','CC','DD'] + doc=self.makedoc(template,content) + + blocks=doc.getTextBlocks() + textArr=[block.getText() for block in blocks] + numWords=[block.getNumWords() for block in blocks] + self.assertEqual(textArr,[content[0],content[1],content[2]+content[3]]) + + def test_ignorable(self): + template="

*

" + content=self.makecontent([10,12]) + doc=self.makedoc(template,content) + + blocks=doc.getTextBlocks() + textArr=[block.getText() for block in blocks] + self.assertEqual(textArr,[content[0]]) + + def assertRange(self,val,minval,maxval): + self.assertTrue(val>=minval and val<=maxval) + + def test_textDensity(self): + template="

*

*

" + content=self.makecontent([80,"one, !!! two"]) + doc=self.makedoc(template,content) + + blocks=doc.getTextBlocks() + numArr=[[block.getNumWords(),block.numWordsInWrappedLines,block.numWrappedLines,block.getTextDensity()] for block in blocks] + + #exact values are unknown, approximate value range to check + self.assertEqual(blocks[0].getNumWords(),80) + self.assertRange(blocks[0].numWordsInWrappedLines,60,80) + self.assertRange(blocks[0].numWrappedLines,4,7) + self.assertRange(blocks[0].getTextDensity(),8,16) + + self.assertEqual(numArr[1],[2,2,1,2]) + + def test_blockIdxs(self): + template="

*

*

*

*

" + content=self.makecontent([11,12,13,14]) + doc=self.makedoc(template,content) + + blocks=doc.getTextBlocks() + idxArr=[[block.getOffsetBlocksStart(),block.getOffsetBlocksEnd()] for block in blocks] + self.assertEqual(idxArr,[[0,0],[1,1],[2,2],[3,3]]) + + def test_tagLevel(self): + template="

*

*
" + content=self.makecontent([5,6]) + doc=self.makedoc(template,content) + + blocks=doc.getTextBlocks() + levelArr=[block.getTagLevel() for block in blocks] + self.assertEqual(levelArr,[5,3]) + + def test_merge(self): + block1=TextBlock("AA BB CC ",set([0]),3,3,3,1,0) + block2=TextBlock("DD EE FF GG HH II JJ .",set([1]),6,0,6,2,1) + block1.addLabels(DefaultLabels.MIGHT_BE_CONTENT) + block2.addLabels(DefaultLabels.ARTICLE_METADATA) + block1.mergeNext(block2) + self.assertEqual(block1.getText(),"AA BB CC \nDD EE FF GG HH II JJ .") + self.assertEqual(block1.getNumWords(),9) + self.assertEqual(block1.getNumWordsInAnchorText(),3) + self.assertAlmostEqual(block1.getLinkDensity(), 1.0 / 3.0) + self.assertEqual(block1.getTextDensity(),3) + self.assertEqual(block1.getLabels(),set([DefaultLabels.MIGHT_BE_CONTENT,DefaultLabels.ARTICLE_METADATA])) + self.assertEqual(block1.getOffsetBlocksStart(),0) + self.assertEqual(block1.getOffsetBlocksEnd(),1) + + + def test_getDocFromUrl(self): + """getDocFromUrl() should run (was dying because of undefined 'filename')""" + url = "http://www.example.com/" + fake_readFromUrl = mock.Mock(return_value=u"

Example

") + tmp_filter = MarkEverythingContentFilter() + + with mock.patch.object(self.extractor, "readFromUrl", fake_readFromUrl): + with mock.patch.object(self.extractor, "filter", tmp_filter): + self.assertIsInstance(self.extractor.getDocFromUrl(url), TextDocument) + runTests()