diff --git a/.gitattributes b/.gitattributes index 412eeda..2431c40 100644 --- a/.gitattributes +++ b/.gitattributes @@ -10,13 +10,13 @@ *.dbproj merge=union # Standard to msysgit -*.doc diff=astextplain -*.DOC diff=astextplain +*.doc diff=astextplain +*.DOC diff=astextplain *.docx diff=astextplain *.DOCX diff=astextplain *.dot diff=astextplain *.DOT diff=astextplain *.pdf diff=astextplain -*.PDF diff=astextplain -*.rtf diff=astextplain -*.RTF diff=astextplain +*.PDF diff=astextplain +*.rtf diff=astextplain +*.RTF diff=astextplain diff --git a/README.txt b/README.txt index bf0b259..5e06b9b 100644 --- a/README.txt +++ b/README.txt @@ -5,7 +5,7 @@ BoilerPy About --------------------------------------- -BoilerPy is a native Python port of Christian Kohlschütter's Boilerpipe library, released under the Apache 2.0 Licence. (http://code.google.com/p/boilerpipe/ +BoilerPy is a native Python port of Christian Kohlschutter's Boilerpipe library, released under the Apache 2.0 Licence. (http://code.google.com/p/boilerpipe/ ) I created this port since I don't have access to Java on my webhost and I wanted to create a pure Python version. Another Python version which consists of Python hooks to the original Java library can be found here : (https://github.com/misja/python-boilerpipe @@ -20,19 +20,19 @@ Installation BoilerPy was packaged with distutils. In can be installed from the command-line with the following line: - ``>python setup.py install`` + ``>python setup.py install`` Usage --------------------------------------- - ``import boilerpy`` + ``import boilerpy`` - ``boilerpy.extractors.ARTICLE_EXTRACTOR.getContentFromUrl('http://www.example.com/')`` + ``boilerpy.extractors.ARTICLE_EXTRACTOR.getContentFromUrl('http://www.example.com/')`` - ``boilerpy.extractors.ARTICLE_EXTRACTOR.getContentFromFile('site/example.html')`` + ``boilerpy.extractors.ARTICLE_EXTRACTOR.getContentFromFile('site/example.html')`` - ``htmlText='
null if no
- # * such title has ben set.
- # *
- # * @return The "main" title.
- def getTitle(self):
- """ generated source for method getTitle """
- return self.title
-
- #
- # * Updates the "main" title for this document.
- # *
- # * @param title
- def setTitle(self, title):
- """ generated source for method setTitle """
- self.title = title
-
- #
- # * Returns the {@link TextDocument}'s content.
- # *
- # * @return The content text.
- def getContent(self):
- """ generated source for method getContent """
- return self.getText(True, False)
-
- #
- # * Returns the {@link TextDocument}'s content, non-content or both
- # *
- # * @param includeContent Whether to include TextBlocks marked as "content".
- # * @param includeNonContent Whether to include TextBlocks marked as "non-content".
- # * @return The text.
- def getText(self, includeContent, includeNonContent):
- sb = ""
- for block in self.getTextBlocks():
- if block.isContent():
- if not includeContent:
- continue
- else:
- if not includeNonContent:
- continue
- sb+=block.getText()+'\n'
- return sb
-
- # * Returns detailed debugging information about the contained {@link TextBlock}s.
- # * @return Debug information.
- def debugString(self):
- sb = ""
- for tb in self.getTextBlocks():
- sb+=str(tb)+"\n"
- return sb
+ # * Creates a new {@link TextDocument} with given {@link TextBlock}s and
+ # * given title.
+ # *
+ # * @param title
+ # * The "main" title for this text document.
+ # * @param textBlocks
+ # * The text blocks of this document.
+ def __init__(self, textBlocks, title=None):
+ self.title = title
+ self.textBlocks = textBlocks
+
+ # * Returns the {@link TextBlock}s of this document.
+ # *
+ # * @return A list of {@link TextBlock}s, in sequential order of appearance.
+ #
+ def getTextBlocks(self):
+ """ generated source for method getTextBlocks """
+ return self.textBlocks
+
+ def setTextBlocks(self,textBlocks): self.textBlocks=textBlocks
+
+ #
+ # * Returns the "main" title for this document, or null if no
+ # * such title has ben set.
+ # *
+ # * @return The "main" title.
+ def getTitle(self):
+ """ generated source for method getTitle """
+ return self.title
+
+ #
+ # * Updates the "main" title for this document.
+ # *
+ # * @param title
+ def setTitle(self, title):
+ """ generated source for method setTitle """
+ self.title = title
+
+ #
+ # * Returns the {@link TextDocument}'s content.
+ # *
+ # * @return The content text.
+ def getContent(self):
+ """ generated source for method getContent """
+ return self.getText(True, False)
+
+ #
+ # * Returns the {@link TextDocument}'s content, non-content or both
+ # *
+ # * @param includeContent Whether to include TextBlocks marked as "content".
+ # * @param includeNonContent Whether to include TextBlocks marked as "non-content".
+ # * @return The text.
+ def getText(self, includeContent, includeNonContent):
+ sb = ""
+ for block in self.getTextBlocks():
+ if block.isContent():
+ if not includeContent:
+ continue
+ else:
+ if not includeNonContent:
+ continue
+ sb+=block.getText()+'\n'
+ return sb
+
+ # * Returns detailed debugging information about the contained {@link TextBlock}s.
+ # * @return Debug information.
+ def debugString(self):
+ sb = ""
+ for tb in self.getTextBlocks():
+ sb+=str(tb)+"\n"
+ return sb
@@ -127,174 +128,174 @@ def debugString(self):
#
class TextBlock(object):
- """ generated source for class TextBlock """
-
- def __init__(self, text, containedTextElements=set(), numWords=0, numWordsInAnchorText=0, numWordsInWrappedLines=0, numWrappedLines=0, offsetBlocks=0):
- self._isContent = False
- self.labels = set()
- self.numFullTextWords = 0
- self.tagLevel = 0
-
- self.text = text
- self.containedTextElements = containedTextElements
- self.numWords = numWords
- self.numWordsInAnchorText = numWordsInAnchorText
- self.numWordsInWrappedLines = numWordsInWrappedLines
- self.numWrappedLines = numWrappedLines
- self.offsetBlocksStart = offsetBlocks
- self.offsetBlocksEnd = offsetBlocks
- self.initDensities()
-
- def initDensities(self):
- """ generated source for method initDensities """
- if self.numWordsInWrappedLines == 0:
- self.numWordsInWrappedLines = self.numWords
- self.numWrappedLines = 1
- self.textDensity = self.numWordsInWrappedLines / float(self.numWrappedLines)
- self.linkDensity = 0 if self.numWords==0 else self.numWordsInAnchorText / float(self.numWords)
-
- def isContent(self):
- """ generated source for method isContent """
- return self._isContent
-
- def setIsContent(self, isContent):
- """ generated source for method setIsContent """
- if isContent != self._isContent:
- self._isContent = isContent
- return True
- else:
- return False
-
- def getText(self):
- """ generated source for method getText """
- return self.text
-
- def getNumWords(self):
- """ generated source for method getNumWords """
- return self.numWords
-
- def getNumWordsInAnchorText(self):
- """ generated source for method getNumWordsInAnchorText """
- return self.numWordsInAnchorText
-
- def getTextDensity(self):
- """ generated source for method getTextDensity """
- return self.textDensity
-
- def getLinkDensity(self):
- """ generated source for method getLinkDensity """
- return self.linkDensity
-
- def mergeNext(self, nextTextBlock):
- """ generated source for method mergeNext """
- if self.text==None: self.text=""
- self.text+='\n'+nextTextBlock.text
- self.numWords += nextTextBlock.numWords
- self.numWordsInAnchorText += nextTextBlock.numWordsInAnchorText
- self.numWordsInWrappedLines += nextTextBlock.numWordsInWrappedLines
- self.numWrappedLines += nextTextBlock.numWrappedLines
- self.offsetBlocksStart = min(self.offsetBlocksStart, nextTextBlock.offsetBlocksStart)
- self.offsetBlocksEnd = max(self.offsetBlocksEnd, nextTextBlock.offsetBlocksEnd)
- self.initDensities()
- self._isContent |= nextTextBlock.isContent()
- self.containedTextElements|=nextTextBlock.containedTextElements
- self.numFullTextWords += nextTextBlock.numFullTextWords
- self.labels|=nextTextBlock.labels
- self.tagLevel = min(self.tagLevel, nextTextBlock.tagLevel)
-
- def getOffsetBlocksStart(self):
- """ generated source for method getOffsetBlocksStart """
- return self.offsetBlocksStart
-
- def getOffsetBlocksEnd(self):
- """ generated source for method getOffsetBlocksEnd """
- return self.offsetBlocksEnd
-
- def __repr__(self):
- """ generated source for method toString """
- return "[" + str(self.offsetBlocksStart) + "-" + str(self.offsetBlocksEnd) + ";tl=" + str(self.tagLevel) + "; nw=" + str(self.numWords) + ";nwl=" + str(self.numWrappedLines) + ";ld=" + str(self.linkDensity) + "]\t" + ("CONTENT" if self.isContent else "boilerplate") + "," + str(self.labels) + "\n" + str(self.getText())
-
- #
- # * Adds an arbitrary String label to this {@link TextBlock}.
- # *
- # * @param label The label
- #
- def addLabel(self, label):
- """ generated source for method addLabel """
- self.labels.add(label)
-
- #
- # * Checks whether this TextBlock has the given label.
- # *
- # * @param label The label
- # * @return true if this block is marked by the given label.
- #
- def hasLabel(self, label):
- """ generated source for method hasLabel """
- return label in self.labels
-
- def removeLabel(self, label):
- """ generated source for method removeLabel """
- try:
- self.labels.remove(label)
- return True
- except KeyError:
- return False
-
- #
- # * Returns the labels associated to this TextBlock, or null if no such labels
- # * exist.
- # *
- # * to the data structure. However it is recommended to use the label-specific methods in {@link TextBlock}
- # * whenever possible.
- # *
- # * @return Returns the set of labels, or null if no labels was added yet.
- #
- def getLabels(self):
- """ generated source for method getLabels """
- return self.labels
-
- #
- # * Adds a set of labels to this {@link TextBlock}.
- # * null-references are silently ignored.
- # *
- # * @param labels The labels to be added.
- #
- def addLabels(self, *labels):
- """ generated source for method addLabels """
- if len(labels)==0 or labels[0] == None: return
- if self.labels == None: self.labels = set()
- elif len(labels)==1 and (type(labels[0])==set or type(labels[0])==list): self.labels|=set(labels[0])
- else: self.labels|=set(labels)
-
-
- #
- # * Returns the containedTextElements BitSet, or null.
- # * @return
- #
- def getContainedTextElements(self):
- """ generated source for method getContainedTextElements """
- return self.containedTextElements
-
- def clone(self):
- try:
- clone = copy.copy(self)
- except copy.error:
- raise copy.error
- if self.labels != None: clone.labels = self.labels.copy()
- if self.containedTextElements != None: clone.containedTextElements = self.containedTextElements.copy()
- return clone
-
- def getTagLevel(self):
- """ generated source for method getTagLevel """
- return self.tagLevel
-
- def setTagLevel(self, tagLevel):
- """ generated source for method setTagLevel """
- self.tagLevel = tagLevel
+ """ generated source for class TextBlock """
+
+ def __init__(self, text, containedTextElements=set(), numWords=0, numWordsInAnchorText=0, numWordsInWrappedLines=0, numWrappedLines=0, offsetBlocks=0):
+ self._isContent = False
+ self.labels = set()
+ self.numFullTextWords = 0
+ self.tagLevel = 0
+
+ self.text = text
+ self.containedTextElements = containedTextElements
+ self.numWords = numWords
+ self.numWordsInAnchorText = numWordsInAnchorText
+ self.numWordsInWrappedLines = numWordsInWrappedLines
+ self.numWrappedLines = numWrappedLines
+ self.offsetBlocksStart = offsetBlocks
+ self.offsetBlocksEnd = offsetBlocks
+ self.initDensities()
+
+ def initDensities(self):
+ """ generated source for method initDensities """
+ if self.numWordsInWrappedLines == 0:
+ self.numWordsInWrappedLines = self.numWords
+ self.numWrappedLines = 1
+ self.textDensity = self.numWordsInWrappedLines / self.numWrappedLines
+ self.linkDensity = 0 if self.numWords == 0 else self.numWordsInAnchorText / self.numWords
+
+ def isContent(self):
+ """ generated source for method isContent """
+ return self._isContent
+
+ def setIsContent(self, isContent):
+ """ generated source for method setIsContent """
+ if isContent != self._isContent:
+ self._isContent = isContent
+ return True
+ else:
+ return False
+
+ def getText(self):
+ """ generated source for method getText """
+ return self.text
+
+ def getNumWords(self):
+ """ generated source for method getNumWords """
+ return self.numWords
+
+ def getNumWordsInAnchorText(self):
+ """ generated source for method getNumWordsInAnchorText """
+ return self.numWordsInAnchorText
+
+ def getTextDensity(self):
+ """ generated source for method getTextDensity """
+ return self.textDensity
+
+ def getLinkDensity(self):
+ """ generated source for method getLinkDensity """
+ return self.linkDensity
+
+ def mergeNext(self, nextTextBlock):
+ """ generated source for method mergeNext """
+ if self.text==None: self.text=""
+ self.text+='\n'+nextTextBlock.text
+ self.numWords += nextTextBlock.numWords
+ self.numWordsInAnchorText += nextTextBlock.numWordsInAnchorText
+ self.numWordsInWrappedLines += nextTextBlock.numWordsInWrappedLines
+ self.numWrappedLines += nextTextBlock.numWrappedLines
+ self.offsetBlocksStart = min(self.offsetBlocksStart, nextTextBlock.offsetBlocksStart)
+ self.offsetBlocksEnd = max(self.offsetBlocksEnd, nextTextBlock.offsetBlocksEnd)
+ self.initDensities()
+ self._isContent |= nextTextBlock.isContent()
+ self.containedTextElements|=nextTextBlock.containedTextElements
+ self.numFullTextWords += nextTextBlock.numFullTextWords
+ self.labels|=nextTextBlock.labels
+ self.tagLevel = min(self.tagLevel, nextTextBlock.tagLevel)
+
+ def getOffsetBlocksStart(self):
+ """ generated source for method getOffsetBlocksStart """
+ return self.offsetBlocksStart
+
+ def getOffsetBlocksEnd(self):
+ """ generated source for method getOffsetBlocksEnd """
+ return self.offsetBlocksEnd
+
+ def __repr__(self):
+ """ generated source for method toString """
+ return "[" + str(self.offsetBlocksStart) + "-" + str(self.offsetBlocksEnd) + ";tl=" + str(self.tagLevel) + "; nw=" + str(self.numWords) + ";nwl=" + str(self.numWrappedLines) + ";ld=" + str(self.linkDensity) + "]\t" + ("CONTENT" if self.isContent else "boilerplate") + "," + str(self.labels) + "\n" + str(self.getText())
+
+ #
+ # * Adds an arbitrary String label to this {@link TextBlock}.
+ # *
+ # * @param label The label
+ #
+ def addLabel(self, label):
+ """ generated source for method addLabel """
+ self.labels.add(label)
+
+ #
+ # * Checks whether this TextBlock has the given label.
+ # *
+ # * @param label The label
+ # * @return true if this block is marked by the given label.
+ #
+ def hasLabel(self, label):
+ """ generated source for method hasLabel """
+ return label in self.labels
+
+ def removeLabel(self, label):
+ """ generated source for method removeLabel """
+ try:
+ self.labels.remove(label)
+ return True
+ except KeyError:
+ return False
+
+ #
+ # * Returns the labels associated to this TextBlock, or null if no such labels
+ # * exist.
+ # *
+ # * to the data structure. However it is recommended to use the label-specific methods in {@link TextBlock}
+ # * whenever possible.
+ # *
+ # * @return Returns the set of labels, or null if no labels was added yet.
+ #
+ def getLabels(self):
+ """ generated source for method getLabels """
+ return self.labels
+
+ #
+ # * Adds a set of labels to this {@link TextBlock}.
+ # * null-references are silently ignored.
+ # *
+ # * @param labels The labels to be added.
+ #
+ def addLabels(self, *labels):
+ """ generated source for method addLabels """
+ if len(labels)==0 or labels[0] == None: return
+ if self.labels == None: self.labels = set()
+ elif len(labels)==1 and (type(labels[0])==set or type(labels[0])==list): self.labels|=set(labels[0])
+ else: self.labels|=set(labels)
+
+
+ #
+ # * Returns the containedTextElements BitSet, or null.
+ # * @return
+ #
+ def getContainedTextElements(self):
+ """ generated source for method getContainedTextElements """
+ return self.containedTextElements
+
+ def clone(self):
+ try:
+ clone = copy.copy(self)
+ except copy.error:
+ raise copy.error
+ if self.labels != None: clone.labels = self.labels.copy()
+ if self.containedTextElements != None: clone.containedTextElements = self.containedTextElements.copy()
+ return clone
+
+ def getTagLevel(self):
+ """ generated source for method getTagLevel """
+ return self.tagLevel
+
+ def setTagLevel(self, tagLevel):
+ """ generated source for method setTagLevel """
+ self.tagLevel = tagLevel
TextBlock.EMPTY_START = TextBlock("", set(), 0, 0, 0, 0, -1)
-TextBlock.EMPTY_END = TextBlock("", set(), 0, 0, 0, 0, sys.maxint)
+TextBlock.EMPTY_END = TextBlock("", set(), 0, 0, 0, 0, sys.maxsize)
@@ -303,35 +304,35 @@ def setTagLevel(self, tagLevel):
# * @author Christian Kohlschuetter
#
class TextDocumentStatistics(object):
- #
- # * Computes statistics on a given {@link TextDocument}.
- # *
- # * @param doc The {@link TextDocument}.
- # * @param contentOnly if true then o
- #
- def __init__(self, doc, contentOnly):
- self.numWords=0
- self.numBlocks=0
- for tb in doc.getTextBlocks():
- if contentOnly and not tb.isContent(): continue
- self.numWords += tb.getNumWords()
- self.numBlocks += 1
-
-
- # * Returns the average number of words at block-level (= overall number of words divided by
- # * the number of blocks).
- # *
- # * @return Average
- #
- def avgNumWords(self):
- """ generated source for method avgNumWords """
- return self.numWords / float(self.numBlocks)
-
- #
- # * Returns the overall number of words in all blocks.
- # *
- # * @return Sum
- #
- def getNumWords(self):
- """ generated source for method getNumWords """
- return self.numWords
+ #
+ # * Computes statistics on a given {@link TextDocument}.
+ # *
+ # * @param doc The {@link TextDocument}.
+ # * @param contentOnly if true then o
+ #
+ def __init__(self, doc, contentOnly):
+ self.numWords=0
+ self.numBlocks=0
+ for tb in doc.getTextBlocks():
+ if contentOnly and not tb.isContent(): continue
+ self.numWords += tb.getNumWords()
+ self.numBlocks += 1
+
+
+ # * Returns the average number of words at block-level (= overall number of words divided by
+ # * the number of blocks).
+ # *
+ # * @return Average
+ #
+ def avgNumWords(self):
+ """ generated source for method avgNumWords """
+ return self.numWords / self.numBlocks
+
+ #
+ # * Returns the overall number of words in all blocks.
+ # *
+ # * @return Sum
+ #
+ def getNumWords(self):
+ """ generated source for method getNumWords """
+ return self.numWords
diff --git a/boilerpy/extractors.py b/boilerpy/extractors.py
index 462c2f0..d3e95e7 100644
--- a/boilerpy/extractors.py
+++ b/boilerpy/extractors.py
@@ -8,7 +8,7 @@
# * (the "License"); you may not use this file except in compliance with
# * the License. You may obtain a copy of the License at
# *
-# * http://www.apache.org/licenses/LICENSE-2.0
+# * http://www.apache.org/licenses/LICENSE-2.0
# *
# * Unless required by applicable law or agreed to in writing, software
# * distributed under the License is distributed on an "AS IS" BASIS,
@@ -27,75 +27,77 @@
from xml.sax import parseString, SAXException
-import HTMLParser
+import html.parser
from . import filters
from . import parser
-import urllib2
+import urllib.request
+import urllib.error
+import urllib.parse
import re
class Extractor(object):
- def __init__(self,filtr):
- self.filter=filtr
-
- def getContent(self, text):
- return self.getDoc(text).getContent()
-
- def getContentFromUrl(self, url):
- return self.getDocFromUrl(url).getContent()
-
- def getContentFromFile(self, filename):
- return self.getDocFromFile(filename).getContent()
-
- def getDocFromFile(self,filename):
- return self.getDoc(self.readFromFile(filename))
-
- def getDocFromUrl(self,url):
- return self.getDoc(self.readFromUrl(filename))
-
- def getDoc(self,text):
- doc=self.parseDoc(text)
- self.filter.process(doc)
- return doc
-
- def readFromFile(self,filename):
- f=open(filename,'r')
- text=f.read()
- f.close()
- try:
- text=text.decode('utf8')
- except UnicodeDecodeError: pass
- return text
-
- def readFromUrl(self,url):
- f=urllib2.urlopen(url)
- text=f.read()
- encoding=self.getUrlEncoding(f)
- f.close()
- try:
- text=text.decode(encoding)
- except UnicodeDecodeError: pass
- return text
-
- def getUrlEncoding(self,f):
- try:
- return f.headers['content-type'].split('charset=')[1].split(';')[0]
- except: return 'utf8'
-
- def parseDoc(self,inputStr):
- bpParser=parser.BoilerpipeHTMLParser()
- try:
- bpParser.feed(inputStr)
- except:
- #in case of error, try again, first removing script tag content
- bpParser=parser.BoilerpipeHTMLParser()
- inputStr=re.sub(r'<(?:script|SCRIPT)[^>]*>.*?(?:script|SCRIPT)>','',inputStr,0,re.DOTALL)
- try:
- bpParser.feed(inputStr)
- except:
- print "Error parsing HTML : "+str(e)
- return None
- doc=bpParser.toTextDocument()
- return doc
+ def __init__(self,filtr):
+ self.filter=filtr
+
+ def getContent(self, text):
+ return self.getDoc(text).getContent()
+
+ def getContentFromUrl(self, url):
+ return self.getDocFromUrl(url).getContent()
+
+ def getContentFromFile(self, filename):
+ return self.getDocFromFile(filename).getContent()
+
+ def getDocFromFile(self,filename):
+ return self.getDoc(self.readFromFile(filename))
+
+ def getDocFromUrl(self,url):
+ return self.getDoc(self.readFromUrl(url))
+
+ def getDoc(self,text):
+ doc=self.parseDoc(text)
+ self.filter.process(doc)
+ return doc
+
+ def readFromFile(self,filename):
+ f=open(filename,'r')
+ text=f.read()
+ f.close()
+ try:
+ text=text.decode('utf8')
+ except UnicodeDecodeError: pass
+ return text
+
+ def readFromUrl(self,url):
+ f = urllib.request.urlopen(url)
+ text=f.read()
+ encoding=self.getUrlEncoding(f)
+ f.close()
+ try:
+ text=text.decode(encoding)
+ except UnicodeDecodeError: pass
+ return text
+
+ def getUrlEncoding(self,f):
+ try:
+ return f.headers['content-type'].split('charset=')[1].split(';')[0]
+ except: return 'utf8'
+
+ def parseDoc(self,inputStr):
+ bpParser=parser.BoilerpipeHTMLParser()
+ try:
+ bpParser.feed(inputStr)
+ except Exception as exc:
+ #in case of error, try again, first removing script tag content
+ bpParser=parser.BoilerpipeHTMLParser()
+ inputStr=re.sub(r'<(?:script|SCRIPT)[^>]*>.*?(?:script|SCRIPT)>','',inputStr,0,re.DOTALL)
+ try:
+ bpParser.feed(inputStr)
+ except Exception as e:
+ print("Error parsing HTML : " + str(e))
+ return None
+ doc=bpParser.toTextDocument()
+ return doc
@@ -103,28 +105,28 @@ def parseDoc(self,inputStr):
# * A full-text extractor which is tuned towards news articles. In this scenario
# * it achieves higher accuracy than {@link DefaultExtractor}.
articleFilterChain=filters.FilterChain([
- filters.TerminatingBlocksFinder(),
- filters.DocumentTitleMatchClassifier(None,True),
- filters.NumWordsRulesClassifier(),
- filters.IgnoreBlocksAfterContentFilter(),
- filters.BlockProximityFusion(1,False,False),
- filters.BoilerplateBlockFilter(),
- filters.BlockProximityFusion(1,True,False),
- filters.KeepLargestBlockFilter(),
- filters.ExpandTitleToContentFilter()
+ filters.TerminatingBlocksFinder(),
+ filters.DocumentTitleMatchClassifier(None,True),
+ filters.NumWordsRulesClassifier(),
+ filters.IgnoreBlocksAfterContentFilter(),
+ filters.BlockProximityFusion(1,False,False),
+ filters.BoilerplateBlockFilter(),
+ filters.BlockProximityFusion(1,True,False),
+ filters.KeepLargestBlockFilter(),
+ filters.ExpandTitleToContentFilter()
])
-# * Works very well for most types of Article-like HTML.
+# * Works very well for most types of Article-like HTML.
ARTICLE_EXTRACTOR = Extractor(articleFilterChain)
# class DefaultExtractor
-# * Usually worse than {@link ArticleExtractor}, but simpler/no heuristics.
+# * Usually worse than {@link ArticleExtractor}, but simpler/no heuristics.
# * A quite generic full-text extractor.
defaultFilterChain=filters.FilterChain([
- filters.SimpleBlockFusionProcessor(),
- filters.BlockProximityFusion(1,False,False),
- filters.DensityRulesClassifier()
+ filters.SimpleBlockFusionProcessor(),
+ filters.BlockProximityFusion(1,False,False),
+ filters.DensityRulesClassifier()
])
DEFAULT_EXTRACTOR = Extractor(defaultFilterChain)
@@ -135,19 +137,19 @@ def parseDoc(self,inputStr):
# * For news articles, it may perform better than the {@link DefaultExtractor},
# * but usually worse than {@link ArticleExtractor}.
largestContentFilterChain=filters.FilterChain([
- filters.NumWordsRulesClassifier(),
- filters.BlockProximityFusion(1,False,False),
- filters.KeepLargestBlockFilter()
+ filters.NumWordsRulesClassifier(),
+ filters.BlockProximityFusion(1,False,False),
+ filters.KeepLargestBlockFilter()
])
-# * Like {@link DefaultExtractor}, but keeps the largest text block only.
+# * Like {@link DefaultExtractor}, but keeps the largest text block only.
LARGEST_CONTENT_EXTRACTOR = Extractor(largestContentFilterChain)
# class CanolaExtractor
-# * Trained on krdwrd Canola (different definition of "boilerplate"). You may
-# * give it a try.
+# * Trained on krdwrd Canola (different definition of "boilerplate"). You may
+# * give it a try.
CANOLA_EXTRACTOR = Extractor(filters.CanolaFilter())
@@ -155,9 +157,9 @@ def parseDoc(self,inputStr):
# class KeepEverythingExtractor
# * Marks everything as content.
-# * Dummy Extractor; should return the input text. Use this to double-check
-# * that your problem is within a particular {@link BoilerpipeExtractor}, or
-# * somewhere else.
+# * Dummy Extractor; should return the input text. Use this to double-check
+# * that your problem is within a particular {@link BoilerpipeExtractor}, or
+# * somewhere else.
KEEP_EVERYTHING_EXTRACTOR = Extractor(filters.MarkEverythingContentFilter())
@@ -174,9 +176,9 @@ def parseDoc(self,inputStr):
# class ArticleSentencesExtractor
# * A full-text extractor which is tuned towards extracting sentences from news articles.
ARTICLE_SENTENCES_EXTRACTOR=Extractor(filters.FilterChain([
- articleFilterChain,
- filters.SplitParagraphBlocksFilter(),
- filters.MinClauseWordsFilter()
+ articleFilterChain,
+ filters.SplitParagraphBlocksFilter(),
+ filters.MinClauseWordsFilter()
]))
@@ -184,10 +186,10 @@ def parseDoc(self,inputStr):
# * For news articles, it may perform better than the {@link DefaultExtractor},
# * but usually worse than {@link ArticleExtractor}.
class KeepEverythingWithMinKWordsFilter(filters.FilterChain):
- def __init__(self, kMin):
- filterArr = [
- filters.SimpleBlockFusionProcessor(),
- filters.MarkEverythingContentFilter(),
- filters.MinWordsFilter(kMin)
- ]
- super(KeepEverythingWithMinKWordsFilter, self).__init__(filters)
+ def __init__(self, kMin):
+ filterArr = [
+ filters.SimpleBlockFusionProcessor(),
+ filters.MarkEverythingContentFilter(),
+ filters.MinWordsFilter(kMin)
+ ]
+ super(KeepEverythingWithMinKWordsFilter, self).__init__(filters)
diff --git a/boilerpy/filters.py b/boilerpy/filters.py
index c2885bb..a9714ed 100644
--- a/boilerpy/filters.py
+++ b/boilerpy/filters.py
@@ -9,7 +9,7 @@
# * (the "License"); you may not use this file except in compliance with
# * the License. You may obtain a copy of the License at
# *
-# * http://www.apache.org/licenses/LICENSE-2.0
+# * http://www.apache.org/licenses/LICENSE-2.0
# *
# * Unless required by applicable law or agreed to in writing, software
# * distributed under the License is distributed on an "AS IS" BASIS,
@@ -59,42 +59,42 @@
import re
from . import document
-from document import DefaultLabels
+from .document import DefaultLabels
# Boilerpipe abstract interface
class BoilerpipeFilter(object):
- def process(self, doc): pass
-
- def subtractBlocks(self,blockArr,blocksToRemove):
- #inefficient but in place: for block in blocksToRemove: blockArr.remove(blocksToRemove)
- #efficiently subtracts second array from first assuming blocksToRemove shows up in the same order as blocArr
- if len(blocksToRemove)==0: return blockArr
- newBlockArr=[]
- removeIter=iter(blocksToRemove)
- curBlockToRemove=removeIter.next()
- for idx,block in enumerate(blockArr):
- if block==curBlockToRemove:
- try:
- curBlockToRemove=removeIter.next()
- except StopIteration:
- #add the rest
- newBlockArr.extend(blockArr[idx+1:])
- break
- else: newBlockArr.append(block)
- return newBlockArr
+ def process(self, doc): pass
+
+ def subtractBlocks(self,blockArr,blocksToRemove):
+ #inefficient but in place: for block in blocksToRemove: blockArr.remove(blocksToRemove)
+ #efficiently subtracts second array from first assuming blocksToRemove shows up in the same order as blocArr
+ if len(blocksToRemove)==0: return blockArr
+ newBlockArr=[]
+ removeIter=iter(blocksToRemove)
+ curBlockToRemove = next(removeIter)
+ for idx,block in enumerate(blockArr):
+ if block==curBlockToRemove:
+ try:
+ curBlockToRemove = next(removeIter)
+ except StopIteration:
+ #add the rest
+ newBlockArr.extend(blockArr[idx+1:])
+ break
+ else: newBlockArr.append(block)
+ return newBlockArr
# chain together multiple filters in sequence
class FilterChain(BoilerpipeFilter):
- def __init__(self,filterArr):
- super(FilterChain, self).__init__()
- self.filterArr=filterArr
-
- def process(self,doc):
- isUpdated=False
- for filtr in self.filterArr:
- isUpdated|=filtr.process(doc)
- return isUpdated
+ def __init__(self,filterArr):
+ super(FilterChain, self).__init__()
+ self.filterArr=filterArr
+
+ def process(self,doc):
+ isUpdated=False
+ for filtr in self.filterArr:
+ isUpdated|=filtr.process(doc)
+ return isUpdated
#-----------------------------------------------------------------------
@@ -109,14 +109,14 @@ def process(self,doc):
# * @author Christian Kohlschtter
#
class MarkEverythingContentFilter(BoilerpipeFilter):
- def process(self, doc):
- """ generated source for method process """
- changes = False
- for tb in doc.getTextBlocks():
- if not tb.isContent():
- tb.setIsContent(True)
- changes = True
- return changes
+ def process(self, doc):
+ """ generated source for method process """
+ changes = False
+ for tb in doc.getTextBlocks():
+ if not tb.isContent():
+ tb.setIsContent(True)
+ changes = True
+ return changes
#
@@ -126,12 +126,12 @@ def process(self, doc):
#
class InvertedFilter(BoilerpipeFilter):
- def process(self, doc):
- """ generated source for method process """
- tbs = doc.getTextBlocks()
- if len(tbs)==0: return False
- for tb in tbs: tb.setIsContent(not tb.isContent())
- return True
+ def process(self, doc):
+ """ generated source for method process """
+ tbs = doc.getTextBlocks()
+ if len(tbs)==0: return False
+ for tb in tbs: tb.setIsContent(not tb.isContent())
+ return True
#
@@ -140,14 +140,14 @@ def process(self, doc):
# * @author Christian Kohlschtter
#
class BoilerplateBlockFilter(BoilerpipeFilter):
- def process(self, doc):
- """ generated source for method process """
- textBlocks = doc.getTextBlocks()
- newBlocks=[tb for tb in textBlocks if tb.isContent()]
- hasChanges = len(newBlocks)<A> tag).
@@ -59,286 +59,286 @@ def changesTagLevel(self):
# * If boilerpipe encounters such nestings, a SAXException is thrown.
#
class AnchorTextTagAction(TagAction):
- """ generated source for class TA_ANCHOR_TEXT """
- def start(self, contentHandler, tagName, attrs):
- contentHandler.inAnchor += 1
- if contentHandler.inAnchor > 1:
- # as nested A elements are not allowed per specification, we
- # are probably reaching this branch due to a bug in the XML
- # parser
- print("Warning: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again. Trying to recover somehow...")
- self.end(contentHandler, tagName)
- if contentHandler.inIgnorableElement == 0:
- contentHandler.addToken(SpecialTokens.ANCHOR_TEXT_START)
- return False
-
- def end(self, contentHandler, tagName):
- contentHandler.inAnchor -= 1
- if contentHandler.inAnchor == 0 and contentHandler.inIgnorableElement == 0:
- contentHandler.addToken(SpecialTokens.ANCHOR_TEXT_END)
- return False
-
- def changesTagLevel(self):
- return True
+ """ generated source for class TA_ANCHOR_TEXT """
+ def start(self, contentHandler, tagName, attrs):
+ contentHandler.inAnchor += 1
+ if contentHandler.inAnchor > 1:
+ # as nested A elements are not allowed per specification, we
+ # are probably reaching this branch due to a bug in the XML
+ # parser
+ print("Warning: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again. Trying to recover somehow...")
+ self.end(contentHandler, tagName)
+ if contentHandler.inIgnorableElement == 0:
+ contentHandler.addToken(SpecialTokens.ANCHOR_TEXT_START)
+ return False
+
+ def end(self, contentHandler, tagName):
+ contentHandler.inAnchor -= 1
+ if contentHandler.inAnchor == 0 and contentHandler.inIgnorableElement == 0:
+ contentHandler.addToken(SpecialTokens.ANCHOR_TEXT_END)
+ return False
+
+ def changesTagLevel(self):
+ return True
#
# * Marks this tag the body element (this should usually only be set for the <BODY> tag).
#
class BodyTagAction(TagAction):
- """ generated source for class TA_BODY """
- def start(self, contentHandler, tagName, attrs):
- contentHandler.flushBlock()
- contentHandler.inBody += 1
- return False
+ """ generated source for class TA_BODY """
+ def start(self, contentHandler, tagName, attrs):
+ contentHandler.flushBlock()
+ contentHandler.inBody += 1
+ return False
- def end(self, contentHandler, tagName):
- contentHandler.flushBlock()
- contentHandler.inBody -= 1
- return False
+ def end(self, contentHandler, tagName):
+ contentHandler.flushBlock()
+ contentHandler.inBody -= 1
+ return False
- def changesTagLevel(self):
- return True
+ def changesTagLevel(self):
+ return True
#
# * Marks this tag a simple "inline" element, which generates whitespace, but no new block.
#
class InlineWhitespaceTagAction(TagAction):
- """ generated source for class TA_INLINE_WHITESPACE """
- def start(self, contentHandler, tagName, attrs):
- contentHandler.addWhitespaceIfNecessary()
- return False
+ """ generated source for class TA_INLINE_WHITESPACE """
+ def start(self, contentHandler, tagName, attrs):
+ contentHandler.addWhitespaceIfNecessary()
+ return False
- def end(self, contentHandler, tagName):
- contentHandler.addWhitespaceIfNecessary()
- return False
+ def end(self, contentHandler, tagName):
+ contentHandler.addWhitespaceIfNecessary()
+ return False
- def changesTagLevel(self): return False
+ def changesTagLevel(self): return False
#
# * Marks this tag a simple "inline" element, which neither generates whitespace, nor a new block.
#
class InlineTagAction(TagAction):
- """ generated source for class TA_INLINE_NO_WHITESPACE """
- def start(self, contentHandler, tagName, attrs): return False
- def end(self, contentHandler, tagName): return False
- def changesTagLevel(self): return False
+ """ generated source for class TA_INLINE_NO_WHITESPACE """
+ def start(self, contentHandler, tagName, attrs): return False
+ def end(self, contentHandler, tagName): return False
+ def changesTagLevel(self): return False
#
# * Explicitly marks this tag a simple "block-level" element, which always generates whitespace
#
class BlockTagAction(TagAction):
- """ generated source for class TA_BLOCK_LEVEL """
- def start(self, contentHandler, tagName, attrs): return True
- def end(self, contentHandler, tagName): return True
- def changesTagLevel(self): return True
+ """ generated source for class TA_BLOCK_LEVEL """
+ def start(self, contentHandler, tagName, attrs): return True
+ def end(self, contentHandler, tagName): return True
+ def changesTagLevel(self): return True
#
# * Special TagAction for the <FONT> tag, which keeps track of the
# * absolute and relative font size.
#
class FontTagAction(TagAction):
- """ generated source for class TA_FONT """
- #WARNING: POSSIBLE BUG -- used to be [0-9] without +
- PAT_FONT_SIZE = re.compile("([\+\-]?)([0-9]+)")
-
- def start(self, contentHandler, tagName, attrs):
- """ generated source for method start """
- sizeAttr = attrs.getValue("size")
- size=None
- if sizeAttr != None:
- match=PAT_FONT_SIZE.match(sizeAttr)
- if match!=None:
- rel=match.group(0)
- val=match.group(1)
- if len(rel)==0:
- # absolute
- size = val
- else:
- # relative
- #last non-none element from stack, default 3
- lastNonNone=(s for s in contentHandler.fontSizeStack[::-1] if s!=None)
- prevSize=next(lastNonNone,3)
- if rel[0] == '+': size = prevSize + val
- else: size = prevSize - val
- contentHandler.fontSizeStack.append(size)
- return False
-
- def end(self, contentHandler, tagName):
- contentHandler.fontSizeStack.pop()
- return False
-
- def changesTagLevel(self): return False
+ """ generated source for class TA_FONT """
+ #WARNING: POSSIBLE BUG -- used to be [0-9] without +
+ PAT_FONT_SIZE = re.compile("([\+\-]?)([0-9]+)")
+
+ def start(self, contentHandler, tagName, attrs):
+ """ generated source for method start """
+ sizeAttr = attrs.getValue("size")
+ size=None
+ if sizeAttr != None:
+ match = self.PAT_FONT_SIZE.match(sizeAttr)
+ if match!=None:
+ rel=match.group(0)
+ val=match.group(1)
+ if len(rel)==0:
+ # absolute
+ size = val
+ else:
+ # relative
+ #last non-none element from stack, default 3
+ lastNonNone=(s for s in contentHandler.fontSizeStack[::-1] if s!=None)
+ prevSize=next(lastNonNone,3)
+ if rel[0] == '+': size = prevSize + val
+ else: size = prevSize - val
+ contentHandler.fontSizeStack.append(size)
+ return False
+
+ def end(self, contentHandler, tagName):
+ contentHandler.fontSizeStack.pop()
+ return False
+
+ def changesTagLevel(self): return False
#
# * {@link CommonTagActions} for inline elements, which triggers some {@link LabelAction} on the generated
# * {@link TextBlock}.
#
class InlineTagLabelAction(TagAction):
- """ generated source for class InlineTagLabelAction """
+ """ generated source for class InlineTagLabelAction """
- def __init__(self, action):
- """ generated source for method __init__ """
- super(InlineTagLabelAction, self).__init__()
- self.action = action
+ def __init__(self, action):
+ """ generated source for method __init__ """
+ super(InlineTagLabelAction, self).__init__()
+ self.action = action
- def start(self, contentHandler, tagName, attrs):
- """ generated source for method start """
- contentHandler.addWhitespaceIfNecessary()
- contentHandler.addLabelAction(self.action)
- return False
+ def start(self, contentHandler, tagName, attrs):
+ """ generated source for method start """
+ contentHandler.addWhitespaceIfNecessary()
+ contentHandler.addLabelAction(self.action)
+ return False
- def end(self, contentHandler, tagName):
- """ generated source for method end """
- contentHandler.addWhitespaceIfNecessary()
- return False
+ def end(self, contentHandler, tagName):
+ """ generated source for method end """
+ contentHandler.addWhitespaceIfNecessary()
+ return False
- def changesTagLevel(self):
- """ generated source for method changesTagLevel """
- return False
+ def changesTagLevel(self):
+ """ generated source for method changesTagLevel """
+ return False
#
# * {@link CommonTagActions} for block-level elements, which triggers some {@link LabelAction} on the generated
# * {@link TextBlock}.
#
class BlockTagLabelAction(TagAction):
- """ generated source for class BlockTagLabelAction """
+ """ generated source for class BlockTagLabelAction """
- def __init__(self, action):
- """ generated source for method __init__ """
- super(BlockTagLabelAction, self).__init__()
- self.action = action
+ def __init__(self, action):
+ """ generated source for method __init__ """
+ super(BlockTagLabelAction, self).__init__()
+ self.action = action
- def start(self, contentHandler, tagName, attrs):
- """ generated source for method start """
- contentHandler.addLabelAction(self.action)
- return True
+ def start(self, contentHandler, tagName, attrs):
+ """ generated source for method start """
+ contentHandler.addLabelAction(self.action)
+ return True
- def end(self, contentHandler, tagName):
- """ generated source for method end """
- return True
+ def end(self, contentHandler, tagName):
+ """ generated source for method end """
+ return True
- def changesTagLevel(self):
- """ generated source for method changesTagLevel """
- return True
+ def changesTagLevel(self):
+ """ generated source for method changesTagLevel """
+ return True
class Chained(TagAction):
- def __init__(self, tagAction1, tagAction2):
- """ generated source for method __init__ """
- super(Chained, self).__init__()
- self.tagAction1 = tagAction1
- self.tagAction2 = tagAction2
+ def __init__(self, tagAction1, tagAction2):
+ """ generated source for method __init__ """
+ super(Chained, self).__init__()
+ self.tagAction1 = tagAction1
+ self.tagAction2 = tagAction2
- def start(self, contentHandler, tagName, attrs):
- """ generated source for method start """
- return self.tagAction1.start(contentHandler, tagName, attrs) | self.tagAction2.start(contentHandler, tagName, attrs)
+ def start(self, contentHandler, tagName, attrs):
+ """ generated source for method start """
+ return self.tagAction1.start(contentHandler, tagName, attrs) | self.tagAction2.start(contentHandler, tagName, attrs)
- def end(self, contentHandler, tagName):
- """ generated source for method end """
- return self.tagAction1.end(contentHandler, tagName) | self.tagAction2.end(contentHandler, tagName)
+ def end(self, contentHandler, tagName):
+ """ generated source for method end """
+ return self.tagAction1.end(contentHandler, tagName) | self.tagAction2.end(contentHandler, tagName)
- def changesTagLevel(self):
- """ generated source for method changesTagLevel """
- return self.tagAction1.changesTagLevel() or self.tagAction2.changesTagLevel()
+ def changesTagLevel(self):
+ """ generated source for method changesTagLevel """
+ return self.tagAction1.changesTagLevel() or self.tagAction2.changesTagLevel()
class MarkupTagAction(TagAction):
- """ generated source for class MarkupTagAction """
-
- def __init__(self, isBlockLevel):
- """ generated source for method __init__ """
- super(MarkupTagAction, self).__init__()
- self.isBlockLevel = isBlockLevel
- self.labelStack = []
-
- PAT_NUM = re.compile("[0-9]+")
-
- def start(self, contentHandler, tagName, attrs):
- """ generated source for method start """
- labels = []
- labels.append(DefaultLabels.MARKUP_PREFIX + tagName)
- classVal = attrs.getValue("class")
- if classVal != None and len(classVal)>0:
- classVal = self.PAT_NUM.sub("#",classVal).strip()
- vals = classVal.split(r"[ ]+")
- labels.append(DefaultLabels.MARKUP_PREFIX + "." + classVal.replace(' ', '.'))
- if len(vals)>1:
- for s in vals:
- labels.append(DefaultLabels.MARKUP_PREFIX + "." + s)
- id = attrs.get("id")
- if id != None and len(id)<0:
- id = self.PAT_NUM.sub("#",id)
- labels.append(DefaultLabels.MARKUP_PREFIX + "#" + id)
- ancestors = self.getAncestorLabels()
- labelsWithAncestors = []
- for l in labels:
- for an in ancestors:
- labelsWithAncestors.append(an)
- labelsWithAncestors.append(an + " " + l)
- labelsWithAncestors.append(l)
- contentHandler.addLabelAction(LabelAction(labelsWithAncestors))
- self.labelStack.append(labels)
- return self.isBlockLevel
-
- def end(self, contentHandler, tagName):
- """ generated source for method end """
- self.labelStack.pop()
- return self.isBlockLevel
-
- def changesTagLevel(self):
- """ generated source for method changesTagLevel """
- return self.isBlockLevel
-
- def getAncestorLabels(self):
- """ generated source for method getAncestorLabels """
- labelSet = set()
- for labels in labelStack:
- if labels == None:continue
- labelSet.update(labels)
- return labelSet
-
-
-class CommonTagActions:
- TA_IGNORABLE_ELEMENT=IgnorableElementTagAction()
- TA_ANCHOR_TEXT=AnchorTextTagAction()
- TA_BODY=BodyTagAction()
- TA_INLINE_WHITESPACE=InlineWhitespaceTagAction()
- TA_INLINE_NO_WHITESPACE=InlineTagAction()
- TA_BLOCK_LEVEL=BlockTagAction()
- TA_FONT=FontTagAction()
+ """ generated source for class MarkupTagAction """
+
+ def __init__(self, isBlockLevel):
+ """ generated source for method __init__ """
+ super(MarkupTagAction, self).__init__()
+ self.isBlockLevel = isBlockLevel
+ self.labelStack = []
+
+ PAT_NUM = re.compile("[0-9]+")
+
+ def start(self, contentHandler, tagName, attrs):
+ """ generated source for method start """
+ labels = []
+ labels.append(DefaultLabels.MARKUP_PREFIX + tagName)
+ classVal = attrs.getValue("class")
+ if classVal != None and len(classVal)>0:
+ classVal = self.PAT_NUM.sub("#",classVal).strip()
+ vals = classVal.split(r"[ ]+")
+ labels.append(DefaultLabels.MARKUP_PREFIX + "." + classVal.replace(' ', '.'))
+ if len(vals)>1:
+ for s in vals:
+ labels.append(DefaultLabels.MARKUP_PREFIX + "." + s)
+ id = attrs.get("id")
+ if id != None and len(id)<0:
+ id = self.PAT_NUM.sub("#",id)
+ labels.append(DefaultLabels.MARKUP_PREFIX + "#" + id)
+ ancestors = self.getAncestorLabels()
+ labelsWithAncestors = []
+ for l in labels:
+ for an in ancestors:
+ labelsWithAncestors.append(an)
+ labelsWithAncestors.append(an + " " + l)
+ labelsWithAncestors.append(l)
+ contentHandler.addLabelAction(LabelAction(labelsWithAncestors))
+ self.labelStack.append(labels)
+ return self.isBlockLevel
+
+ def end(self, contentHandler, tagName):
+ """ generated source for method end """
+ self.labelStack.pop()
+ return self.isBlockLevel
+
+ def changesTagLevel(self):
+ """ generated source for method changesTagLevel """
+ return self.isBlockLevel
+
+ def getAncestorLabels(self):
+ """ generated source for method getAncestorLabels """
+ labelSet = set()
+ for labels in self.labelStack:
+ if labels == None:continue
+ labelSet.update(labels)
+ return labelSet
+
+
+class CommonTagActions(object):
+ TA_IGNORABLE_ELEMENT=IgnorableElementTagAction()
+ TA_ANCHOR_TEXT=AnchorTextTagAction()
+ TA_BODY=BodyTagAction()
+ TA_INLINE_WHITESPACE=InlineWhitespaceTagAction()
+ TA_INLINE_NO_WHITESPACE=InlineTagAction()
+ TA_BLOCK_LEVEL=BlockTagAction()
+ TA_FONT=FontTagAction()
defaultTagActionMap={
- "STYLE" : CommonTagActions.TA_IGNORABLE_ELEMENT,
- "SCRIPT" : CommonTagActions.TA_IGNORABLE_ELEMENT,
- "OPTION" : CommonTagActions.TA_IGNORABLE_ELEMENT,
- "OBJECT" : CommonTagActions.TA_IGNORABLE_ELEMENT,
- "EMBED" : CommonTagActions.TA_IGNORABLE_ELEMENT,
- "APPLET" : CommonTagActions.TA_IGNORABLE_ELEMENT,
- #Note: link removed because it can be self-closing in HTML5
- #"LINK" : CommonTagActions.TA_IGNORABLE_ELEMENT,
- "A" : CommonTagActions.TA_ANCHOR_TEXT,
- "BODY" : CommonTagActions.TA_BODY,
- "STRIKE" : CommonTagActions.TA_INLINE_NO_WHITESPACE,
- "U" : CommonTagActions.TA_INLINE_NO_WHITESPACE,
- "B" : CommonTagActions.TA_INLINE_NO_WHITESPACE,
- "I" : CommonTagActions.TA_INLINE_NO_WHITESPACE,
- "EM" : CommonTagActions.TA_INLINE_NO_WHITESPACE,
- "STRONG" : CommonTagActions.TA_INLINE_NO_WHITESPACE,
- "SPAN" : CommonTagActions.TA_INLINE_NO_WHITESPACE,
- # New in 1.1 (especially to improve extraction quality from Wikipedia etc.,
- "SUP" : CommonTagActions.TA_INLINE_NO_WHITESPACE,
- # New in 1.2
- "CODE" : CommonTagActions.TA_INLINE_NO_WHITESPACE,
- "TT" : CommonTagActions.TA_INLINE_NO_WHITESPACE,
- "SUB" : CommonTagActions.TA_INLINE_NO_WHITESPACE,
- "VAR" : CommonTagActions.TA_INLINE_NO_WHITESPACE,
- "ABBR" : CommonTagActions.TA_INLINE_WHITESPACE,
- "ACRONYM" : CommonTagActions.TA_INLINE_WHITESPACE,
- "FONT" : CommonTagActions.TA_INLINE_NO_WHITESPACE,
- # could also use TA_FONT
- # added in 1.1.1
- "NOSCRIPT" : CommonTagActions.TA_IGNORABLE_ELEMENT
+ "STYLE" : CommonTagActions.TA_IGNORABLE_ELEMENT,
+ "SCRIPT" : CommonTagActions.TA_IGNORABLE_ELEMENT,
+ "OPTION" : CommonTagActions.TA_IGNORABLE_ELEMENT,
+ "OBJECT" : CommonTagActions.TA_IGNORABLE_ELEMENT,
+ "EMBED" : CommonTagActions.TA_IGNORABLE_ELEMENT,
+ "APPLET" : CommonTagActions.TA_IGNORABLE_ELEMENT,
+ #Note: link removed because it can be self-closing in HTML5
+ #"LINK" : CommonTagActions.TA_IGNORABLE_ELEMENT,
+ "A" : CommonTagActions.TA_ANCHOR_TEXT,
+ "BODY" : CommonTagActions.TA_BODY,
+ "STRIKE" : CommonTagActions.TA_INLINE_NO_WHITESPACE,
+ "U" : CommonTagActions.TA_INLINE_NO_WHITESPACE,
+ "B" : CommonTagActions.TA_INLINE_NO_WHITESPACE,
+ "I" : CommonTagActions.TA_INLINE_NO_WHITESPACE,
+ "EM" : CommonTagActions.TA_INLINE_NO_WHITESPACE,
+ "STRONG" : CommonTagActions.TA_INLINE_NO_WHITESPACE,
+ "SPAN" : CommonTagActions.TA_INLINE_NO_WHITESPACE,
+ # New in 1.1 (especially to improve extraction quality from Wikipedia etc.,
+ "SUP" : CommonTagActions.TA_INLINE_NO_WHITESPACE,
+ # New in 1.2
+ "CODE" : CommonTagActions.TA_INLINE_NO_WHITESPACE,
+ "TT" : CommonTagActions.TA_INLINE_NO_WHITESPACE,
+ "SUB" : CommonTagActions.TA_INLINE_NO_WHITESPACE,
+ "VAR" : CommonTagActions.TA_INLINE_NO_WHITESPACE,
+ "ABBR" : CommonTagActions.TA_INLINE_WHITESPACE,
+ "ACRONYM" : CommonTagActions.TA_INLINE_WHITESPACE,
+ "FONT" : CommonTagActions.TA_INLINE_NO_WHITESPACE,
+ # could also use TA_FONT
+ # added in 1.1.1
+ "NOSCRIPT" : CommonTagActions.TA_IGNORABLE_ELEMENT
}
@@ -353,30 +353,30 @@ class CommonTagActions:
# * @author Christian Kohlschtter
#
class LabelAction(object):
- def __init__(self, *labels):
- self.labels = labels
+ def __init__(self, *labels):
+ self.labels = labels
- def addTo(self, textBlock):
- self.addLabelsTo(textBlock)
+ def addTo(self, textBlock):
+ self.addLabelsTo(textBlock)
- def addLabelsTo(self, textBlock):
- textBlock.addLabels(self.labels)
+ def addLabelsTo(self, textBlock):
+ textBlock.addLabels(self.labels)
- def __str__(self):
- return str(self.labels)
+ def __str__(self):
+ return str(self.labels)
class ConditionalLabelAction(LabelAction):
- def __init__(self, condition, *labels):
- super(ConditionalLabelAction, self).__init__(*labels)
- self.condition = condition
+ def __init__(self, condition, *labels):
+ super(ConditionalLabelAction, self).__init__(*labels)
+ self.condition = condition
- def addTo(self, textBlock):
- if self.condition(textBlock): self.addLabelsTo(textBlock)
+ def addTo(self, textBlock):
+ if self.condition(textBlock): self.addLabelsTo(textBlock)
-class SpecialTokens:
- ANCHOR_TEXT_START = u'\ue00astart'
- ANCHOR_TEXT_END = u'\ue00aend'
+class SpecialTokens(object):
+ ANCHOR_TEXT_START = u'\ue00astart'
+ ANCHOR_TEXT_END = u'\ue00aend'
#----------------------------------------------------------------------------
@@ -392,297 +392,296 @@ class SpecialTokens:
class BoilerpipeBaseParser(object):
- EVENT_START_TAG=0
- EVENT_END_TAG=1
- EVENT_CHARACTERS=2
- EVENT_WHITESPACE=3
- #all word characters except underscore -- i.e. not (not word or underscore)
- PAT_VALID_WORD_CHARACTER = re.compile(r"[^\W_]",re.UNICODE)
-# PAT_WORD = re.compile(r"\ue00a?[\w]+",re.UNICODE)
- PAT_WORD = re.compile(ur"\ue00a?[\w\"'\.,\!\@\-\:\;\$\?\(\)/]+",re.UNICODE)
-
- """ generated source for class BoilerpipeHTMLContentHandler """
- #
- # * Constructs a {@link BoilerpipeHTMLContentHandler} using the given
- # * {@link TagActionMap}.
- # *
- # * @param tagActions
- # * The {@link TagActionMap} to use, e.g.
- # * {@link DefaultTagActionMap}.
- #
- def __init__(self, tagActions=None):
- """ generated source for method __init___0 """
- #super(BoilerpipeHTMLContentHandler, self).__init__()
- if tagActions==None: self.tagActions=defaultTagActionMap
- else: self.tagActions = tagActions
-
-
- self.clearTextBuffer()
- self.inBody = 0
- self.inAnchor = 0
- self.inIgnorableElement = 0
- self.textElementIdx = 0
- self.lastStartTag = None
- self.lastEndTag = None
- self.lastEvent = None
- self.offsetBlocks = 0
- self.currentContainedTextElements=set()
- self.flush = False
- self.inAnchorText = False
-
- self.title = None
- self.tagLevel = 0
- self.blockTagLevel = -1
- self.textBlocks = []
- self.labelStacks = []
- self.fontSizeStack = []
-
- #
- # * Recycles this instance.
- #
- def recycle(self):
- """ generated source for method recycle """
- self.clearTextBuffer()
- self.inBody = 0
- self.inAnchor = 0
- self.inIgnorableElement = 0
- self.textElementIdx = 0
- self.lastStartTag = None
- self.lastEndTag = None
- self.lastEvent = None
- self.offsetBlocks = 0
- self.currentContainedTextElements=set()
- self.flush = False
- self.inAnchorText = False
- self.textBlocks=[]
-
- #--------- added -------
- self.title = None
- self.tagLevel = 0
- self.blockTagLevel = -1
- self.labelStacks = []
- self.fontSizeStack = []
+ EVENT_START_TAG=0
+ EVENT_END_TAG=1
+ EVENT_CHARACTERS=2
+ EVENT_WHITESPACE=3
+ #all word characters except underscore -- i.e. not (not word or underscore)
+ PAT_VALID_WORD_CHARACTER = re.compile(r"[^\W_]", re.UNICODE)
+ PAT_WORD = re.compile(r"\ue00a?[\w\"'\.,\!\@\-\:\;\$\?\(\)/]+", re.UNICODE)
+
+ """ generated source for class BoilerpipeHTMLContentHandler """
+ #
+ # * Constructs a {@link BoilerpipeHTMLContentHandler} using the given
+ # * {@link TagActionMap}.
+ # *
+ # * @param tagActions
+ # * The {@link TagActionMap} to use, e.g.
+ # * {@link DefaultTagActionMap}.
+ #
+ def __init__(self, tagActions=None):
+ """ generated source for method __init___0 """
+ #super(BoilerpipeHTMLContentHandler, self).__init__()
+ if tagActions==None: self.tagActions=defaultTagActionMap
+ else: self.tagActions = tagActions
+
+
+ self.clearTextBuffer()
+ self.inBody = 0
+ self.inAnchor = 0
+ self.inIgnorableElement = 0
+ self.textElementIdx = 0
+ self.lastStartTag = None
+ self.lastEndTag = None
+ self.lastEvent = None
+ self.offsetBlocks = 0
+ self.currentContainedTextElements=set()
+ self.flush = False
+ self.inAnchorText = False
+
+ self.title = None
+ self.tagLevel = 0
+ self.blockTagLevel = -1
+ self.textBlocks = []
+ self.labelStacks = []
+ self.fontSizeStack = []
+
+ #
+ # * Recycles this instance.
+ #
+ def recycle(self):
+ """ generated source for method recycle """
+ self.clearTextBuffer()
+ self.inBody = 0
+ self.inAnchor = 0
+ self.inIgnorableElement = 0
+ self.textElementIdx = 0
+ self.lastStartTag = None
+ self.lastEndTag = None
+ self.lastEvent = None
+ self.offsetBlocks = 0
+ self.currentContainedTextElements=set()
+ self.flush = False
+ self.inAnchorText = False
+ self.textBlocks=[]
+
+ #--------- added -------
+ self.title = None
+ self.tagLevel = 0
+ self.blockTagLevel = -1
+ self.labelStacks = []
+ self.fontSizeStack = []
#------------------------------- SAX Parser methods ----------------------------------------
- # @Override
- def endDocument(self):
- """ generated source for method endDocument """
- self.flushBlock()
-
- # @Override
- def startDocument(self): pass
-
- # @Override
- def startElement(self, name,attrs):
- self.labelStacks.append([])
-
- tagAction = self.tagActions.get(name.strip().upper())
-
- if tagAction != None:
- self.flush |= tagAction.start(self, name, attrs)
- if tagAction.changesTagLevel(): self.tagLevel += 1
- else:
- self.tagLevel += 1
- self.flush = True
- self.lastEvent = self.EVENT_START_TAG
- self.lastStartTag = name
-
- # @Override
- def endElement(self, name):
- tagAction = self.tagActions.get(name.strip().upper())
-
-
- if tagAction != None:
- self.flush |= tagAction.end(self, name)
- if tagAction.changesTagLevel(): self.tagLevel -= 1
- else:
- self.flush = True
- self.tagLevel -= 1
-
- if self.flush: self.flushBlock()
- self.lastEvent = self.EVENT_END_TAG
- self.lastEndTag = name
- self.labelStacks.pop()
-
- # @Override
- def characters(self, content):
- self.textElementIdx += 1
- if self.flush:
- self.flushBlock()
- self.flush = False
- if self.inIgnorableElement != 0: return
-
- if len(content) == 0: return
-
- strippedContent=content.strip()
-
- if len(strippedContent) == 0:
- self.addWhitespaceIfNecessary()
- self.lastEvent = self.EVENT_WHITESPACE
- return
-
- startWhitespace=content[0].isspace()
- if startWhitespace: self.addWhitespaceIfNecessary()
-
- if self.blockTagLevel == -1:
- self.blockTagLevel = self.tagLevel
- self.textBuffer+=strippedContent
- self.tokenBuffer+=strippedContent
-
- endWhitespace=content[-1].isspace()
- if endWhitespace: self.addWhitespaceIfNecessary()
-
- self.lastEvent = self.EVENT_CHARACTERS
- self.currentContainedTextElements.add(self.textElementIdx)
-
- # @Override
- def ignorableWhitespace(self, whitespace):
- self.addWhitespaceIfNecessary()
+ # @Override
+ def endDocument(self):
+ """ generated source for method endDocument """
+ self.flushBlock()
+
+ # @Override
+ def startDocument(self): pass
+
+ # @Override
+ def startElement(self, name,attrs):
+ self.labelStacks.append([])
+
+ tagAction = self.tagActions.get(name.strip().upper())
+
+ if tagAction != None:
+ self.flush |= tagAction.start(self, name, attrs)
+ if tagAction.changesTagLevel(): self.tagLevel += 1
+ else:
+ self.tagLevel += 1
+ self.flush = True
+ self.lastEvent = self.EVENT_START_TAG
+ self.lastStartTag = name
+
+ # @Override
+ def endElement(self, name):
+ tagAction = self.tagActions.get(name.strip().upper())
+
+
+ if tagAction != None:
+ self.flush |= tagAction.end(self, name)
+ if tagAction.changesTagLevel(): self.tagLevel -= 1
+ else:
+ self.flush = True
+ self.tagLevel -= 1
+
+ if self.flush: self.flushBlock()
+ self.lastEvent = self.EVENT_END_TAG
+ self.lastEndTag = name
+ self.labelStacks.pop()
+
+ # @Override
+ def characters(self, content):
+ self.textElementIdx += 1
+ if self.flush:
+ self.flushBlock()
+ self.flush = False
+ if self.inIgnorableElement != 0: return
+
+ if len(content) == 0: return
+
+ strippedContent=content.strip()
+
+ if len(strippedContent) == 0:
+ self.addWhitespaceIfNecessary()
+ self.lastEvent = self.EVENT_WHITESPACE
+ return
+
+ startWhitespace=content[0].isspace()
+ if startWhitespace: self.addWhitespaceIfNecessary()
+
+ if self.blockTagLevel == -1:
+ self.blockTagLevel = self.tagLevel
+ self.textBuffer+=strippedContent
+ self.tokenBuffer+=strippedContent
+
+ endWhitespace=content[-1].isspace()
+ if endWhitespace: self.addWhitespaceIfNecessary()
+
+ self.lastEvent = self.EVENT_CHARACTERS
+ self.currentContainedTextElements.add(self.textElementIdx)
+
+ # @Override
+ def ignorableWhitespace(self, whitespace):
+ self.addWhitespaceIfNecessary()
#------------------------------- utility methods ----------------------------------------
- def flushBlock(self):
- """ generated source for method flushBlock """
- if self.inBody == 0:
- if self.lastStartTag.lower()=="title": self.setTitle(self.textBuffer.strip())
- self.clearTextBuffer()
- return
- if len(self.tokenBuffer.strip())==0:
- self.clearTextBuffer()
- return
-
- tokens = self.tokenize(self.tokenBuffer)
- numWords = 0
- numLinkedWords = 0
- numWrappedLines = 0
- currentLineLength = -1
- # don't count the first space
- maxLineLength = 80
- numTokens = 0
- numWordsCurrentLine = 0
-
- for token in tokens:
- if token==SpecialTokens.ANCHOR_TEXT_START: self.inAnchorText = True
- elif token==SpecialTokens.ANCHOR_TEXT_END: self.inAnchorText = False
- elif self.isWord(token):
- numTokens += 1
- numWords += 1
- numWordsCurrentLine += 1
- if self.inAnchorText:
- numLinkedWords += 1
- currentLineLength += len(token) + 1
- if currentLineLength > maxLineLength:
- numWrappedLines += 1
- currentLineLength = len(token)
- numWordsCurrentLine = 1
- else:
- numTokens += 1
-
- #if only special tokens (numTokens excludes special tokens)
- if numTokens == 0:
- self.clearTextBuffer()
- return
-
- if numWrappedLines == 0:
- numWordsInWrappedLines = numWords
- numWrappedLines = 1
- else:
- numWordsInWrappedLines = numWords - numWordsCurrentLine
-
- tb = document.TextBlock(self.textBuffer.strip(), self.currentContainedTextElements, numWords, numLinkedWords, numWordsInWrappedLines, numWrappedLines, self.offsetBlocks)
- self.currentContainedTextElements = set()
- self.offsetBlocks += 1
- self.clearTextBuffer()
- tb.setTagLevel(self.blockTagLevel)
- self.addTextBlock(tb)
- self.blockTagLevel = -1
-
- def addTextBlock(self, tb):
- """ generated source for method addTextBlock """
- for fontSize in self.fontSizeStack[::-1]:
- if fontSize != None:
- tb.addLabel("font-" + str(fontSize))
- break
- for labelStack in self.labelStacks:
- for labels in labelStack:
- labels.addTo(tb)
- self.textBlocks.append(tb)
-
-
- def isWord(self, token):
- """ generated source for method isWord """
- return self.PAT_VALID_WORD_CHARACTER.search(token)!=None
-
- def tokenize(self,text):
- return self.PAT_WORD.findall(text)
-
- def getTextBlocks(self):
- """ generated source for method getTextBlocks """
- return self.textBlocks
-
- def getTitle(self):
- """ generated source for method getTitle """
- return self.title
-
- def setTitle(self, s):
- """ generated source for method setTitle """
- if s == None or len(s)==0: return
- self.title = s
-
- #
- # * Returns a {@link TextDocument} containing the extracted {@link TextBlock}
- # * s. NOTE: Only call this after parsing.
- # *
- # * @return The {@link TextDocument}
- #
- def toTextDocument(self):
- """ generated source for method toTextDocument """
- # just to be sure
- self.flushBlock()
- return document.TextDocument(self.getTextBlocks(), self.getTitle())
-
- def addWhitespaceIfNecessary(self):
- """ generated source for method addWhitespaceIfNecessary """
- if len(self.textBuffer)==0 or not self.textBuffer[-1].isspace():
- self.textBuffer+=' '
- if len(self.tokenBuffer)==0 or not self.tokenBuffer[-1].isspace():
- self.tokenBuffer+=' '
-
- def clearTextBuffer(self):
- self.textBuffer=''
- self.tokenBuffer=''
-
- def addToken(self,token):
- self.addWhitespaceIfNecessary()
- self.tokenBuffer+=token
- self.addWhitespaceIfNecessary()
-
- def addLabelAction(self, la):
- """ generated source for method addLabelAction """
- if len(self.labelStacks)==0: self.labelStacks.append([])
- self.labelStacks[-1].append(la)
+ def flushBlock(self):
+ """ generated source for method flushBlock """
+ if self.inBody == 0:
+ if self.lastStartTag.lower()=="title": self.setTitle(self.textBuffer.strip())
+ self.clearTextBuffer()
+ return
+ if len(self.tokenBuffer.strip())==0:
+ self.clearTextBuffer()
+ return
+
+ tokens = self.tokenize(self.tokenBuffer)
+ numWords = 0
+ numLinkedWords = 0
+ numWrappedLines = 0
+ currentLineLength = -1
+ # don't count the first space
+ maxLineLength = 80
+ numTokens = 0
+ numWordsCurrentLine = 0
+
+ for token in tokens:
+ if token==SpecialTokens.ANCHOR_TEXT_START: self.inAnchorText = True
+ elif token==SpecialTokens.ANCHOR_TEXT_END: self.inAnchorText = False
+ elif self.isWord(token):
+ numTokens += 1
+ numWords += 1
+ numWordsCurrentLine += 1
+ if self.inAnchorText:
+ numLinkedWords += 1
+ currentLineLength += len(token) + 1
+ if currentLineLength > maxLineLength:
+ numWrappedLines += 1
+ currentLineLength = len(token)
+ numWordsCurrentLine = 1
+ else:
+ numTokens += 1
+
+ #if only special tokens (numTokens excludes special tokens)
+ if numTokens == 0:
+ self.clearTextBuffer()
+ return
+
+ if numWrappedLines == 0:
+ numWordsInWrappedLines = numWords
+ numWrappedLines = 1
+ else:
+ numWordsInWrappedLines = numWords - numWordsCurrentLine
+
+ tb = document.TextBlock(self.textBuffer.strip(), self.currentContainedTextElements, numWords, numLinkedWords, numWordsInWrappedLines, numWrappedLines, self.offsetBlocks)
+ self.currentContainedTextElements = set()
+ self.offsetBlocks += 1
+ self.clearTextBuffer()
+ tb.setTagLevel(self.blockTagLevel)
+ self.addTextBlock(tb)
+ self.blockTagLevel = -1
+
+ def addTextBlock(self, tb):
+ """ generated source for method addTextBlock """
+ for fontSize in self.fontSizeStack[::-1]:
+ if fontSize != None:
+ tb.addLabel("font-" + str(fontSize))
+ break
+ for labelStack in self.labelStacks:
+ for labels in labelStack:
+ labels.addTo(tb)
+ self.textBlocks.append(tb)
+
+
+ def isWord(self, token):
+ """ generated source for method isWord """
+ return self.PAT_VALID_WORD_CHARACTER.search(token)!=None
+
+ def tokenize(self,text):
+ return self.PAT_WORD.findall(text)
+
+ def getTextBlocks(self):
+ """ generated source for method getTextBlocks """
+ return self.textBlocks
+
+ def getTitle(self):
+ """ generated source for method getTitle """
+ return self.title
+
+ def setTitle(self, s):
+ """ generated source for method setTitle """
+ if s == None or len(s)==0: return
+ self.title = s
+
+ #
+ # * Returns a {@link TextDocument} containing the extracted {@link TextBlock}
+ # * s. NOTE: Only call this after parsing.
+ # *
+ # * @return The {@link TextDocument}
+ #
+ def toTextDocument(self):
+ """ generated source for method toTextDocument """
+ # just to be sure
+ self.flushBlock()
+ return document.TextDocument(self.getTextBlocks(), self.getTitle())
+
+ def addWhitespaceIfNecessary(self):
+ """ generated source for method addWhitespaceIfNecessary """
+ if len(self.textBuffer)==0 or not self.textBuffer[-1].isspace():
+ self.textBuffer+=' '
+ if len(self.tokenBuffer)==0 or not self.tokenBuffer[-1].isspace():
+ self.tokenBuffer+=' '
+
+ def clearTextBuffer(self):
+ self.textBuffer=''
+ self.tokenBuffer=''
+
+ def addToken(self,token):
+ self.addWhitespaceIfNecessary()
+ self.tokenBuffer+=token
+ self.addWhitespaceIfNecessary()
+
+ def addLabelAction(self, la):
+ """ generated source for method addLabelAction """
+ if len(self.labelStacks)==0: self.labelStacks.append([])
+ self.labelStacks[-1].append(la)
class BoilerpipeHTMLParser(HTMLParser,BoilerpipeBaseParser):
- def __init__(self):
- HTMLParser.__init__(self)
- BoilerpipeBaseParser.__init__(self)
-
- def feed(self,data):
- self.startDocument()
- HTMLParser.feed(self,data)
- self.endDocument()
-
- def handle_starttag(self, tag, attrs): self.startElement(tag,attrs)
- def handle_endtag(self, tag): self.endElement(tag)
- def handle_data(self, data): self.characters(data)
+ def __init__(self):
+ HTMLParser.__init__(self)
+ BoilerpipeBaseParser.__init__(self)
+
+ def feed(self,data):
+ self.startDocument()
+ HTMLParser.feed(self,data)
+ self.endDocument()
+
+ def handle_starttag(self, tag, attrs): self.startElement(tag,attrs)
+ def handle_endtag(self, tag): self.endElement(tag)
+ def handle_data(self, data): self.characters(data)
class BoilerpipeSAXContentHandler(ContentHandler,BoilerpipeBaseParser):
- def __init__(self):
- ContentHandler.__init__(self)
- BoilerpipeBaseParser.__init__(self)
+ def __init__(self):
+ ContentHandler.__init__(self)
+ BoilerpipeBaseParser.__init__(self)
diff --git a/dist/boilerpy-1.0.zip b/dist/boilerpy-1.0.zip
deleted file mode 100644
index a849c7c..0000000
Binary files a/dist/boilerpy-1.0.zip and /dev/null differ
diff --git a/setup.py b/setup.py
index 6088060..aebbf74 100644
--- a/setup.py
+++ b/setup.py
@@ -6,22 +6,22 @@
# README file and 2) it's easier to type in the README file than to put a raw
# string in below ...
def read(fname):
- return open(os.path.join(os.path.dirname(__file__), fname)).read()
+ return open(os.path.join(os.path.dirname(__file__), fname)).read()
setup(
- name = "boilerpy",
- version = "1.0",
- author = "Sam Myer",
- author_email = "mail@frozencavemanmedia.com",
- description = "Python port of Boilerpipe, Boilerplate Removal and Fulltext Extraction from HTML pages",
- license = "Apache 2.0",
- keywords = "boilerpipe fulltext extraction",
- url = "https://github.com/sammyer/BoilerPy",
- packages=['boilerpy'],
- long_description=read('README.txt'),
- classifiers=[
- "Development Status :: 4 - Beta",
- "Topic :: Utilities",
- "License :: OSI Approved :: Apache License",
- ]
-)
\ No newline at end of file
+ name = "boilerpy",
+ version = "1.0",
+ author = "Sam Myer",
+ author_email = "mail@frozencavemanmedia.com",
+ description = "Python port of Boilerpipe, Boilerplate Removal and Fulltext Extraction from HTML pages",
+ license = "Apache 2.0",
+ keywords = "boilerpipe fulltext extraction",
+ url = "https://github.com/sammyer/BoilerPy",
+ packages=['boilerpy'],
+ long_description=read('README.txt'),
+ classifiers=[
+ "Development Status :: 4 - Beta",
+ "Topic :: Utilities",
+ "License :: OSI Approved :: Apache License",
+ ]
+)
diff --git a/tests/unittests.py b/tests/unittests.py
index ac0a97e..f12ab6e 100644
--- a/tests/unittests.py
+++ b/tests/unittests.py
@@ -1,420 +1,434 @@
import unittest
import sys
+from unittest import mock
+
from boilerpy.document import TextDocument,TextBlock
from boilerpy.filters import *
from boilerpy.extractors import Extractor
def runTests():
- suite = unittest.TestLoader().loadTestsFromTestCase(TestFilters)
- unittest.TextTestRunner(verbosity=2).run(suite)
- suite = unittest.TestLoader().loadTestsFromTestCase(TestParser)
- unittest.TextTestRunner(verbosity=2).run(suite)
+ suite = unittest.TestLoader().loadTestsFromTestCase(TestFilters)
+ unittest.TextTestRunner(verbosity=2).run(suite)
+ suite = unittest.TestLoader().loadTestsFromTestCase(TestParser)
+ unittest.TextTestRunner(verbosity=2).run(suite)
def runOneTest():
- testName='test_anchor'
- suite = unittest.TestSuite()
- suite.addTest(TestParser(testName))
- unittest.TextTestRunner(verbosity=2).run(suite)
+ testName='test_anchor'
+ suite = unittest.TestSuite()
+ suite.addTest(TestParser(testName))
+ unittest.TextTestRunner(verbosity=2).run(suite)
class TestFilters(unittest.TestCase):
- defaultWords="Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec fermentum tincidunt magna, eu pulvinar mauris dapibus pharetra. In varius, nisl a rutrum porta, sem sem semper lacus, et varius urna tellus vel lorem. Nullam urna eros, luctus eget blandit ac, imperdiet feugiat ipsum. Donec laoreet tristique mi a bibendum. Sed pretium bibendum scelerisque. Mauris id pellentesque turpis. Mauris porta adipiscing massa, quis tempus dui pharetra ac. Morbi lacus mauris, feugiat ac tempor ut, congue tincidunt risus. Pellentesque tincidunt adipiscing elit, in fringilla enim scelerisque vel. Nulla facilisi. ".split(' ')
-
- def makedoc(self,wordsArr,numAnchorWordsArr=None,isContentArr=None,labelArr=None):
- textBlocks=[]
- for idx,words in enumerate(wordsArr):
- if type(words)==int:
- numWords=words
- text=' '.join(self.defaultWords[:numWords])
- else:
- text=words
- numWords=text.count(' ')
- try:
- numAnchorWords=numAnchorWordsArr[idx]
- except TypeError,IndexError:
- numAnchorWords=0
- block=TextBlock(text,set(),numWords,numAnchorWords,0,0,idx)
- try:
- block.setIsContent(isContentArr[idx])
- except TypeError,IndexError:
- pass
- try:
- label=labelArr[idx]
- if label==None: pass
- elif type(label)==list:
- for l in label: block.addLabel(l)
- else: block.addLabel(label)
- except TypeError,IndexError:
- pass
-
- textBlocks.append(block)
-
- return TextDocument(textBlocks)
-
- def verifyContent(self,filtr,doc,contentArr,show=False):
- isContentBefore=[block.isContent() for block in doc.getTextBlocks()]
- isChanged=filtr.process(doc)
- isContent=[block.isContent() for block in doc.getTextBlocks()]
- self.assertEqual(isContent,contentArr)
- self.assertEqual(isChanged,isContent!=isContentBefore)
-
- def test_markEveryhingContent(self):
- doc=self.makedoc([5,100,80],None,[False,True,False])
- self.verifyContent(MarkEverythingContentFilter(),doc,[True,True,True])
-
- def test_inverted(self):
- doc=self.makedoc([5,100,80],None,[False,True,False])
- self.verifyContent(InvertedFilter(),doc,[True,False,True])
-
- def test_boilerplateBlock(self):
- #keeps if isContent
- doc=self.makedoc([5,100,10,50,80],None,[False,True,False,True,False])
- initBlocks=doc.getTextBlocks()
- finalBlocks=[initBlocks[1],initBlocks[3]]
- filtr=BoilerplateBlockFilter()
- isChanged=filtr.process(doc)
- isContent=[block.isContent() for block in doc.getTextBlocks()]
- self.assertEqual(doc.getTextBlocks(),finalBlocks)
- self.assertEqual(isContent,[True,True])
- self.assertEqual(isChanged,True)
-
- def test_minWords(self):
- #rejects if #words*
*
**
*
" - content=self.makecontent([6,"end with space ",3,6]) - doc=self.makedoc(template,content) - - blocks=doc.getTextBlocks() - textArr=[block.getText() for block in blocks] - densityArr=[block.getLinkDensity() for block in blocks] - numAnchorWords=[block.getNumWordsInAnchorText() for block in blocks] - self.assertEqual(textArr,[content[0],content[1]+content[2],content[3]]) - self.assertEqual(numAnchorWords,[0,3,6]) - self.assertEqual(densityArr,[0.0,0.5,1.0]) - - def test_title(self): - titleText="THIS IS TITLE" - s="THIS IS CONTENT
" - doc=self.extractor.parseDoc(s) - self.assertEqual(doc.getTitle(),titleText) - - def test_body(self): - bodyText="THIS IS CONTENT" - s="NOT IN BODY
"+bodyText+"
" - doc=self.extractor.parseDoc(s) - textArr=[block.getText() for block in doc.getTextBlocks()] - self.assertEqual(textArr,[bodyText]) - - def test_inline(self): - template="*
" - content=self.makecontent([10,12]) - doc=self.makedoc(template,content) - - blocks=doc.getTextBlocks() - textArr=[block.getText() for block in blocks] - self.assertEqual(textArr,[content[0]]) - - def assertRange(self,val,minval,maxval): - self.assertTrue(val>=minval and val<=maxval) - - def test_textDensity(self): - template="*
*
" - content=self.makecontent([80,"one, !!! two"]) - doc=self.makedoc(template,content) - - blocks=doc.getTextBlocks() - numArr=[[block.getNumWords(),block.numWordsInWrappedLines,block.numWrappedLines,block.getTextDensity()] for block in blocks] - - #exact values are unknown, approximate value range to check - self.assertEqual(blocks[0].getNumWords(),80) - self.assertRange(blocks[0].numWordsInWrappedLines,60,80) - self.assertRange(blocks[0].numWrappedLines,4,7) - self.assertRange(blocks[0].getTextDensity(),8,16) - - self.assertEqual(numArr[1],[2,2,1,2]) - - def test_blockIdxs(self): - template="*
*
*
*
" - content=self.makecontent([11,12,13,14]) - doc=self.makedoc(template,content) - - blocks=doc.getTextBlocks() - idxArr=[[block.getOffsetBlocksStart(),block.getOffsetBlocksEnd()] for block in blocks] - self.assertEqual(idxArr,[[0,0],[1,1],[2,2],[3,3]]) - - def test_tagLevel(self): - template="*
*
**
*
" + content=self.makecontent([6,"end with space ",3,6]) + doc=self.makedoc(template,content) + + blocks=doc.getTextBlocks() + textArr=[block.getText() for block in blocks] + densityArr=[block.getLinkDensity() for block in blocks] + numAnchorWords=[block.getNumWordsInAnchorText() for block in blocks] + self.assertEqual(textArr,[content[0],content[1]+content[2],content[3]]) + self.assertEqual(numAnchorWords,[0,3,6]) + self.assertEqual(densityArr,[0.0,0.5,1.0]) + + def test_title(self): + titleText="THIS IS TITLE" + s="THIS IS CONTENT
" + doc=self.extractor.parseDoc(s) + self.assertEqual(doc.getTitle(),titleText) + + def test_body(self): + bodyText="THIS IS CONTENT" + s="NOT IN BODY
"+bodyText+"
" + doc=self.extractor.parseDoc(s) + textArr=[block.getText() for block in doc.getTextBlocks()] + self.assertEqual(textArr,[bodyText]) + + def test_inline(self): + template="*
" + content=self.makecontent([10,12]) + doc=self.makedoc(template,content) + + blocks=doc.getTextBlocks() + textArr=[block.getText() for block in blocks] + self.assertEqual(textArr,[content[0]]) + + def assertRange(self,val,minval,maxval): + self.assertTrue(val>=minval and val<=maxval) + + def test_textDensity(self): + template="*
*
" + content=self.makecontent([80,"one, !!! two"]) + doc=self.makedoc(template,content) + + blocks=doc.getTextBlocks() + numArr=[[block.getNumWords(),block.numWordsInWrappedLines,block.numWrappedLines,block.getTextDensity()] for block in blocks] + + #exact values are unknown, approximate value range to check + self.assertEqual(blocks[0].getNumWords(),80) + self.assertRange(blocks[0].numWordsInWrappedLines,60,80) + self.assertRange(blocks[0].numWrappedLines,4,7) + self.assertRange(blocks[0].getTextDensity(),8,16) + + self.assertEqual(numArr[1],[2,2,1,2]) + + def test_blockIdxs(self): + template="*
*
*
*
" + content=self.makecontent([11,12,13,14]) + doc=self.makedoc(template,content) + + blocks=doc.getTextBlocks() + idxArr=[[block.getOffsetBlocksStart(),block.getOffsetBlocksEnd()] for block in blocks] + self.assertEqual(idxArr,[[0,0],[1,1],[2,2],[3,3]]) + + def test_tagLevel(self): + template="