Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 20 additions & 1 deletion boilerpipe-common/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -13,23 +13,42 @@
<properties>
<boilerpipe.parent.base.directory>${project.parent.basedir}</boilerpipe.parent.base.directory>
</properties>

<dependencies>

<dependency>
<groupId>xerces</groupId>
<artifactId>xercesImpl</artifactId>
<version>2.9.1</version>
</dependency>
<!--

<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.7.3</version>
</dependency>

<!--
<dependency>
<groupId>net.sourceforge.nekohtml</groupId>
<artifactId>nekohtml</artifactId>
<version>1.9.13</version>
</dependency>
-->

<dependency>
<groupId>com.kohlschutter.boilerpipe</groupId>
<artifactId>nekohtml-relocated</artifactId>
<version>1.9.13</version>
</dependency>

<dependency>
<groupId>com.kohlschutter.boilerpipe</groupId>
<artifactId>boilerpipe-test-framework</artifactId>
<version>2.0-SNAPSHOT</version>
<scope>test</scope>
</dependency>

</dependencies>

</project>
Original file line number Diff line number Diff line change
Expand Up @@ -162,10 +162,24 @@ public int getOffsetBlocksEnd() {
return offsetBlocksEnd;
}

@Override
public String toString() {
return "[" + offsetBlocksStart + "-" + offsetBlocksEnd + ";tl=" + tagLevel + "; nw=" + numWords
+ ";nwl=" + numWrappedLines + ";ld=" + linkDensity + "]\t"
+ (isContent ? "CONTENT" : "boilerplate") + "," + labels + "\n" + getText();
return "TextBlock{" +
"isContent=" + isContent +
", text=" + text +
", labels=" + labels +
", offsetBlocksStart=" + offsetBlocksStart +
", offsetBlocksEnd=" + offsetBlocksEnd +
", numWords=" + numWords +
", numWordsInAnchorText=" + numWordsInAnchorText +
", numWordsInWrappedLines=" + numWordsInWrappedLines +
", numWrappedLines=" + numWrappedLines +
", textDensity=" + textDensity +
", linkDensity=" + linkDensity +
", containedTextElements=" + containedTextElements +
", numFullTextWords=" + numFullTextWords +
", tagLevel=" + tagLevel +
'}';
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -129,4 +129,13 @@ public TextDocument clone() {
}
return new TextDocument(title, list);
}

@Override
public String toString() {
return "TextDocument{" +
"textBlocks=" + textBlocks +
", title='" + title + '\'' +
'}';
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import com.kohlschutter.boilerpipe.BoilerpipeDocumentSource;
import com.kohlschutter.boilerpipe.document.TextBlock;
import com.kohlschutter.boilerpipe.document.TextDocument;
import com.kohlschutter.boilerpipe.org.cyberneko.html.HTMLConfiguration;
import org.cyberneko.html.HTMLConfiguration;

/**
* A simple SAX Parser, used by {@link BoilerpipeSAXInput}. The parser uses <a
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import java.util.regex.Pattern;

import org.apache.xerces.parsers.AbstractSAXParser;
import org.cyberneko.html.HTMLConfiguration;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
Expand All @@ -38,7 +39,6 @@
import com.kohlschutter.boilerpipe.BoilerpipeProcessingException;
import com.kohlschutter.boilerpipe.document.TextBlock;
import com.kohlschutter.boilerpipe.document.TextDocument;
import com.kohlschutter.boilerpipe.org.cyberneko.html.HTMLConfiguration;

/**
* Highlights text blocks in an HTML document that have been marked as "content" in the
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import java.util.Map;

import org.apache.xerces.parsers.AbstractSAXParser;
import org.cyberneko.html.HTMLConfiguration;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
Expand All @@ -38,7 +39,6 @@
import com.kohlschutter.boilerpipe.document.Image;
import com.kohlschutter.boilerpipe.document.TextBlock;
import com.kohlschutter.boilerpipe.document.TextDocument;
import com.kohlschutter.boilerpipe.org.cyberneko.html.HTMLConfiguration;

/**
* Extracts the images that are enclosed by extracted content.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
package com.kohlschutter.boilerpipe;

import com.kohlschutter.boilerpipe.corpora.CorporaAsserter;
import com.kohlschutter.boilerpipe.corpora.CorporaCache;
import com.kohlschutter.boilerpipe.extractors.ArticleExtractor;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.junit.Test;

import java.io.IOException;
import java.net.URL;

/**
* Tests using real world documents to look at real world accuracy.
*/
public class CorporaTests {

CorporaCache corporaCache = new CorporaCache( getClass() );
CorporaAsserter corporaAsserter = new CorporaAsserter( getClass() );

@Test
public void testAll() throws Exception {

test( "http://www.bbc.com/news/world-europe-31669061", "testBbc1" );
test( "http://www.cnn.com/2015/02/26/us/arizona-llamas-escape/index.html", "testCnn1" );
test( "http://www.cnn.com/videos/us/2015/02/26/pkg-woman-loses-over-800-pounds.ktrk", "testCnn2" );
test( "http://www.cnn.com/2015/02/27/world/mexico-knights-templar-leader-detained/index.html", "testCnn3" );
test( "http://techcrunch.com/2015/02/27/nsa-bullk-telephony-metadata-program-reupped-until-parts-of-the-patriot-act-potentially-sunset/", "testTechcrunch1" );

}

private void test( String link, String key ) throws Exception {

String html = read( link );

ArticleExtractor articleExtractor = ArticleExtractor.getInstance();

corporaAsserter.assertCorpora( key, articleExtractor.getText( html ) );

}


private String key( String link ) {

return link.replaceAll( "[:/?=&%]", "_" );

}


private String read( String link ) throws IOException {

String key = key( link );

if ( corporaCache.contains( key ) ) {
return corporaCache.read( key );
}

String data = fetch( link );

corporaCache.write( key, data );

return data;

}


// fetch the given link by going over the network.
private String fetch( String link ) throws IOException {

// TODO: should we strip any charset in the <meta> since after this we
// always save as UTF8 ?

URL url = new URL( link );

Document doc = Jsoup.parse( url, 30000 );

String content = doc.outerHtml();

return content;

}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
package com.kohlschutter.boilerpipe;

import com.kohlschutter.boilerpipe.corpora.CorporaAsserter;
import com.kohlschutter.boilerpipe.corpora.Formatter;
import com.kohlschutter.boilerpipe.document.TextDocument;
import com.kohlschutter.boilerpipe.sax.BoilerpipeSAXInput;
import com.kohlschutter.boilerpipe.sax.HTMLDocument;
import com.kohlschutter.boilerpipe.sax.HTMLFetcher;
import org.junit.Test;

import java.net.URL;

import static com.kohlschutter.boilerpipe.corpora.Formatter.*;

public class TextDocumentParserTest {

CorporaAsserter corporaAsserter = new CorporaAsserter( getClass() );

@Test
public void testBasicDocument1() throws Exception {

TextDocument doc = parse( "/test1.html" );

corporaAsserter.assertCorpora( "testBasicDocument1", doc.getTextBlocks().toString() );

}

@Test
public void testBasicDocument2() throws Exception {

TextDocument doc = parse( "/test2.html" );

corporaAsserter.assertCorpora( "testBasicDocument2", doc.getTextBlocks().toString() );

}

@Test
public void testBasicDocument3() throws Exception {

TextDocument doc = parse( "/test3.html" );

// I can use jsoup for this by taking the code from flushBlock in
// boilerpipe.sax.BoilerpipeHTMLContentHandler and doing a jsoup query
// for div,p ... in jsoup and then taking the text from the elements to
// build a TextBlock

corporaAsserter.assertCorpora( "testBasicDocument3", table( doc.getTextBlocks() ) );

}

private TextDocument parse( String path ) throws Exception {

URL url = getClass().getResource( path );

final HTMLDocument htmlDoc = HTMLFetcher.fetch( url );

return new BoilerpipeSAXInput(htmlDoc.toInputSource())
.getTextDocument();

}

}
Loading