diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index f1fb2c4768..627f5160eb 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1072,6 +1072,12 @@
Comma-separated list of Any23 extractors (a list of extractors is available here: http://any23.apache.org/getting-started.html)
+
+ any23.content_types
+ text/html,application/xhtml+xml
+ Comma-separated list of content-types onto which Any23 extractors should be applied (see http://www.iana.org/assignments/media-types/). If empty, all content-types are supported.
+
+
diff --git a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java b/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java
index 4fe651c7eb..e64131046d 100644
--- a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java
+++ b/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java
@@ -25,6 +25,7 @@
import java.util.Set;
import java.util.TreeSet;
import java.util.Collections;
+import java.util.Arrays;
import org.apache.any23.Any23;
import org.apache.any23.extractor.ExtractionException;
@@ -77,15 +78,16 @@ public class Any23ParseFilter implements HtmlParseFilter {
public final static String ANY23_TRIPLES = "Any23-Triples";
public static final String ANY_23_EXTRACTORS_CONF = "any23.extractors";
+ public static final String ANY_23_CONTENT_TYPES_CONF = "any23.content_types";
private static class Any23Parser {
Set triples = null;
- Any23Parser(String url, String htmlContent, String... extractorNames) throws TripleHandlerException {
+ Any23Parser(String url, String htmlContent, String contentType, String... extractorNames) throws TripleHandlerException {
triples = new TreeSet();
try {
- parse(url, htmlContent, extractorNames);
+ parse(url, htmlContent, contentType, extractorNames);
} catch (URISyntaxException e) {
throw new RuntimeException(e.getReason());
} catch (IOException e) {
@@ -101,7 +103,7 @@ private Set getTriples() {
return triples;
}
- private void parse(String url, String htmlContent, String... extractorNames) throws URISyntaxException, IOException, TripleHandlerException {
+ private void parse(String url, String htmlContent, String contentType, String... extractorNames) throws URISyntaxException, IOException, TripleHandlerException {
Any23 any23 = new Any23(extractorNames);
any23.setMIMETypeDetector(null);
@@ -118,7 +120,7 @@ private void parse(String url, String htmlContent, String... extractorNames) thr
TripleHandler tHandler = new NTriplesWriter(baos);
BenchmarkTripleHandler bHandler = new BenchmarkTripleHandler(tHandler);
try {
- any23.extract(input, url, "text/html","UTF-8", bHandler);
+ any23.extract(input, url, contentType, "UTF-8", bHandler);
} catch (IOException e) {
LOG.error("Error while reading the source", e);
} catch (ExtractionException e) {
@@ -154,12 +156,18 @@ public void setConf(Configuration conf) {
*/
@Override
public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
- String[] extractorNames = conf.getStrings(ANY_23_EXTRACTORS_CONF,"html-head-meta");
+ String[] extractorNames = conf.getStrings(ANY_23_EXTRACTORS_CONF, "html-head-meta");
+ String[] supportedContentTypes = conf.getStrings(ANY_23_CONTENT_TYPES_CONF, "text/html", "application/xhtml+xml");
+ String contentType = content.getContentType();
+ if (supportedContentTypes != null && !Arrays.asList(supportedContentTypes).contains(contentType)) {
+ LOG.debug("Ignoring document at {} because it has an unsupported Content-Type {}", content.getUrl(), contentType);
+ return parseResult;
+ }
Any23Parser parser;
try {
String htmlContent = new String(content.getContent(), Charset.forName("UTF-8"));
- parser = new Any23Parser(content.getUrl(), htmlContent, extractorNames);
+ parser = new Any23Parser(content.getUrl(), htmlContent, contentType, extractorNames);
} catch (TripleHandlerException e) {
throw new RuntimeException("Error running Any23 parser: " + e.getMessage());
}
@@ -175,4 +183,3 @@ public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags
return parseResult;
}
}
-
diff --git a/src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23ParseFilter.java b/src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23ParseFilter.java
index dfc0928624..571a3d5a55 100644
--- a/src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23ParseFilter.java
+++ b/src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23ParseFilter.java
@@ -72,16 +72,12 @@ public void setUp() {
conf.set("file.content.limit", "-1");
conf.set("parser.timeout", "-1");
conf.set(Any23ParseFilter.ANY_23_EXTRACTORS_CONF, "html-embedded-jsonld,html-head-icbm,html-head-links,html-head-meta,html-head-title,html-mf-adr,html-mf-geo,html-mf-hcalendar,html-mf-hcard,html-mf-hlisting,html-mf-hrecipe,html-mf-hresume,html-mf-hreview,html-mf-hreview-aggregate,html-mf-license,html-mf-species,html-mf-xfn,html-microdata,html-rdfa11,html-xpath");
+ conf.set(Any23ParseFilter.ANY_23_CONTENT_TYPES_CONF, "text/html");
}
@Test
public void testExtractTriplesFromHTML() throws IOException, ParserNotFound, ParseException {
-
- String urlString = "file:" + sampleDir + fileSeparator + file1;
-
- File file = new File(sampleDir + fileSeparator + file1);
-
- String[] triplesArray = extract(urlString, file);
+ String[] triplesArray = getTriples(file1);
Assert.assertEquals("We expect 117 tab-separated triples extracted by the filter",
EXPECTED_TRIPLES_1, triplesArray.length);
@@ -89,22 +85,27 @@ public void testExtractTriplesFromHTML() throws IOException, ParserNotFound, Par
@Test
public void extractMicroDataFromHTML() throws ParserNotFound, IOException, ParseException {
- String urlString = "file:" + sampleDir + fileSeparator + file2;
-
- File file = new File(sampleDir + fileSeparator + file2);
-
- String[] triplesArray = extract(urlString, file);
+ String[] triplesArray = getTriples(file2);
Assert.assertEquals("We expect 40 tab-separated triples extracted by the filter",
EXPECTED_TRIPLES_2, triplesArray.length);
}
+
+ @Test
+ public void ignoreUnsupported() throws ParserNotFound, IOException, ParseException {
+ String[] triplesArray = getTriples(file1, "application/pdf");
+
+ Assert.assertEquals("We expect no triples extracted by the filter since content-type should be ignored",
+ 0, triplesArray.length);
+ }
- public String[] extract(String urlString, File file) {
+ public String[] extract(String urlString, File file, String contentType) {
try {
System.out.println(urlString);
Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
Content content = protocol.getProtocolOutput(new Text(urlString),
new CrawlDatum()).getContent();
+ content.setContentType(contentType);
Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
return parse.getData().getParseMeta().getValues(Any23ParseFilter.ANY23_TRIPLES);
} catch (Exception e) {
@@ -113,4 +114,16 @@ public String[] extract(String urlString, File file) {
}
return null;
}
+
+ private String[] getTriples(String fileName) {
+ return getTriples(fileName, "text/html");
+ }
+
+ private String[] getTriples(String fileName, String contentType) {
+ String urlString = "file:" + sampleDir + fileSeparator + fileName;
+
+ File file = new File(sampleDir + fileSeparator + fileName);
+
+ return extract(urlString, file, contentType);
+ }
}