diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index f1fb2c4768..627f5160eb 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -1072,6 +1072,12 @@ Comma-separated list of Any23 extractors (a list of extractors is available here: http://any23.apache.org/getting-started.html) + + any23.content_types + text/html,application/xhtml+xml + Comma-separated list of content-types onto which Any23 extractors should be applied (see http://www.iana.org/assignments/media-types/). If empty, all content-types are supported. + + diff --git a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java b/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java index 4fe651c7eb..e64131046d 100644 --- a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java +++ b/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java @@ -25,6 +25,7 @@ import java.util.Set; import java.util.TreeSet; import java.util.Collections; +import java.util.Arrays; import org.apache.any23.Any23; import org.apache.any23.extractor.ExtractionException; @@ -77,15 +78,16 @@ public class Any23ParseFilter implements HtmlParseFilter { public final static String ANY23_TRIPLES = "Any23-Triples"; public static final String ANY_23_EXTRACTORS_CONF = "any23.extractors"; + public static final String ANY_23_CONTENT_TYPES_CONF = "any23.content_types"; private static class Any23Parser { Set triples = null; - Any23Parser(String url, String htmlContent, String... extractorNames) throws TripleHandlerException { + Any23Parser(String url, String htmlContent, String contentType, String... extractorNames) throws TripleHandlerException { triples = new TreeSet(); try { - parse(url, htmlContent, extractorNames); + parse(url, htmlContent, contentType, extractorNames); } catch (URISyntaxException e) { throw new RuntimeException(e.getReason()); } catch (IOException e) { @@ -101,7 +103,7 @@ private Set getTriples() { return triples; } - private void parse(String url, String htmlContent, String... extractorNames) throws URISyntaxException, IOException, TripleHandlerException { + private void parse(String url, String htmlContent, String contentType, String... extractorNames) throws URISyntaxException, IOException, TripleHandlerException { Any23 any23 = new Any23(extractorNames); any23.setMIMETypeDetector(null); @@ -118,7 +120,7 @@ private void parse(String url, String htmlContent, String... extractorNames) thr TripleHandler tHandler = new NTriplesWriter(baos); BenchmarkTripleHandler bHandler = new BenchmarkTripleHandler(tHandler); try { - any23.extract(input, url, "text/html","UTF-8", bHandler); + any23.extract(input, url, contentType, "UTF-8", bHandler); } catch (IOException e) { LOG.error("Error while reading the source", e); } catch (ExtractionException e) { @@ -154,12 +156,18 @@ public void setConf(Configuration conf) { */ @Override public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) { - String[] extractorNames = conf.getStrings(ANY_23_EXTRACTORS_CONF,"html-head-meta"); + String[] extractorNames = conf.getStrings(ANY_23_EXTRACTORS_CONF, "html-head-meta"); + String[] supportedContentTypes = conf.getStrings(ANY_23_CONTENT_TYPES_CONF, "text/html", "application/xhtml+xml"); + String contentType = content.getContentType(); + if (supportedContentTypes != null && !Arrays.asList(supportedContentTypes).contains(contentType)) { + LOG.debug("Ignoring document at {} because it has an unsupported Content-Type {}", content.getUrl(), contentType); + return parseResult; + } Any23Parser parser; try { String htmlContent = new String(content.getContent(), Charset.forName("UTF-8")); - parser = new Any23Parser(content.getUrl(), htmlContent, extractorNames); + parser = new Any23Parser(content.getUrl(), htmlContent, contentType, extractorNames); } catch (TripleHandlerException e) { throw new RuntimeException("Error running Any23 parser: " + e.getMessage()); } @@ -175,4 +183,3 @@ public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags return parseResult; } } - diff --git a/src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23ParseFilter.java b/src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23ParseFilter.java index dfc0928624..571a3d5a55 100644 --- a/src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23ParseFilter.java +++ b/src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23ParseFilter.java @@ -72,16 +72,12 @@ public void setUp() { conf.set("file.content.limit", "-1"); conf.set("parser.timeout", "-1"); conf.set(Any23ParseFilter.ANY_23_EXTRACTORS_CONF, "html-embedded-jsonld,html-head-icbm,html-head-links,html-head-meta,html-head-title,html-mf-adr,html-mf-geo,html-mf-hcalendar,html-mf-hcard,html-mf-hlisting,html-mf-hrecipe,html-mf-hresume,html-mf-hreview,html-mf-hreview-aggregate,html-mf-license,html-mf-species,html-mf-xfn,html-microdata,html-rdfa11,html-xpath"); + conf.set(Any23ParseFilter.ANY_23_CONTENT_TYPES_CONF, "text/html"); } @Test public void testExtractTriplesFromHTML() throws IOException, ParserNotFound, ParseException { - - String urlString = "file:" + sampleDir + fileSeparator + file1; - - File file = new File(sampleDir + fileSeparator + file1); - - String[] triplesArray = extract(urlString, file); + String[] triplesArray = getTriples(file1); Assert.assertEquals("We expect 117 tab-separated triples extracted by the filter", EXPECTED_TRIPLES_1, triplesArray.length); @@ -89,22 +85,27 @@ public void testExtractTriplesFromHTML() throws IOException, ParserNotFound, Par @Test public void extractMicroDataFromHTML() throws ParserNotFound, IOException, ParseException { - String urlString = "file:" + sampleDir + fileSeparator + file2; - - File file = new File(sampleDir + fileSeparator + file2); - - String[] triplesArray = extract(urlString, file); + String[] triplesArray = getTriples(file2); Assert.assertEquals("We expect 40 tab-separated triples extracted by the filter", EXPECTED_TRIPLES_2, triplesArray.length); } + + @Test + public void ignoreUnsupported() throws ParserNotFound, IOException, ParseException { + String[] triplesArray = getTriples(file1, "application/pdf"); + + Assert.assertEquals("We expect no triples extracted by the filter since content-type should be ignored", + 0, triplesArray.length); + } - public String[] extract(String urlString, File file) { + public String[] extract(String urlString, File file, String contentType) { try { System.out.println(urlString); Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString); Content content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent(); + content.setContentType(contentType); Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl()); return parse.getData().getParseMeta().getValues(Any23ParseFilter.ANY23_TRIPLES); } catch (Exception e) { @@ -113,4 +114,16 @@ public String[] extract(String urlString, File file) { } return null; } + + private String[] getTriples(String fileName) { + return getTriples(fileName, "text/html"); + } + + private String[] getTriples(String fileName, String contentType) { + String urlString = "file:" + sampleDir + fileSeparator + fileName; + + File file = new File(sampleDir + fileSeparator + fileName); + + return extract(urlString, file, contentType); + } }