Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions conf/nutch-default.xml
Original file line number Diff line number Diff line change
Expand Up @@ -1072,6 +1072,12 @@
<description>Comma-separated list of Any23 extractors (a list of extractors is available here: http://any23.apache.org/getting-started.html)</description>
</property>

<property>
<name>any23.content_types</name>
<value>text/html,application/xhtml+xml</value>
<description>Comma-separated list of content-types onto which Any23 extractors should be applied (see http://www.iana.org/assignments/media-types/). If empty, all content-types are supported.</description>
</property>

<!-- moreindexingfilter plugin properties -->

<property>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import java.util.Set;
import java.util.TreeSet;
import java.util.Collections;
import java.util.Arrays;

import org.apache.any23.Any23;
import org.apache.any23.extractor.ExtractionException;
Expand Down Expand Up @@ -77,15 +78,16 @@ public class Any23ParseFilter implements HtmlParseFilter {
public final static String ANY23_TRIPLES = "Any23-Triples";

public static final String ANY_23_EXTRACTORS_CONF = "any23.extractors";
public static final String ANY_23_CONTENT_TYPES_CONF = "any23.content_types";

private static class Any23Parser {

Set<String> triples = null;

Any23Parser(String url, String htmlContent, String... extractorNames) throws TripleHandlerException {
Any23Parser(String url, String htmlContent, String contentType, String... extractorNames) throws TripleHandlerException {
triples = new TreeSet<String>();
try {
parse(url, htmlContent, extractorNames);
parse(url, htmlContent, contentType, extractorNames);
} catch (URISyntaxException e) {
throw new RuntimeException(e.getReason());
} catch (IOException e) {
Expand All @@ -101,7 +103,7 @@ private Set<String> getTriples() {
return triples;
}

private void parse(String url, String htmlContent, String... extractorNames) throws URISyntaxException, IOException, TripleHandlerException {
private void parse(String url, String htmlContent, String contentType, String... extractorNames) throws URISyntaxException, IOException, TripleHandlerException {
Any23 any23 = new Any23(extractorNames);
any23.setMIMETypeDetector(null);

Expand All @@ -118,7 +120,7 @@ private void parse(String url, String htmlContent, String... extractorNames) thr
TripleHandler tHandler = new NTriplesWriter(baos);
BenchmarkTripleHandler bHandler = new BenchmarkTripleHandler(tHandler);
try {
any23.extract(input, url, "text/html","UTF-8", bHandler);
any23.extract(input, url, contentType, "UTF-8", bHandler);
} catch (IOException e) {
LOG.error("Error while reading the source", e);
} catch (ExtractionException e) {
Expand Down Expand Up @@ -154,12 +156,18 @@ public void setConf(Configuration conf) {
*/
@Override
public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
String[] extractorNames = conf.getStrings(ANY_23_EXTRACTORS_CONF,"html-head-meta");
String[] extractorNames = conf.getStrings(ANY_23_EXTRACTORS_CONF, "html-head-meta");
String[] supportedContentTypes = conf.getStrings(ANY_23_CONTENT_TYPES_CONF, "text/html", "application/xhtml+xml");
String contentType = content.getContentType();
if (supportedContentTypes != null && !Arrays.asList(supportedContentTypes).contains(contentType)) {
LOG.debug("Ignoring document at {} because it has an unsupported Content-Type {}", content.getUrl(), contentType);
return parseResult;
}

Any23Parser parser;
try {
String htmlContent = new String(content.getContent(), Charset.forName("UTF-8"));
parser = new Any23Parser(content.getUrl(), htmlContent, extractorNames);
parser = new Any23Parser(content.getUrl(), htmlContent, contentType, extractorNames);
} catch (TripleHandlerException e) {
throw new RuntimeException("Error running Any23 parser: " + e.getMessage());
}
Expand All @@ -175,4 +183,3 @@ public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags
return parseResult;
}
}

Original file line number Diff line number Diff line change
Expand Up @@ -72,39 +72,40 @@ public void setUp() {
conf.set("file.content.limit", "-1");
conf.set("parser.timeout", "-1");
conf.set(Any23ParseFilter.ANY_23_EXTRACTORS_CONF, "html-embedded-jsonld,html-head-icbm,html-head-links,html-head-meta,html-head-title,html-mf-adr,html-mf-geo,html-mf-hcalendar,html-mf-hcard,html-mf-hlisting,html-mf-hrecipe,html-mf-hresume,html-mf-hreview,html-mf-hreview-aggregate,html-mf-license,html-mf-species,html-mf-xfn,html-microdata,html-rdfa11,html-xpath");
conf.set(Any23ParseFilter.ANY_23_CONTENT_TYPES_CONF, "text/html");
}

@Test
public void testExtractTriplesFromHTML() throws IOException, ParserNotFound, ParseException {

String urlString = "file:" + sampleDir + fileSeparator + file1;

File file = new File(sampleDir + fileSeparator + file1);

String[] triplesArray = extract(urlString, file);
String[] triplesArray = getTriples(file1);

Assert.assertEquals("We expect 117 tab-separated triples extracted by the filter",
EXPECTED_TRIPLES_1, triplesArray.length);
}

@Test
public void extractMicroDataFromHTML() throws ParserNotFound, IOException, ParseException {
String urlString = "file:" + sampleDir + fileSeparator + file2;

File file = new File(sampleDir + fileSeparator + file2);

String[] triplesArray = extract(urlString, file);
String[] triplesArray = getTriples(file2);

Assert.assertEquals("We expect 40 tab-separated triples extracted by the filter",
EXPECTED_TRIPLES_2, triplesArray.length);
}

@Test
public void ignoreUnsupported() throws ParserNotFound, IOException, ParseException {
String[] triplesArray = getTriples(file1, "application/pdf");

Assert.assertEquals("We expect no triples extracted by the filter since content-type should be ignored",
0, triplesArray.length);
}

public String[] extract(String urlString, File file) {
public String[] extract(String urlString, File file, String contentType) {
try {
System.out.println(urlString);
Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
Content content = protocol.getProtocolOutput(new Text(urlString),
new CrawlDatum()).getContent();
content.setContentType(contentType);
Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
return parse.getData().getParseMeta().getValues(Any23ParseFilter.ANY23_TRIPLES);
} catch (Exception e) {
Expand All @@ -113,4 +114,16 @@ public String[] extract(String urlString, File file) {
}
return null;
}

private String[] getTriples(String fileName) {
return getTriples(fileName, "text/html");
}

private String[] getTriples(String fileName, String contentType) {
String urlString = "file:" + sampleDir + fileSeparator + fileName;

File file = new File(sampleDir + fileSeparator + fileName);

return extract(urlString, file, contentType);
}
}