-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathRobotExclusionUtil.java
More file actions
140 lines (124 loc) · 4.36 KB
/
Copy pathRobotExclusionUtil.java
File metadata and controls
140 lines (124 loc) · 4.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class RobotExclusionUtil {
private static ConcurrentHashMap<String, ConcurrentHashMap<String, Boolean>> map
= new ConcurrentHashMap<String, ConcurrentHashMap<String, Boolean>>();
/**
* This method based on code in the WWW::RobotRules module in the
* libwww-perl5 library, available from www.cpan.org.
*
* @param robotsFile
* The robots.txt file represented as a string. Based on public
* code made available by:
* @author Ted Wild & Ray Mooney
*/
private static void parseRobotsFileString(String site, String robotsFile) {
int currentIndex = 0;
// Regex Pattern matchers for finding user-agent, disallow, and blank
// lines in file
Matcher userAgentLine = Pattern.compile("(?i)User-Agent:\\s*(.*)")
.matcher(robotsFile);
Matcher disallowLine = Pattern.compile("(?i)Disallow:\\s*(.*)")
.matcher(robotsFile);
Matcher blankLine = Pattern.compile("\n\\s*\n").matcher(robotsFile);
ConcurrentHashMap<String, Boolean> m = map.get(site);
if (m == null) {
m = new ConcurrentHashMap<String, Boolean>();
map.put(site, m);
}
// Find each user-agent portion of file
while (userAgentLine.find()) {
if (userAgentLine.group(1).indexOf('*') != -1) {
// this User-Agent line applies to this robot
// find next blank line after this user-agent line
currentIndex = userAgentLine.end();
blankLine.region(currentIndex, robotsFile.length());
// Index of next blank line
int blankLineIndex = robotsFile.length();
if (blankLine.find())
blankLineIndex = blankLine.start();
// Find disallow lines before next blank line (or end of file)
disallowLine.region(currentIndex, blankLineIndex);
while (disallowLine.find()) {
// For each disallow line, add its path to the disallowed
// set
String disallowed = disallowLine.group(1).trim();
if (disallowed.length() > 0) {
if (disallowed.endsWith("/"))
disallowed = disallowed.substring(0,
disallowed.lastIndexOf('/'));
}
//System.out.println("Disallowed: " + disallowed);
m.put(disallowed, true);
}
}
}
}
public static boolean robotsShouldFollow(String url) {
try {
URL u = url(url);
String site = u.getHost();
//System.out.println("Site: " + site);
if (! map.containsKey(site)) {
String robotText = readRobotsFile(url("http://" + site + "/robots.txt"));
if (robotText != null) {
//System.out.println(robotText);
parseRobotsFileString(site, robotText);
}
}
ConcurrentHashMap<String, Boolean> m = map.get(site);
if (m == null) {
return true;
} else {
//System.out.println("u.getPath(): " + u.getPath());
String path = u.getPath();
if (path != null && path.length() > 0 && path.endsWith("/"))
path = path.substring(0, path.lastIndexOf('/'));
return ! m.containsKey(path);
}
} catch (MalformedURLException e) {
//System.out.println("WebPage.getWebPage(): " + e.toString());
} catch (Exception e) {
// TODO: handle exception
}
return true;
}
private static URL url(String url) throws MalformedURLException {
return new URL(url);
}
public static String readRobotsFile(URL urlObj) throws IOException {
// using a StringBuffer instead of a String has huge
// performance benefits.
StringBuffer page = new StringBuffer();
// Open an input stream of the URL contents
BufferedReader x = new BufferedReader(new InputStreamReader(urlObj
.openConnection().getInputStream()));
// wait for sometime for buffer to be initialized
/*
int c = 0;
while (!x.ready() || c < 10000) {
c++;
}
*/
// make up a name where you would store the
// open a file to write in
String line = "";
while ((line=x.readLine())!=null) {
page.append(line + "\n");
}
x.close();
return page.toString();
}
/**
* For testing only. Parses robosts.txt file for a particular site
*/
public static void main(String[] args) {
System.out.println(robotsShouldFollow("http://academicpersonnel.ucr.edu/employment/"));
}
}