XSS HTMLFilter这是一个采用Java实现的开源类库。用于分析用户提交的输入,消除潜在的跨站点脚本攻击(XSS),恶意的HTML,或简单的HTML格式错误。
示例代码:
// retrieve input from user...
String input = ...
String clean = new HTMLInputFilter().filter( input );
该项目主页:http://xss-html-filter.sourceforge.net/
package net.sf.xsshtmlfilter;
2
3 import java.util.ArrayList;
4 import java.util.Collections;
5 import java.util.HashMap;
6 import java.util.List;
7 import java.util.Map;
8 import java.util.concurrent.ConcurrentHashMap;
9 import java.util.concurrent.ConcurrentMap;
10 import java.util.logging.Logger;
11 import java.util.regex.Matcher;
12 import java.util.regex.Pattern;
13
14 /**
15 *
16 * HTML filtering utility for protecting against XSS (Cross Site Scripting).
17 *
18 * This code is licensed LGPLv3
19 *
20 * This code is a Java port of the original work in PHP by Cal Hendersen.
21 * http://code.iamcal.com/php/lib_filter/
22 *
23 * The trickiest part of the translation was handling the differences in regex handling
24 * between PHP and Java. These resources were helpful in the process:
25 *
26 * http://java.sun.com/j2se/1.4.2/docs/api/java/util/regex/Pattern.html
27 * http://us2.php.net/manual/en/reference.pcre.pattern.modifiers.php
28 * http://www.regular-expressions.info/modifiers.html
29 *
30 * A note on naming conventions: instance variables are prefixed with a "v"; global
31 * constants are in all caps.
32 *
33 * Sample use:
34 * String input = ...
35 * String clean = new HTMLFilter().filter( input );
36 *
37 * The class is not thread safe. Create a new instance if in doubt.
38 *
39 * If you find bugs or have suggestions on improvement (especially regarding
40 * performance), please contact us. The latest version of this
41 * source, and our contact details, can be found at http://xss-html-filter.sf.net
42 *
43 * @author Joseph O'Connell
44 * @author Cal Hendersen
45 * @author Michael Semb Wever
46 */
47 public final class HTMLFilter {
48
49 /** regex flag union representing /si modifiers in php **/
50 private static final int REGEX_FLAGS_SI = Pattern.CASE_INSENSITIVE | Pattern.DOTALL;
51 private static final Pattern P_COMMENTS = Pattern.compile("<!--(.*?)-->", Pattern.DOTALL);
52 private static final Pattern P_COMMENT = Pattern.compile("^!--(.*)--{1}quot;, REGEX_FLAGS_SI);
53 private static final Pattern P_TAGS = Pattern.compile("<(.*?)>", Pattern.DOTALL);
54 private static final Pattern P_END_TAG = Pattern.compile("^/([a-z0-9]+)", REGEX_FLAGS_SI);
55 private static final Pattern P_START_TAG = Pattern.compile("^([a-z0-9]+)(.*?)(/?){1}quot;, REGEX_FLAGS_SI);
56 private static final Pattern P_QUOTED_ATTRIBUTES = Pattern.compile("([a-z0-9]+)=([\"'])(.*?)\\2", REGEX_FLAGS_SI);
57 private static final Pattern P_UNQUOTED_ATTRIBUTES = Pattern.compile("([a-z0-9]+)(=)([^\"\\s']+)", REGEX_FLAGS_SI);
58 private static final Pattern P_PROTOCOL = Pattern.compile("^([^:]+):", REGEX_FLAGS_SI);
59 private static final Pattern P_ENTITY = Pattern.compile("&#(\\d+);?");
60 private static final Pattern P_ENTITY_UNICODE = Pattern.compile("&#x([0-9a-f]+);?");
61 private static final Pattern P_ENCODE = Pattern.compile("%([0-9a-f]{2});?");
62 private static final Pattern P_VALID_ENTITIES = Pattern.compile("&([^&;]*)(?=(;|&|$))");
63 private static final Pattern P_VALID_QUOTES = Pattern.compile("(>|^)([^<]+?)(<|$)", Pattern.DOTALL);
64 private static final Pattern P_END_ARROW = Pattern.compile("^>");
65 private static final Pattern P_BODY_TO_END = Pattern.compile("<([^>]*?)(?=<|$)");
66 private static final Pattern P_XML_CONTENT = Pattern.compile("(^|>)([^<]*?)(?=>)");
67 private static final Pattern P_STRAY_LEFT_ARROW = Pattern.compile("<([^>]*?)(?=<|$)");
68 private static final Pattern P_STRAY_RIGHT_ARROW = Pattern.compile("(^|>)([^<]*?)(?=>)");
69 private static final Pattern P_AMP = Pattern.compile("&");
70 private static final Pattern P_QUOTE = Pattern.compile("\"");
71 private static final Pattern P_LEFT_ARROW = Pattern.compile("<");
72 private static final Pattern P_RIGHT_ARROW = Pattern.compile(">");
73 private static final Pattern P_BOTH_ARROWS = Pattern.compile("<>");
74
75 // @xxx could grow large... maybe use sesat's ReferenceMap
76 private static final ConcurrentMap<String,Pattern> P_REMOVE_PAIR_BLANKS = new ConcurrentHashMap<String, Pattern>();
77 private static final ConcurrentMap<String,Pattern> P_REMOVE_SELF_BLANKS = new ConcurrentHashMap<String, Pattern>();
78
79 /** set of allowed html elements, along with allowed attributes for each element **/
80 private final Map<String, List<String>> vAllowed;
81 /** counts of open tags for each (allowable) html element **/
82 private final Map<String, Integer> vTagCounts = new HashMap<String, Integer>();
83
84 /** html elements which must always be self-closing (e.g. "<img />") **/
85 private final String[] vSelfClosingTags;
86 /** html elements which must always have separate opening and closing tags (e.g. "<b></b>") **/
87 private final String[] vNeedClosingTags;
88 /** set of disallowed html elements **/
89 private final String[] vDisallowed;
90 /** attributes which should be checked for valid protocols **/
91 private final String[] vProtocolAtts;
92 /** allowed protocols **/
93 private final String[] vAllowedProtocols;
94 /** tags which should be removed if they contain no content (e.g. "<b></b>" or "<b />") **/
95 private final String[] vRemoveBlanks;
96 /** entities allowed within html markup **/
97 private final String[] vAllowedEntities;
98 /** flag determining whether comments are allowed in input String. */
99 private final boolean stripComment;
100 private boolean vDebug = false;
101 /**
102 * flag determining whether to try to make tags when presented with "unbalanced"
103 * angle brackets (e.g. "<b text </b>" becomes "<b> text </b>"). If set to false,
104 * unbalanced angle brackets will be html escaped.
105 */
106 private final boolean alwaysMakeTags;
107
108 /** Default constructor.
109 *
110 */
111 public HTMLFilter() {
112 vAllowed = new HashMap<String, List<String>>();
113
114 final ArrayList<String> a_atts = new ArrayList<String>();
115 a_atts.add("href");
116 a_atts.add("target");
117 vAllowed.put("a", a_atts);
118
119 final ArrayList<String> img_atts = new ArrayList<String>();
120 img_atts.add("src");
121 img_atts.add("width");
122 img_atts.add("height");
123 img_atts.add("alt");
124 vAllowed.put("img", img_atts);
125
126 final ArrayList<String> no_atts = new ArrayList<String>();
127 vAllowed.put("b", no_atts);
128 vAllowed.put("strong", no_atts);
129 vAllowed.put("i", no_atts);
130 vAllowed.put("em", no_atts);
131
132 vSelfClosingTags = new String[]{"img"};
133 vNeedClosingTags = new String[]{"a", "b", "strong", "i", "em"};
134 vDisallowed = new String[]{};
135 vAllowedProtocols = new String[]{"http", "mailto"}; // no ftp.
136 vProtocolAtts = new String[]{"src", "href"};
137 vRemoveBlanks = new String[]{"a", "b", "strong", "i", "em"};
138 vAllowedEntities = new String[]{"amp", "gt", "lt", "quot"};
139 stripComment = true;
140 alwaysMakeTags = true;
141 }
142
143 /** Set debug flag to true. Otherwise use default settings. See the default constructor.
144 *
145 * @param debug turn debug on with a true argument
146 */
147 public HTMLFilter(final boolean debug) {
148 this();
149 vDebug = debug;
150
151 }
152
153 /** Map-parameter configurable constructor.
154 *
155 * @param configuration map containing configuration. keys match field names.
156 */
157 public HTMLFilter(final Map<String,Object> configuration) {
158
159 assert configuration.containsKey("vAllowed") : "configuration requires vAllowed";
160 assert configuration.containsKey("vSelfClosingTags") : "configuration requires vSelfClosingTags";
161 assert configuration.containsKey("vNeedClosingTags") : "configuration requires vNeedClosingTags";
162 assert configuration.containsKey("vDisallowed") : "configuration requires vDisallowed";
163 assert configuration.containsKey("vAllowedProtocols") : "configuration requires vAllowedProtocols";
164 assert configuration.containsKey("vProtocolAtts") : "configuration requires vProtocolAtts";
165 assert configuration.containsKey("vRemoveBlanks") : "configuration requires vRemoveBlanks";
166 assert configuration.containsKey("vAllowedEntities") : "configuration requires vAllowedEntities";
167 assert configuration.containsKey("stripComment") : "configuration requires stripComment";
168 assert configuration.containsKey("alwaysMakeTags") : "configuration requires alwaysMakeTags";
169
170 vAllowed = Collections.unmodifiableMap((HashMap<String, List<String>>) configuration.get("vAllowed"));
171 vSelfClosingTags = (String[]) configuration.get("vSelfClosingTags");
172 vNeedClosingTags = (String[]) configuration.get("vNeedClosingTags");
173 vDisallowed = (String[]) configuration.get("vDisallowed");
174 vAllowedProtocols = (String[]) configuration.get("vAllowedProtocols");
175 vProtocolAtts = (String[]) configuration.get("vProtocolAtts");
176 vRemoveBlanks = (String[]) configuration.get("vRemoveBlanks");
177 vAllowedEntities = (String[]) configuration.get("vAllowedEntities");
178 stripComment = (Boolean) configuration.get("stripComment");
179 alwaysMakeTags = (Boolean) configuration.get("alwaysMakeTags");
180 }
181
182 private void reset() {
183 vTagCounts.clear();
184 }
185
186 private void debug(final String msg) {
187 if (vDebug) {
188 Logger.getAnonymousLogger().info(msg);
189 }
190 }
191
192 //---------------------------------------------------------------
193 // my versions of some PHP library functions
194 public static String chr(final int decimal) {
195 return String.valueOf((char) decimal);
196 }
197
198 public static String htmlSpecialChars(final String s) {
199 String result = s;
200 result = regexReplace(P_AMP, "&", result);
201 result = regexReplace(P_QUOTE, """, result);
202 result = regexReplace(P_LEFT_ARROW, "<", result);
203 result = regexReplace(P_RIGHT_ARROW, ">", result);
204 return result;
205 }
206
207 //---------------------------------------------------------------
208 /**
209 * given a user submitted input String, filter out any invalid or restricted
210 * html.
211 *
212 * @param input text (i.e. submitted by a user) than may contain html
213 * @return "clean" version of input, with only valid, whitelisted html elements allowed
214 */
215 public String filter(final String input) {
216 reset();
217 String s = input;
218
219 debug("************************************************");
220 debug(" INPUT: " + input);
221
222 s = escapeComments(s);
223 debug(" escapeComments: " + s);
224
225 s = balanceHTML(s);
226 debug(" balanceHTML: " + s);
227
228 s = checkTags(s);
229 debug(" checkTags: " + s);
230
231 s = processRemoveBlanks(s);
232 debug("processRemoveBlanks: " + s);
233
234 s = validateEntities(s);
235 debug(" validateEntites: " + s);
236
237 debug("************************************************\n\n");
238 return s;
239 }
240
241 public boolean isAlwaysMakeTags(){
242 return alwaysMakeTags;
243 }
244
245 public boolean isStripComments(){
246 return stripComment;
247 }
248
249 private String escapeComments(final String s) {
250 final Matcher m = P_COMMENTS.matcher(s);
251 final StringBuffer buf = new StringBuffer();
252 if (m.find()) {
253 final String match = m.group(1); //(.*?)
254 m.appendReplacement(buf, Matcher.quoteReplacement("<!--" + htmlSpecialChars(match) + "-->"));
255 }
256 m.appendTail(buf);
257
258 return buf.toString();
259 }
260
261 private String balanceHTML(String s) {
262 if (alwaysMakeTags) {
263 //
264 // try and form html
265 //
266 s = regexReplace(P_END_ARROW, "", s);
267 s = regexReplace(P_BODY_TO_END, "<$1>", s);
268 s = regexReplace(P_XML_CONTENT, "$1<$2", s);
269
270 } else {
271 //
272 // escape stray brackets
273 //
274 s = regexReplace(P_STRAY_LEFT_ARROW, "<$1", s);
275 s = regexReplace(P_STRAY_RIGHT_ARROW, "$1$2><", s);
276
277 //
278 // the last regexp causes '<>' entities to appear
279 // (we need to do a lookahead assertion so that the last bracket can
280 // be used in the next pass of the regexp)
281 //
282 s = regexReplace(P_BOTH_ARROWS, "", s);
283 }
284
285 return s;
286 }
287
288 private String checkTags(String s) {
289 Matcher m = P_TAGS.matcher(s);
290
291 final StringBuffer buf = new StringBuffer();
292 while (m.find()) {
293 String replaceStr = m.group(1);
294 replaceStr = processTag(replaceStr);
295 m.appendReplacement(buf, Matcher.quoteReplacement(replaceStr));
296 }
297 m.appendTail(buf);
298
299 s = buf.toString();
300
301 // these get tallied in processTag
302 // (remember to reset before subsequent calls to filter method)
303 for (String key : vTagCounts.keySet()) {
304 for (int ii = 0; ii < vTagCounts.get(key); ii++) {
305 s += "</" + key + ">";
306 }
307 }
308
309 return s;
310 }
311
312 private String processRemoveBlanks(final String s) {
313 String result = s;
314 for (String tag : vRemoveBlanks) {
315 if(!P_REMOVE_PAIR_BLANKS.containsKey(tag)){
316 P_REMOVE_PAIR_BLANKS.putIfAbsent(tag, Pattern.compile("<" + tag + "(\\s[^>]*)?></" + tag + ">"));
317 }
318 result = regexReplace(P_REMOVE_PAIR_BLANKS.get(tag), "", result);
319 if(!P_REMOVE_SELF_BLANKS.containsKey(tag)){
320 P_REMOVE_SELF_BLANKS.putIfAbsent(tag, Pattern.compile("<" + tag + "(\\s[^>]*)?/>"));
321 }
322 result = regexReplace(P_REMOVE_SELF_BLANKS.get(tag), "", result);
323 }
324
325 return result;
326 }
327
328 private static String regexReplace(final Pattern regex_pattern, final String replacement, final String s) {
329 Matcher m = regex_pattern.matcher(s);
330 return m.replaceAll(replacement);
331 }
332
333 private String processTag(final String s) {
334 // ending tags
335 Matcher m = P_END_TAG.matcher(s);
336 if (m.find()) {
337 final String name = m.group(1).toLowerCase();
338 if (allowed(name)) {
339 if (!inArray(name, vSelfClosingTags)) {
340 if (vTagCounts.containsKey(name)) {
341 vTagCounts.put(name, vTagCounts.get(name) - 1);
342 return "</" + name + ">";
343 }
344 }
345 }
346 }
347
348 // starting tags
349 m = P_START_TAG.matcher(s);
350 if (m.find()) {
351 final String name = m.group(1).toLowerCase();
352 final String body = m.group(2);
353 String ending = m.group(3);
354
355 //debug( "in a starting tag, name='" + name + "'; body='" + body + "'; ending='" + ending + "'" );
356 if (allowed(name)) {
357 String params = "";
358
359 final Matcher m2 = P_QUOTED_ATTRIBUTES.matcher(body);
360 final Matcher m3 = P_UNQUOTED_ATTRIBUTES.matcher(body);
361 final List<String> paramNames = new ArrayList<String>();
362 final List<String> paramValues = new ArrayList<String>();
363 while (m2.find()) {
364 paramNames.add(m2.group(1)); //([a-z0-9]+)
365 paramValues.add(m2.group(3)); //(.*?)
366 }
367 while (m3.find()) {
368 paramNames.add(m3.group(1)); //([a-z0-9]+)
369 paramValues.add(m3.group(3)); //([^\"\\s']+)
370 }
371
372 String paramName, paramValue;
373 for (int ii = 0; ii < paramNames.size(); ii++) {
374 paramName = paramNames.get(ii).toLowerCase();
375 paramValue = paramValues.get(ii);
376
377 // debug( "paramName='" + paramName + "'" );
378 // debug( "paramValue='" + paramValue + "'" );
379 // debug( "allowed? " + vAllowed.get( name ).contains( paramName ) );
380
381 if (allowedAttribute(name, paramName)) {
382 if (inArray(paramName, vProtocolAtts)) {
383 paramValue = processParamProtocol(paramValue);
384 }
385 params += " " + paramName + "=\"" + paramValue + "\"";
386 }
387 }
388
389 if (inArray(name, vSelfClosingTags)) {
390 ending = " /";
391 }
392
393 if (inArray(name, vNeedClosingTags)) {
394 ending = "";
395 }
396
397 if (ending == null || ending.length() < 1) {
398 if (vTagCounts.containsKey(name)) {
399 vTagCounts.put(name, vTagCounts.get(name) + 1);
400 } else {
401 vTagCounts.put(name, 1);
402 }
403 } else {
404 ending = " /";
405 }
406 return "<" + name + params + ending + ">";
407 } else {
408 return "";
409 }
410 }
411
412 // comments
413 m = P_COMMENT.matcher(s);
414 if (!stripComment && m.find()) {
415 return "<" + m.group() + ">";
416 }
417
418 return "";
419 }
420
421 private String processParamProtocol(String s) {
422 s = decodeEntities(s);
423 final Matcher m = P_PROTOCOL.matcher(s);
424 if (m.find()) {
425 final String protocol = m.group(1);
426 if (!inArray(protocol, vAllowedProtocols)) {
427 // bad protocol, turn into local anchor link instead
428 s = "#" + s.substring(protocol.length() + 1, s.length());
429 if (s.startsWith("#//")) {
430 s = "#" + s.substring(3, s.length());
431 }
432 }
433 }
434
435 return s;
436 }
437
438 private String decodeEntities(String s) {
439 StringBuffer buf = new StringBuffer();
440
441 Matcher m = P_ENTITY.matcher(s);
442 while (m.find()) {
443 final String match = m.group(1);
444 final int decimal = Integer.decode(match).intValue();
445 m.appendReplacement(buf, Matcher.quoteReplacement(chr(decimal)));
446 }
447 m.appendTail(buf);
448 s = buf.toString();
449
450 buf = new StringBuffer();
451 m = P_ENTITY_UNICODE.matcher(s);
452 while (m.find()) {
453 final String match = m.group(1);
454 final int decimal = Integer.valueOf(match, 16).intValue();
455 m.appendReplacement(buf, Matcher.quoteReplacement(chr(decimal)));
456 }
457 m.appendTail(buf);
458 s = buf.toString();
459
460 buf = new StringBuffer();
461 m = P_ENCODE.matcher(s);
462 while (m.find()) {
463 final String match = m.group(1);
464 final int decimal = Integer.valueOf(match, 16).intValue();
465 m.appendReplacement(buf, Matcher.quoteReplacement(chr(decimal)));
466 }
467 m.appendTail(buf);
468 s = buf.toString();
469
470 s = validateEntities(s);
471 return s;
472 }
473
474 private String validateEntities(String s) {
475 StringBuffer buf = new StringBuffer();
476
477 // validate entities throughout the string
478 Matcher m = P_VALID_ENTITIES.matcher(s);
479 while (m.find()) {
480 final String one = m.group(1); //([^&;]*)
481 final String two = m.group(2); //(?=(;|&|$))
482 m.appendReplacement(buf, Matcher.quoteReplacement(checkEntity(one, two)));
483 }
484 m.appendTail(buf);
485 s = buf.toString();
486
487 // validate quotes outside of tags
488 buf = new StringBuffer();
489 m = P_VALID_QUOTES.matcher(s);
490 while (m.find()) {
491 final String one = m.group(1); //(>|^)
492 final String two = m.group(2); //([^<]+?)
493 final String three = m.group(3); //(<|$)
494 m.appendReplacement(buf, Matcher.quoteReplacement(one + regexReplace(P_QUOTE, """, two) + three));
495 }
496 m.appendTail(buf);
497 s = buf.toString();
498
499 return s;
500 }
501
502 private String checkEntity(final String preamble, final String term) {
503
504 return ";".equals(term) && isValidEntity(preamble)
505 ? '&' + preamble
506 : "&" + preamble;
507 }
508
509 private boolean isValidEntity(final String entity) {
510 return inArray(entity, vAllowedEntities);
511 }
512
513 private static boolean inArray(final String s, final String[] array) {
514 for (String item : array) {
515 if (item != null && item.equals(s)) {
516 return true;
517 }
518 }
519 return false;
520 }
521
522 private boolean allowed(final String name) {
523 return (vAllowed.isEmpty() || vAllowed.containsKey(name)) && !inArray(name, vDisallowed);
524 }
525
526 private boolean allowedAttribute(final String name, final String paramName) {
527 return allowed(name) && (vAllowed.isEmpty() || vAllowed.get(name).contains(paramName));
528 }
529 }