跨站点脚本攻击(XSS)防护 XSS HTMLFilter

凌波峻
2023-12-01

 XSS HTMLFilter这是一个采用Java实现的开源类库。用于分析用户提交的输入,消除潜在的跨站点脚本攻击(XSS),恶意的HTML,或简单的HTML格式错误。

示例代码:

// retrieve input from user...
String input = ...
String clean = new HTMLInputFilter().filter( input );

 

该项目主页:http://xss-html-filter.sourceforge.net/ 

package net.sf.xsshtmlfilter;
 
2  
3 import java.util.ArrayList;
 
4 import java.util.Collections;
 
5 import java.util.HashMap;
 
6 import java.util.List;
 
7 import java.util.Map;
 
8 import java.util.concurrent.ConcurrentHashMap;
 
9 import java.util.concurrent.ConcurrentMap;
 
10 import java.util.logging.Logger;
 
11 import java.util.regex.Matcher;
 
12 import java.util.regex.Pattern;
 
13  
14 /**
 
15  *
 
16  * HTML filtering utility for protecting against XSS (Cross Site Scripting).
 
17  *
 
18  * This code is licensed LGPLv3
 
19  *
 
20  * This code is a Java port of the original work in PHP by Cal Hendersen.
 
21  * http://code.iamcal.com/php/lib_filter/
 
22  *
 
23  * The trickiest part of the translation was handling the differences in regex handling
 
24  * between PHP and Java.  These resources were helpful in the process:
 
25  *
 
26  * http://java.sun.com/j2se/1.4.2/docs/api/java/util/regex/Pattern.html
 
27  * http://us2.php.net/manual/en/reference.pcre.pattern.modifiers.php
 
28  * http://www.regular-expressions.info/modifiers.html
 
29  *
 
30  * A note on naming conventions: instance variables are prefixed with a "v"; global
 
31  * constants are in all caps.
 
32  *
 
33  * Sample use:
 
34  * String input = ...
 
35  * String clean = new HTMLFilter().filter( input );
 
36  *
 
37  * The class is not thread safe. Create a new instance if in doubt.
 
38  *
 
39  * If you find bugs or have suggestions on improvement (especially regarding
 
40  * performance), please contact us.  The latest version of this
 
41  * source, and our contact details, can be found at http://xss-html-filter.sf.net
 
42  *
 
43  * @author Joseph O'Connell
 
44  * @author Cal Hendersen
 
45  * @author Michael Semb Wever
 
46  */
 
47 public final class HTMLFilter {
 
48  
49     /** regex flag union representing /si modifiers in php **/
 
50     private static final int REGEX_FLAGS_SI = Pattern.CASE_INSENSITIVE | Pattern.DOTALL;
 
51     private static final Pattern P_COMMENTS = Pattern.compile("<!--(.*?)-->", Pattern.DOTALL);
 
52     private static final Pattern P_COMMENT = Pattern.compile("^!--(.*)--{1}quot;, REGEX_FLAGS_SI);
 
53     private static final Pattern P_TAGS = Pattern.compile("<(.*?)>", Pattern.DOTALL);
 
54     private static final Pattern P_END_TAG = Pattern.compile("^/([a-z0-9]+)", REGEX_FLAGS_SI);
 
55     private static final Pattern P_START_TAG = Pattern.compile("^([a-z0-9]+)(.*?)(/?){1}quot;, REGEX_FLAGS_SI);
 
56     private static final Pattern P_QUOTED_ATTRIBUTES = Pattern.compile("([a-z0-9]+)=([\"'])(.*?)\\2", REGEX_FLAGS_SI);
 
57     private static final Pattern P_UNQUOTED_ATTRIBUTES = Pattern.compile("([a-z0-9]+)(=)([^\"\\s']+)", REGEX_FLAGS_SI);
 
58     private static final Pattern P_PROTOCOL = Pattern.compile("^([^:]+):", REGEX_FLAGS_SI);
 
59     private static final Pattern P_ENTITY = Pattern.compile("&#(\\d+);?");
 
60     private static final Pattern P_ENTITY_UNICODE = Pattern.compile("&#x([0-9a-f]+);?");
 
61     private static final Pattern P_ENCODE = Pattern.compile("%([0-9a-f]{2});?");
 
62     private static final Pattern P_VALID_ENTITIES = Pattern.compile("&([^&;]*)(?=(;|&|$))");
 
63     private static final Pattern P_VALID_QUOTES = Pattern.compile("(>|^)([^<]+?)(<|$)", Pattern.DOTALL);
 
64     private static final Pattern P_END_ARROW = Pattern.compile("^>");
 
65     private static final Pattern P_BODY_TO_END = Pattern.compile("<([^>]*?)(?=<|$)");
 
66     private static final Pattern P_XML_CONTENT = Pattern.compile("(^|>)([^<]*?)(?=>)");
 
67     private static final Pattern P_STRAY_LEFT_ARROW = Pattern.compile("<([^>]*?)(?=<|$)");
 
68     private static final Pattern P_STRAY_RIGHT_ARROW = Pattern.compile("(^|>)([^<]*?)(?=>)");
 
69     private static final Pattern P_AMP = Pattern.compile("&");
 
70     private static final Pattern P_QUOTE = Pattern.compile("\"");
 
71     private static final Pattern P_LEFT_ARROW = Pattern.compile("<");
 
72     private static final Pattern P_RIGHT_ARROW = Pattern.compile(">");
 
73     private static final Pattern P_BOTH_ARROWS = Pattern.compile("<>");
 
74  
75     // @xxx could grow large... maybe use sesat's ReferenceMap
 
76     private static final ConcurrentMap<String,Pattern> P_REMOVE_PAIR_BLANKS = new ConcurrentHashMap<String, Pattern>();
 
77     private static final ConcurrentMap<String,Pattern> P_REMOVE_SELF_BLANKS = new ConcurrentHashMap<String, Pattern>();
 
78  
79     /** set of allowed html elements, along with allowed attributes for each element **/
 
80     private final Map<String, List<String>> vAllowed;
 
81     /** counts of open tags for each (allowable) html element **/
 
82     private final Map<String, Integer> vTagCounts = new HashMap<String, Integer>();
 
83  
84     /** html elements which must always be self-closing (e.g. "<img />") **/
 
85     private final String[] vSelfClosingTags;
 
86     /** html elements which must always have separate opening and closing tags (e.g. "<b></b>") **/
 
87     private final String[] vNeedClosingTags;
 
88     /** set of disallowed html elements **/
 
89     private final String[] vDisallowed;
 
90     /** attributes which should be checked for valid protocols **/
 
91     private final String[] vProtocolAtts;
 
92     /** allowed protocols **/
 
93     private final String[] vAllowedProtocols;
 
94     /** tags which should be removed if they contain no content (e.g. "<b></b>" or "<b />") **/
 
95     private final String[] vRemoveBlanks;
 
96     /** entities allowed within html markup **/
 
97     private final String[] vAllowedEntities;
 
98     /** flag determining whether comments are allowed in input String. */
 
99     private final boolean stripComment;
 
100     private boolean vDebug = false;
 
101     /**
 
102      * flag determining whether to try to make tags when presented with "unbalanced"
 
103      * angle brackets (e.g. "<b text </b>" becomes "<b> text </b>").  If set to false,
 
104      * unbalanced angle brackets will be html escaped.
 
105      */
 
106     private final boolean alwaysMakeTags;
 
107  
108     /** Default constructor.
 
109      *
 
110      */
 
111     public HTMLFilter() {
 
112         vAllowed = new HashMap<String, List<String>>();
 
113  
114         final ArrayList<String> a_atts = new ArrayList<String>();
 
115         a_atts.add("href");
 
116         a_atts.add("target");
 
117         vAllowed.put("a", a_atts);
 
118  
119         final ArrayList<String> img_atts = new ArrayList<String>();
 
120         img_atts.add("src");
 
121         img_atts.add("width");
 
122         img_atts.add("height");
 
123         img_atts.add("alt");
 
124         vAllowed.put("img", img_atts);
 
125  
126         final ArrayList<String> no_atts = new ArrayList<String>();
 
127         vAllowed.put("b", no_atts);
 
128         vAllowed.put("strong", no_atts);
 
129         vAllowed.put("i", no_atts);
 
130         vAllowed.put("em", no_atts);
 
131  
132         vSelfClosingTags = new String[]{"img"};
 
133         vNeedClosingTags = new String[]{"a", "b", "strong", "i", "em"};
 
134         vDisallowed = new String[]{};
 
135         vAllowedProtocols = new String[]{"http", "mailto"}; // no ftp.
 
136         vProtocolAtts = new String[]{"src", "href"};
 
137         vRemoveBlanks = new String[]{"a", "b", "strong", "i", "em"};
 
138         vAllowedEntities = new String[]{"amp", "gt", "lt", "quot"};
 
139         stripComment = true;
 
140         alwaysMakeTags = true;
 
141     }
 
142  
143     /** Set debug flag to true. Otherwise use default settings. See the default constructor.
 
144      *
 
145      * @param debug turn debug on with a true argument
 
146      */
 
147     public HTMLFilter(final boolean debug) {
 
148         this();
 
149         vDebug = debug;
 
150  
151     }
 
152  
153     /** Map-parameter configurable constructor.
 
154      *
 
155      * @param configuration map containing configuration. keys match field names.
 
156      */
 
157     public HTMLFilter(final Map<String,Object> configuration) {
 
158  
159         assert configuration.containsKey("vAllowed") : "configuration requires vAllowed";
 
160         assert configuration.containsKey("vSelfClosingTags") : "configuration requires vSelfClosingTags";
 
161         assert configuration.containsKey("vNeedClosingTags") : "configuration requires vNeedClosingTags";
 
162         assert configuration.containsKey("vDisallowed") : "configuration requires vDisallowed";
 
163         assert configuration.containsKey("vAllowedProtocols") : "configuration requires vAllowedProtocols";
 
164         assert configuration.containsKey("vProtocolAtts") : "configuration requires vProtocolAtts";
 
165         assert configuration.containsKey("vRemoveBlanks") : "configuration requires vRemoveBlanks";
 
166         assert configuration.containsKey("vAllowedEntities") : "configuration requires vAllowedEntities";
 
167         assert configuration.containsKey("stripComment") : "configuration requires stripComment";
 
168         assert configuration.containsKey("alwaysMakeTags") : "configuration requires alwaysMakeTags";
 
169  
170         vAllowed = Collections.unmodifiableMap((HashMap<String, List<String>>) configuration.get("vAllowed"));
 
171         vSelfClosingTags = (String[]) configuration.get("vSelfClosingTags");
 
172         vNeedClosingTags = (String[]) configuration.get("vNeedClosingTags");
 
173         vDisallowed = (String[]) configuration.get("vDisallowed");
 
174         vAllowedProtocols = (String[]) configuration.get("vAllowedProtocols");
 
175         vProtocolAtts = (String[]) configuration.get("vProtocolAtts");
 
176         vRemoveBlanks = (String[]) configuration.get("vRemoveBlanks");
 
177         vAllowedEntities = (String[]) configuration.get("vAllowedEntities");
 
178         stripComment = (Boolean) configuration.get("stripComment");
 
179         alwaysMakeTags = (Boolean) configuration.get("alwaysMakeTags");
 
180     }
 
181  
182     private void reset() {
 
183         vTagCounts.clear();
 
184     }
 
185  
186     private void debug(final String msg) {
 
187         if (vDebug) {
 
188             Logger.getAnonymousLogger().info(msg);
 
189         }
 
190     }
 
191  
192     //---------------------------------------------------------------
 
193     // my versions of some PHP library functions
 
194     public static String chr(final int decimal) {
 
195         return String.valueOf((char) decimal);
 
196     }
 
197  
198     public static String htmlSpecialChars(final String s) {
 
199         String result = s;
 
200         result = regexReplace(P_AMP, "&", result);
 
201         result = regexReplace(P_QUOTE, """, result);
 
202         result = regexReplace(P_LEFT_ARROW, "<", result);
 
203         result = regexReplace(P_RIGHT_ARROW, ">", result);
 
204         return result;
 
205     }
 
206  
207     //---------------------------------------------------------------
 
208     /**
 
209      * given a user submitted input String, filter out any invalid or restricted
 
210      * html.
 
211      *
 
212      * @param input text (i.e. submitted by a user) than may contain html
 
213      * @return "clean" version of input, with only valid, whitelisted html elements allowed
 
214      */
 
215     public String filter(final String input) {
 
216         reset();
 
217         String s = input;
 
218  
219         debug("************************************************");
 
220         debug("              INPUT: " + input);
 
221  
222         s = escapeComments(s);
 
223         debug("     escapeComments: " + s);
 
224  
225         s = balanceHTML(s);
 
226         debug("        balanceHTML: " + s);
 
227  
228         s = checkTags(s);
 
229         debug("          checkTags: " + s);
 
230  
231         s = processRemoveBlanks(s);
 
232         debug("processRemoveBlanks: " + s);
 
233  
234         s = validateEntities(s);
 
235         debug("    validateEntites: " + s);
 
236  
237         debug("************************************************\n\n");
 
238         return s;
 
239     }
 
240  
241     public boolean isAlwaysMakeTags(){
 
242         return alwaysMakeTags;
 
243     }
 
244  
245     public boolean isStripComments(){
 
246         return stripComment;
 
247     }
 
248  
249     private String escapeComments(final String s) {
 
250         final Matcher m = P_COMMENTS.matcher(s);
 
251         final StringBuffer buf = new StringBuffer();
 
252         if (m.find()) {
 
253             final String match = m.group(1); //(.*?)
 
254             m.appendReplacement(buf, Matcher.quoteReplacement("<!--" + htmlSpecialChars(match) + "-->"));
 
255         }
 
256         m.appendTail(buf);
 
257  
258         return buf.toString();
 
259     }
 
260  
261     private String balanceHTML(String s) {
 
262         if (alwaysMakeTags) {
 
263             //
 
264             // try and form html
 
265             //
 
266             s = regexReplace(P_END_ARROW, "", s);
 
267             s = regexReplace(P_BODY_TO_END, "<$1>", s);
 
268             s = regexReplace(P_XML_CONTENT, "$1<$2", s);
 
269  
270         } else {
 
271             //
 
272             // escape stray brackets
 
273             //
 
274             s = regexReplace(P_STRAY_LEFT_ARROW, "<$1", s);
 
275             s = regexReplace(P_STRAY_RIGHT_ARROW, "$1$2><", s);
 
276  
277             //
 
278             // the last regexp causes '<>' entities to appear
 
279             // (we need to do a lookahead assertion so that the last bracket can
 
280             // be used in the next pass of the regexp)
 
281             //
 
282             s = regexReplace(P_BOTH_ARROWS, "", s);
 
283         }
 
284  
285         return s;
 
286     }
 
287  
288     private String checkTags(String s) {
 
289         Matcher m = P_TAGS.matcher(s);
 
290  
291         final StringBuffer buf = new StringBuffer();
 
292         while (m.find()) {
 
293             String replaceStr = m.group(1);
 
294             replaceStr = processTag(replaceStr);
 
295             m.appendReplacement(buf, Matcher.quoteReplacement(replaceStr));
 
296         }
 
297         m.appendTail(buf);
 
298  
299         s = buf.toString();
 
300  
301         // these get tallied in processTag
 
302         // (remember to reset before subsequent calls to filter method)
 
303         for (String key : vTagCounts.keySet()) {
 
304             for (int ii = 0; ii < vTagCounts.get(key); ii++) {
 
305                 s += "</" + key + ">";
 
306             }
 
307         }
 
308  
309         return s;
 
310     }
 
311  
312     private String processRemoveBlanks(final String s) {
 
313         String result = s;
 
314         for (String tag : vRemoveBlanks) {
 
315             if(!P_REMOVE_PAIR_BLANKS.containsKey(tag)){
 
316                 P_REMOVE_PAIR_BLANKS.putIfAbsent(tag, Pattern.compile("<" + tag + "(\\s[^>]*)?></" + tag + ">"));
 
317             }
 
318             result = regexReplace(P_REMOVE_PAIR_BLANKS.get(tag), "", result);
 
319             if(!P_REMOVE_SELF_BLANKS.containsKey(tag)){
 
320                 P_REMOVE_SELF_BLANKS.putIfAbsent(tag, Pattern.compile("<" + tag + "(\\s[^>]*)?/>"));
 
321             }
 
322             result = regexReplace(P_REMOVE_SELF_BLANKS.get(tag), "", result);
 
323         }
 
324  
325         return result;
 
326     }
 
327  
328     private static String regexReplace(final Pattern regex_pattern, final String replacement, final String s) {
 
329         Matcher m = regex_pattern.matcher(s);
 
330         return m.replaceAll(replacement);
 
331     }
 
332  
333     private String processTag(final String s) {
 
334         // ending tags
 
335         Matcher m = P_END_TAG.matcher(s);
 
336         if (m.find()) {
 
337             final String name = m.group(1).toLowerCase();
 
338             if (allowed(name)) {
 
339                 if (!inArray(name, vSelfClosingTags)) {
 
340                     if (vTagCounts.containsKey(name)) {
 
341                         vTagCounts.put(name, vTagCounts.get(name) - 1);
 
342                         return "</" + name + ">";
 
343                     }
 
344                 }
 
345             }
 
346         }
 
347  
348         // starting tags
 
349         m = P_START_TAG.matcher(s);
 
350         if (m.find()) {
 
351             final String name = m.group(1).toLowerCase();
 
352             final String body = m.group(2);
 
353             String ending = m.group(3);
 
354  
355             //debug( "in a starting tag, name='" + name + "'; body='" + body + "'; ending='" + ending + "'" );
 
356             if (allowed(name)) {
 
357                 String params = "";
 
358  
359                 final Matcher m2 = P_QUOTED_ATTRIBUTES.matcher(body);
 
360                 final Matcher m3 = P_UNQUOTED_ATTRIBUTES.matcher(body);
 
361                 final List<String> paramNames = new ArrayList<String>();
 
362                 final List<String> paramValues = new ArrayList<String>();
 
363                 while (m2.find()) {
 
364                     paramNames.add(m2.group(1)); //([a-z0-9]+)
 
365                     paramValues.add(m2.group(3)); //(.*?)
 
366                 }
 
367                 while (m3.find()) {
 
368                     paramNames.add(m3.group(1)); //([a-z0-9]+)
 
369                     paramValues.add(m3.group(3)); //([^\"\\s']+)
 
370                 }
 
371  
372                 String paramName, paramValue;
 
373                 for (int ii = 0; ii < paramNames.size(); ii++) {
 
374                     paramName = paramNames.get(ii).toLowerCase();
 
375                     paramValue = paramValues.get(ii);
 
376  
377 //          debug( "paramName='" + paramName + "'" );
 
378 //          debug( "paramValue='" + paramValue + "'" );
 
379 //          debug( "allowed? " + vAllowed.get( name ).contains( paramName ) );
 
380  
381                     if (allowedAttribute(name, paramName)) {
 
382                         if (inArray(paramName, vProtocolAtts)) {
 
383                             paramValue = processParamProtocol(paramValue);
 
384                         }
 
385                         params += " " + paramName + "=\"" + paramValue + "\"";
 
386                     }
 
387                 }
 
388  
389                 if (inArray(name, vSelfClosingTags)) {
 
390                     ending = " /";
 
391                 }
 
392  
393                 if (inArray(name, vNeedClosingTags)) {
 
394                     ending = "";
 
395                 }
 
396  
397                 if (ending == null || ending.length() < 1) {
 
398                     if (vTagCounts.containsKey(name)) {
 
399                         vTagCounts.put(name, vTagCounts.get(name) + 1);
 
400                     } else {
 
401                         vTagCounts.put(name, 1);
 
402                     }
 
403                 } else {
 
404                     ending = " /";
 
405                 }
 
406                 return "<" + name + params + ending + ">";
 
407             } else {
 
408                 return "";
 
409             }
 
410         }
 
411  
412         // comments
 
413         m = P_COMMENT.matcher(s);
 
414         if (!stripComment && m.find()) {
 
415             return  "<" + m.group() + ">";
 
416         }
 
417  
418         return "";
 
419     }
 
420  
421     private String processParamProtocol(String s) {
 
422         s = decodeEntities(s);
 
423         final Matcher m = P_PROTOCOL.matcher(s);
 
424         if (m.find()) {
 
425             final String protocol = m.group(1);
 
426             if (!inArray(protocol, vAllowedProtocols)) {
 
427                 // bad protocol, turn into local anchor link instead
 
428                 s = "#" + s.substring(protocol.length() + 1, s.length());
 
429                 if (s.startsWith("#//")) {
 
430                     s = "#" + s.substring(3, s.length());
 
431                 }
 
432             }
 
433         }
 
434  
435         return s;
 
436     }
 
437  
438     private String decodeEntities(String s) {
 
439         StringBuffer buf = new StringBuffer();
 
440  
441         Matcher m = P_ENTITY.matcher(s);
 
442         while (m.find()) {
 
443             final String match = m.group(1);
 
444             final int decimal = Integer.decode(match).intValue();
 
445             m.appendReplacement(buf, Matcher.quoteReplacement(chr(decimal)));
 
446         }
 
447         m.appendTail(buf);
 
448         s = buf.toString();
 
449  
450         buf = new StringBuffer();
 
451         m = P_ENTITY_UNICODE.matcher(s);
 
452         while (m.find()) {
 
453             final String match = m.group(1);
 
454             final int decimal = Integer.valueOf(match, 16).intValue();
 
455             m.appendReplacement(buf, Matcher.quoteReplacement(chr(decimal)));
 
456         }
 
457         m.appendTail(buf);
 
458         s = buf.toString();
 
459  
460         buf = new StringBuffer();
 
461         m = P_ENCODE.matcher(s);
 
462         while (m.find()) {
 
463             final String match = m.group(1);
 
464             final int decimal = Integer.valueOf(match, 16).intValue();
 
465             m.appendReplacement(buf, Matcher.quoteReplacement(chr(decimal)));
 
466         }
 
467         m.appendTail(buf);
 
468         s = buf.toString();
 
469  
470         s = validateEntities(s);
 
471         return s;
 
472     }
 
473  
474     private String validateEntities(String s) {
 
475         StringBuffer buf = new StringBuffer();
 
476  
477         // validate entities throughout the string
 
478         Matcher m = P_VALID_ENTITIES.matcher(s);
 
479         while (m.find()) {
 
480             final String one = m.group(1); //([^&;]*)
 
481             final String two = m.group(2); //(?=(;|&|$))
 
482             m.appendReplacement(buf, Matcher.quoteReplacement(checkEntity(one, two)));
 
483         }
 
484         m.appendTail(buf);
 
485         s = buf.toString();
 
486  
487         // validate quotes outside of tags
 
488         buf = new StringBuffer();
 
489         m = P_VALID_QUOTES.matcher(s);
 
490         while (m.find()) {
 
491             final String one = m.group(1); //(>|^)
 
492             final String two = m.group(2); //([^<]+?)
 
493             final String three = m.group(3); //(<|$)
 
494             m.appendReplacement(buf, Matcher.quoteReplacement(one + regexReplace(P_QUOTE, """, two) + three));
 
495         }
 
496         m.appendTail(buf);
 
497         s = buf.toString();
 
498  
499         return s;
 
500     }
 
501  
502     private String checkEntity(final String preamble, final String term) {
 
503  
504         return ";".equals(term) && isValidEntity(preamble)
 
505                 ? '&' + preamble
 
506                 : "&" + preamble;
 
507     }
 
508  
509     private boolean isValidEntity(final String entity) {
 
510         return inArray(entity, vAllowedEntities);
 
511     }
 
512  
513     private static boolean inArray(final String s, final String[] array) {
 
514         for (String item : array) {
 
515             if (item != null && item.equals(s)) {
 
516                 return true;
 
517             }
 
518         }
 
519         return false;
 
520     }
 
521  
522     private boolean allowed(final String name) {
 
523         return (vAllowed.isEmpty() || vAllowed.containsKey(name)) && !inArray(name, vDisallowed);
 
524     }
 
525  
526     private boolean allowedAttribute(final String name, final String paramName) {
 
527         return allowed(name) && (vAllowed.isEmpty() || vAllowed.get(name).contains(paramName));
 
528     }
 
529 }
 


 类似资料: