import net.htmlparser.jericho.*; import java.util.*; /** * Provides facilities to sanitise HTML containing unwanted or invalid tags into clean HTML. *

* The sanitation process consists of the following steps: *

*/ public class HTMLSanitiser { private HTMLSanitiser() {} // not instantiable // list of HTML elements that will be retained in the final output: private static final Set VALID_ELEMENT_NAMES=new HashSet(Arrays.asList(new String[] { HTMLElementName.BR, HTMLElementName.P, HTMLElementName.B, HTMLElementName.I, HTMLElementName.OL, HTMLElementName.UL, HTMLElementName.LI, HTMLElementName.A })); // list of HTML attributes that will be retained in the final output: private static final Set VALID_ATTRIBUTE_NAMES=new HashSet(Arrays.asList(new String[] { "id","class","href","target","title" })); private static final Object VALID_MARKER=new Object(); /** * Returns a sanitised version of the specified HTML, encoding any unwanted tags. *

* Calling this method is equivalent to {@link #encodeInvalidMarkup(String,boolean) encodeInvalidMarkup(pseudoHTML,false)}. *

*

*
Example:
*
* * * * *
Method call:
HTMLSanitiser.encodeInvalidMarkup("<P><u>Line   1</u>\n<b>Line   2</b>\n<script>doBadStuff()</script>")
Output:
<p>&lt;u&gt;Line   1&lt;/u&gt;\n<b>Line   2</b>\n&lt;script&gt;doBadStuff()&lt;/script&gt;</p>
Rendered output:

<u>Line 1</u> Line 2 <script>doBadStuff()</script>

* In this example: *
    *
  • The <P> tag is kept and converted to lower case *
  • The optional end tag </p> is added *
  • The <b> element is kept *
  • The unwanted <u> and <script> elements are encoded so that they render verbatim *
*
*
* * @param pseudoHTML The potentially invalid HTML to sanitise. * @return a sanitised version of the specified HTML, encoding any unwanted tags. */ public static String encodeInvalidMarkup(String pseudoHTML) { return encodeInvalidMarkup(pseudoHTML,false); } /** * Returns a sanitised version of the specified HTML, encoding any unwanted tags. *

* Encoding unwanted and invalid tags results in them appearing verbatim in the rendered output, * helping to highlight the problem so that the source HTML can be fixed. *

* Specifying a value of true as an argument to the formatWhiteSpace parameter * results in the formatting of white space as described in the sanitisation process in the class description above. *

*

*
Example:
*
* * * * *
Method call:
HTMLSanitiser.encodeInvalidMarkup("<P><u>Line   1</u>\n<b>Line   2</b>\n<script>doBadStuff()</script>",true)
Output:
<p>&lt;u&gt;Line &nbsp; 1&lt;/u&gt;<br /><b>Line &nbsp; 2</b><br />&lt;script&gt;doBadStuff()&lt;/script&gt;</p>
Rendered output:

<u>Line   1</u>
Line   2
<script>doBadStuff()</script>

* In this example: *
    *
  • The <P> tag is kept and converted to lower case *
  • The optional end tag </p> is added *
  • The <b> element is kept *
  • The unwanted <u> and <script> elements are encoded so that they render verbatim *
  • The line feed characters are converted to <br /> elements *
  • Non-breaking spaces (&nbsp;) are added to ensure the multiple spaces are rendered as they appear in the input. *
*
*
* * @param pseudoHTML The potentially invalid HTML to sanitise. * @param formatWhiteSpace Specifies whether white space should be marked up in the output. * @return a sanitised version of the specified HTML, encoding any unwanted tags. */ public static String encodeInvalidMarkup(String pseudoHTML, boolean formatWhiteSpace) { return sanitise(pseudoHTML,formatWhiteSpace,false); } /** * Returns a sanitised version of the specified HTML, stripping any unwanted tags. *

* Calling this method is equivalent to {@link #stripInvalidMarkup(String,boolean) stripInvalidMarkup(pseudoHTML,false)}. *

*

*
Example:
*
* * * * *
Method call:
HTMLSanitiser.stripInvalidMarkup("<P><u>Line   1</u>\n<b>Line   2</b>\n<script>doBadStuff()</script>")
Output:
<p>Line   1\n<b>Line   2</b>\n</p>
Rendered output:

Line 1 Line 2

* In this example: *
    *
  • The <P> tag is kept and converted to lower case *
  • The optional end tag </p> is added *
  • The <b> element is kept *
  • The unwanted <u> and <script> elements are stripped from the output *
*
*
* * @param pseudoHTML The potentially invalid HTML to sanitise. * @return a sanitised version of the specified HTML, stripping any unwanted tags. */ public static String stripInvalidMarkup(String pseudoHTML) { return stripInvalidMarkup(pseudoHTML,false); } /** * Returns a sanitised version of the specified HTML, stripping any unwanted tags. *

* Stripping unwanted and invalid tags is the preferred option if the output is for public consumption. *

* Specifying a value of true as an argument to the formatWhiteSpace parameter * results in the formatting of white space as described in the sanitisation process in the class description above. *

*

*
Example:
*
* * * * *
Method call:
HTMLSanitiser.stripInvalidMarkup("<P><u>Line   1</u>\n<b>Line   2</b>\n<script>doBadStuff()</script>",true)
Output:
<p>Line &nbsp; 1<br /><b>Line &nbsp; 2</b><br /></p>
Rendered output:

Line   1
Line   2

* In this example: *
    *
  • The <P> tag is kept and converted to lower case *
  • The optional end tag </p> is added *
  • The <b> element is kept *
  • The unwanted <u> and <script> elements are stripped from the output *
  • The line feed characters are converted to <br /> elements *
  • Non-breaking spaces (&nbsp;) are added to ensure the multiple spaces are rendered as they appear in the input. *
*
*
* * @param pseudoHTML The potentially invalid HTML to sanitise. * @param formatWhiteSpace Specifies whether white space should be marked up in the output. * @return a sanitised version of the specified HTML, stripping any unwanted tags. */ public static String stripInvalidMarkup(String pseudoHTML, boolean formatWhiteSpace) { return sanitise(pseudoHTML,formatWhiteSpace,true); } private static String sanitise(String pseudoHTML, boolean formatWhiteSpace, boolean stripInvalidElements) { Source source=new Source(pseudoHTML); source.fullSequentialParse(); OutputDocument outputDocument=new OutputDocument(source); List tags=source.getAllTags(); int pos=0; for (Tag tag : tags) { if (tag.getBegin()=end) return; Segment textSegment=new Segment(source,begin,end); String decodedText=CharacterReference.decode(textSegment); String encodedText=formatWhiteSpace ? CharacterReference.encodeWithWhiteSpaceFormatting(decodedText) : CharacterReference.encode(decodedText); outputDocument.replace(textSegment,encodedText); } private static CharSequence getStartTagHTML(StartTag startTag) { // tidies and filters out non-approved attributes StringBuilder sb=new StringBuilder(); sb.append('<').append(startTag.getName()); for (Attribute attribute : startTag.getAttributes()) { if (VALID_ATTRIBUTE_NAMES.contains(attribute.getKey())) { sb.append(' ').append(attribute.getName()); if (attribute.getValue()!=null) { sb.append("=\""); sb.append(CharacterReference.encode(attribute.getValue())); sb.append('"'); } } } if (startTag.getElement().getEndTag()==null && !HTMLElements.getEndTagOptionalElementNames().contains(startTag.getName())) sb.append(" /"); sb.append('>'); return sb; } private static String getEndTagHTML(String tagName) { return "'; } ////////////////////////////////////////////////////////////////////////////////////// // THE METHODS BELOW ARE USED ONLY FOR DEMONSTRATING THE FUNCTIONALITY OF THE CLASS // ////////////////////////////////////////////////////////////////////////////////////// // See test/src/samples/HTMLSanitiserTest.java for a comprehensive test suite. public static void main(String[] args) throws Exception { System.out.println("Examples of HTMLSanitiser.encodeInvalidMarkup:"); System.out.println("----------------------------------------------\n"); displayEncodeInvalidMarkup("ab & c","encode text"); displayEncodeInvalidMarkup("abc def geh"," element not allowed"); displayEncodeInvalidMarkup("

abc","add optional end tag"); displayEncodeInvalidMarkup("","remove potentially dangerous script"); displayEncodeInvalidMarkup("

abc

","keep approved attributes but strip non-approved attributes"); displayEncodeInvalidMarkup("

abc

","tidy up attributes to make them XHTML compliant"); displayEncodeInvalidMarkup("List:
  • A
  • B
  • C
","inserts optional end tags"); System.out.println("Examples of HTMLSanitiser.stripInvalidMarkup:"); System.out.println("---------------------------------------------\n"); displayStripInvalidMarkup("ab & c","encode text"); displayStripInvalidMarkup("abc def geh"," element not allowed"); displayStripInvalidMarkup("

abc","add optional end tag"); displayStripInvalidMarkup("abcgeh","remove potentially dangerous script"); displayStripInvalidMarkup("

abc

","keep approved attributes but strip non-approved attributes"); displayStripInvalidMarkup("

abc

","tidy up attributes to make them XHTML compliant"); displayStripInvalidMarkup("List:
  • A
  • B
  • C
","inserts optional end tags"); displayStripInvalidMarkup("List:
  • A
  • B
  • C","missing required
      or
        element"); displayStripInvalidMarkup("List:
        • A
        • B
        • C
        ","
      1. is invalid as it is not directly under
          or
            "); System.out.println("Examples of HTMLSanitiser.stripInvalidMarkup with formatWhiteSpace=true:"); System.out.println("------------------------------------------------------------------------\n"); displayStripInvalidMarkup("abc\ndef",true,"convert LF to
            "); displayStripInvalidMarkup(" abc",true,"ensure consecutive spaces are rendered"); displayStripInvalidMarkup("\tabc",true,"convert TAB to equivalent of four spaces"); } private static void displayEncodeInvalidMarkup(String input, String explanation) { display(input,explanation,HTMLSanitiser.encodeInvalidMarkup(input)); } private static void displayStripInvalidMarkup(String input, String explanation) { display(input,explanation,HTMLSanitiser.stripInvalidMarkup(input)); } private static void displayStripInvalidMarkup(String input, boolean formatWhiteSpace, String explanation) { display(input,explanation,HTMLSanitiser.stripInvalidMarkup(input,formatWhiteSpace)); } private static void display(String input, String explanation, String output) { System.out.println(explanation+":\ninput : "+input+"\noutput: "+output+"\n"); } }