(Arrays.asList(new String[] {
"id","class","href","target","title"
}));
private static final Object VALID_MARKER=new Object();
/**
* Returns a sanitised version of the specified HTML, encoding any unwanted tags.
*
* Calling this method is equivalent to {@link #encodeInvalidMarkup(String,boolean) encodeInvalidMarkup(pseudoHTML,false)}.
*
*
* - Example:
* -
*
* Method call: | HTMLSanitiser.encodeInvalidMarkup("<P><u>Line 1</u>\n<b>Line 2</b>\n<script>doBadStuff()</script>") |
* Output: | <p><u>Line 1</u>\n<b>Line 2</b>\n<script>doBadStuff()</script></p> |
* Rendered output: | <u>Line 1</u> Line 2 <script>doBadStuff()</script> |
*
* In this example:
*
* - The
<P>
tag is kept and converted to lower case
* - The optional end tag
</p>
is added
* - The
<b>
element is kept
* - The unwanted
<u>
and <script>
elements are encoded so that they render verbatim
*
*
*
*
* @param pseudoHTML The potentially invalid HTML to sanitise.
* @return a sanitised version of the specified HTML, encoding any unwanted tags.
*/
public static String encodeInvalidMarkup(String pseudoHTML) {
return encodeInvalidMarkup(pseudoHTML,false);
}
/**
* Returns a sanitised version of the specified HTML, encoding any unwanted tags.
*
* Encoding unwanted and invalid tags results in them appearing verbatim in the rendered output,
* helping to highlight the problem so that the source HTML can be fixed.
*
* Specifying a value of true
as an argument to the formatWhiteSpace
parameter
* results in the formatting of white space as described in the sanitisation process in the class description above.
*
*
* - Example:
* -
*
* Method call: | HTMLSanitiser.encodeInvalidMarkup("<P><u>Line 1</u>\n<b>Line 2</b>\n<script>doBadStuff()</script>",true) |
* Output: | <p><u>Line 1</u><br /><b>Line 2</b><br /><script>doBadStuff()</script></p> |
* Rendered output: | <u>Line 1</u> Line 2 <script>doBadStuff()</script> |
*
* In this example:
*
* - The
<P>
tag is kept and converted to lower case
* - The optional end tag
</p>
is added
* - The
<b>
element is kept
* - The unwanted
<u>
and <script>
elements are encoded so that they render verbatim
* - The line feed characters are converted to
<br />
elements
* - Non-breaking spaces (
) are added to ensure the multiple spaces are rendered as they appear in the input.
*
*
*
*
* @param pseudoHTML The potentially invalid HTML to sanitise.
* @param formatWhiteSpace Specifies whether white space should be marked up in the output.
* @return a sanitised version of the specified HTML, encoding any unwanted tags.
*/
public static String encodeInvalidMarkup(String pseudoHTML, boolean formatWhiteSpace) {
return sanitise(pseudoHTML,formatWhiteSpace,false);
}
/**
* Returns a sanitised version of the specified HTML, stripping any unwanted tags.
*
* Calling this method is equivalent to {@link #stripInvalidMarkup(String,boolean) stripInvalidMarkup(pseudoHTML,false)}.
*
*
* - Example:
* -
*
* Method call: | HTMLSanitiser.stripInvalidMarkup("<P><u>Line 1</u>\n<b>Line 2</b>\n<script>doBadStuff()</script>") |
* Output: | <p>Line 1\n<b>Line 2</b>\n</p> |
* Rendered output: | Line 1 Line 2 |
*
* In this example:
*
* - The
<P>
tag is kept and converted to lower case
* - The optional end tag
</p>
is added
* - The
<b>
element is kept
* - The unwanted
<u>
and <script>
elements are stripped from the output
*
*
*
*
* @param pseudoHTML The potentially invalid HTML to sanitise.
* @return a sanitised version of the specified HTML, stripping any unwanted tags.
*/
public static String stripInvalidMarkup(String pseudoHTML) {
return stripInvalidMarkup(pseudoHTML,false);
}
/**
* Returns a sanitised version of the specified HTML, stripping any unwanted tags.
*
* Stripping unwanted and invalid tags is the preferred option if the output is for public consumption.
*
* Specifying a value of true
as an argument to the formatWhiteSpace
parameter
* results in the formatting of white space as described in the sanitisation process in the class description above.
*
*
* - Example:
* -
*
* Method call: | HTMLSanitiser.stripInvalidMarkup("<P><u>Line 1</u>\n<b>Line 2</b>\n<script>doBadStuff()</script>",true) |
* Output: | <p>Line 1<br /><b>Line 2</b><br /></p> |
* Rendered output: | Line 1 Line 2
|
*
* In this example:
*
* - The
<P>
tag is kept and converted to lower case
* - The optional end tag
</p>
is added
* - The
<b>
element is kept
* - The unwanted
<u>
and <script>
elements are stripped from the output
* - The line feed characters are converted to
<br />
elements
* - Non-breaking spaces (
) are added to ensure the multiple spaces are rendered as they appear in the input.
*
*
*
*
* @param pseudoHTML The potentially invalid HTML to sanitise.
* @param formatWhiteSpace Specifies whether white space should be marked up in the output.
* @return a sanitised version of the specified HTML, stripping any unwanted tags.
*/
public static String stripInvalidMarkup(String pseudoHTML, boolean formatWhiteSpace) {
return sanitise(pseudoHTML,formatWhiteSpace,true);
}
private static String sanitise(String pseudoHTML, boolean formatWhiteSpace, boolean stripInvalidElements) {
Source source=new Source(pseudoHTML);
source.fullSequentialParse();
OutputDocument outputDocument=new OutputDocument(source);
List tags=source.getAllTags();
int pos=0;
for (Tag tag : tags) {
if (tag.getBegin()=end) return;
Segment textSegment=new Segment(source,begin,end);
String decodedText=CharacterReference.decode(textSegment);
String encodedText=formatWhiteSpace ? CharacterReference.encodeWithWhiteSpaceFormatting(decodedText) : CharacterReference.encode(decodedText);
outputDocument.replace(textSegment,encodedText);
}
private static CharSequence getStartTagHTML(StartTag startTag) {
// tidies and filters out non-approved attributes
StringBuilder sb=new StringBuilder();
sb.append('<').append(startTag.getName());
for (Attribute attribute : startTag.getAttributes()) {
if (VALID_ATTRIBUTE_NAMES.contains(attribute.getKey())) {
sb.append(' ').append(attribute.getName());
if (attribute.getValue()!=null) {
sb.append("=\"");
sb.append(CharacterReference.encode(attribute.getValue()));
sb.append('"');
}
}
}
if (startTag.getElement().getEndTag()==null && !HTMLElements.getEndTagOptionalElementNames().contains(startTag.getName())) sb.append(" /");
sb.append('>');
return sb;
}
private static String getEndTagHTML(String tagName) {
return ""+tagName+'>';
}
//////////////////////////////////////////////////////////////////////////////////////
// THE METHODS BELOW ARE USED ONLY FOR DEMONSTRATING THE FUNCTIONALITY OF THE CLASS //
//////////////////////////////////////////////////////////////////////////////////////
// See test/src/samples/HTMLSanitiserTest.java for a comprehensive test suite.
public static void main(String[] args) throws Exception {
System.out.println("Examples of HTMLSanitiser.encodeInvalidMarkup:");
System.out.println("----------------------------------------------\n");
displayEncodeInvalidMarkup("ab & c","encode text");
displayEncodeInvalidMarkup("abc def geh"," element not allowed");
displayEncodeInvalidMarkup("abc","add optional end tag");
displayEncodeInvalidMarkup("","remove potentially dangerous script");
displayEncodeInvalidMarkup("
abc
","keep approved attributes but strip non-approved attributes");
displayEncodeInvalidMarkup("abc
","tidy up attributes to make them XHTML compliant");
displayEncodeInvalidMarkup("List:","inserts optional end tags");
System.out.println("Examples of HTMLSanitiser.stripInvalidMarkup:");
System.out.println("---------------------------------------------\n");
displayStripInvalidMarkup("ab & c","encode text");
displayStripInvalidMarkup("abc def geh"," element not allowed");
displayStripInvalidMarkup("abc","add optional end tag");
displayStripInvalidMarkup("abcgeh","remove potentially dangerous script");
displayStripInvalidMarkup("
abc
","keep approved attributes but strip non-approved attributes");
displayStripInvalidMarkup("abc
","tidy up attributes to make them XHTML compliant");
displayStripInvalidMarkup("List:","inserts optional end tags");
displayStripInvalidMarkup("List:ABC","missing required or element");
displayStripInvalidMarkup("List:","- is invalid as it is not directly under
or ");
System.out.println("Examples of HTMLSanitiser.stripInvalidMarkup with formatWhiteSpace=true:");
System.out.println("------------------------------------------------------------------------\n");
displayStripInvalidMarkup("abc\ndef",true,"convert LF to
");
displayStripInvalidMarkup(" abc",true,"ensure consecutive spaces are rendered");
displayStripInvalidMarkup("\tabc",true,"convert TAB to equivalent of four spaces");
}
private static void displayEncodeInvalidMarkup(String input, String explanation) {
display(input,explanation,HTMLSanitiser.encodeInvalidMarkup(input));
}
private static void displayStripInvalidMarkup(String input, String explanation) {
display(input,explanation,HTMLSanitiser.stripInvalidMarkup(input));
}
private static void displayStripInvalidMarkup(String input, boolean formatWhiteSpace, String explanation) {
display(input,explanation,HTMLSanitiser.stripInvalidMarkup(input,formatWhiteSpace));
}
private static void display(String input, String explanation, String output) {
System.out.println(explanation+":\ninput : "+input+"\noutput: "+output+"\n");
}
}