import net.htmlparser.jericho.*; import java.util.*; import java.io.*; import java.net.*; public class ExtractText { public static void main(String[] args) throws Exception { String sourceUrlString="data/test.html"; if (args.length==0) System.err.println("Using default argument of \""+sourceUrlString+'"'); else sourceUrlString=args[0]; if (sourceUrlString.indexOf(':')==-1) sourceUrlString="file:"+sourceUrlString; MicrosoftConditionalCommentTagTypes.register(); PHPTagTypes.register(); PHPTagTypes.PHP_SHORT.deregister(); // remove PHP short tags for this example otherwise they override processing instructions MasonTagTypes.register(); Source source=new Source(new URL(sourceUrlString)); // Call fullSequentialParse manually as most of the source will be parsed. source.fullSequentialParse(); System.out.println("Document title:"); String title=getTitle(source); System.out.println(title==null ? "(none)" : title); System.out.println("\nDocument description:"); String description=getMetaValue(source,"description"); System.out.println(description==null ? "(none)" : description); System.out.println("\nDocument keywords:"); String keywords=getMetaValue(source,"keywords"); System.out.println(keywords==null ? "(none)" : keywords); System.out.println("\nLinks to other documents:"); List linkElements=source.getAllElements(HTMLElementName.A); for (Element linkElement : linkElements) { String href=linkElement.getAttributeValue("href"); if (href==null) continue; // A element can contain other tags so need to extract the text from it: String label=linkElement.getContent().getTextExtractor().toString(); System.out.println(label+" <"+href+'>'); } System.out.println("\nAll text from file (exluding content inside SCRIPT and STYLE elements):\n"); System.out.println(source.getTextExtractor().setIncludeAttributes(true).toString()); System.out.println("\nSame again but this time extend the TextExtractor class to also exclude text from P elements and any elements with class=\"control\":\n"); TextExtractor textExtractor=new TextExtractor(source) { public boolean excludeElement(StartTag startTag) { return startTag.getName()==HTMLElementName.P || "control".equalsIgnoreCase(startTag.getAttributeValue("class")); } }; System.out.println(textExtractor.setIncludeAttributes(true).toString()); } private static String getTitle(Source source) { Element titleElement=source.getFirstElement(HTMLElementName.TITLE); if (titleElement==null) return null; // TITLE element never contains other tags so just decode it collapsing whitespace: return CharacterReference.decodeCollapseWhiteSpace(titleElement.getContent()); } private static String getMetaValue(Source source, String key) { for (int pos=0; pos