SAMPLE
package com.jsoup;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.safety.Whitelist;
import org.jsoup.select.Elements;
public class HtmlParser
{
public static String html = "<html><head><title>Sample Title</title></head>"
+ "<body>"
+ "<p>Sample Content1</p>"
+ "<img name='pic1' id='picid1' src='test.jpg' />"
+ "<p>Sample Content2</p>"
+ "<p>Sample Content3</p>"
+ "<img name='pic2' id='picid2' src='test2.jpg' />"
+ "<video width='320' height='240' controls>"
+ "<source id='1' src='movie.mp4' type='video/mp4'>"
+ " <source id='2' src='movie.ogg' type='video/ogg'>"
+ "<Br/>Your browser does not support the video tag."
+ "</video>"
+ "<audio controls>"
+ "<source id='1' src='horse.ogg' type='audio/ogg'>"
+ "<source id='2' src='horse.mp3' type='audio/mpeg'>"
+ "<Br/>Your browser does not support the audio tag."
+ "</audio>"
+ "<p><a href='http://example.com/'" + " onclick='checkData()'>Link</a></p>"
+"</body></html>";
/*output
Initial HTML: <p><a href='http://example.com/' onclick='checkData()'>Link</a></p>
Cleaned HTML: <p><a href="http://example.com/" rel="nofollow">Link</a></p>*/
public static void safeGuardHtmlSanitize ()
{
System.out.println ( "Initial HTML: " + html );
String safeHtml = Jsoup.clean ( html, Whitelist.basic () );
System.out.println ( "Cleaned HTML: " + safeHtml );
}
public static void main ( String [] args )
{
Document document = Jsoup.parse ( html );
// img with src ending .png
Elements imgs = document.select ( "img" );
for ( Element img : imgs )
{
System.out.println ( "Name: " + img.attr ( "name" ) + " id: " + img.id () + " src: " + img.attr ( "src" ) );
//to replace the value existing
img.attr ( "src", "replacedImage.jpg" ) ;
}
System.out.println ( "\n\n" );
Elements videos = document.select ( "video" );
Elements videoSrc = videos.select ( "source" );
for ( Element vSrc : videoSrc )
{
System.out.println ( " id: " + vSrc.id () + " src: " + vSrc.attr ( "src" ) + " type: " + vSrc.attr ( "type" ));
}
System.out.println ( "\n\n" );
Elements audios = document.select ( "audio" );
Elements audioSrc = audios.select ( "source" );
for ( Element aSrc : audioSrc )
{
System.out.println ( " id: " + aSrc.id () + " src: " + aSrc.attr ( "src" ) + " type: " + aSrc.attr ( "type" ));
}
System.out.println ( "\n\n" );
safeGuardHtmlSanitize();
}
}
OUTPUT
Name: pic1 id: picid1 src: test.jpg
Name: pic2 id: picid2 src: test2.jpg
id: 1 src: movie.mp4 type: video/mp4
id: 2 src: movie.ogg type: video/ogg
id: 1 src: horse.ogg type: audio/ogg
id: 2 src: horse.mp3 type: audio/mpeg
Initial HTML:
Sample TitleSample Content1
Sample Content2
Sample Content3
Link
Cleaned HTML: Sample Title
Sample Content1
Sample Content2
Sample Content3
Your browser does not support the video tag.
Your browser does not support the audio tag.
Link