Search This Blog

Tuesday, September 11, 2018

JSoup - HTML Parser for image audio video attributes

SAMPLE

package com.jsoup;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.safety.Whitelist;
import org.jsoup.select.Elements;

public class HtmlParser
{
public static String html = "<html><head><title>Sample Title</title></head>"
         + "<body>"
         + "<p>Sample Content1</p>"              
         + "<img name='pic1' id='picid1' src='test.jpg' />"  
         + "<p>Sample Content2</p>"
         + "<p>Sample Content3</p>"
         + "<img name='pic2' id='picid2' src='test2.jpg' />"
         + "<video width='320' height='240' controls>"
        + "<source id='1' src='movie.mp4' type='video/mp4'>"
        + " <source  id='2' src='movie.ogg' type='video/ogg'>"
        + "<Br/>Your browser does not support the video tag."
        + "</video>"
        + "<audio controls>"
        + "<source id='1' src='horse.ogg' type='audio/ogg'>"
        + "<source id='2' src='horse.mp3' type='audio/mpeg'>"
        + "<Br/>Your browser does not support the audio tag."
        + "</audio>"
        + "<p><a href='http://example.com/'" + " onclick='checkData()'>Link</a></p>"
         +"</body></html>";


/*output
Initial HTML: <p><a href='http://example.com/' onclick='checkData()'>Link</a></p>
Cleaned HTML: <p><a href="http://example.com/" rel="nofollow">Link</a></p>*/
public static void safeGuardHtmlSanitize ()
{
System.out.println ( "Initial HTML: " + html );
String safeHtml = Jsoup.clean ( html, Whitelist.basic () );
System.out.println ( "Cleaned HTML: " + safeHtml );
}

public static void main ( String [] args )
{

Document document = Jsoup.parse ( html );
// img with src ending .png
Elements imgs = document.select ( "img" );
for ( Element img : imgs )
{
System.out.println ( "Name: " + img.attr ( "name" ) + " id: " + img.id () + " src: " + img.attr ( "src" ) );
//to replace the value existing
img.attr ( "src", "replacedImage.jpg" ) ;
}
System.out.println ( "\n\n" );

Elements videos = document.select ( "video" );
Elements videoSrc = videos.select ( "source" );
for ( Element vSrc : videoSrc )
{
System.out.println ( " id: " + vSrc.id () + " src: " + vSrc.attr ( "src" ) + " type: " + vSrc.attr ( "type" ));
}
System.out.println ( "\n\n" );

Elements audios = document.select ( "audio" );
Elements audioSrc = audios.select ( "source" );
for ( Element aSrc : audioSrc )
{
System.out.println ( " id: " + aSrc.id () + " src: " + aSrc.attr ( "src" ) + " type: " + aSrc.attr ( "type" ));
}
System.out.println ( "\n\n" );

safeGuardHtmlSanitize();
}
}



OUTPUT

Name: pic1 id: picid1 src: test.jpg
Name: pic2 id: picid2 src: test2.jpg



 id: 1 src: movie.mp4 type: video/mp4
 id: 2 src: movie.ogg type: video/ogg



 id: 1 src: horse.ogg type: audio/ogg
 id: 2 src: horse.mp3 type: audio/mpeg



Initial HTML: Sample TitleSample Content1
Sample Content2
Sample Content3
Link
Cleaned HTML: Sample Title
Sample Content1
Sample Content2
Sample Content3


Your browser does not support the video tag.

Your browser does not support the audio tag.
Link



No comments:

Hit Counter


View My Stats