2

Scenario/Requirement:

Download html page from some URL

Download images that were mentioned in html tags.

Change tags for images in my file, so I can open it with my browser offline and see them.

I made first 2 points, but am having difficulties with the third one.Tags do not change.What am I doing wrong?

The job is to open a file, find img src tag and replace it by another tag! Can you give me an example?

Code:

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.*;

import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;
import java.awt.image.BufferedImage;
import java.net.URL;
import java.net.URLConnection;
import javax.imageio.ImageIO;
import javax.swing.text.AttributeSet;
import javax.swing.text.html.HTMLDocument;


public class ExtractAllImages {
static String result_doc =  "/home/foo/index.html";
static String home_folder = "/home/foo/";
static String start_webURL = "http://www.oracle.com/";


public static void main(String args[]) throws Exception {

    String webUrl = start_webURL;
    URL url = new URL(webUrl);
    URLConnection connection = url.openConnection();
    InputStream is = connection.getInputStream();
    InputStreamReader isr = new InputStreamReader(is);
    BufferedReader br = new BufferedReader(isr);

    HTMLEditorKit htmlKit = new HTMLEditorKit();
    HTMLDocument htmlDoc = (HTMLDocument) htmlKit.createDefaultDocument();

    HTMLEditorKit.Parser parser = new ParserDelegator();
    HTMLEditorKit.ParserCallback callback = htmlDoc.getReader(0);



    parser.parse(br, callback, true);
    FileWriter writer = new FileWriter(result_doc);
    htmlKit.write(writer, htmlDoc, 0, htmlDoc.getLength());
    writer.close();

    int number_or_images = 0;
    String[] array = new String[4096];



    for (HTMLDocument.Iterator iterator = htmlDoc.getIterator(HTML.Tag.IMG); iterator.isValid(); iterator.next()) {
        AttributeSet attributes = iterator.getAttributes();
        String imgSrc = (String) attributes.getAttribute(HTML.Attribute.SRC);
        System.out.println("img_src = " + imgSrc);


        if (imgSrc != null && (imgSrc.endsWith(".jpg") || (imgSrc.endsWith(".png")) || (imgSrc.endsWith(".jpeg")) || (imgSrc.endsWith(".bmp")) || (imgSrc.endsWith(".ico")))) {
            try {
                downloadImage(webUrl, imgSrc);
            } catch (IOException ex) {
                System.out.println(ex.getMessage());
            }
        }
        array[number_or_images] = imgSrc;
        number_or_images++;
        ///TODO change


    }

    for(int i =0; i < number_or_images; i++)
    {
       System.out.println("before = "+array[i]);
        while(true)
        {
            int count = array[i].indexOf('/');
            if(count == -1) break;
            array[i] = array[i].substring(count+1);
        }
        System.out.println("after = " + array[i]);
    }


    //TODO open file and replace tags
    int i =0;
   File input = new File(result_doc);
    Document doc = Jsoup.parse(input, "UTF-8");
    System.out.println( input.canWrite());
    for( Element img : doc.select("img[src]") )
    {
        String s = img.attr("src");
        System.out.println(s);

        img.attr("src", "/home/foo/"+array[i]); // set attribute 'src' to 'your-source-here'
        s = img.attr("src");
        System.out.println(s);
        ++i;
    }








}
private static void downloadImage(String url, String imgSrc) throws IOException {
    BufferedImage image = null;
    try {
        if (!(imgSrc.startsWith("http"))) {
            url = url + imgSrc;
        } else {
            url = imgSrc;
        }
        imgSrc = imgSrc.substring(imgSrc.lastIndexOf("/") + 1);
        String imageFormat = null;
        imageFormat = imgSrc.substring(imgSrc.lastIndexOf(".") + 1);
        String imgPath = null;
        imgPath = home_folder + imgSrc + "";
        URL imageUrl = new URL(url);
        image = ImageIO.read(imageUrl);
        if (image != null) {
            File file = new File(imgPath);
            ImageIO.write(image, imageFormat, file);
        }
    } catch (Exception ex) {
        ex.printStackTrace();
    }

}
}
3

2 Answers 2

3

Solved. I didn't save changes. Need to add code befire "downloadImage()"

    int i = 0;
    File input = new File(result_doc);
    Document doc = Jsoup.parse(input, "UTF-8");
    for( Element img : doc.select("img[src]") ) {
        img.attr("src",array[i]); // set attribute 'src' to 'your-source-here'
        ++i;
    }

    try {
        String strmb = doc.outerHtml();
        bw = new BufferedWriter(new FileWriter(result_doc));
        bw.write(strmb);
        bw.close();
    }
    catch (Exception ex) {
        System.out.println("Program stopped. The problem is " + "\"" +      
     ex.getMessage()+"\"");
    }
Sign up to request clarification or add additional context in comments.

Comments

1

You can go with JSOUP Try something like below


import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements; 
 public static void getAllTags(){
        try {
            File input=new File("H:\\html pages\\index1.html");
            Document document=Jsoup.parse(input, "UTF-8");
            Document parse=Jsoup.parse(document.html());   

            Elements body=parse.select("body");
            Elements bodyTags=body.select("*");

            for (Element element : bodyTags) {
                  //Do what you want with tag
                System.out.println(element.tagName());

            }   
        } catch (Exception e) {
            e.printStackTrace();
        }

If you want to parse html then try this

public static void parseHTML(){
try {
            File input = new File("H:\\html\\index1.html");
            Document document = Jsoup.parse(input, "UTF-8");
            Document parse = Jsoup.parse(document.html());

            Elements bodyElements = parse.select("div");
            Elements elements = bodyElements.select("*");
            for (Element element : elements) {
                FilterHtml.setHtmlTAG(element.tagName());
                FilterHtml.ParseXml();

                Elements body = bodyElements.select(FilterHtml.getXmlTAG());
                if (body.is(FilterHtml.getXmlTAG())) {
                    Elements tag = parse.select(FilterHtml.getXmlTAG());
                    //Do something meaning full with tag
                    System.out.println(tag.text());
                }                

            }
        } catch (Exception e) {
            e.printStackTrace();
        }
}   

Hope this would help. if yes please mark it green.

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.