Saturday 19 March 2016

Reading multiple html files into folder parse and store into csv file

Standard


Image result for jsoup
Jsoup is a Java library for working with real-world HTML. It provides a very convenient API for extracting and manipulating data, using the best of DOM, CSS, and jquery-like methods.

Example :

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

public class HTMLParser {

public static void main(String[] args) {
Document doc = null;

{
FileWriter writer =null;
System.out.println("--------------------------Program started--------------------------");
try {
writer = new FileWriter("c:\\Temp\\abc_search_data.csv");

       File input = new File("C:\\My Web     Sites\\http___www.abcd.com_\\www.abcd.com\\library\\html\\");

File[] st = input.listFiles();

for (int i = 0; i < st.length; i++) {
if (st[i].isFile()) {// other condition like name ends in
// html

// parse(st[i]);

doc = Jsoup.parse(st[i], null);

// get page title
String title = doc.title();
System.out.println("title : " +"["+i+"]"+ title);
//
String ownText = doc.body().ownText();
String text = doc.body().text();
//
//System.out.println("ownText" + ownText + "\n");
//System.out.println("text" + text);
//
 
     writer.append("title : "+"["+i+"]");
     writer.append(',');
     writer.append(title);
     writer.append('\n');

    /* writer.append("ownText");
     writer.append(',');
     writer.append(ownText);
     writer.append('\n');*/
     
     writer.append("text : "+"["+i+"]");
     writer.append(',');
     writer.append(text);
       writer.append('\n');

}
}
} catch (Exception e) {
e.printStackTrace();
}finally {
try {
writer.flush();
writer.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
System.out.println("--------------------------Program End--------------------------");
}

}

0 comments:

Post a Comment