Jsoup
is a Java library for working with real-world HTML. It provides a very convenient API for extracting and manipulating data, using the best of DOM, CSS, and jquery-like methods.
Example :
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
public class HTMLParser {
public static void main(String[] args) {
Document doc = null;
{
FileWriter writer =null;
System.out.println("--------------------------Program started--------------------------");
try {
writer = new FileWriter("c:\\Temp\\abc_search_data.csv");
File input = new File("C:\\My Web Sites\\http___www.abcd.com_\\www.abcd.com\\library\\html\\");
File[] st = input.listFiles();
for (int i = 0; i < st.length; i++) {
if (st[i].isFile()) {// other condition like name ends in
// html
// parse(st[i]);
doc = Jsoup.parse(st[i], null);
// get page title
String title = doc.title();
System.out.println("title : " +"["+i+"]"+ title);
//
String ownText = doc.body().ownText();
String text = doc.body().text();
//
//System.out.println("ownText" + ownText + "\n");
//System.out.println("text" + text);
//
writer.append("title : "+"["+i+"]");
writer.append(',');
writer.append(title);
writer.append('\n');
/* writer.append("ownText");
writer.append(',');
writer.append(ownText);
writer.append('\n');*/
writer.append("text : "+"["+i+"]");
writer.append(',');
writer.append(text);
writer.append('\n');
}
}
} catch (Exception e) {
e.printStackTrace();
}finally {
try {
writer.flush();
writer.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
System.out.println("--------------------------Program End--------------------------");
}
}