Wednesday, January 30, 2013

Web scrapping using Jsoup

Download latest jsoup jar file (Download Link).

Compile code with appropriate class path value, like

javac -cp "C:\jsoup-1.7.1.jar"  "TestClass.java"

java  -cp  "C:\jsoup-1.7.1.jar"  TestClass

Simple Example using Jsoup to connect to server using login credentials and then retrieving specific page.

[sourcecode language="java"]

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.Connection;
import java.io.IOException;
import org.jsoup.Connection.Method;
import java.util.HashMap;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

public class TestClass
{
public static void main(String args[]) throws IOException
{

Document doc = Jsoup.connect("<URL>").get();

Elements viewState = doc.select("input[name=__VIEWSTATE");
Elements eventValidation = doc.select("input[name=__EVENTVALIDATION]");

Map<String,String> allFields = new HashMap<String,String>();
allFields.put("__VIEWSTATE", viewState.val());
allFields.put("__EVENTVALIDATION", eventValidation.val());
allFields.put("txtLogin", "<USERNAME>");
allFields.put("txtPassword",   "<PASSWORD>");
allFields.put("butSubmit",   "Sign In");

System.out.println(allFields);

Connection.Response res = Jsoup.connect("<URL2>")
.userAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.21 (KHTML, like Gecko) Chrome/19.0.1042.0 Safari/535.21")
.data(allFields)
.method(Method.POST).
execute();

String sessionId = res.cookie("<COOKIENAME>");

System.out.println(sessionId);

Document doc2  = Jsoup.connect("URL3")
.cookie("ASP.NET_SessionId", sessionId)
.timeout(0)
.get();

System.out.println(doc2.html());

}
}

[/sourcecode]

No comments:

Post a Comment