Tuesday, 26 November 2013

Working example with jsoup - parse from site actors and top of best movies

Here is an example how to use jsoup to parse some text from site.
This site has also small "security bridge". To get what we want we must have first cookie.

So first we will take cooke from home page, and then we will parse some text.

I used in string something like this %1$s < percentage with number tells about number of argument, dolar with s means that this will be a string (d if for decimal and etc.)
So after this move i can reuse string and putting into additional string element

/**
 * Created with IntelliJ IDEA.
 * Date: 10.11.13
 * Time: 21:44
 * To change this template use File | Settings | File Templates.
 */
public class Seriale {
    private static boolean writeOut = true;
    private static String start = "http://www.filmweb.pl/";
    private static String series = "http://www.filmweb.pl/rankings/series/poland";
    private static String actors = "http://www.filmweb.pl/search/person?q=&sex=0&startBirthYear=&endBirthYear=&professionIds=null&startRate=&endRate=&startCount=&endCount=&sort=COUNT&sortAscending=false&c=portal&page=%1$s";
    private static URL obj;
    private static HttpURLConnection con;
    private static int responseCode;

    public static void main(String[] args) throws IOException {

        CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL));

        //we need first cookie, if we won't get it we will see annoying advertistment
        initStart();

        getSeries();
        getActors();

    }

    private static void initStart() throws IOException {
        obj = new URL(start);
        con = (HttpURLConnection) obj.openConnection();
        con.setRequestMethod("GET");
        responseCode = con.getResponseCode();
    }

    private static void getSeries() throws IOException {
        obj = new URL(series);
        con = (HttpURLConnection) obj.openConnection();
        con.setRequestMethod("GET");
        responseCode = con.getResponseCode();

        String str = convertStreamToString(con.getInputStream());
        Document doc = Jsoup.parse(str);
        Element awt = doc.select("table.awT.rankingTable").get(0);

        Elements filmPoster = awt.select("div.ohidden");
        for (Element el : filmPoster) {
            System.out.println(el.select("a[href]").get(0).text());
        }
    }

    private static void getActors() throws IOException {
        for (int i = 0; i < 5; i++) {
            obj = new URL(String.format(actors, i));
            con = (HttpURLConnection) obj.openConnection();
            con.setRequestMethod("GET");
            responseCode = con.getResponseCode();

            String str = convertStreamToString(con.getInputStream());
            Document doc = Jsoup.parse(str);
            Elements awt = doc.select("div.hitDescWrapper>h3");

            for (Element el : awt) {
                System.out.println(el.text());
            }
        }
    }

    static String convertStreamToString(java.io.InputStream is) {
        java.util.Scanner s = new java.util.Scanner(is).useDelimiter("\\A");
        return s.hasNext() ? s.next() : "";
    }
}