Question

My ServletContextListener:

@WebListener
public class RunServlet implements ServletContextListener {

    private ScheduledExecutorService scheduler;

    @Override
    public void contextInitialized(ServletContextEvent event) {
        System.out.println("ready");
        scheduler = Executors.newScheduledThreadPool(10);
        scheduler.execute(new RunThread("http://stackoverflow.com"));
    }

    @Override
    public void contextDestroyed(ServletContextEvent event) {
        scheduler.shutdownNow();
        System.out.println("removed");
    }
}

Class that implements Runnable is RunThread (get all links from webpage, click on links, parse webpages and save words into database using jsoup and hibernate):

public class RunThread implements Runnable{
    private Document html;
    private String url;

    private static final int threads_num = Runtime.getRuntime().availableProcessors()*4;
    private int links = 0;
    private int alinks = 0;

    public RunThread(String url){
        this.url = url;
        try {
            this.html = Jsoup.connect(url).get();
            this.links = html.select("a[href]").size();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }


    @Override
    public void run() {
        if(alinks != links){
            Elements collectedLinks = html.select("a[href]");
            ExecutorService executor = Executors.newFixedThreadPool(threads_num);
            for(Element link:collectedLinks){
                if(alinks == links) break;
                else{
                    String current = link.attr("abs:href");
                    if(!current.equals(url) && current.startsWith(url)&& !current.contains("#")){
                        executor.execute(new RunThread(current));
                        alinks++;
                    }
                }
            }
        }
        AnalyzePage(html, url);
    }

    private void AnalyzePage(Document doc,String url){
        String text = doc.body().text();
        SaveTextToDB(text,url);
    }

    public void SaveTextToDB(String text, String link){
        TreeMap<String, Integer> frequencyMap = new TreeMap<String, Integer>();
        StringTokenizer parser = 
            new StringTokenizer(text.replaceAll("[0-9]+","").replaceAll("[^a-zA-Zа-яА-Я]-[^a-zA-Zа-яА-Я]", " "), " \t\n\r\f.,;:!?%#+№/<←→↓@'\"—«»©“\\(\\)");
        while (parser.hasMoreTokens()) {
            String currentWord = parser.nextToken();
            Integer frequency = frequencyMap.get(currentWord);
        if (frequency == null) {
            frequency = 0;
        }
        frequencyMap.put(currentWord, frequency + 1);
    }

    for (Map.Entry<String,Integer> entry : frequencyMap.entrySet()){
        Indexation word = new Indexation();
        IndexationPK pk = new IndexationPK();
        pk.setLink(link);
        pk.setWord(entry.getKey());
        word.setFrequency(entry.getValue());
        word.setIndexationPK(pk);
        IndexationDAO indDAO = new IndexationDAOImpl();
        indDAO.AddRecord(word);
    }}
}

I receive next errors:

java.net.SocketTimeoutException: Read timed out
    at java.net.SocketInputStream.socketRead0(Native Method)
    at java.net.SocketInputStream.read(SocketInputStream.java:129)
    at java.io.BufferedInputStream.fill(BufferedInputStream.java:218)
    at java.io.BufferedInputStream.read1(BufferedInputStream.java:258)
    at java.io.BufferedInputStream.read(BufferedInputStream.java:317)
    at sun.net.www.http.HttpClient.parseHTTPHeader(HttpClient.java:695)
    at sun.net.www.http.HttpClient.parseHTTP(HttpClient.java:640)
    at sun.net.www.protocol.http.HttpURLConnection.getInputStream(HttpURLConnection.java:1195)
    at java.net.HttpURLConnection.getResponseCode(HttpURLConnection.java:379)
    at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:381)
    at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:364)
    at org.jsoup.helper.HttpConnection.execute(HttpConnection.java:143)
    at org.jsoup.helper.HttpConnection.get(HttpConnection.java:132)
    at com.mstu.service.RunThread.<init>(RunThread.java:35)
    at com.mstu.service.RunThread.run(RunThread.java:53)
    at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:886)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:908)
    at java.lang.Thread.run(Thread.java:662)
Exception in thread "pool-9-thread-1" java.lang.NullPointerException
    at com.mstu.service.RunThread.AnalyzePage(RunThread.java:63)
    at com.mstu.service.RunThread.run(RunThread.java:59)
    at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:886)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:908)
    at java.lang.Thread.run(Thread.java:662)
java.net.SocketTimeoutException: Read timed out
    at java.net.SocketInputStream.socketRead0(Native Method)
    at java.net.SocketInputStream.read(SocketInputStream.java:129)
    at java.io.BufferedInputStream.fill(BufferedInputStream.java:218)
    at java.io.BufferedInputStream.read1(BufferedInputStream.java:258)
    at java.io.BufferedInputStream.read(BufferedInputStream.java:317)
    at sun.net.www.http.HttpClient.parseHTTPHeader(HttpClient.java:695)
    at sun.net.www.http.HttpClient.parseHTTP(HttpClient.java:640)
    at sun.net.www.protocol.http.HttpURLConnection.getInputStream(HttpURLConnection.java:1195)
    at java.net.HttpURLConnection.getResponseCode(HttpURLConnection.java:379)
    at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:381)
    at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:364)
    at org.jsoup.helper.HttpConnection.execute(HttpConnection.java:143)
    at org.jsoup.helper.HttpConnection.get(HttpConnection.java:132)
    at com.mstu.service.RunThread.<init>(RunThread.java:35)
    at com.mstu.service.RunThread.run(RunThread.java:53)
    at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:886)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:908)
    at java.lang.Thread.run(Thread.java:662)
java.net.SocketTimeoutException: Read timed out

What's wrong? Please, help me.

Was it helpful?

Solution

I suspect it's not to do with your threading, but that

 this.html = Jsoup.connect(url).get();

isn't able to connect to your given URL. Does this run as a standalone component ? It would be worth testing this standalone (pull it out of the Runnable and make it a standalone class that you can easily test), and checking whether you need to set HTTP proxies etc.

(I'd probably run the above within the run() method, rather than getting the connection in the object creation and hanging onto it until the executor is ready to run it)

Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top