Question

I have a program to check whether keywords are on a web page. But after checking 1000-3000 urls, it hangs. There is no output, it does not exit, and the number of tcp connections is zero. I don't know why there are no new connections.

Would you give me some advice how to debug it?

type requestReturn struct {    
    url    string    
    status bool
}

var timeout = time.Duration(800 * time.Millisecond)    

func checkUrls(urls []string, kws string, threadLimit int) []string {    
    limitChan := make(chan int, threadLimit)    
    ok := make(chan requestReturn, 1)    
    var result []string    
    i := 0    
    for ; i < threadLimit; i++ {    
        go func(u string) {    
            request(u, limitChan, ok, kws)    
        }(urls[i])    
    }    
    for o := range ok {    
        if o.status {    
            result = append(result, o.url)    
            log.Printf("success %s,remain %d", o.url, len(urls)-i)    
        } else {    
            log.Printf("fail %s,remain %d", o.url, len(urls)-i)    
        }    
        if i < len(urls) {    
            go func(u string) {    
                request(u, limitChan, ok, kws)    
            }(urls[i])    
            i++    
        }    
    }    
    close(limitChan)    
    return result    
}    

func dialTimeout(network, addr string) (net.Conn, error) {    
    return net.DialTimeout(network, addr, timeout)    
}    

func request(url string, threadLimit chan int, ok chan requestReturn, kws string) {    
    threadLimit <- 1    
    log.Printf("%s, start...", url)    
    //startTime := time.Now().UnixNano()    
    rr := requestReturn{url: url}    

    transport := http.Transport{    
        Dial:              dialTimeout,    
        DisableKeepAlives: true,    
    }    

    client := http.Client{    
        Transport: &transport,    
        Timeout:   time.Duration(15 * time.Second),    
    }    

    resp, e := client.Get(url)    
    if e != nil {    
        log.Printf("%q", e)    
        rr.status = false    
        return    
    }    

    if resp.StatusCode == 200 {    
        body, err := ioutil.ReadAll(resp.Body)    
        if err != nil {    
            log.Printf("%q", err)    
            rr.status = false    
            return    
        }    

        content := bytes.NewBuffer(body).String()    

        matched, err1 := regexp.MatchString(kws, content)    
        if err1 != nil {    
            log.Printf("%q", err1)    
            rr.status = false    
        } else if matched {    
            rr.status = true    
            log.Println(rr.url)    
        } else {    
            rr.status = false    
        }    
    } else {    
        rr.status = false    
    }    

    defer (func() {    
        resp.Body.Close()    
        ok <- rr    
        //processed := float32(time.Now().UnixNano()-startTime) / 1e9    
        //log.Printf("%s, status:%t,time:%.3fs", rr.url, rr.status, processed)    
        <-threadLimit    
    })()    
}
Was it helpful?

Solution

You seem to be using two forms of concurrency control in this code, and both have problems.

You've got limitChan, which looks like it is being used as a semaphore (request sends a value at its start, and receives a value in a defer in that function). But checkUrls is also trying to make sure it only has threadLimit goroutines running at once (by spawning that number first up, and only spawning more when one reports its results on the ok channel). Only one of these should be necessary to limit the concurrency.

Both methods fail due to the way the defer is set up in request. There are a number of return statements that occur before defer, so it is possible for the function to complete without sending the result to the ok channel, and without freeing up its slot in limitChan. After a sufficient number of errors, checkUrls will stop spawning new goroutines and you'll see your hang.

The fix is to place the defer statement before any of the return statements so you know it will always be run. Something like this:

func request(url string, threadLimit chan int, ok chan requestReturn, kws string) {
    threadLimit <- 1
    rr := requestReturn{url: url}
    var resp *http.Response
    defer func() {
        if resp != nil {
            resp.Body.Close()
        }
        ok <- rr
        <-threadLimit
    }()
    ...
}
Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top