質問

I tried to scrap some content from a website and I had a problem which may be trivial, but I can't find a solution. For the first page it works but when I browse (with curl) the following pages I still get the content for the page 1 which is strange. I guess the website have some scrapping protections but I can't find a way to identify them...

<?php 
$i = 1;
$links = array();

while($i < 3) 
{ 
    $ch = curl_init();
    $url = 'http://www.gites-de-france.com/location-vacances-chambre-hotes.html?page=$i&chambre=o&xhtml=O&acc=CHAMBRE,CHAMBRE&order_by=prix&order_by_tri=asc&';
    curl_setopt($ch, CURLOPT_URL, $url);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
    curl_setopt($ch, CURLOPT_REFERER, "http://www.google.fr/");
    curl_setopt($ch, CURLOPT_USERAGENT, "MozillaXYZ/1.0");
    curl_setopt($ch, CURLOPT_HEADER, 0);
    curl_setopt($ch, CURLOPT_TIMEOUT, 100);
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); 

    $html = curl_exec($ch);
    curl_close($ch);

    $doc = phpQuery::newDocument($html);

    foreach($doc['.vignette a'] as $a){
        $url = '';
        $links[] .= pq($a)->attr('href');
    }
$i++;
}
    print_r($links);
?>
役に立ちましたか?

解決

This worked for me.

$i = 1;
$links = array();
$baseUrl = "http://www.gites-de-france.com/location-vacances-chambre-hotes.html";
$param = array(
    'chambre' => 'o',
    'xhtml' => 'O',
    'acc' => 'CHAMBRE,CHAMBRE',
    'order_by' => 'prix',
    'order_by_tri' => 'asc'
);

while($i < 3) { 
    $ch = curl_init();

    $param['page'] = $i;
    $url = "{$baseUrl}?" . http_build_query($param);

    curl_setopt($ch, CURLOPT_URL, $url);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
    curl_setopt($ch, CURLOPT_REFERER, "http://www.google.fr/");
    curl_setopt($ch, CURLOPT_USERAGENT, "MozillaXYZ/1.0");
    curl_setopt($ch, CURLOPT_HEADER, 0);
    curl_setopt($ch, CURLOPT_TIMEOUT, 100);
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); 
    curl_setopt($ch, CURLOPT_COOKIEJAR, 'cookie.txt');
    curl_setopt($ch, CURLOPT_COOKIEFILE, 'cookie.txt');

    $html = curl_exec($ch);
    curl_close($ch);

    $doc = phpQuery::newDocument($html);

    foreach($doc['.vignette a'] as $a){
        $url = '';
        $links[] .= pq($a)->attr('href');
    }
    $i++;
}
print_r($links);

Note: I created the cookie.txt file manually before running the script.

他のヒント

So here is the solution, this website use cookies to pass a session number, so you must use the following code

curl_setopt($ch, CURLOPT_COOKIEJAR, '/tmp/cookie.txt');
curl_setopt($ch, CURLOPT_COOKIEFILE, '/tmp/cookie.txt');

And it now works!

You will need to do like below:

/**
*
*
* int $start start page number
* int $limit maximum number of results
* int $pgIncrmnt number of results per page
*
*
*/
$buffer = NULL;
$limit = 100;

for ($j = $startPageNum; $j <= $limitMaxResult; $j = $j + $pgIncrmnt) {
    $chr = curl_init();
    curl_setopt($chr, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.6 (KHTML, like Gecko) Chrome/16.0.897.0 Safari/535.6');
    curl_setopt($chr, CURLOPT_HEADER, FALSE);
    curl_setopt($chr, CURLOPT_URL, 'http://www.windowsphone.com/');
    curl_setopt($chr, CURLOPT_RETURNTRANSFER, TRUE);
    curl_setopt($chr, CURLOPT_FRESH_CONNECT, TRUE);
    curl_setopt($chr, CURLOPT_FORBID_REUSE, TRUE);
    curl_setopt($chr, CURLOPT_FOLLOWLOCATION, TRUE);

    $buffer .= curl_exec($chr);
    curl_close($chr);
}
ライセンス: CC-BY-SA帰属
所属していません StackOverflow
scroll top