HTMLスクレイピングとCSSクエリ

https://stackoverflow.com/questions/3603511

25-09-2019
|

質問

次のライブラリの長所と短所は何ですか?

上記から、私はQPを使用しましたが、無効なHTMLの解析に失敗しました。simpleDomParserは良い仕事をしますが、オブジェクトモデルのせいでメモリリークが発生します。ただし、次の呼び出しを行うことで、それを制御できる可能性があります。 $object->clear(); unset($object); オブジェクトがもう必要なくなったとき。

他にスクレーパーはありますか？彼らとの経験は何ですか?これをコミュニティ Wiki にするつもりです。スクレイピング時に役立つライブラリの便利なリストを作成できますように。

バイロンの答えに基づいていくつかのテストを行いました。

    <?
    include("lib/simplehtmldom/simple_html_dom.php");
    include("lib/phpQuery/phpQuery/phpQuery.php");


    echo "<pre>";

    $html = file_get_contents("http://stackoverflow.com/search?q=favorite+programmer+cartoon");
    $data['pq'] = $data['dom'] = $data['simple_dom'] = array();

    $timer_start = microtime(true);

    $dom = new DOMDocument();
    @$dom->loadHTML($html);
    $x = new DOMXPath($dom);

    foreach($x->query("//a") as $node)
    {
         $data['dom'][] = $node->getAttribute("href");
    }

    foreach($x->query("//img") as $node)
    {
         $data['dom'][] = $node->getAttribute("src");
    }

    foreach($x->query("//input") as $node)
    {
         $data['dom'][] = $node->getAttribute("name");
    }

    $dom_time =  microtime(true) - $timer_start;
    echo "dom: \t\t $dom_time . Got ".count($data['dom'])." items \n";






    $timer_start = microtime(true);
    $doc = phpQuery::newDocument($html);
    foreach( $doc->find("a") as $node)
    {
       $data['pq'][] = $node->href;
    }

    foreach( $doc->find("img") as $node)
    {
       $data['pq'][] = $node->src;
    }

    foreach( $doc->find("input") as $node)
    {
       $data['pq'][] = $node->name;
    }
    $time =  microtime(true) - $timer_start;
    echo "PQ: \t\t $time . Got ".count($data['pq'])." items \n";









    $timer_start = microtime(true);
    $simple_dom = new simple_html_dom();
    $simple_dom->load($html);
    foreach( $simple_dom->find("a") as $node)
    {
       $data['simple_dom'][] = $node->href;
    }

    foreach( $simple_dom->find("img") as $node)
    {
       $data['simple_dom'][] = $node->src;
    }

    foreach( $simple_dom->find("input") as $node)
    {
       $data['simple_dom'][] = $node->name;
    }
    $simple_dom_time =  microtime(true) - $timer_start;
    echo "simple_dom: \t $simple_dom_time . Got ".count($data['simple_dom'])." items \n";


    echo "</pre>";

そして得た

dom:         0.00359296798706 . Got 115 items 
PQ:          0.010568857193 . Got 115 items 
simple_dom:  0.0770139694214 . Got 115 items

解決

いくつかの明るいSO'ersが私に光ハレルヤを示したまでです。

私は、単純なHTML DOM独占を使用するために使用されます

ただ、DOMの機能に組み込まれて使用します。彼らは、Cで書かれており、PHPのコアの一部です。彼らは、任意のサードパーティのソリューションよりも速く、より効率的です。放火犯を使用すると、XPathクエリを取得することはmuey簡単です。この単純な変更は私の貴重な時間を節約しながら、私のPHPベースのスクレーパーは、高速に実行作られています。

私のスクレーパーは、カールとasyncronously 10件のサイトを掻き取る〜60メガバイトを取るために使用されます。それも、あなたが言及した、単純なHTML DOMメモリ修正していました。

今私のPHPプロセスが8メガバイトの上に行くことはありません。

を強くお勧めします。

編集

さて私はいくつかのベンチマークをしました。 DOMに速く、少なくとも大きさの順で建てられています。

Built in php DOM: 0.007061
Simple html  DOM: 0.117781

<?
include("../lib/simple_html_dom.php");

$html = file_get_contents("http://stackoverflow.com/search?q=favorite+programmer+cartoon");
$data['dom'] = $data['simple_dom'] = array();

$timer_start = microtime(true);

$dom = new DOMDocument();
@$dom->loadHTML($html);
$x = new DOMXPath($dom); 

foreach($x->query("//a") as $node) 
{
     $data['dom'][] = $node->getAttribute("href");
}

foreach($x->query("//img") as $node) 
{
     $data['dom'][] = $node->getAttribute("src");
}

foreach($x->query("//input") as $node) 
{
     $data['dom'][] = $node->getAttribute("name");
}

$dom_time =  microtime(true) - $timer_start;

echo "built in php DOM : $dom_time\n";

$timer_start = microtime(true);
$simple_dom = new simple_html_dom();
$simple_dom->load($html);
foreach( $simple_dom->find("a") as $node)
{
   $data['simple_dom'][] = $node->href;
}

foreach( $simple_dom->find("img") as $node)
{
   $data['simple_dom'][] = $node->src;
}

foreach( $simple_dom->find("input") as $node)
{
   $data['simple_dom'][] = $node->name;
}
$simple_dom_time =  microtime(true) - $timer_start;

echo "simple html  DOM : $simple_dom_time\n";

ライセンス： CC-BY-SA と帰属

所属していません StackOverflow