Question

Source

<div class=filmPoster-1><a class="fImg1 entityPoster" href="/Zielona.Mila" title="Zielona mila (1999)"> bla bla bla bla
<div class=filmPoster-1><a class="fImg1 entityPoster" href="/Batman" title="Batman (1999)">

How to get only "/Zielona.Mila,/Batman" ( this links ) with preg_match ??

Was it helpful?

Solution

The DOM way (more appropriate):

$dom = new DOMDocument();
@$dom->loadHTML($html);
$xpath = new DOMXPath($dom);
$hrefNodes = $xpath->query('//div[@class="filmPoster-1"]/a[contains(@class, "fImg1") and contains(@class, "entityPoster")]/@href');

foreach($hrefNodes as $hrefNode) {
    $links[] = $hrefNode->textContent;
}
print_r($links);

The regex way:

$pattern = <<<'LOD'
~
<div\b
(?>              # possible content before the class attribute
    [^c>]++      # all that is not a "c" or a ">"
  |              # OR
    \Bc          # a "c" not preceded by a word boundary
  |              # OR
    c(?!lass\b)  # "c" not followed by "lass"
)++
class \s*+ = \s*+ ["']?  # the class attribute
(?-i) filmPoster-1 (?i) (?=["'\s>])
[^>]*+ > # and of the div tag
\s*+
<a\b
(?>
    [^>h]++
  |
    \Bh
  |
    h(?!ref\b)
)+
href \s*+ = \s*+ ["\']?
\K            # reset all that have been matched before from match result
[^\s>"\']++
~xi
LOD;

preg_match_all($pattern, $html, $links);
print_r($links);
Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top