A quick and easy way of extracting interesting pieces of information from a DOM document is to make use of XPath. Below is a basic example showing how to get the text content and attribute text from a div element.
<?php
// Pre-amble, scroll down to interesting stuff...
$html = '<div>
<p>para-1</p>
<p>para-2</p>
<p>
<iframe src="p-iframe-url"></iframe>
</p>
<iframe src="iframe-url"></iframe>
<h1>header-1</h1>
<img src="image-url"/>
<p>
<img src="p-image-url"/>
</p>
content not wrapped within any tags
<h2>header-2</h2>
<p>para-3</p>
<ul>
<li>list-item-1</li>
<li>list-item-2</li>
</ul>
<span>span-content</span>
content not wrapped within any tags
</div>';
$doc = new DOMDocument;
$doc->loadHTML($html);
$div = $doc->getElementsByTagName('div')->item(0);
// Interesting stuff:
// Use XPath to get all text nodes and attribute text
// $tests becomes a DOMNodeList filled with DOMText and DOMAttr objects
$xpath = new DOMXPath($doc);
$texts = $xpath->query('descendant-or-self::*/text()|descendant::*/@*', $div);
// You could only include/exclude specific attributes by looking at their name
// e.g. multiple paths: .//@src|.//@href
// or whitelist: descendant::*/@*[name()="src" or name()="href"]
// or blacklist: descendant::*/@*[not(name()="ignore")]
// Build an array of the text held by the DOMText and DOMAttr objects
// skipping any boring whitespace
$results = array();
foreach ($texts as $text) {
$trimmed_text = trim($text->nodeValue);
if ($trimmed_text !== '') {
$results[] = $trimmed_text;
}
}
// Let's see what we have
var_dump($results);