If you want the contents of the entire page, you should be able to use
var allText = document.body.textContent;
In Internet Explorer before IE9, there was the property innerText
which is similar but not identical. The MDN page about textContent
has more detail.
Now one problem here is that textContent
will get you the content of any <style>
or <script>
tags, which may or may not be what you want. If you don't want that, you could use something like this:
function getText(startingPoint) {
var text = "";
function gt(start) {
if (start.nodeType === 3)
text += start.nodeValue;
else if (start.nodeType === 1)
if (start.tagName != "SCRIPT" && start.tagName != "STYLE")
for (var i = 0; i < start.childNodes.length; ++i)
gt(start.childNodes[i]);
}
gt(startingPoint);
return text;
}
Then:
var allText = getText(document.body);
Note: this (or document.body.innerText
) will get you all the text, but in a depth-first order. Getting all the text from a page in the order that a human actually sees it once the page is rendered is a much more difficult problem, because it'd require the code to understand the visual effects (and visual semantics!) of the layout as dictated by CSS (etc).
edit — if you want the text "stored into an array", I suppose on a node-by-node basis (?), you'd simply substitute array appends for the string concatenation in the above:
function getTextArray(startingPoint) {
var text = [];
function gt(start) {
if (start.nodeType === 3)
text.push(start.nodeValue);
else if (start.nodeType === 1)
if (start.tagName != "SCRIPT" && start.tagName != "STYLE")
for (var i = 0; i < start.childNodes.length; ++i)
gt(start.childNodes[i]);
}
gt(startingPoint);
return text;
}