Converting html to textual representation with preserved whitespace meaning of tags — how?
-
21-04-2021 - |
質問
Consider such html piece:
<p>foo</p><p>bar</p>
If you run (for example) jQuery text
for it you will get "foobar" -- so it is raw text actually, not textual representation.
I am looking for some ready to use library to get textual representation, in this case it should be -- "foo\nbar". Or clever hints how to make this as easy as possible ;-).
NOTE: I am not looking for beautiful output text, but just preserved meaning of whitespaces, so for:
<tr><td>foo</td><td>bar</td></tr>
<tr><td>1</td><td>2</td></tr>
I will be happy with
foo bar
1 2
it does NOT have to be:
foo bar
1 2
(but of course no harm done).
解決
Have you looked at the innerText
or textContent
properties?
function getText(element){
var s = "";
if(element.innerText){
s = element.innerText;
}else if(element.textContent){
s = element.textContent;
}
return s;
}
Example
Adds a PRE tag to the body and appends the body text.
document.body.appendChild(
document.createElement('pre')
)
.appendChild(
document.createTextNode(
getText(document.body)
)
);
Edit
Does using a range work with firefox?
var r = document.createRange();
r.selectNode(document.body);
console.log(r.toString());
Edit
It looks like you're stuck with a parsing function like this then.
var parse = function(element){
var s = "";
for(var i = 0; i < element.childNodes.length; i++){
if(/^(iframe|noscript|script|style)$/i.test(element.childNodes[i].nodeName)){
continue;
}else if(/^(tr|br|p|hr)$/i.test(element.childNodes[i].nodeName)){
s+='\n';
}else if(/^(td|th)$/.test(element.childNodes[i].nodeName)){
s+='\t';
}
if(element.childNodes[i].nodeType == 3){
s+=element.childNodes[i].nodeValue.replace(/[\r\n]+/, "");
}else{
s+=parse(element.childNodes[i]);
}
}
return s;
}
console.log(parse(document.body));
他のヒント
I started writing my own function probably at the same time as Zapthedingbat, so just for the record:
var NodeTypeEnum = { Element : 1,Attribute : 2, Text: 3, Comment :8,Document :9};
function doTextualRepresentation(elem)
{
if (elem.nodeType==NodeTypeEnum.Text)
return elem.nodeValue;
else if (elem.nodeType==NodeTypeEnum.Element || elem.nodeType==NodeTypeEnum.Document)
{
var s = "";
var child = elem.firstChild;
while (child!=null)
{
s += doTextualRepresentation(child);
child = child.nextSibling;
}
if (['P','DIV','TABLE','TR','BR','HR'].indexOf(elem.tagName)>-1)
s = "\n"+s+"\n";
else if (['TD','TR'].indexOf(elem.tagName)>-1)
s = "\t"+s+"\t";
return s;
}
return "";
}
function TextualRepresentation(elem)
{
return doTextualRepresentation(elem).replace(/\n[\s]+/g,"\n").replace(/\t{2,}/g,"\t");
}
One thing I am surprised with -- I couldn't get
for (var child in elem.childNodes)
working, and it is a pity, because I spend most time in C# and I like this syntax, theoretically it should work in JS, but it doesn't.