i am trying to find a way using javascript or jquery to write a function which remove all the html tags from a page and just give me the plain text of this page.
How this can be done? any ideas?
IE & WebKit
document.body.innerText
Others:
document.body.textContent
(as suggested by Amr ElGarhy)
Most js frameworks implement a crossbrowser way to do this. This is usually implemented somewhat like this:
text = document.body.textContent || document.body.innerText;
It seems that WebKit keeps some formatting with textContent whereas strips everything with innerText.
The only trouble with textContent or innerText is that they can jam the text from adjacent nodes together, without any white space between them.
If that matters, you can curse through the body or other container and return the text in an array, and join them with spaces or newlines.
document.deepText= function(hoo){
var A= [], tem, tx;
if(hoo){
hoo= hoo.firstChild;
while(hoo!= null){
if(hoo.nodeType== 3){
tx= hoo.data || '';
if(/\S/.test(tx)) A[A.length]= tx;
}
else A= A.concat(document.deepText(hoo));
hoo= hoo.nextSibling;
}
}
return A;
}
alert(document.deepText(document.body).join(' '))
// return document.deepText(document.body).join('\n')
nodeType of 4 as well (CDATA) just in case someone wraps their text in it. (This is how jQuery does it at least.)I had to convert rich text in an HTML email to plain text. The following worked for me in IE (obj is a jQuery object):
function getTextFromHTML(obj) {
var ni = document.createNodeIterator(obj[0], NodeFilter.SHOW_TEXT, null, false);
var nodeLine = ni.nextNode(); // go to first node of our NodeIterator
var plainText = "";
while (nodeLine) {
plainText += nodeLine.nodeValue + "\n";
nodeLine = ni.nextNode();
}
return plainText;
}
I would use:
<script language="javascript" type="text/javascript" src="http://code.jquery.com/jquery-1.4.2.js"></script>
<script type="text/javascript">
jQuery.fn.stripTags = function() { return this.replaceWith( this.html().replace(/<\/?[^>]+>/gi, '') ); };
jQuery('head').stripTags();
$(document).ready(function() {
$("img").each(function() {
jQuery(this).remove();
});
});
</script>
This will not release any styles, but will strip all tags out.
Is that what you wanted?
[EDIT] now edited to include removal of image tags[/EDIT]
<body>, then?