Extract Text And Links From Html Using Regular Expressions
I would like to extract text from an html document keeping the links inside it. for example: From this HTML code
Solution 2:
// Example in javascript:string.
replace(/<a(.*?)>/g,'\0$1\0').
replace(/<\/a>/,'\1').
replace(/<[^>]*>/,'').
replace(/\0(.*?)\0/,'<a$1>').
replace(/\1/,'</a>');
From additional comments it appears you're operating in a browser. In which case the browser has already parsed the HTML for you into a nice DOM tree. Use DOM methods to parse through the tree and process it the way you want:
functionsimpleHTML (domNode) {
var ret = "";
if (domNode.nodeType === Node.ELEMENT_NODE) {
var children = domNode.childNodes;
for (var i=0;i<children.length;i++) {
var child = children[i];
// Filter out unwanted nodes to speed up processing.// For example, you can ignore 'SCRIPT' nodes etc.if (child.nodeName != 'SCRIPT') {
if (child.nodeName == 'A') {
ret += '<a href="' + child.href + '">' +
simpleHTML(child) +
'</a>';
}
else {
ret += simpleHTML(child);
}
}
}
}
elseif (domNode.nodeType === Node.TEXT_NODE) {
ret += domNode.nodeValue;
}
return ret;
}
// serialize the whole document:var simpleDocument = simpleHTML(document.body);
// serialize a div:var simpleDiv = simpleHTML(document.getElementById('some_div'));
// filter a html formatted string:var temp = document.createElement('DIV');
temp.innerHTML = original_string;
simple_string = simpleHTML(temp);
Post a Comment for "Extract Text And Links From Html Using Regular Expressions"