我正在尝试从一个网站解析一些html。
html可能包含一些无效的html,导致解析器无法解析html。
这是我写的正则表达式
/(\[class\]((=)("|')?.*("|')))|(\[class\])|((\[id\]((=)("|')?.*("|')))|(\[id\]))/
这将删除所有[class]和[id]属性
我上面的regex可以很好地处理一些html,但不是全部
示例1有效
<div class="par fontsize-16" [class]="'par fontsize-' + fontsize"><p>the two of them left that everyone came back to their senses.</p>
但它不起作用
</div><span id="saved" hidden>Settings saved..</span><div class="clear"></div><div class="par fontsize-16" [class]="'par fontsize-' + fontsize"><p>It wasn't " until the two of them left that everyone came back to their senses.</p>
这是由字符串引起的
It wasn't "
其被移除。
我只想删除属性及其内容,而不是标签内容
有可能吗
最终解决方案
感谢@IT goldman,我最终找到了一个解决方案。
我会把它贴出来,以备有人需要。
function cleanHTML(html, attrs) {
try {
attrs.forEach(attr => {
var pos = 0
while ((pos = html.indexOf(attr)) > -1) {
var sep = null;
var state = 0;
for (var i = pos + attr.length; i < html.length; i++) {
var c = html.charAt(i);
if (c == '=') {
state = 1
continue;
}
if (state == 1 && (c.trim() === '"' || c.trim() === "'")) {
sep = c;
break;
} else if (["'", '"', "=", ""].indexOf(c.trim()) === -1)
break;
}
if (sep) {
const closingPos = html.indexOf(">", pos);
const pos_q = html.indexOf(sep, pos);
let pos_q2 = html.indexOf(sep, pos_q + 1);
if (pos_q2 > closingPos) // none closing attr
pos_q2 = closingPos - 1;
html = html.substring(0, pos) + html.substring(pos_q2 + 1)
} else html = html.substring(0, pos) + html.substring(pos + attr.length + (state == 1 ? 1 : 0));
}
});
} catch (e) {
console.log(e);
}
return html;
}
var src = `<span [class]= [class][class] id="saved" [id]hidden [class] = '"kjhsdf->Settings saved..</span><div class="clear"></div><div class="par fontsize-16" [class]="'par fontsize-' + fontsize"><p>It wasn't " until the two of them left that everyone came back to their senses.</p><a [class]='another'>sasportas</a>`
console.log(cleanHTML(src, ["[class]", "[id]"]));