代码之家  ›  专栏  ›  技术社区  ›  Alen.Toma

Regex清理一些html字符串

  •  0
  • Alen.Toma  · 技术社区  · 2 年前

    我正在尝试从一个网站解析一些html。

    html可能包含一些无效的html,导致解析器无法解析html。

    这是我写的正则表达式

    /(\[class\]((=)("|')?.*("|')))|(\[class\])|((\[id\]((=)("|')?.*("|')))|(\[id\]))/
    

    这将删除所有[class]和[id]属性

    我上面的regex可以很好地处理一些html,但不是全部 示例1有效

    <div class="par fontsize-16" [class]="'par fontsize-' + fontsize"><p>the two of them left that everyone came back to their senses.</p>
    

    但它不起作用

          </div><span id="saved" hidden>Settings saved..</span><div class="clear"></div><div class="par fontsize-16" [class]="'par fontsize-' + fontsize"><p>It wasn't " until the two of them left that everyone came back to their senses.</p>
    

    这是由字符串引起的 It wasn't " 其被移除。

    我只想删除属性及其内容,而不是标签内容

    有可能吗

    最终解决方案

    感谢@IT goldman,我最终找到了一个解决方案。 我会把它贴出来,以备有人需要。

    function cleanHTML(html, attrs) {
      try {
        attrs.forEach(attr => {
          var pos = 0
          while ((pos = html.indexOf(attr)) > -1) {
            var sep = null;
            var state = 0;
            for (var i = pos + attr.length; i < html.length; i++) {
              var c = html.charAt(i);
              if (c == '=') {
                state = 1
                continue;
              }
              if (state == 1 && (c.trim() === '"' || c.trim() === "'")) {
                sep = c;
                break;
              } else if (["'", '"', "=", ""].indexOf(c.trim()) === -1)
                break;
            }
    
    
            if (sep) {
              const closingPos = html.indexOf(">", pos);
              const pos_q = html.indexOf(sep, pos);
              let pos_q2 = html.indexOf(sep, pos_q + 1);
              if (pos_q2 > closingPos) // none closing attr
                pos_q2 = closingPos - 1;
              html = html.substring(0, pos) + html.substring(pos_q2 + 1)
            } else html = html.substring(0, pos) + html.substring(pos + attr.length + (state == 1 ? 1 : 0));
          }
        });
      } catch (e) {
        console.log(e);
      }
      return html;
    }
    
    
    var src = `<span [class]= [class][class] id="saved" [id]hidden [class] =  '"kjhsdf->Settings saved..</span><div class="clear"></div><div class="par fontsize-16" [class]="'par fontsize-' + fontsize"><p>It wasn't " until the two of them left that everyone came back to their senses.</p><a [class]='another'>sasportas</a>`
    console.log(cleanHTML(src, ["[class]", "[id]"]));
    1 回复  |  直到 2 年前
        1
  •  1
  •   IT goldman    2 年前

    这里有一个从HTML字符串中删除特定属性(及其值)的小函数。

    var src = `</div><span [class] [class][class] id="saved" [id]hidden>Settings saved..</span><div class="clear"></div><div class="par fontsize-16" [class]="'par fontsize-' + fontsize"><p>It wasn't " until the two of them left that everyone came back to their senses.</p><a [class]='another'>sasportas</a>`
    
    function clean_str(src, attributes_to_remove) {
      attributes_to_remove.forEach(function(attr) {
        var pos
        while ((pos = src.indexOf(attr)) > -1) {
          var sep;
          var state = 0;
          for (var i = pos + attr.length; i < src.length; i++) {
            var c = src.charAt(i);
            if (c == '=') {
              state = 1
              continue;
            }
            if (state == 0 && c.trim()) {
              sep = null;
              break;
            }
            if (state == 1 && c.trim()) {
              sep = c;
              break;
            }
          }
          if (sep) {
            var pos_q = src.indexOf(sep, pos);
            var pos_q2 = src.indexOf(sep, pos_q + 1);
            src = src.substring(0, pos) + src.substring(pos_q2 + 1)
          } else {
            src = src.substring(0, pos) + src.substring(pos + attr.length)
          }
        }
      })
      return src;
    }
    
    console.log(clean_str(src, ["[class]", "[id]"]))
        2
  •  1
  •   xehpuk    2 年前

    这个正则表达式应该可以做到: /\[(?:class|id)](?:=(["']).*?\1)?/

    const regex = /\[(?:class|id)](?:=(["']).*?\1)?/g
    const badHtml = `</div><span id="saved" hidden>Settings saved..</span><div class="clear"></div><div class="par fontsize-16" [class]="'par fontsize-' + fontsize"><p>It wasn't " until the two of them left that everyone came back to their senses.</p>`
    
    document.getElementById('input').innerText = badHtml
    document.getElementById('output').innerText = regex[Symbol.replace](badHtml, '')
    Input
    <pre id="input"></pre>
    Output
    <pre id="output"></pre>