Cleaning up HTML tag attributes

https://stackoverflow.com/questions/4091757

28-09-2019
|

Question

I need to wade through a lot of HTML using JavaScript to adjust the attribute quote syntax to be all double quotes. I don't need to worry about key-only attributes like "disabled" or "selected".

Here is my current test case:

var text = "<input class=daily_input type='hidden' size='1' value=3 disabled />";
var regex = /<([\w]+)([^>]+)([\w]+)=['"]?([^'\s|"\s|\s]*)['"]?([^>]+)>/gi;
text = text.replace( regex, "<$1$2$3=\"$4\"$5>" );

console.log(text); // logs <input class=daily_input type='hidden' size='1' value="3" disabled />

Looks like it is still only adjusting the last attribute. I'm able to easily test for matches using the regex find/replace in TextMate, and the following will match each attribute in the text HTML tag:

/([\w]+)=['"]?([^'\s|"\s|\s]*)['"]?/gi

How can I change this to catch and adjust every attribute, not only the last one? Been fiddling with it for quite a while now without results. Any help is appreciated!

Solution

text.replace(/='([^']*)'/g, '="$1"').replace(/=([^"'][^ >]*)/g, '="$1"')

Or (one replace):

text.replace(/='([^']*)'|=([^"'][^ >]*)/g, '="$1"')

OTHER TIPS

I know this is a late answer, but if you can always use sanitize-html It's written for node, but pretty sure you could run browserify against the library (or your code for that matter).

Note, it uses lodash, so if you are already using it, then you may want to adjust the package.

This example is more than you're looking for... I use this library in order to cleanup input code, converting to markdown for storage in the db from here, I re-hydrate via marked.

// convert/html-to-filtered-markdown.js

'use strict';

var sanitize = require('sanitize-html') //https://www.npmjs.org/package/sanitize-html
    ,toMarkdown = require('to-markdown').toMarkdown
    ;

module.exports = function convertHtmlToFilteredMarkdown(input, options) {
  if (!input) return '';

  options = options || {};

  //basic cleanup, normalize line endings, normalize/reduce whitespace and extra line endings
  var response = (input || '').toString().trim()
    .replace(/(\r\n|\r|\n)/g, '\n') //normalize line endings
    .replace(/“/g, '"') //remove fancy quotes
    .replace(/”/g, '"') //remove fancy quotes
    .replace(/‘/g, '\'') //remove fancy quotes
    .replace(/’/g, '\'') //remove fancy quotes
    ;

  //sanitize html input
  response = sanitize(response, {
    //don't allow table elements
    allowedTags: [ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'blockquote', 'p', 'a', 'ul', 'ol', 'nl', 'li', 'b', 'i', 'strong', 'em', 'strike', 'code', 'hr', 'br', 'div', 'table', 'thead', 'caption', 'tbody', 'tr', 'th', 'td', 'pre' ],

    //make orderd lists
    transformTags: {
      'ol': 'ul'
    }
  }).replace(/\r\n|\r|\n/g,'\n') //normalize line endings;

  if (!options.tables) {
    response = response.replace(/[\s\n]*\<(\/?)(table|thead|tbody|tr|th|td)\>[\s\n]*/g, '\n\n') //replace divs/tables blocks as paragraphs
  }

  //cleanup input further
  response = response
    .replace(/[\s\n]*\<(\/?)(div|p)\>[\s\n]*/g, '\n\n') //divs and p's to simple multi-line expressions
    .replace(/\>#/g, '\n\n#') //cleanup #'s' after closing tag, ex: <a>...</a>\n\n# will be reduced via sanitizer
    .replace(/\\s+\</,'<') //remove space before a tag open
    .replace(/\>\s+\n?/,'>\n') //remove space after a tag close
    .replace(/\&?nbsp\;?/g,' ') //revert nbsp to space
    .replace(/\<\h[12]/g,'<h3').replace(/\<\/\h[12]/g,'</h3') //reduce h1/h2 to h3
    ;

  //convert response to markdown
  response = toMarkdown(response);

  //normalize line endings
  response = response
    .replace(/(?:^|\n)##?[\b\s]/g,'\n### ') //reduce h1 and h2 to h3
    .replace(/(\r\n|\r|\n)/g, '\n') //normalize line endings
    .trim()

  return response + '\n';
}

Licensed under: CC-BY-SA with attribution

Not affiliated with StackOverflow