Last active
May 3, 2025 18:53
-
-
Save john-doherty/b9195065884cdbfd2017a4756e6409cc to your computer and use it in GitHub Desktop.
JavaScript function that removes invalid XML characters from a string according to the spec
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Removes invalid XML characters from a string | |
* @param {string} str - a string containing potentially invalid XML characters (non-UTF8 characters, STX, EOX etc) | |
* @param {boolean} removeDiscouragedChars - should it remove discouraged but valid XML characters | |
* @return {string} a sanitized string stripped of invalid XML characters | |
*/ | |
function removeXMLInvalidChars(str, removeDiscouragedChars) { | |
// remove everything forbidden by XML 1.0 specifications, plus the unicode replacement character U+FFFD | |
var regex = /((?:[\0-\x08\x0B\f\x0E-\x1F\uFFFD\uFFFE\uFFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?:[^\uD800-\uDBFF]|^)[\uDC00-\uDFFF]))/g; | |
// ensure we have a string | |
str = String(str || '').replace(regex, ''); | |
if (removeDiscouragedChars) { | |
// remove everything discouraged by XML 1.0 specifications | |
regex = new RegExp( | |
'([\\x7F-\\x84]|[\\x86-\\x9F]|[\\uFDD0-\\uFDEF]|(?:\\uD83F[\\uDFFE\\uDFFF])|(?:\\uD87F[\\uDF' + | |
'FE\\uDFFF])|(?:\\uD8BF[\\uDFFE\\uDFFF])|(?:\\uD8FF[\\uDFFE\\uDFFF])|(?:\\uD93F[\\uDFFE\\uD' + | |
'FFF])|(?:\\uD97F[\\uDFFE\\uDFFF])|(?:\\uD9BF[\\uDFFE\\uDFFF])|(?:\\uD9FF[\\uDFFE\\uDFFF])' + | |
'|(?:\\uDA3F[\\uDFFE\\uDFFF])|(?:\\uDA7F[\\uDFFE\\uDFFF])|(?:\\uDABF[\\uDFFE\\uDFFF])|(?:\\' + | |
'uDAFF[\\uDFFE\\uDFFF])|(?:\\uDB3F[\\uDFFE\\uDFFF])|(?:\\uDB7F[\\uDFFE\\uDFFF])|(?:\\uDBBF' + | |
'[\\uDFFE\\uDFFF])|(?:\\uDBFF[\\uDFFE\\uDFFF])(?:[\\0-\\t\\x0B\\f\\x0E-\\u2027\\u202A-\\uD7FF\\' + | |
'uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|' + | |
'(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF]))', 'g'); | |
str = str.replace(regex, ''); | |
} | |
return str; | |
} |
Hey, sorry for the slow response. Yes, MIT please use away :)
2 outer groups (parentheses) in var regex
seems useless to me, i.e.:
var regex = /:[\0-\x08\x0B\f\x0E-\x1F\uFFFD\uFFFE\uFFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?:[^\uD800-\uDBFF]|^)[\uDC00-\uDFFF]/g;
Also, the part matching low surrogate (DC00–DFFF) not preceded by by a valid high surrogate may be changed to (?<![\uD800-\uDBFF])[\uDC00-\uDFFF]
i.e.:
var regex = /:[\0-\x08\x0B\f\x0E-\x1F\uFFFD\uFFFE\uFFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF]/g;
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi there, what is the license for this gist? Can I use it under the MIT license?