Forked from john-doherty/remove-invalid-xml-characters.js
Last active
January 11, 2021 18:49
-
-
Save AaronHarris/1c4d48b6058ed9af7935534474b0f59c to your computer and use it in GitHub Desktop.
JavaScript function that removes invalid XML characters from a string according to the spec
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// remove everything forbidden by XML 1.0 specifications, plus the unicode replacement character U+FFFD | |
const INVALID_XML_REGEX = /((?:[\0-\x08\x0B\f\x0E-\x1F\uFFFD\uFFFE\uFFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?:[^\uD800-\uDBFF]|^)[\uDC00-\uDFFF]))/g; | |
// remove everything discouraged by XML 1.0 specifications | |
const DISCOURAGED_XML_REGEX = new RegExp( | |
'([\\x7F-\\x84]|[\\x86-\\x9F]|[\\uFDD0-\\uFDEF]|(?:\\uD83F[\\uDFFE\\uDFFF])|(?:\\uD87F[\\uDF' + | |
'FE\\uDFFF])|(?:\\uD8BF[\\uDFFE\\uDFFF])|(?:\\uD8FF[\\uDFFE\\uDFFF])|(?:\\uD93F[\\uDFFE\\uD' + | |
'FFF])|(?:\\uD97F[\\uDFFE\\uDFFF])|(?:\\uD9BF[\\uDFFE\\uDFFF])|(?:\\uD9FF[\\uDFFE\\uDFFF])' + | |
'|(?:\\uDA3F[\\uDFFE\\uDFFF])|(?:\\uDA7F[\\uDFFE\\uDFFF])|(?:\\uDABF[\\uDFFE\\uDFFF])|(?:\\' + | |
'uDAFF[\\uDFFE\\uDFFF])|(?:\\uDB3F[\\uDFFE\\uDFFF])|(?:\\uDB7F[\\uDFFE\\uDFFF])|(?:\\uDBBF' + | |
'[\\uDFFE\\uDFFF])|(?:\\uDBFF[\\uDFFE\\uDFFF])(?:[\\0-\\t\\x0B\\f\\x0E-\\u2027\\u202A-\\uD7FF\\' + | |
'uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|' + | |
'(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF]))', 'g'); | |
/** | |
* Removes invalid XML characters from a string | |
* @param {string} str - a string containing potentially invalid XML characters (non-UTF8 characters, STX, EOX etc) | |
* @param {boolean} removeDiscouragedChars - should it remove discouraged but valid XML characters | |
* @return {string} a sanitized string stripped of invalid XML characters | |
*/ | |
function escapeXMLInvalidChars(str, removeDiscouragedChars) { | |
// ensure we have a string | |
str = String(str || '').replace(INVALID_XML_REGEX, c => '\\u' + ('000' + c.charCodeAt().toString(16)).slice(-4)); | |
if (removeDiscouragedChars) { | |
str = str.replace(DISCOURAGED_XML_REGEX, c => '\\u' + ('000' + c.charCodeAt().toString(16)).slice(-4)); | |
} | |
return str; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Inspiration: