Skip to content

Instantly share code, notes, and snippets.

@daformat
Last active October 25, 2023 08:19
Show Gist options
  • Save daformat/950411857f01a9b39873ddd1b44d5813 to your computer and use it in GitHub Desktop.
Save daformat/950411857f01a9b39873ddd1b44d5813 to your computer and use it in GitHub Desktop.
A list of the different UTF spaces
utfSpaces = [
{
name: 'Space',
utf: '\u0020',
html: [' ', ' '],
breaking: true,
width: 'Typically 1/4 em',
unicode_category: 'Separator, Space',
matched_by_s_character_class: true
},
{
name: 'Thin space',
utf: '\u2009',
html: [' ', ' ', ' '],
breaking: true,
width: '1/5 em, can be 1/6 em',
unicode_category: 'Separator, Space',
matched_by_s_character_class: true
},
{
name: 'Hair space',
utf: '\u200A',
html: [' ', ' '],
breaking: true,
width: 'Narrower than a thin space (less than 1/5 em or 1/6em)',
unicode_category: 'Separator, Space',
matched_by_s_character_class: true
},
{
name: 'Zero width space',
utf: '\u200B',
html: ['​', '​'],
breaking: true,
width: 'None (invisible character)',
unicode_category: 'Other, Format',
matched_by_s_character_class: false
},
{
name: 'Medium mathematical space',
utf: '\u205F',
html: [' ', ' '],
breaking: true,
width: '4/18 em',
unicode_category: 'Separator, Space',
matched_by_s_character_class: true
},
{
name: 'Ogham space mark',
utf: '\u1680',
html: [' ', ' '],
breaking: true,
width: 'Usually represented by a 1em dash',
unicode_category: 'Separator, Space',
matched_by_s_character_class: true
},
{
name: 'Mongolian vowel separator',
utf: '\u180E',
html: ['᠎', '᠎'],
breaking: false,
width: 'None (invisible character)',
unicode_category: 'Other, Format',
matched_by_s_character_class: false
},
{
name: 'EN quad',
utf: '\u2000',
html: [' ', ' '],
breaking: true,
width: '1 en (1/2 em)',
unicode_category: 'Separator, Space',
matched_by_s_character_class: true
},
{
name: 'EM quad',
utf: '\u2001',
html: [' ', ' '],
breaking: true,
width: '1 em',
unicode_category: 'Separator, Space',
matched_by_s_character_class: true
},
{
name: 'EN space',
utf: '\u2002',
html: [' ', ' ', ' '],
breaking: true,
width: '1 en (1/2 em)',
unicode_category: 'Separator, Space',
matched_by_s_character_class: true
},
{
name: 'EM space',
utf: '\u2003',
html: [' ', ' ', ' '],
breaking: false,
width: '1 em',
unicode_category: 'Separator, Space',
matched_by_s_character_class: true
},
{
name: 'Three-per-em space',
utf: '\u2004',
html: [' ', ' '],
breaking: false,
width: '1/3 em',
unicode_category: 'Separator, Space',
matched_by_s_character_class: true
},
{
name: 'Four-per-em space',
utf: '\u2005',
html: [' ', ' '],
breaking: false,
width: '1/4 em',
unicode_category: 'Separator, Space',
matched_by_s_character_class: true
},
{
name: 'Six-per-em space',
utf: '\u2006',
html: [' ', ' '],
breaking: false,
width: '1/6 em',
unicode_category: 'Separator, Space',
matched_by_s_character_class: true
},
{
name: 'Non breaking space',
utf: '\u00A0',
html: [' ', ' ', ' '],
breaking: false,
width: 'Typically 1/4 em, same as a regular space but usually not adjusted with justification',
unicode_category: 'Separator, Space',
matched_by_s_character_class: true
},
{
name: 'Narrow no-break space',
utf: '\u202F',
html: [' ', ' '],
breaking: false,
width: 'Narrower than a non-breaking or breaking space',
unicode_category: 'Separator, Space',
matched_by_s_character_class: true
},
{
name: 'Figure space',
utf: '\u2007',
html: [' ', ' '],
breaking: false,
width: 'The width of digits (tabular space)',
unicode_category: 'Separator, Space',
matched_by_s_character_class: true
},
{
name: 'Punctuation space',
utf: '\u2008',
html: [' ', ' '],
breaking: true,
width: 'Width of a period (.)',
unicode_category: 'Separator, Space',
matched_by_s_character_class: true
},
{
name: 'Word joiner',
utf: '\u2060',
html: ['⁠', '&8288;'],
breaking: false,
width: 'None (invisible character)',
unicode_category: 'Other, Format',
matched_by_s_character_class: false
},
{
name: 'Ideographic space',
utf: '\u3000',
html: [' ', ' '],
breaking: false,
width: 'The width of ideographic (CJK) characters',
unicode_category: 'Separator, Space',
matched_by_s_character_class: true
},
{
name: 'Zero width no-break space (BOM often intepreted as)',
utf: '\uFEFF',
html: ['', ''],
breaking: false,
width: 'None (invisible character)',
unicode_category: 'Other, Format',
matched_by_s_character_class: true
},
{
name: 'Character tabulation',
utf: '\u0009',
html: ['	', '	', '	'],
width: 'Up to the next tab stop',
unicode_category: 'Other, Control',
matched_by_s_character_class: true
},
{
name: 'Line tabulation',
utf: '\u000B',
html: ['', ''],
width: 'doesn’t apply (vertical)',
unicode_category: 'Other, Control',
matched_by_s_character_class: true
}
];
@daformat
Copy link
Author

Here's a way to know the type of spaces used in a string:

var str = ' 1 016,00 € ';
console.table(
  str.match(
    new RegExp(
      `[${utfSpaces.map(s => s.utf).join('')}]`, 'g'
    )
  ).filter(
    (value, index, self) => self.indexOf(value) === index
  ).map(
    s => utfSpaces.find(
      u => u.utf === s
    )
  ).map(
    u => (
      {...u, ['utf']: escape(u.utf).replace('%', '\\')}
    )
  )
)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment