Last active
April 22, 2019 16:09
-
-
Save TRex22/e6b3ca1021fe5274ea448fe619604b47 to your computer and use it in GitHub Desktop.
Useful Unicode Regex (Ruby)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module StringSanitizer | |
extend self | |
## | |
# This is used for different kinds of cleanup of a string | |
# Use cases: | |
# 1. When creating reports in different formats with varying support for unicode. | |
# 2. For Data inputs to remove whitespace and strange characters which are not "language" | |
# 3. External services which may not be able to handle different character sets | |
# 4. Comparisons - when comapring two strings which have whitespace and weird unicode | |
# | |
# Examples: | |
# | |
# str = "bob, g, Germany 🇩🇪" | |
# StringSanitizer.call(str, strategy: :letters_accents_and_numbers_only) | |
# => "bob g Germany" | |
# | |
# str = "Добро пожаловать в джунгли. " | |
# StringSanitizer.call(str, strategy: :letters_accents_and_numbers_only) | |
# => "Добро пожаловать в джунгли" | |
# | |
# str = "ABCDEпожаловатьF" | |
# StringSanitizer.call(str, strategy: :ascii_only) | |
# => "ABCDEF" | |
# List of useful REGEX | |
# https://www.regular-expressions.info/refunicode.html | |
# https://stackoverflow.com/questions/24672834/how-do-i-remove-emoji-from-string | |
PICTURE_REGEX = /[\u{1f300}-\u{1f5ff}]/ | |
ENCLOSED_CHAR_REGEX = /[\u{2500}-\u{2BEF}]/ | |
EMOTICONS_REGEX = /[\u{1f600}-\u{1f64f}]/ | |
DINGBATS_REGEX = /[\u{2702}-\u{27b0}]/ | |
TRANSPORT_AND_MAP_REGEX = /[\u{1f680}-\u{1f6ff}]/ | |
REGIONAL_INDICATOR_SYMBOL_REGEX = /[\u{1f1e6}-\u{1f1ff}]/ # flags | |
SPACE_BEFORE_COMMA_REGEX = / \,/ | |
# Inverted Regex | |
ASCII_ONLY_REGEX = /[^\u{0000}-\u{007f}]/ | |
LETTERS_ACCENTS_AND_DIGITS_ONLY_REGEX = /[^\p{L}\p{M}\d+\s]/ | |
ASCII_AND_LANGUAGE_CHARACTERS_ONLY_REGEX = /[^\u{0000}-\u{007f}\p{L}\p{M}\d+]/ | |
def call_multiple(str, strategies: [:strip_extra_whitespace_only]) | |
strategies.each do |strategy| | |
str = self.call(str, strategy: strategy) | |
end | |
str | |
end | |
def call(str, strategy: :strip_extra_whitespace_only) | |
return str unless str.is_a?(String) | |
str = str.force_encoding('utf-8').encode | |
case(strategy) | |
when :ascii_and_language_symbols_only | |
str = ascii_and_language_symbols_only(str) | |
when :ascii_only | |
str = ascii_only(str) | |
when :letters_accents_and_numbers_only | |
str = letters_accents_and_numbers_only(str) | |
when :strip_special_unicode_classes_only | |
str = strip_special_unicode_classes_only(str) | |
when :strip_html_tags | |
str = strip_html_tags(str) | |
else | |
str = strip_space_before_comma(str) | |
end | |
str.squish | |
end | |
private | |
# Dangerous to just strip all characters which are not-ascii because then we cannot | |
# support other locales like ru, de, etc ... | |
def ascii_and_language_symbols_only(str) | |
str = strip_space_before_comma(str) | |
str.gsub(ASCII_AND_LANGUAGE_CHARACTERS_ONLY_REGEX, '') | |
end | |
def ascii_only(str) | |
str = strip_space_before_comma(str) | |
str.gsub(ASCII_ONLY_REGEX, '') | |
end | |
def letters_accents_and_numbers_only(str) | |
str = strip_space_before_comma(str) | |
str.gsub(LETTERS_ACCENTS_AND_DIGITS_ONLY_REGEX, '') | |
end | |
def strip_special_unicode_classes_only(str) | |
str = strip_space_before_comma(str) | |
str = str.gsub(PICTURE_REGEX, '') | |
str = str.gsub(ENCLOSED_CHAR_REGEX, '') | |
str = str.gsub(EMOTICONS_REGEX, '') | |
str = str.gsub(DINGBATS_REGEX, '') | |
str = str.gsub(TRANSPORT_AND_MAP_REGEX, '') | |
str.gsub(REGIONAL_INDICATOR_SYMBOL_REGEX, '') | |
end | |
def strip_space_before_comma(str) | |
str.gsub(SPACE_BEFORE_COMMA_REGEX, '') | |
end | |
def strip_html_tags(str) | |
ActionController::Base.helpers.strip_tags(str) | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment