Created
November 8, 2010 18:20
-
-
Save wam/668025 to your computer and use it in GitHub Desktop.
php function that replaces html entities with ascii near-equivalents
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function asciify($text) { | |
$entities = array(); | |
$ascii = array(); | |
// 32 through 127 correspond to ascii letters | |
for ($i = 32; $i < 127; $i++) { | |
$entities[] = "&#$i;"; | |
$ascii[] = chr($i); | |
} | |
// 32 through 99 have alternates with padding | |
for ($i = 32; $i < 100; $i++) { | |
$entities[] = "�$i;"; | |
$ascii[] = chr($i); | |
} | |
$entities[] = " "; $ascii[] = ' '; # non-breaking space | |
$entities[] = "¡"; $ascii[] = '!'; # inverted exclamation mark | |
$entities[] = "¢"; $ascii[] = 'cents'; # cent sign | |
$entities[] = "£"; $ascii[] = 'pounds'; # pound sign | |
$entities[] = "¤"; $ascii[] = '$'; # currency sign | |
$entities[] = "¥"; $ascii[] = 'yen'; # yen sign | |
$entities[] = "¦"; $ascii[] = '|'; # broken vertical bar | |
$entities[] = "§"; $ascii[] = 'Ss'; # section sign | |
$entities[] = "¨"; $ascii[] = '``'; # spacing diaeresis - umlaut | |
$entities[] = "©"; $ascii[] = '(c)'; # copyright sign | |
$entities[] = "ª"; $ascii[] = 'a'; # feminine ordinal indicator | |
$entities[] = "«"; $ascii[] = '<<'; # left double angle quotes | |
$entities[] = "¬"; $ascii[] = '~'; # not sign | |
$entities[] = "­"; $ascii[] = '-'; # soft hyphen | |
$entities[] = "®"; $ascii[] = '(r)'; # registered trade mark sign | |
$entities[] = "¯"; $ascii[] = '-'; # spacing macron - overline | |
$entities[] = " "; $ascii[] = ' '; # non-breaking space | |
$entities[] = "¡"; $ascii[] = '!'; # inverted exclamation mark | |
$entities[] = "¢"; $ascii[] = 'cents'; # cent sign | |
$entities[] = "£"; $ascii[] = 'pounds'; # pound sign | |
$entities[] = "¤"; $ascii[] = '$'; # currency sign | |
$entities[] = "¥"; $ascii[] = 'yen'; # yen sign | |
$entities[] = "¦"; $ascii[] = '|'; # broken vertical bar | |
$entities[] = "§"; $ascii[] = 'Ss'; # section sign | |
$entities[] = "¨"; $ascii[] = '``'; # spacing diaeresis - umlaut | |
$entities[] = "©"; $ascii[] = '(c)'; # copyright sign | |
$entities[] = "ª"; $ascii[] = 'a'; # feminine ordinal indicator | |
$entities[] = "«"; $ascii[] = '<<'; # left double angle quotes | |
$entities[] = "¬"; $ascii[] = '~'; # not sign | |
$entities[] = "­"; $ascii[] = '-'; # soft hyphen | |
$entities[] = "®"; $ascii[] = '(r)'; # registered trade mark sign | |
$entities[] = "¯"; $ascii[] = '-'; # spacing macron - overline | |
$entities[] = "°"; $ascii[] = 'deg'; # degree sign | |
$entities[] = "±"; $ascii[] = '+/-'; # plus-or-minus sign | |
$entities[] = "²"; $ascii[] = '^2'; # superscript two - squared | |
$entities[] = "³"; $ascii[] = '^3'; # superscript three - cubed | |
$entities[] = "´"; $ascii[] = '\''; # acute accent - spacing acute | |
$entities[] = "µ"; $ascii[] = 'u'; # micro sign | |
$entities[] = "¶"; $ascii[] = 'par'; # pilcrow sign - paragraph sign | |
$entities[] = "·"; $ascii[] = '.'; # middle dot - Georgian comma | |
$entities[] = "¸"; $ascii[] = ','; # spacing cedilla | |
$entities[] = "¹"; $ascii[] = '^1'; # superscript one | |
$entities[] = "º"; $ascii[] = '^o'; # masculine ordinal indicator | |
$entities[] = "»"; $ascii[] = '>>'; # right double angle quotes | |
$entities[] = "¼"; $ascii[] = '1/4'; # fraction one quarter | |
$entities[] = "½"; $ascii[] = '1/2'; # fraction one half | |
$entities[] = "¾"; $ascii[] = '3/4'; # fraction three quarters | |
$entities[] = "¿"; $ascii[] = '?'; # inverted question mark | |
$entities[] = "°"; $ascii[] = 'deg'; # degree sign | |
$entities[] = "±"; $ascii[] = '+/-'; # plus-or-minus sign | |
$entities[] = "²"; $ascii[] = '^2'; # superscript two - squared | |
$entities[] = "³"; $ascii[] = '^3'; # superscript three - cubed | |
$entities[] = "´"; $ascii[] = '\''; # acute accent - spacing acute | |
$entities[] = "µ"; $ascii[] = 'u'; # micro sign | |
$entities[] = "¶"; $ascii[] = 'par'; # pilcrow sign - paragraph sign | |
$entities[] = "·"; $ascii[] = '.'; # middle dot - Georgian comma | |
$entities[] = "¸"; $ascii[] = ','; # spacing cedilla | |
$entities[] = "¹"; $ascii[] = '^1'; # superscript one | |
$entities[] = "º"; $ascii[] = '^o'; # masculine ordinal indicator | |
$entities[] = "»"; $ascii[] = '>>'; # right double angle quotes | |
$entities[] = "¼"; $ascii[] = '1/4'; # fraction one quarter | |
$entities[] = "½"; $ascii[] = '1/2'; # fraction one half | |
$entities[] = "¾"; $ascii[] = '3/4'; # fraction three quarters | |
$entities[] = "¿"; $ascii[] = '?'; # inverted question mark | |
$entities[] = "À"; $ascii[] = 'A'; # latin capital letter A with grave | |
$entities[] = "Á"; $ascii[] = 'A'; # latin capital letter A with acute | |
$entities[] = "Â"; $ascii[] = 'A'; # latin capital letter A with circumflex | |
$entities[] = "Ã"; $ascii[] = 'A'; # latin capital letter A with tilde | |
$entities[] = "Ä"; $ascii[] = 'A'; # latin capital letter A with diaeresis | |
$entities[] = "Å"; $ascii[] = 'A'; # latin capital letter A with ring above | |
$entities[] = "Æ"; $ascii[] = 'AE'; # latin capital letter AE | |
$entities[] = "Ç"; $ascii[] = 'C'; # latin capital letter C with cedilla | |
$entities[] = "È"; $ascii[] = 'E'; # latin capital letter E with grave | |
$entities[] = "É"; $ascii[] = 'E'; # latin capital letter E with acute | |
$entities[] = "Ê"; $ascii[] = 'E'; # latin capital letter E with circumflex | |
$entities[] = "Ë"; $ascii[] = 'E'; # latin capital letter E with diaeresis | |
$entities[] = "Ì"; $ascii[] = 'I'; # latin capital letter I with grave | |
$entities[] = "Í"; $ascii[] = 'I'; # latin capital letter I with acute | |
$entities[] = "Î"; $ascii[] = 'I'; # latin capital letter I with circumflex | |
$entities[] = "Ï"; $ascii[] = 'I'; # latin capital letter I with diaeresis | |
$entities[] = "À"; $ascii[] = 'A'; # latin capital letter A with grave | |
$entities[] = "Á"; $ascii[] = 'A'; # latin capital letter A with acute | |
$entities[] = "Â"; $ascii[] = 'A'; # latin capital letter A with circumflex | |
$entities[] = "Ã"; $ascii[] = 'A'; # latin capital letter A with tilde | |
$entities[] = "Ä"; $ascii[] = 'A'; # latin capital letter A with diaeresis | |
$entities[] = "Å"; $ascii[] = 'A'; # latin capital letter A with ring above | |
$entities[] = "Æ"; $ascii[] = 'AE'; # latin capital letter AE | |
$entities[] = "Ç"; $ascii[] = 'C'; # latin capital letter C with cedilla | |
$entities[] = "È"; $ascii[] = 'E'; # latin capital letter E with grave | |
$entities[] = "É"; $ascii[] = 'E'; # latin capital letter E with acute | |
$entities[] = "Ê"; $ascii[] = 'E'; # latin capital letter E with circumflex | |
$entities[] = "Ë"; $ascii[] = 'E'; # latin capital letter E with diaeresis | |
$entities[] = "Ì"; $ascii[] = 'I'; # latin capital letter I with grave | |
$entities[] = "Í"; $ascii[] = 'I'; # latin capital letter I with acute | |
$entities[] = "Î"; $ascii[] = 'I'; # latin capital letter I with circumflex | |
$entities[] = "Ï"; $ascii[] = 'I'; # latin capital letter I with diaeresis | |
$entities[] = "Ð"; $ascii[] = 'EDH'; # latin capital letter ETH | |
$entities[] = "Ñ"; $ascii[] = 'N'; # latin capital letter N with tilde | |
$entities[] = "Ò"; $ascii[] = 'O'; # latin capital letter O with grave | |
$entities[] = "Ó"; $ascii[] = 'O'; # latin capital letter O with acute | |
$entities[] = "Ô"; $ascii[] = 'O'; # latin capital letter O with circumflex | |
$entities[] = "Õ"; $ascii[] = 'O'; # latin capital letter O with tilde | |
$entities[] = "Ö"; $ascii[] = 'O'; # latin capital letter O with diaeresis | |
$entities[] = "×"; $ascii[] = 'x'; # multiplication sign | |
$entities[] = "Ø"; $ascii[] = '0'; # latin capital letter O with slash | |
$entities[] = "Ù"; $ascii[] = 'U'; # latin capital letter U with grave | |
$entities[] = "Ú"; $ascii[] = 'U'; # latin capital letter U with acute | |
$entities[] = "Û"; $ascii[] = 'U'; # latin capital letter U with circumflex | |
$entities[] = "Ü"; $ascii[] = 'U'; # latin capital letter U with diaeresis | |
$entities[] = "Ý"; $ascii[] = 'Y'; # latin capital letter Y with acute | |
$entities[] = "Þ"; $ascii[] = 'dh'; # latin capital letter THORN | |
$entities[] = "ß"; $ascii[] = 'th'; # latin small letter sharp s - ess-zed | |
$entities[] = "Ð"; $ascii[] = 'EDH'; # latin capital letter ETH | |
$entities[] = "Ñ"; $ascii[] = 'N'; # latin capital letter N with tilde | |
$entities[] = "Ò"; $ascii[] = 'O'; # latin capital letter O with grave | |
$entities[] = "Ó"; $ascii[] = 'O'; # latin capital letter O with acute | |
$entities[] = "Ô"; $ascii[] = 'O'; # latin capital letter O with circumflex | |
$entities[] = "Õ"; $ascii[] = 'O'; # latin capital letter O with tilde | |
$entities[] = "Ö"; $ascii[] = 'O'; # latin capital letter O with diaeresis | |
$entities[] = "×"; $ascii[] = 'x'; # multiplication sign | |
$entities[] = "Ø"; $ascii[] = 'O'; # latin capital letter O with slash | |
$entities[] = "Ù"; $ascii[] = 'U'; # latin capital letter U with grave | |
$entities[] = "Ú"; $ascii[] = 'U'; # latin capital letter U with acute | |
$entities[] = "Û"; $ascii[] = 'U'; # latin capital letter U with circumflex | |
$entities[] = "Ü"; $ascii[] = 'U'; # latin capital letter U with diaeresis | |
$entities[] = "Ý"; $ascii[] = 'Y'; # latin capital letter Y with acute | |
$entities[] = "Þ"; $ascii[] = 'dh'; # latin capital letter THORN | |
$entities[] = "ß"; $ascii[] = 'th'; # latin small letter sharp s - ess-zed | |
$entities[] = "à"; $ascii[] = 'a'; # latin small letter a with grave | |
$entities[] = "á"; $ascii[] = 'a'; # latin small letter a with acute | |
$entities[] = "â"; $ascii[] = 'a'; # latin small letter a with circumflex | |
$entities[] = "ã"; $ascii[] = 'a'; # latin small letter a with tilde | |
$entities[] = "ä"; $ascii[] = 'a'; # latin small letter a with diaeresis | |
$entities[] = "å"; $ascii[] = 'a'; # latin small letter a with ring above | |
$entities[] = "æ"; $ascii[] = 'ae'; # latin small letter ae | |
$entities[] = "ç"; $ascii[] = 'c'; # latin small letter c with cedilla | |
$entities[] = "è"; $ascii[] = 'e'; # latin small letter e with grave | |
$entities[] = "é"; $ascii[] = 'e'; # latin small letter e with acute | |
$entities[] = "ê"; $ascii[] = 'e'; # latin small letter e with circumflex | |
$entities[] = "ë"; $ascii[] = 'e'; # latin small letter e with diaeresis | |
$entities[] = "ì"; $ascii[] = 'i'; # latin small letter i with grave | |
$entities[] = "í"; $ascii[] = 'i'; # latin small letter i with acute | |
$entities[] = "î"; $ascii[] = 'i'; # latin small letter i with circumflex | |
$entities[] = "ï"; $ascii[] = 'i'; # latin small letter i with diaeresis | |
$entities[] = "à"; $ascii[] = 'a'; # latin small letter a with grave | |
$entities[] = "á"; $ascii[] = 'a'; # latin small letter a with acute | |
$entities[] = "â"; $ascii[] = 'a'; # latin small letter a with circumflex | |
$entities[] = "ã"; $ascii[] = 'a'; # latin small letter a with tilde | |
$entities[] = "ä"; $ascii[] = 'a'; # latin small letter a with diaeresis | |
$entities[] = "å"; $ascii[] = 'a'; # latin small letter a with ring above | |
$entities[] = "æ"; $ascii[] = 'ae'; # latin small letter ae | |
$entities[] = "ç"; $ascii[] = 'c'; # latin small letter c with cedilla | |
$entities[] = "è"; $ascii[] = 'e'; # latin small letter e with grave | |
$entities[] = "é"; $ascii[] = 'e'; # latin small letter e with acute | |
$entities[] = "ê"; $ascii[] = 'e'; # latin small letter e with circumflex | |
$entities[] = "ë"; $ascii[] = 'e'; # latin small letter e with diaeresis | |
$entities[] = "ì"; $ascii[] = 'i'; # latin small letter i with grave | |
$entities[] = "í"; $ascii[] = 'i'; # latin small letter i with acute | |
$entities[] = "î"; $ascii[] = 'i'; # latin small letter i with circumflex | |
$entities[] = "ï"; $ascii[] = 'i'; # latin small letter i with diaeresis | |
$entities[] = "ð"; $ascii[] = 'edh'; # latin small letter eth | |
$entities[] = "ñ"; $ascii[] = 'n'; # latin small letter n with tilde | |
$entities[] = "ò"; $ascii[] = 'o'; # latin small letter o with grave | |
$entities[] = "ó"; $ascii[] = 'o'; # latin small letter o with acute | |
$entities[] = "ô"; $ascii[] = 'o'; # latin small letter o with circumflex | |
$entities[] = "õ"; $ascii[] = 'o'; # latin small letter o with tilde | |
$entities[] = "ö"; $ascii[] = 'o'; # latin small letter o with diaeresis | |
$entities[] = "÷"; $ascii[] = '/'; # division sign | |
$entities[] = "ø"; $ascii[] = 'o'; # latin small letter o with slash | |
$entities[] = "ù"; $ascii[] = 'u'; # latin small letter u with grave | |
$entities[] = "ú"; $ascii[] = 'u'; # latin small letter u with acute | |
$entities[] = "û"; $ascii[] = 'u'; # latin small letter u with circumflex | |
$entities[] = "ü"; $ascii[] = 'u'; # latin small letter u with diaeresis | |
$entities[] = "ý"; $ascii[] = 'y'; # latin small letter y with acute | |
$entities[] = "þ"; $ascii[] = 'th'; # latin small letter thorn | |
$entities[] = "ÿ"; $ascii[] = 'y'; # latin small letter y with diaeresis | |
$entities[] = "ð"; $ascii[] = 'edh'; # latin small letter eth | |
$entities[] = "ñ"; $ascii[] = 'n'; # latin small letter n with tilde | |
$entities[] = "ò"; $ascii[] = 'o'; # latin small letter o with grave | |
$entities[] = "ó"; $ascii[] = 'o'; # latin small letter o with acute | |
$entities[] = "ô"; $ascii[] = 'o'; # latin small letter o with circumflex | |
$entities[] = "õ"; $ascii[] = 'o'; # latin small letter o with tilde | |
$entities[] = "ö"; $ascii[] = 'o'; # latin small letter o with diaeresis | |
$entities[] = "÷"; $ascii[] = '/'; # division sign | |
$entities[] = "ø"; $ascii[] = 'o'; # latin small letter o with slash | |
$entities[] = "ù"; $ascii[] = 'u'; # latin small letter u with grave | |
$entities[] = "ú"; $ascii[] = 'u'; # latin small letter u with acute | |
$entities[] = "û"; $ascii[] = 'u'; # latin small letter u with circumflex | |
$entities[] = "ü"; $ascii[] = 'u'; # latin small letter u with diaeresis | |
$entities[] = "ý"; $ascii[] = 'y'; # latin small letter y with acute | |
$entities[] = "þ"; $ascii[] = 'th'; # latin small letter thorn | |
$entities[] = "ÿ"; $ascii[] = 'y'; # latin small letter y with diaeresis | |
$entities[] = "Œ"; $ascii[] = 'OE'; # latin capital letter OE | |
$entities[] = "œ"; $ascii[] = 'oe'; # latin small letter oe | |
$entities[] = "Š"; $ascii[] = 'S'; # latin capital letter S with caron | |
$entities[] = "š"; $ascii[] = 's'; # latin small letter s with caron | |
$entities[] = "Ÿ"; $ascii[] = 'U'; # latin capital letter Y with diaeresis | |
$entities[] = "ƒ"; $ascii[] = 'f'; # latin small f with hook - function | |
// Higher Punctuation | |
$entities[] = " "; $ascii[] = ' '; # en space | |
$entities[] = " "; $ascii[] = ' '; # em space | |
$entities[] = " "; $ascii[] = ' '; # thin space | |
$entities[] = "‌"; $ascii[] = ''; # zero width non-joiner, | |
$entities[] = "‍"; $ascii[] = ''; # zero width joiner | |
$entities[] = "‎"; $ascii[] = ''; # left-to-right mark | |
$entities[] = "‏"; $ascii[] = ''; # right-to-left mark | |
$entities[] = "–"; $ascii[] = '-'; # en dash | |
$entities[] = "—"; $ascii[] = '--'; # em dash | |
$entities[] = "‘"; $ascii[] = '\''; # left single quotation mark, | |
$entities[] = "’"; $ascii[] = '\''; # right single quotation mark, | |
$entities[] = "‚"; $ascii[] = '"'; # single low-9 quotation mark | |
$entities[] = "“"; $ascii[] = '"'; # left double quotation mark, | |
$entities[] = "”"; $ascii[] = '"'; # right double quotation mark, | |
$entities[] = "„"; $ascii[] = ',,'; # double low-9 quotation mark | |
$entities[] = "†"; $ascii[] = '*'; # dagger | |
$entities[] = "‡"; $ascii[] = '**'; # double dagger | |
$entities[] = "•"; $ascii[] = '*'; # bullet | |
$entities[] = "…"; $ascii[] = '...'; # horizontal ellipsis | |
$entities[] = "‰"; $ascii[] = '0/00'; # per mille sign | |
$entities[] = "‹"; $ascii[] = '<'; # single left-pointing angle quotation mark, | |
$entities[] = "›"; $ascii[] = '>'; # single right-pointing angle quotation mark, | |
$entities[] = "€"; $ascii[] = 'euro'; # euro sign | |
$entities[] = "€"; $ascii[] = 'euro'; # euro sign | |
$entities[] = "™"; $ascii[] = '(TM)'; # trade mark sign | |
$entities[] = "&"; $ascii[] = '&'; # ampersand | |
$output = str_replace($entities, $ascii, $text); | |
// For CDATA: Remove any instances of ]]> that may have accidentally been created. | |
// $output = str_replace(']]>', '', $output); | |
return $output; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment