Created
April 18, 2010 04:30
-
-
Save eculver/370004 to your computer and use it in GitHub Desktop.
convert microsoft word special characters to html entities
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
def convert_1252_codes(text): | |
"""Convert windows-1252 characters to appropriate html entities. | |
@param str String to filter | |
@type string/unicode | |
@return unicode version of filtered string | |
Adapted from: http://effbot.org/zone/unicode-gremlins.htm | |
""" | |
cp_1252_chars = { | |
# from http://www.microsoft.com/typography/unicode/1252.htm | |
u"\x80": u"€", # EURO SIGN | |
u"\x82": u"‚", # SINGLE LOW-9 QUOTATION MARK | |
u"\x83": u"ƒ", # LATIN SMALL LETTER F WITH HOOK | |
u"\x84": u"„", # DOUBLE LOW-9 QUOTATION MARK | |
u"\x85": u"…", # HORIZONTAL ELLIPSIS | |
u"\x86": u"†", # DAGGER | |
u"\x87": u"‡", # DOUBLE DAGGER | |
u"\x88": u"ˆ", # MODIFIER LETTER CIRCUMFLEX ACCENT | |
u"\x89": u"‰", # PER MILLE SIGN | |
u"\x8A": u"Š", # LATIN CAPITAL LETTER S WITH CARON | |
u"\x8B": u"‹", # SINGLE LEFT-POINTING ANGLE QUOTATION MARK | |
u"\x8C": u"Œ", # LATIN CAPITAL LIGATURE OE | |
u"\x8E": u"Ž", # LATIN CAPITAL LETTER Z WITH CARON | |
u"\x91": u"‘", # LEFT SINGLE QUOTATION MARK | |
u"\x92": u"’", # RIGHT SINGLE QUOTATION MARK | |
u"\x93": u"“", # LEFT DOUBLE QUOTATION MARK | |
u"\x94": u"”", # RIGHT DOUBLE QUOTATION MARK | |
u"\x95": u"•", # BULLET | |
u"\x96": u"–", # EN DASH | |
u"\x97": u"—", # EM DASH | |
u"\x98": u"˜", # SMALL TILDE | |
u"\x99": u"™", # TRADE MARK SIGN | |
u"\x9A": u"š", # LATIN SMALL LETTER S WITH CARON | |
u"\x9B": u"›", # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK | |
u"\x9C": u"œ", # LATIN SMALL LIGATURE OE | |
u"\x9E": u"ž", # LATIN SMALL LETTER Z WITH CARON | |
u"\x9F": u"Ÿ", # LATIN CAPITAL LETTER Y WITH DIAERESIS | |
} | |
if re.search(u"[\x80-\x9f]", text): | |
def fixup(m): | |
s = m.group(0) | |
return cp_1252_chars.get(s, s) | |
if isinstance(text, type("")): | |
text = unicode(text, "iso-8859-1") | |
text = re.sub(u"[\x80-\x9f]", fixup, text) | |
return unicode(text) |
Yessss, thank you! My gist shall be forever replaced by fix_1252_codes.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This Cp1252-to-Unicode hashtable has been helpful to me:
And the function: