Question

I'm getting UnicodeEncodeError: 'ascii' codec can't encode characters in position 0-51: ordinal not in range(128) exception trying to use string.maketrans in Python. I'm kinda discouraged with this kind of error in following code (gist):

# -*- coding: utf-8 -*-

import string

def translit1(string):
    """ This function works just fine """
    capital_letters = {
        u'А': u'A',
        u'Б': u'B',
        u'В': u'V',
        u'Г': u'G',
        u'Д': u'D',
        u'Е': u'E',
        u'Ё': u'E',
        u'Ж': u'Zh',
        u'З': u'Z',
        u'И': u'I',
        u'Й': u'Y',
        u'К': u'K',
        u'Л': u'L',
        u'М': u'M',
        u'Н': u'N',
        u'О': u'O',
        u'П': u'P',
        u'Р': u'R',
        u'С': u'S',
        u'Т': u'T',
        u'У': u'U',
        u'Ф': u'F',
        u'Х': u'H',
        u'Ц': u'Ts',
        u'Ч': u'Ch',
        u'Ш': u'Sh',
        u'Щ': u'Sch',
        u'Ъ': u'',
        u'Ы': u'Y',
        u'Ь': u'',
        u'Э': u'E',
        u'Ю': u'Yu',
        u'Я': u'Ya'
    }

    lower_case_letters = {
        u'а': u'a',
        u'б': u'b',
        u'в': u'v',
        u'г': u'g',
        u'д': u'd',
        u'е': u'e',
        u'ё': u'e',
        u'ж': u'zh',
        u'з': u'z',
        u'и': u'i',
        u'й': u'y',
        u'к': u'k',
        u'л': u'l',
        u'м': u'm',
        u'н': u'n',
        u'о': u'o',
        u'п': u'p',
        u'р': u'r',
        u'с': u's',
        u'т': u't',
        u'у': u'u',
        u'ф': u'f',
        u'х': u'h',
        u'ц': u'ts',
        u'ч': u'ch',
        u'ш': u'sh',
        u'щ': u'sch',
        u'ъ': u'',
        u'ы': u'y',
        u'ь': u'',
        u'э': u'e',
        u'ю': u'yu',
        u'я': u'ya'
    }

    translit_string = ""

    for index, char in enumerate(string):
        if char in lower_case_letters.keys():
            char = lower_case_letters[char]
        elif char in capital_letters.keys():
            char = capital_letters[char]
            if len(string) > index+1:
                if string[index+1] not in lower_case_letters.keys():
                    char = char.upper()
            else:
                char = char.upper()
        translit_string += char

    return translit_string


def translit2(text):
    """ This method should be more easy to grasp, 
    but throws exception:
    UnicodeEncodeError: 'ascii' codec can't encode characters in position 0-51: ordinal not in range(128)
    """

    symbols = string.maketrans(u"абвгдеёзийклмнопрстуфхъыьэАБВГДЕЁЗИЙКЛМНОПРСТУФХЪЫЬЭ",
                               u"abvgdeezijklmnoprstufh'y'eABVGDEEZIJKLMNOPRSTUFH'Y'E")
    sequence = {
        u'ж':'zh',
        u'ц':'ts',
        u'ч':'ch',
        u'ш':'sh',
        u'щ':'sch',
        u'ю':'ju',
        u'я':'ja',
        u'Ж':'Zh',
        u'Ц':'Ts',
        u'Ч':'Ch'
    }

    for char in sequence.keys():
        text = text.replace(char, sequence[char])

    return text.translate(symbols)

if __name__ == "__main__":
    print translit1(u"Привет") # prints Privet as expected
    print translit2(u"Привет") # throws exception: UnicodeEncodeError: 'ascii' codec can't encode characters in position 0-51: ordinal not in range(128)

Original trace:

Traceback (most recent call last):
  File "translit_error.py", line 124, in <module>
    print translit2(u"Привет") # throws exception: UnicodeEncodeError: 'ascii' codec can't encode characters in position 0-51: ordinal not in range(128)
  File "translit_error.py", line 103, in translit2
    u"abvgdeezijklmnoprstufh'y'eABVGDEEZIJKLMNOPRSTUFH'Y'E")
UnicodeEncodeError: 'ascii' codec can't encode characters in position 0-51: ordinal not in range(128)

I mean, why Python string.maketrans trying to use ascii table anyway? And how comes English alphabet letters are out of 0-128 range?

$ python -c "print ord(u'A')"
65
$ python -c "print ord(u'z')"
122
$ python -c "print ord(u\"'\")"
39

After several hours I feel like absolutely exhausted to solve this issue.

Can someone say what is happening and how to fix it?

Was it helpful?

Solution 2

translate behaves differently when used with unicode strings. Instead of a maketrans table, you have to provide a dictionary ord(search)->ord(replace):

symbols = (u"абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ",
           u"abvgdeejzijklmnoprstufhzcss_y_euaABVGDEEJZIJKLMNOPRSTUFHZCSS_Y_EUA")

tr = {ord(a):ord(b) for a, b in zip(*symbols)}

# for Python 2.*:
# tr = dict( [ (ord(a), ord(b)) for (a, b) in zip(*symbols) ] )

text = u'Добрый Ден'
print text.translate(tr)  # looks good

That said, I'd second the suggestion not to reinvent the wheel and to use an established library: http://pypi.python.org/pypi/Unidecode

OTHER TIPS

You can use transliterate package (https://pypi.python.org/pypi/transliterate)

Example #1:

from transliterate import translit
print translit("Lorem ipsum dolor sit amet", "ru")
# Лорем ипсум долор сит амет

Example #2:

print translit(u"Лорем ипсум долор сит амет", "ru", reversed=True)
# Lorem ipsum dolor sit amet

Check out the CyrTranslit package, it's specifically made to transliterate from and to Cyrillic script text. It currently supports Serbian, Montenegrin, Macedonian, and Russian.

Example usage:

>>> import cyrtranslit
>>> cyrtranslit.supported()
['me', 'sr', 'mk', 'ru']

>>> cyrtranslit.to_latin('Моё судно на воздушной подушке полно угрей', 'ru')
'Moyo sudno na vozdushnoj podushke polno ugrej'

>>> cyrtranslit.to_cyrillic('Moyo sudno na vozdushnoj podushke polno ugrej')
'Моё судно на воздушной подушке полно угрей'

Here is another short solution with more accurate transliteration:

symbols = (u"абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ ",
    (*list(u'abvgdee'), 'zh', *list(u'zijklmnoprstuf'), 'kh', 'z', 'ch', 'sh', 'sh', '',
    'y', '', 'e', 'yu','ya', *list(u'ABVGDEE'), 'ZH', 
    *list(u'ZIJKLMNOPRSTUF'), 'KH', 'Z', 'CH', 'SH', 'SH', *list(u'_Y_E'), 'YU', 'YA', ' '))

coding_dict = {source: dest for source, dest in zip(*symbols)}
translate = lambda x: ''.join([coding_dict[i] for i in x])

text = u'Добро пожаловать'
translate(text)
# 'Dobro pozhalovat'
Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top