Unicode renders differently in python due to overlooking incorrect logic flow of two seemingly identical procedures

https://stackoverflow.com/questions/20497868

31-08-2022
|

문제

I've tried to write a test() function to help parsing strings to Tibetan unicode, so I can see if my invariants are still kept when adding new features. Here's everything in its entirety:

import sys
from math import *

''' Translator
    Wylie to utf-8 conversion.

'''

W_ROOTLETTERS = [
    'k',  'kh',  'g',  'ng',
    'c',  'ch',  'j',  'ny',
    't',  'th',  'd',  'n',
    'p',  'ph',  'b',  'm',
    'ts', 'tsh', 'dz', 'w',
    'zh', 'z',   '\'', 'y',
    'r',  'l',   'sh', 's',
    'h',  'a' ];

U_ROOTLETTERS = [
    u'\u0f40', u'\u0f41', u'\u0f42', u'\u0f44',
    u'\u0f45', u'\u0f46', u'\u0f47', u'\u0f49',
    u'\u0f4f', u'\u0f50', u'\u0f51', u'\u0f53',
    u'\u0f54', u'\u0f55', u'\u0f56', u'\u0f58',
    u'\u0f59', u'\u0f5a', u'\u0f5b', u'\u0f5d',
    u'\u0f5e', u'\u0f5f', u'\u0f60', u'\u0f61',
    u'\u0f62', u'\u0f63', u'\u0f64', u'\u0f66',
    u'\u0f67', u'\u0f68' ];

W_VOWELS = [ 'i', 'u', 'e', 'o' ];

U_VOWELS = [ u'\u0f72', u'\u0f74', u'\u0f7a', u'\u0f7c' ];

TSHEG = u'\u0f0b'

SUPER = [ 'r', 'l', 's' ];

SUB = [ 'y', 'r', 'l', 'w' ];

SUBOFFSET = 0x50

class Translator(object):
    'Mainly modifies static variable: Translator.syllable'

    def __init__(self):
        wTable = W_ROOTLETTERS + W_VOWELS
        uTable = U_ROOTLETTERS + U_VOWELS
        Translator.first = dict(zip(wTable, uTable))
        Translator.wTable = wTable
        Translator.uTable = uTable

    def mkSyllable(self, wylie):
        Translator.syllable = Syllable(self.toUni(wylie), wylie)

    def toUni(self, syllable):
        return Translator.first[str(syllable)]

    def toSub(self, syllable):
        return unichr(ord(Translator.first[str(syllable)]) + SUBOFFSET)

    def out(self):
        sys.stdout.write(Translator.syllable.uni)

    def addSuper(self, s):
        Translator.syllable.add(self.toSub(s), s)

    def add(self, s):
        # TODO: Remove redundant join
        syll = ''.join([Translator.syllable.wylie, s])

        if syll in Translator.wTable:
            self.mkSyllable(syll)
            return

        byteCnt = self.multibyte(syll)

        # Has multibyte wylie character:
        if byteCnt > 1:
            self.uniMutate(syll, byteCnt, self.isSuper(syll) or self.isSub(syll))
            return

        if self.isSuper(syll):
            self.addSuper(s)
            return

        if self.isSub(syll):
            self.addSuper(s)
            return

        # Has singlebyte wylie character:
        Translator.syllable.add(self.toUni(s), s)


    def uniMutate(self, s, i, doSub):
        old = Translator.syllable.uni[:-1]

        if doSub:
            new = self.toSub(s[-i:])
        else:
            new = Translator.first[s[-i:]]

        Translator.syllable.uni = u''.join([old, new])

    def multibyte(self, s):
        if len(s) < 2:
            return 0
        elif len(s) >= 3 and s[-3:] == 'tsh':
            return 3
        elif len(s) >= 2 and s[-2:] in Translator.first:
            return 2
        else:
            return 0

    def isSuper(self, s):
        if len(s) < 2 or not s[-2] in SUPER:
            return False
        else:
            return True

    def isSub(self, s):
        if s[-1] in SUB:
            return True
        else:
            return False

    def isVow(self, s, byteCnt):
        if s[-byteCnt-1] in W_VOWELS:
            return True
        else:
            return False

    def tsheg(self):
        Translator.syllable.tsheg()
        self.out()

    def alphabet(self):
        i = 0

        for key in W_ROOTLETTERS:
            self.mkSyllable(key)
            self.tsheg()
            i += 1

            if i % 4 == 0:
                sys.stdout.write("\n")

        sys.stdout.write("\n")

    def vowels(self):
        for key in W_VOWELS:
            self.mkSyllable('a')
            self.add(key)
            self.tsheg()

        print

    def test(self, string):
        print string + ":"
        i = 0
        for s in string:
            if i == 0:
               self.mkSyllable(s)

            self.add(s)
            i += 1

        self.tsheg()
        print

class Syllable(object):
    'Syllable structure'

    def __init__(self, uni, wylie):
        self.uni   = uni
        self.wylie = wylie

    def __str__(self):
        return self.wylie

    def __repr__(self):
        return self.wylie

    def tsheg(self):
        self.uni = u''.join([self.uni, TSHEG])

    def add(self, uni, wylie):
        self.wylie = u''.join([self.wylie, wylie])
        self.uni   = u''.join([self.uni, uni])

def main():
    t = Translator()
    t.alphabet()
    t.vowels()
    # print u''.join([Translator.first['s'], t.toSub('k'), t.toSub('y'), Translator.first['o'], Translator.first['ng']])
    t.test('skyongs') #renders glyphs incorrectly
    t.mkSyllable('s') # this line to l.203 (print) works fine
    t.add('k')
    t.add('y')
    t.add('o')
    t.add('n')
    t.add('g')
    t.add('s')
    t.tsheg()
    print

if __name__ =='__main__':
    main()

This outputs:

ཀ་ཁ་ག་ང་
ཅ་ཆ་ཇ་ཉ་
ཏ་ཐ་ད་ན་
པ་ཕ་བ་མ་
ཙ་ཚ་ཛ་ཝ་
ཞ་ཟ་འ་ཡ་
ར་ལ་ཤ་ས་
ཧ་ཨ་
ཨི་ཨུ་ཨེ་ཨོ་
skyongs:
སྶྐྱོངས་
སྐྱོངས་

Apart from some stabs at duckduckgo/google, I've already asked #python@freenode, but didn't get any response. Understanding this would really be of great help. Thanks.

해결책

Found a difference: in your test() method, it first calls self.mkSyllable('s'), and then also self.add('s'). But in the individual calls below, without for loop, you only call t.mkSyllable('s') and then don't do t.add('s').

라이센스 : CC-BY-SA ~와 함께 속성

제휴하지 않습니다 StackOverflow