Source code for ksx1026.normalization

"""
========================================================================
 Hangul normalization functions defined in KS X 1026-1
========================================================================
.. moduleauthor:: Wonsup Yoon <pusnow@me.com>


Reference
============

 * http://www.unicode.org/L2/L2008/08225-n3422.pdf

 """
from __future__ import unicode_literals
from .constants import SBase, LBase, VBase, TBase
from .constants import LCount, VCount, TCount, NCount, SCount
from .constants import CPJAMO, HWJAMO, PCJAMO
from . import uchar
import six


[docs]def decomposeHangul(S): """ returns a Johab Modern Hangul Syllable Block for the given Wanseong Modern Hangul Syllable Block :param char S: Single character Hangul Syllable. If not, return input. """ SIndex = ord(S) - SBase if SIndex < 0 or SIndex >= SCount: return S result = "" L = LBase + SIndex // NCount V = VBase + (SIndex % NCount) // TCount T = TBase + SIndex % TCount result += six.unichr(L) result += six.unichr(V) if T != TBase: result += six.unichr(T) return result
[docs]def decomposeHangulStr(source): """ returns a Johab Modern Hangul Syllable String for the given Wanseong Modern Hangul Syllable String :param string source: Single character Hangul Syllable. If not, return input. """ result = [] for S in source: result.append(decomposeHangul(S)) return "".join(result)
[docs]def composeHangul(source): """ returns a Wanseong Modern Hangul Syllable Block for the given Johab Modern Hangul Syllable Block. Even when a portion of an Old Hangul Syllable Block is a Modern Hangul Syllable Block, unlike UAX #15, that portion is not transformed to a Wanseong Modern Hangul Syllable Block. :param string source: unicode string. """ length = len(source) if length == 0: return "" result = [] last = source[0] result += last for i in range(1, length): ch = source[i] LIndex = ord(last) - LBase if 0 <= LIndex and LIndex < LCount: VIndex = ord(ch) - VBase if 0 <= VIndex and VIndex < VCount: last = six.unichr(SBase + (LIndex * VCount + VIndex) * TCount) len_result = len(result) result[len_result - 1] = last continue SIndex = ord(last) - SBase if 0 <= SIndex and SIndex < SCount and (SIndex % TCount) == 0: TIndex = ord(ch) - TBase if 0 < TIndex and TIndex < TCount: last = six.unichr(ord(last) + TIndex) len_result = len(result) result[len_result - 1] = last continue if uchar.isOldJongseong(ch): L = LBase + SIndex // NCount V = VBase + (SIndex % NCount) // TCount len_result = len(result) result[len_result - 1] = six.unichr(L) result += six.unichr(V) result += ch continue last = ch result += ch return "".join(result)
[docs]def recomposeHangul(source): """ If one uses a UAX #15 algorithm instead of the above composeHangul function for normalization, an Old Hangul Syllable Block can be decomposed into a Wanseong Modern Hangul Syllable Block and Johab Hangul Letter(s). In such cases, after applying, one can use the following recomposition algorithm to restore a character string in Normalization Form NFC or NFKC to an L V T format. :param string source: unicode string """ length = len(source) if length == 0: return "" result = [] last = source[0] result += last for i in range(1, length): ch = source[i] # check to see if two consecutive characters are a Wanseong Modern Hangul # Syallable Block and a Syllable-Final Letter. SIndex = ord(last) - SBase if 0 <= SIndex and SIndex < SCount and (SIndex % TCount) == 0: if uchar.isOldJongseong(ch): L = LBase + SIndex // NCount V = VBase + (SIndex % NCount) // TCount result[len(result) - 1] = six.unichr(L) result += six.unichr(V) result += ch continue last = ch result += ch return "".join(result)
[docs]def normalizeJamoKDKC(source): """ Normalizing Compatibility/Halfwidth Hangul Letters and Hangul-embedded symbols (NormalizeJamoKDKC) :param string source: unicode string """ PHBase = 0x3200 PHEnd = 0x320D CHBase = 0x3260 CHEnd = 0x326D length = len(source) if length == 0: return "" result = [] for i in range(0, length): ch = source[i] pf = 0 if uchar.isCompatibilityLetter(ch): ch = six.unichr(CPJAMO[ord(ch) - 0x3131]) elif PHBase <= ord(ch) and ord(ch) <= PHEnd: result += '\u0028' ch = six.unichr(PCJAMO[ord(ch) - PHBase]) pf = '\u0029' elif CHBase <= ord(ch) and ord(ch) <= CHEnd: ch = six.unichr(PCJAMO[ord(ch) - CHBase]) elif uchar.isHalfwidthLetter(ch): ch = six.unichr(HWJAMO[ord(ch) - 0xFFA0]) else: result += ch continue if uchar.isChoseongJamo(ch): result += ch result += '\u1160' elif uchar.isJungseongJamo(ch): result += '\u115F' result += ch elif uchar.isJongseongJamo(ch): result += '\u115F' result += '\u1160' result += ch if pf != 0: result.append(pf) return "".join(result)