Source code for ksx1026.sorting

"""
========================================================================
 Hangul sorting functions defined in KS X 1026-1
========================================================================
.. moduleauthor:: Wonsup Yoon <pusnow@me.com>


Reference
============

 * http://www.unicode.org/L2/L2008/08225-n3422.pdf

 """
from __future__ import unicode_literals
from . import uchar
from .constants import INDEX1100, INDEXA960, INDEXD7B0, INDEXD7CB
from .constants import HWJAMO, CPJAMO, PACHAR, CLCHAR
from .normalization import decomposeHangul
import six


[docs]def getHangulWeightLVT(L, V, T, _type=0): """ for the Syllable-Initial, Syllable-Peak and Syllable-Final Letters, determine a weight. _type: 0 is assigned to a Johab Hangul Syllable Block or Wanseong Hangul Syllable Block. 1 is assigned when there is only a Syllable-Final Letter. 2 is assigned to a Halfwidth Hangul Letter. 3 is assigned to a Hangul Compatibility Letter. 4 is assigned to a Parenthesized Hangul Letter/Syllable Block. 5 is assigned to a Circled Hangul Letter/Syllable Block. :param char L: Single character string :param char V: Single character string :param char T: Single character string :param int _type: inteager (0, 1, 2, 3, 4, 5) """ weight = 0 LW = 0 VW = 0 TW = 0 if uchar.isChoseongJamo(L): if uchar.isJungseongJamo(V): if T is None or uchar.isJongseongJamo(T): if ord(L) < 0x1200: LW = INDEX1100[ord(L) - 0x1100] else: LW = INDEXA960[ord(L) - 0xA960] if ord(V) < 0x1200: VW = INDEX1100[ord(V) - 0x1100] else: VW = INDEXD7B0[ord(V) - 0xD7B0] if T is not None: if ord(V) < 0x1200: TW = INDEX1100[ord(T) - 0x1100] else: TW = INDEXD7CB[ord(T) - 0xD7CB] if ord(L) == 0x115F and \ ord(V) == 0x1160 and \ T is not None: weight = (TW << 24) + _type else: weight = (LW << 24) + (VW << 16) + (TW << 8) + _type return weight
[docs]def getHangulWeight(hc): """ determine a weight for a Wanseong Hangul Syllable Block, a Hangul Letter or Hangul-embedded Symbol :param char hc: Single character string """ _type = 0 index = ord(hc) weight = 0 L = six.unichr(0x115F) V = six.unichr(0x1160) T = None """ _type: 0 is assigned to a Johab Hangul Syllable Block or Wanseong Hangul Syllable Block. 1 is assigned when there is only a Syllable-Final Letter. 2 is assigned to a Halfwidth Hangul Letter. 3 is assigned to a Hangul Compatibility Letter. 4 is assigned to a Parenthesized Hangul Letter/Syllable Block. 5 is assigned to a Circled Hangul Letter/Syllable Block. """ if uchar.isJongseongJamo(hc): _type = 1 T = hc elif uchar.isHalfwidthLetter(hc): _type = 2 index = HWJAMO[index - 0xFFA0] if index == ord(hc): raise elif uchar.isCompatibilityLetter(hc): _type = 3 index = CPJAMO[index - 0x3131] elif uchar.isParenthesizedLetter(hc): _type = 4 index = PACHAR[index - 0x3200] if index == ord(hc): raise elif uchar.isCircledLetter(hc): _type = 5 index = CLCHAR[index - 0x3260] if index == ord(hc): raise index = six.unichr(index) if uchar.isChoseongJamo(index): L = index elif uchar.isJungseongJamo(index): V = index elif uchar.isJongseongJamo(index): T = index elif uchar.isPrecomposedSyllable(index): SIndex = decomposeHangul(index) L = SIndex[0] V = SIndex[1] if len(SIndex) == 3: T = SIndex[2] else: T = None else: return 0 weight = getHangulWeightLVT(L, V, T, _type) return weight
[docs]def sortKey(text, hangul_first=True): """ key function for sorted :param string text: A string for weight :param bool hangul_first: Boolean """ weights = [] itr = enumerate(text) for i, ch in itr: if uchar.isChoseongJamo(ch): L = ch V = six.unichr(0x1160) T = None if i + 1 < len(text) and uchar.isJungseongJamo(text[i + 1]): _, V = next(itr) if i + 2 < len(text) and uchar.isJongseongJamo(text[i + 2]): _, T = next(itr) _type = 0 weight = getHangulWeightLVT(L, V, T, _type) _type = weight & 3 weight = weight >> 2 weight = weight << 1 weight = weight | _type if not hangul_first: weight += 1 << 31 elif uchar.isJongseongJamo(ch): L = six.unichr(0x115F) V = ch if i + 1 < len(text) and uchar.isJongseongJamo(text[i + 1]): _, T = next(itr) _type = 0 weight = getHangulWeightLVT(L, V, T, _type) _type = weight & 3 weight = weight >> 2 weight = weight << 1 weight = weight | _type if not hangul_first: weight += 1 << 31 elif uchar.isHangulLetter(ch): weight = getHangulWeight(ch) _type = weight & 3 weight = weight >> 2 weight = weight << 1 weight = weight | _type if not hangul_first: weight += 1 << 31 else: weight = ord(ch) if hangul_first: weight += 1 << 31 weights.append(weight) return weights