'l10n-support/pology/l10n/sr/hook'

[prev in list] [next in list] [prev in thread] [next in thread] 

List:       kde-commits
Subject:    l10n-support/pology/l10n/sr/hook
From:       Chusslove Illich <caslav.ilic () gmx ! net>
Date:       2009-11-09 20:06:31
Message-ID: 1257797191.171371.25874.nullmailer () svn ! kde ! org
[Download RAW message or body]

SVN commit 1046831 by ilic:

Functions for hybridizing and splitting hybridizations.

 M  +209 -0    wconv.py  


--- trunk/l10n-support/pology/l10n/sr/hook/wconv.py #1046830:1046831
@@ -117,6 +117,7 @@
 from pology.misc.report import warning
 from pology.misc.resolve import resolve_alternatives_simple
 from pology.misc.resolve import resolve_alternatives
+from pology.misc.diff import word_diff
 
 
 # Transliteration table Serbian Cyrillic->Latin.
@@ -236,38 +237,127 @@
                                        outfilter=ctol)
 
 
+def hctocl (htext):
+    """
+    Resolve hybrid Cyrillic-Latin text into clean Cyrillic and clean Latin.
+
+    @param htext: hybrid text
+    @type htext: string
+
+    @returns: Cyrillic and Latin texts
+    @rtype: (string, string)
+    """
+
+    return hctoc(htext), hctol(htext)
+
+
+def cltoh (textc, textl, delims=u"/| �", full=False):
+    """
+    Construct hybrid Cyrillic text out of clean Cyrillic and Latin texts.
+
+    Hybridization is performed by inserting alternatives directives
+    for parts which cannot be resolved by direct transliteration.
+    If C{full} is set to C{True}, complete texts are unconditionally
+    wrapped into single alternatives directive.
+
+    @param textc: Cyrillic text
+    @type textc: string
+    @param textl: Latin text
+    @type textl: string
+    @param delims: possible delimiter characters
+    @type delims: string
+    @param full: whether to wraf full texts as single alternatives directive
+    @type full: bool
+
+    @returns: hybrid Cyrillic text
+    @type: string
+    """
+
+    if not full:
+        wdiff = word_diff(ctol(textc), textl)
+        textc = _padc(textc)
+        segs = []
+        i = 0
+        ic = 0
+        while i < len(wdiff):
+            tag, seg = wdiff[i]
+            segc = textc[ic:ic + len(seg)]
+            if tag == " ":
+                segs.append(segc)
+            else:
+                segl = wdiff[i + 1][1]
+                i += 1
+                segs.append(_shyb_althead + _delimit([segc, segl], delims))
+            ic += len(seg)
+            i += 1
+        return _unpadc("".join(segs))
+
+    else:
+        return _shyb_althead + _delimit([textc, textl], delims)
+
+    return "".join(segs)
+
+
+_padc_chr = u"\u0004"
+_padc_alphas = (u"љ", u"њ", u"џ", u"Љ", u"Њ", u"Џ")
+
+def _padc (text):
+
+    for alpha in _padc_alphas:
+        text = text.replace(alpha, _padc_chr + alpha)
+    return text
+
+def _unpadc (text):
+
+    for alpha in _padc_alphas:
+        text = text.replace(_padc_chr + alpha, alpha)
+    return text
+
+
 # Jat-reflex map Cyrillic->Cyrillic and Latin->Latin.
 _reflex_map = {
     # - basic
     u"ије": u"е",
+    u"Ије": u"Е",
     u"ИЈЕ": u"Е",
     u"иј": u"е",
+    u"Иј": u"Е",
     u"ИЈ": u"Е",
     u"је": u"е",
+    u"Је": u"Е",
     u"ЈЕ": u"Е",
     u"ље": u"ле",
+    u"Ље": u"Ле",
     u"ЉЕ": u"ЛЕ",
     u"ње": u"не",
+    u"Ње": u"Не",
     u"ЊЕ": u"НЕ",
     u"ио": u"ео",
+    u"Ио": u"Ео",
     u"ИО": u"ЕО",
     u"иљ": u"ел",
+    u"Иљ": u"Ел",
     u"ИЉ": u"ЕЛ",
 
     # - special cases
     u"лије": u"ли",
+    u"Лије": u"Ли",
     u"ЛИЈЕ": u"ЛИ",
     u"лијен": u"лењ",
     u"Лијен": u"Лењ",
     u"ЛИЈЕН": u"ЛЕЊ",
     u"мија": u"меја",
+    u"Мија": u"Меја",
     u"МИЈА": u"МЕЈА",
     u"мије": u"мејe",
+    u"Мије": u"Мејe",
     u"МИЈЕ": u"МЕЈE",
     u"није": u"ни",
+    u"Није": u"Ни",
     u"НИЈЕ": u"НИ",
 }
 _reflex_map.update(map(lambda x: map(ctol, x), _reflex_map.items()))
+_max_reflex_btrk = 1 # at most one previous character for special cases
 _max_reflex_len = max(map(lambda x: len(x), _reflex_map.keys()))
 
 _reflex_mark = u"›"
@@ -375,3 +465,122 @@
 
     return spans
 
+
+def hitoei (htext):
+    """
+    Resolve hybrid Ijekavian-Ekavain text into clean Ekavian and Ijekavian.
+
+    @param htext: hybrid text
+    @type htext: string
+
+    @returns: Ekavian and Ijekavian text
+    @rtype: (string, string)
+    """
+
+    return hitoe(htext), hitoi(htext)
+
+
+def eitoh (texte, texti, delims=u"/| �"):
+    """
+    Construct hybrid Ijekavian text out of clean Ekavian and Ijekavian texts.
+
+    Hybridization is performed by inserting reflex marks where possible,
+    and alternatives directives by dialect otherwise.
+    Both input texts should be in same script, Cyrillic or Latin.
+
+    @param texte: Ekavian text
+    @type texte: string
+    @param texti: Ijekavian text
+    @type texti: string
+    @param delims: possible delimiter characters
+    @type delims: string
+
+    @returns: hybrid Ijekavian text
+    @type: string
+    """
+
+    lene = len(texte)
+    leni = len(texti)
+    ie = 0
+    iep = 0
+    ii = 0
+    iip = 0
+    segs = []
+    while ie < lene and ii < leni:
+        while ie < lene and ii < leni and texte[ie] == texti[ii]:
+            ie += 1
+            ii += 1
+        for btrk in range(_max_reflex_btrk, -1, -1):
+            ieb = ie - btrk
+            iib = ii - btrk
+            if ieb < iep or iib < iip:
+                continue
+            maxrlen = _max_reflex_len - _max_reflex_btrk + btrk
+            frme = None
+            for rlen in range(maxrlen, 0, -1):
+                frmi = texti[iib:iib + rlen]
+                frme = _reflex_map.get(frmi)
+                if frme is not None and frme == texte[ieb:ieb + len(frme)]:
+                    break
+            if frme is not None:
+                break
+        if frme is not None:
+            segs.append(texte[iep:ieb])
+            segs.append(_reflex_mark + frmi)
+            iep = ieb + len(frme)
+            iip = iib + len(frmi)
+        else:
+            segs.append(texte[iep:ie])
+            if ie < lene or ii < lene:
+                wdiff = word_diff(texte[ie:], texti[ii:])
+                frme, frmi = wdiff[0][1], wdiff[1][1]
+                segs.append(_dhyb_althead + _delimit([frme, frmi], delims))
+                iep = ie + len(frme)
+                iip = ii + len(frmi)
+            else:
+                iep = ie
+                iip = ii
+        ie = iep
+        ii = iip
+
+    return "".join(segs)
+
+
+def hictoall (htext):
+    """
+    Resolve hybrid Ijekavian-Ekavian Cyrillic-Latin text into
+    all four clean variants.
+
+    @param htext: hybrid text
+    @type htext: string
+
+    @returns: Ekavian Cyrillic, Ekavian Latin, Ijekavian Cyrillic,
+        and Ijekavian Latin text
+    @rtype: (string, string, string, string)
+    """
+
+    htextc = hctoc(htext)
+    htextl = hctol(htext)
+
+    return hitoe(htextc), hitoe(htextl), hitoi(htextc), hitoi(htextl)
+
+
+def _delimit (alts, delims):
+
+    good = False
+    for delim in delims:
+        good = True
+        for alt in alts:
+            if delim in alt:
+                good = False
+                break
+        if good:
+            break
+
+    if not good:
+        raise StandardError("No delimiter from '%s' can be used for "
+                            "alternatives directive on: %s."
+                            % (delims, " ".join(["{%s}" % x for x in alts])))
+
+    return delim + delim.join(alts) + delim
+
[prev in list] [next in list] [prev in thread] [next in thread]
Configure | About | News | Add a list | Sponsored by KoreLogic