Source code for pythainlp.util.spell_words

# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0
import re
from typing import List
from pythainlp import (
    thai_above_vowels,
    thai_below_vowels,
    thai_consonants,
    thai_follow_vowels,
    thai_lead_vowels,
    thai_letters,
    thai_tonemarks,
)
from pythainlp.tokenize import subword_tokenize, Tokenizer


_r1 = ["เ-ย", "เ-ะ", "แ-ะ", "โ-ะ", "เ-าะ", "เ-อะ", "เ-อ", "เ-า"]
_r2 = ["–ั:วะ", "เ–ี:ยะ", "เ–ือะ", "–ั:ว", "เ–ี:ย", "เ–ื:อ", "–ื:อ"]
tonemarks = {
    i: "ไม้" + j
    for i, j in zip(list(thai_tonemarks), ["เอก", "โท", "ตรี", "จัตวา"])
}

rule1 = [i.replace("-", f"([{thai_letters}](thai_tonemarks)?)") for i in _r1]
rule2 = [i.replace("–", f"([{thai_letters}])").replace(":", "") for i in _r2]
rule3 = [
    i.replace("–", f"([{thai_letters}])").replace(":", f"([{thai_tonemarks}])")
    for i in _r2
]
dict_vowel_ex = {}
for i in _r1 + _r2:
    dict_vowel_ex[i.replace("-", "อ").replace("–", "อ").replace(":", "")] = (
        i.replace("-", "อ").replace(":", "").replace("–", "อ")
    )
dict_vowel = {}
for i in _r1 + _r2:
    dict_vowel[i.replace("-", "อ").replace("–", "อ").replace(":", "")] = (
        i.replace("-", "อ").replace(":", "").replace("–", "อ")
    )
for i in thai_lead_vowels:
    dict_vowel[i] = i + "อ"
for i in thai_follow_vowels:
    dict_vowel[i] = "อ" + i
for i in thai_above_vowels:
    dict_vowel[i] = "อ" + i
for i in thai_below_vowels:
    dict_vowel[i] = "อ" + i

_cut = Tokenizer(list(dict_vowel.keys()) + list(thai_consonants), engine="mm")


def _clean(w):
    if bool(re.match("|".join(rule3), w)):
        for r in rule3:
            if bool(re.match(r, w)):
                w = re.sub(r, "\\1==\\2==", w)
                temp = w.split("==")
                w = (
                    temp[0]
                    + r.replace(f"([{thai_letters}])", "อ").replace(
                        f"([{thai_tonemarks}])", ""
                    )
                    + temp[1]
                )
    elif bool(re.match("|".join(rule2), w)):
        for r in rule2:
            if bool(re.match(r, w)):
                w = re.sub(r, "\\1", w) + r.replace(f"([{thai_letters}])", "อ")
    elif bool(re.match("|".join(rule1), w)):
        for r in rule1:
            if bool(re.match(r, w)):
                w = re.sub(r, "\\1", w) + r.replace(
                    f"([{thai_letters}](thai_tonemarks)?)", "อ"
                )
    return w


[docs]def spell_syllable(text: str) -> List[str]:
    """
    Spell out syllables in Thai word distribution form.

    :param str s: Thai syllables only
    :return: List of spelled out syllables
    :rtype: List[str]

    :Example:
    ::

        from pythainlp.util.spell_words import spell_syllable

        print(spell_syllable("แมว"))
        # output: ['มอ', 'วอ', 'แอ', 'แมว']
    """
    tokens = _cut.word_tokenize(_clean(text))

    c_only = [tok + "อ" for tok in tokens if tok in set(thai_consonants)]
    v_only = [dict_vowel[tok] for tok in tokens if tok in set(dict_vowel)]
    t_only = [tonemarks[tok] for tok in tokens if tok in set(tonemarks.keys())]

    return c_only + v_only + t_only + [text]


[docs]def spell_word(text: str) -> List[str]:
    """
    Spell out words in Thai word distribution form.

    :param str w: Thai words only
    :return: List of spelled out words
    :rtype: List[str]

    :Example:
    ::

        from pythainlp.util.spell_words import spell_word

        print(spell_word("คนดี"))
        # output: ['คอ', 'นอ', 'คน', 'ดอ', 'อี', 'ดี', 'คนดี']
    """
    spellouts = []
    tokens = subword_tokenize(text, engine="han_solo")

    for tok in tokens:
        spellouts.extend(spell_syllable(tok))

    if len(tokens) > 1:
        spellouts.append(text)

    return spellouts