Source code for pythainlp.khavee.core

# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0
# ruff: noqa: C901

from typing import List, Union

from pythainlp.tokenize import subword_tokenize
from pythainlp.util import remove_tonemark, sound_syllable


[docs]class KhaveeVerifier:
[docs]    def __init__(self):
        """
        KhaveeVerifier: Thai Poetry verifier
        """

[docs]    def check_sara(self, word: str) -> str:
        """
        Check the vowels in the Thai word.

        :param str word: Thai word
        :return: vowel name of the word
        :rtype: str

        :Example:
        ::

            from pythainlp.khavee import KhaveeVerifier

            kv = KhaveeVerifier()

            print(kv.check_sara("เริง"))
            # output: 'เออ'
        """
        sara = []
        countoa = 0

        # In case of การันย์
        if "์" in word[-1]:
            word = word[:-2]

        # In case of สระเดี่ยว
        for i in word:
            if i in ("ะ", "ั"):
                sara.append("อะ")
            elif i == "ิ":
                sara.append("อิ")
            elif i == "ุ":
                sara.append("อุ")
            elif i == "ึ":
                sara.append("อึ")
            elif i == "ี":
                sara.append("อี")
            elif i == "ู":
                sara.append("อู")
            elif i == "ื":
                sara.append("อือ")
            elif i == "เ":
                sara.append("เอ")
            elif i == "แ":
                sara.append("แอ")
            elif i == "า":
                sara.append("อา")
            elif i == "โ":
                sara.append("โอ")
            elif i == "ำ":
                sara.append("อำ")
            elif i == "อ":
                countoa += 1
                sara.append("ออ")
            elif i == "ั" and "ว" in word:
                sara.append("อัว")
            elif i in ("ไ", "ใ"):
                sara.append("ไอ")
            elif i == "็":
                sara.append("ออ")
            elif "รร" in word:
                if self.check_marttra(word) == "กม":
                    sara.append("อำ")
                else:
                    sara.append("อะ")

        # In case of ออ
        if countoa == 1 and "อ" in word[-1] and "เ" not in word:
            sara.remove("ออ")

        # In case of เอ เอ
        countA = 0
        for i in sara:
            if i == "เอ":
                countA = countA + 1
            if countA > 1:
                sara.remove("เอ")
                sara.remove("เอ")
                sara.append("แ")

        # In case of สระประสม
        if "เอ" in sara and "อะ" in sara:
            sara.remove("เอ")
            sara.remove("อะ")
            sara.append("เอะ")
        elif "แอ" in sara and "อะ" in sara:
            sara.remove("แอ")
            sara.remove("อะ")
            sara.append("แอะ")

        if "เอะ" in sara and "ออ" in sara:
            sara.remove("เอะ")
            sara.remove("ออ")
            sara.append("เออะ")
        elif "เอ" in sara and "อิ" in sara:
            sara.remove("เอ")
            sara.remove("อิ")
            sara.append("เออ")
        elif "เอ" in sara and "ออ" in sara and "อ" in word[-1]:
            sara.remove("เอ")
            sara.remove("ออ")
            sara.append("เออ")
        elif "โอ" in sara and "อะ" in sara:
            sara.remove("โอ")
            sara.remove("อะ")
            sara.append("โอะ")
        elif "เอ" in sara and "อี" in sara:
            sara.remove("เอ")
            sara.remove("อี")
            sara.append("เอีย")
        elif "เอ" in sara and "อือ" in sara:
            sara.remove("เอ")
            sara.remove("อือ")
            sara.append("อัว")
        elif "เอ" in sara and "อา" in sara:
            sara.remove("เอ")
            sara.remove("อา")
            sara.append("เอา")
        elif "เ" in word and "า" in word and "ะ" in word:
            sara = []
            sara.append("เอาะ")

        if "อือ" in sara and "เออ" in sara:
            sara.remove("เออ")
            sara.remove("อือ")
            sara.append("เอือ")
        elif "ออ" in sara and len(sara) > 1:
            sara.remove("ออ")
        elif "ว" in word and len(sara) == 0:
            sara.append("อัว")

        if "ั" in word and self.check_marttra(word) == "กา":
            sara = []
            sara.append("ไอ")

        # In case of อ
        if word == "เออะ":
            sara = []
            sara.append("เออะ")
        elif word == "เออ":
            sara = []
            sara.append("เออ")
        elif word == "เอ":
            sara = []
            sara.append("เอ")
        elif word == "เอะ":
            sara = []
            sara.append("เอะ")
        elif word == "เอา":
            sara = []
            sara.append("เอา")
        elif word == "เอาะ":
            sara = []
            sara.append("เอาะ")

        if "ฤา" in word or "ฦา" in word:
            sara = []
            sara.append("อือ")
        elif "ฤ" in word or "ฦ" in word:
            sara = []
            sara.append("อึ")

        # In case of กน
        if not sara and len(word) == 2:
            if word[-1] != "ร":
                sara.append("โอะ")
            else:
                sara.append("ออ")
        elif not sara and len(word) == 3:
            sara.append("ออ")

        # In case of บ่
        if word == "บ่":
            sara = []
            sara.append("ออ")

        if "ํ" in word:
            sara = []
            sara.append("อำ")

        if "เ" in word and "ื" in word and "อ" in word:
            sara = []
            sara.append("เอือ")

        if not sara:
            return "Can't find Sara in this word"

        return sara[0]

[docs]    def check_marttra(self, word: str) -> str:
        """
        Check the Thai spelling Section in the Thai word.

        :param str word: Thai word
        :return: name of spelling Section of the word.
        :rtype: str

        :Example:
        ::

            from pythainlp.khavee import KhaveeVerifier

            kv = KhaveeVerifier()

            print(kv.check_marttra('สาว'))
            # output: 'เกอว'
        """
        if word[-1] == "ร" and word[-2] in ["ต", "ท"]:
            word = word[:-1]
        word = self.handle_karun_sound_silence(word)
        word = remove_tonemark(word)
        if (
            "ำ" in word
            or ("ํ" in word and "า" in word)
            or "ไ" in word
            or "ใ" in word
        ):
            return "กา"
        elif (
            word[-1] in ["า", "ะ", "ิ", "ี", "ุ", "ู", "อ"]
            or ("ี" in word and "ย" in word[-1])
            or ("ื" in word and "อ" in word[-1])
        ):
            return "กา"
        elif word[-1] in ["ง"]:
            return "กง"
        elif word[-1] in ["ม"]:
            return "กม"
        elif word[-1] in ["ย"]:
            if "ั" in word:
                return "กา"
            else:
                return "เกย"
        elif word[-1] in ["ว"]:
            return "เกอว"
        elif word[-1] in ["ก", "ข", "ค", "ฆ"]:
            return "กก"
        elif word[-1] in [
            "จ",
            "ช",
            "ซ",
            "ฎ",
            "ฏ",
            "ฐ",
            "ฑ",
            "ฒ",
            "ด",
            "ต",
            "ถ",
            "ท",
            "ธ",
            "ศ",
            "ษ",
            "ส",
        ]:
            return "กด"
        elif word[-1] in ["ญ", ", ณ", "น", "ร", "ล", "ฬ"]:
            return "กน"
        elif word[-1] in ["บ", "ป", "พ", "ฟ", "ภ"]:
            return "กบ"
        else:
            if "็" in word:
                return "กา"
            else:
                return "Cant find Marttra in this word"

[docs]    def is_sumpus(self, word1: str, word2: str) -> bool:
        """
        Check the rhyme between two words.

        :param str word1: Thai word
        :param str word2: Thai word
        :return: boolean
        :rtype: bool

        :Example:
        ::

            from pythainlp.khavee import KhaveeVerifier

            kv = KhaveeVerifier()

            print(kv.is_sumpus('สรร', 'อัน'))
            # output: True

            print(kv.is_sumpus('สรร', 'แมว'))
            # output: False
        """
        marttra1 = self.check_marttra(word1)
        marttra2 = self.check_marttra(word2)
        sara1 = self.check_sara(word1)
        sara2 = self.check_sara(word2)
        if sara1 == "อะ" and marttra1 == "เกย":
            sara1 = "ไอ"
            marttra1 = "กา"
        elif sara2 == "อะ" and marttra2 == "เกย":
            sara2 = "ไอ"
            marttra2 = "กา"
        if sara1 == "อำ" and marttra1 == "กม":
            sara1 = "อำ"
            marttra1 = "กา"
        elif sara2 == "อำ" and marttra2 == "กม":
            sara2 = "อำ"
            marttra2 = "กา"
        return bool(marttra1 == marttra2 and sara1 == sara2)

[docs]    def check_karu_lahu(self, text):
        if (
            self.check_marttra(text) != "กา"
            or (
                self.check_marttra(text) == "กา"
                and self.check_sara(text)
                in [
                    "อา",
                    "อี",
                    "อือ",
                    "อู",
                    "เอ",
                    "แอ",
                    "โอ",
                    "ออ",
                    "เออ",
                    "เอีย",
                    "เอือ",
                    "อัว",
                ]
            )
            or self.check_sara(text) in ["อำ", "ไอ", "เอา"]
        ) and text not in ["บ่", "ณ", "ธ", "ก็"]:
            return "karu"
        else:
            return "lahu"

[docs]    def check_klon(self, text: str, k_type: int = 8) -> Union[List[str], str]:
        """
        Check the suitability of the poem according to Thai principles.

        :param str text: Thai poem
        :param int k_type: type of Thai poem
        :return: the check results of the suitability of the poem according to Thai principles.
        :rtype: Union[List[str], str]

        :Example:
        ::

            from pythainlp.khavee import KhaveeVerifier

            kv = KhaveeVerifier()

            print(kv.check_klon(
                'ฉันชื่อหมูกรอบ ฉันชอบกินไก่ แล้วก็วิ่งไล่ หมาชื่อนํ้าทอง ลคคนเก่ง เอ๋งเอ๋งคะนอง \
                มีคนจับจอง เขาชื่อน้องเธียร',
                k_type=4
            ))
            # output: The poem is correct according to the principle.

            print(kv.check_klon(
                'ฉันชื่อหมูกรอบ ฉันชอบกินไก่ แล้วก็วิ่งไล่ หมาชื่อนํ้าทอง ลคคนเก่ง \
                เอ๋งเอ๋งเสียงหมา มีคนจับจอง เขาชื่อน้องเธียร',
                k_type=4
            ))
            # output: [
                "Can't find rhyme between paragraphs ('หมา', 'จอง') in paragraph 2",
                "Can't find rhyme between paragraphs ('หมา', 'ทอง') in paragraph 2"
            ]
        """
        if k_type == 8:
            try:
                error = []
                list_sumpus_sent1 = []
                list_sumpus_sent2h = []
                list_sumpus_sent2l = []
                list_sumpus_sent3 = []
                list_sumpus_sent4 = []
                for i, sent in enumerate(text.split()):
                    sub_sent = subword_tokenize(sent, engine="dict")
                    if len(sub_sent) > 10:
                        error.append(
                            "In sentence "
                            + str(i + 2)
                            + ", there are more than 10 words. "
                            + str(sub_sent)
                        )
                    if (i + 1) % 4 == 1:
                        list_sumpus_sent1.append(sub_sent[-1])
                    elif (i + 1) % 4 == 2:
                        list_sumpus_sent2h.append(
                            [
                                sub_sent[1],
                                sub_sent[2],
                                sub_sent[3],
                                sub_sent[4],
                            ]
                        )
                        list_sumpus_sent2l.append(sub_sent[-1])
                    elif (i + 1) % 4 == 3:
                        list_sumpus_sent3.append(sub_sent[-1])
                    elif (i + 1) % 4 == 0:
                        list_sumpus_sent4.append(sub_sent[-1])
                if (
                    len(list_sumpus_sent1) != len(list_sumpus_sent2h)
                    or len(list_sumpus_sent2h) != len(list_sumpus_sent2l)
                    or len(list_sumpus_sent2l) != len(list_sumpus_sent3)
                    or len(list_sumpus_sent3) != len(list_sumpus_sent4)
                    or len(list_sumpus_sent4) != len(list_sumpus_sent1)
                ):
                    return "The poem does not have 4 complete sentences."
                else:
                    for i in range(len(list_sumpus_sent1)):
                        countwrong = 0
                        for j in list_sumpus_sent2h[i]:
                            if (
                                self.is_sumpus(list_sumpus_sent1[i], j)
                                is False
                            ):
                                countwrong += 1
                        if countwrong > 3:
                            error.append(
                                "Can't find rhyme between paragraphs "
                                + str(
                                    (
                                        list_sumpus_sent1[i],
                                        list_sumpus_sent2h[i],
                                    )
                                )
                                + " in paragraph "
                                + str(i + 1)
                            )
                        if (
                            self.is_sumpus(
                                list_sumpus_sent2l[i], list_sumpus_sent3[i]
                            )
                            is False
                        ):
                            error.append(
                                "Can't find rhyme between paragraphs "
                                + str(
                                    (
                                        list_sumpus_sent2l[i],
                                        list_sumpus_sent3[i],
                                    )
                                )
                                + " in paragraph "
                                + str(i + 1)
                            )
                        if i > 0:
                            if (
                                self.is_sumpus(
                                    list_sumpus_sent2l[i],
                                    list_sumpus_sent4[i - 1],
                                )
                                is False
                            ):
                                error.append(
                                    "Can't find rhyme between paragraphs "
                                    + str(
                                        (
                                            list_sumpus_sent2l[i],
                                            list_sumpus_sent4[i - 1],
                                        )
                                    )
                                    + " in paragraph "
                                    + str(i + 1)
                                )
                    if not error:
                        return (
                            "The poem is correct according to the principle."
                        )
                    else:
                        return error
            except:
                return "Something went wrong. Make sure you enter it in the correct form of klon 8."
        elif k_type == 4:
            try:
                error = []
                list_sumpus_sent1 = []
                list_sumpus_sent2h = []
                list_sumpus_sent2l = []
                list_sumpus_sent3 = []
                list_sumpus_sent4 = []
                for i, sent in enumerate(text.split()):
                    sub_sent = subword_tokenize(sent, engine="dict")
                    if len(sub_sent) > 5:
                        error.append(
                            "In sentence "
                            + str(i + 2)
                            + ", there are more than 4 words. "
                            + str(sub_sent)
                        )
                    if (i + 1) % 4 == 1:
                        list_sumpus_sent1.append(sub_sent[-1])
                    elif (i + 1) % 4 == 2:
                        list_sumpus_sent2h.append([sub_sent[1], sub_sent[2]])
                        list_sumpus_sent2l.append(sub_sent[-1])
                    elif (i + 1) % 4 == 3:
                        list_sumpus_sent3.append(sub_sent[-1])
                    elif (i + 1) % 4 == 0:
                        list_sumpus_sent4.append(sub_sent[-1])
                if (
                    len(list_sumpus_sent1) != len(list_sumpus_sent2h)
                    or len(list_sumpus_sent2h) != len(list_sumpus_sent2l)
                    or len(list_sumpus_sent2l) != len(list_sumpus_sent3)
                    or len(list_sumpus_sent3) != len(list_sumpus_sent4)
                    or len(list_sumpus_sent4) != len(list_sumpus_sent1)
                ):
                    return "The poem does not have 4 complete sentences."
                else:
                    for i in range(len(list_sumpus_sent1)):
                        countwrong = 0
                        for j in list_sumpus_sent2h[i]:
                            if (
                                self.is_sumpus(list_sumpus_sent1[i], j)
                                is False
                            ):
                                countwrong += 1
                        if countwrong > 1:
                            error.append(
                                "Can't find rhyme between paragraphs "
                                + str(
                                    (
                                        list_sumpus_sent1[i],
                                        list_sumpus_sent2h[i],
                                    )
                                )
                                + " in paragraph "
                                + str(i + 1)
                            )
                        if (
                            self.is_sumpus(
                                list_sumpus_sent2l[i], list_sumpus_sent3[i]
                            )
                            is False
                        ):
                            error.append(
                                "Can't find rhyme between paragraphs "
                                + str(
                                    (
                                        list_sumpus_sent2l[i],
                                        list_sumpus_sent3[i],
                                    )
                                )
                                + " in paragraph "
                                + str(i + 1)
                            )
                        if i > 0:
                            if (
                                self.is_sumpus(
                                    list_sumpus_sent2l[i],
                                    list_sumpus_sent4[i - 1],
                                )
                                is False
                            ):
                                error.append(
                                    "Can't find rhyme between paragraphs "
                                    + str(
                                        (
                                            list_sumpus_sent2l[i],
                                            list_sumpus_sent4[i - 1],
                                        )
                                    )
                                    + " in paragraph "
                                    + str(i + 1)
                                )
                    if not error:
                        return (
                            "The poem is correct according to the principle."
                        )
                    else:
                        return error
            except:
                return "Something went wrong. Make sure you enter it in the correct form."

        else:
            return "Something went wrong. Make sure you enter it in the correct form."

[docs]    def check_aek_too(
        self, text: Union[List[str], str], dead_syllable_as_aek: bool = False
    ) -> Union[List[bool], List[str], bool, str]:
        """
        Checker of Thai tonal words

        :param Union[List[str], str] text: Thai word or list of Thai words
        :param bool dead_syllable_as_aek: if True, dead syllable will be considered as aek
        :return: the check result if the word is aek or too or False (not both) or list of check results if input is list
        :rtype: Union[List[bool], List[str], bool, str]

        :Example:
        ::

            from pythainlp.khavee import KhaveeVerifier

            kv = KhaveeVerifier()

            # การเช็คคำเอกโท
            print(kv.check_aek_too('เอง'), kv.check_aek_too('เอ่ง'), kv.check_aek_too('เอ้ง'))
            # -> False, aek, too
            print(kv.check_aek_too(['เอง', 'เอ่ง', 'เอ้ง'])) # ใช้ List ได้เหมือนกัน
            # -> [False, 'aek', 'too']


        """
        if isinstance(text, list):
            return [self.check_aek_too(t, dead_syllable_as_aek) for t in text]

        if not isinstance(text, str):
            raise TypeError("text must be str or iterable list[str]")

        word_characters = [*text]
        if "่" in word_characters and "้" not in word_characters:
            return "aek"
        elif "้" in word_characters and "่" not in word_characters:
            return "too"
        if dead_syllable_as_aek and sound_syllable(text) == "dead":
            return "aek"
        else:
            return False

[docs]    def handle_karun_sound_silence(self, word: str) -> str:
        """
        Handle silent sounds in Thai words using '์' character (Karun)
        by stripping all characters before the 'Karun' character that should be silenced

        :param str text: Thai word
        :return: Thai word with silent words stripped
        :rtype: str
        """
        sound_silenced = word.endswith("์")
        if not sound_silenced:
            return word
        thai_consonants = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ"
        locate_silenced = word.rfind("์") - 1
        can_silence_two = word[locate_silenced - 2] in thai_consonants
        cut_off = 2 if can_silence_two else 1
        word = word[: locate_silenced + 1 - cut_off]
        return word