Source code for pythainlp.khavee.core

# -*- coding: utf-8 -*-
# Copyright (C) 2016-2023 PyThaiNLP Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List, Union
from pythainlp.tokenize import subword_tokenize
from pythainlp.util import sound_syllable
from pythainlp.util import remove_tonemark
[docs]class KhaveeVerifier:
[docs] def __init__(self): """ KhaveeVerifier: Thai Poetry verifier """
[docs] def check_sara(self, word: str) -> str: """ Check the vowels in the Thai word. :param str word: Thai word :return: vowel name of the word :rtype: str :Example: :: from pythainlp.khavee import KhaveeVerifier kv = KhaveeVerifier() print(kv.check_sara('เริง')) # output: 'เออ' """ sara = [] countoa = 0 # In case of การันย์ if '์' in word[-1]: word = word[:-2] # In case of สระเดี่ยว for i in word: if i in ('ะ', 'ั'): sara.append('อะ') elif i == 'ิ': sara.append('อิ') elif i == 'ุ': sara.append('อุ') elif i == 'ึ': sara.append('อึ') elif i == 'ี': sara.append('อี') elif i == 'ู': sara.append('อู') elif i == 'ื': sara.append('อือ') elif i == 'เ': sara.append('เอ') elif i == 'แ': sara.append('แอ') elif i == 'า': sara.append('อา') elif i == 'โ': sara.append('โอ') elif i == 'ำ': sara.append('อำ') elif i == 'อ': countoa += 1 sara.append('ออ') elif i == 'ั' and 'ว' in word: sara.append('อัว') elif i in ('ไ', 'ใ'): sara.append('ไอ') elif i == '็': sara.append('ออ') elif 'รร' in word: if self.check_marttra(word) == 'กม': sara.append('อำ') else: sara.append('อะ') # In case of ออ if countoa == 1 and 'อ' in word[-1] and 'เ' not in word: sara.remove('ออ') # In case of เอ เอ countA = 0 for i in sara: if i == 'เอ': countA = countA + 1 if countA > 1: sara.remove('เอ') sara.remove('เอ') sara.append('แ') # In case of สระประสม if 'เอ' in sara and 'อะ' in sara: sara.remove('เอ') sara.remove('อะ') sara.append('เอะ') elif 'แอ' in sara and 'อะ' in sara: sara.remove('แอ') sara.remove('อะ') sara.append('แอะ') if 'เอะ' in sara and 'ออ' in sara: sara.remove('เอะ') sara.remove('ออ') sara.append('เออะ') elif 'เอ' in sara and 'อิ' in sara: sara.remove('เอ') sara.remove('อิ') sara.append('เออ') elif 'เอ' in sara and 'ออ' in sara and 'อ' in word[-1]: sara.remove('เอ') sara.remove('ออ') sara.append('เออ') elif 'โอ' in sara and 'อะ' in sara: sara.remove('โอ') sara.remove('อะ') sara.append('โอะ') elif 'เอ' in sara and 'อี' in sara: sara.remove('เอ') sara.remove('อี') sara.append('เอีย') elif 'เอ' in sara and 'อือ' in sara: sara.remove('เอ') sara.remove('อือ') sara.append('อัว') elif 'เอ' in sara and 'อา' in sara: sara.remove('เอ') sara.remove('อา') sara.append('เอา') elif 'เ' in word and 'า' in word and 'ะ' in word: sara = [] sara.append('เอาะ') if 'อือ' in sara and 'เออ' in sara: sara.remove('เออ') sara.remove('อือ') sara.append('เอือ') elif 'ออ' in sara and len(sara) > 1: sara.remove('ออ') elif 'ว' in word and len(sara) == 0: sara.append('อัว') if 'ั' in word and self.check_marttra(word) == 'กา': sara = [] sara.append('ไอ') # In case of อ if word == 'เออะ': sara = [] sara.append('เออะ') elif word == 'เออ': sara = [] sara.append('เออ') elif word == 'เอ': sara = [] sara.append('เอ') elif word == 'เอะ': sara = [] sara.append('เอะ') elif word == 'เอา': sara = [] sara.append('เอา') elif word == 'เอาะ': sara = [] sara.append('เอาะ') if 'ฤา' in word or 'ฦา' in word: sara = [] sara.append('อือ') elif 'ฤ' in word or 'ฦ' in word: sara = [] sara.append('อึ') # In case of กน if not sara and len(word) == 2: if word[-1] != 'ร': sara.append('โอะ') else: sara.append('ออ') elif not sara and len(word) == 3: sara.append('ออ') # In case of บ่ if 'บ่' == word: sara = [] sara.append('ออ') if 'ํ' in word: sara = [] sara.append('อำ') if 'เ' in word and 'ื' in word and 'อ' in word: sara = [] sara.append('เอือ') if not sara: return 'Can\'t find Sara in this word' else: return sara[0]
[docs] def check_marttra(self, word: str) -> str: """ Check the Thai spelling Section in the Thai word. :param str word: Thai word :return: name of spelling Section of the word. :rtype: str :Example: :: from pythainlp.khavee import KhaveeVerifier kv = KhaveeVerifier() print(kv.check_marttra('สาว')) # output: 'เกอว' """ if word[-1] == 'ร' and word[-2] in ['ต', 'ท']: word = word[:-1] word = self.handle_karun_sound_silence(word) word = remove_tonemark(word) if 'ำ' in word or ('ํ' in word and 'า' in word) or 'ไ' in word or 'ใ' in word: return 'กา' elif ( word[-1] in ['า', 'ะ', 'ิ', 'ี', 'ุ', 'ู', 'อ'] or ('ี' in word and 'ย' in word[-1]) or ('ื' in word and 'อ' in word[-1]) ): return 'กา' elif word[-1] in ['ง']: return 'กง' elif word[-1] in ['ม']: return 'กม' elif word[-1] in ['ย']: if 'ั' in word: return 'กา' else: return 'เกย' elif word[-1] in ['ว']: return 'เกอว' elif word[-1] in ['ก', 'ข', 'ค', 'ฆ']: return 'กก' elif word[-1] in [ 'จ', 'ช', 'ซ', 'ฎ', 'ฏ', 'ฐ', 'ฑ', 'ฒ', 'ด', 'ต', 'ถ', 'ท', 'ธ', 'ศ', 'ษ', 'ส' ]: return 'กด' elif word[-1] in ['ญ', ', ณ', 'น', 'ร', 'ล', 'ฬ']: return 'กน' elif word[-1] in ['บ', 'ป', 'พ', 'ฟ', 'ภ']: return 'กบ' else: if '็' in word: return 'กา' else: return 'Cant find Marttra in this word'
[docs] def is_sumpus(self, word1: str, word2: str) -> bool: """ Check the rhyme between two words. :param str word1: Thai word :param str word2: Thai word :return: boolean :rtype: bool :Example: :: from pythainlp.khavee import KhaveeVerifier kv = KhaveeVerifier() print(kv.is_sumpus('สรร', 'อัน')) # output: True print(kv.is_sumpus('สรร', 'แมว')) # output: False """ marttra1 = self.check_marttra(word1) marttra2 = self.check_marttra(word2) sara1 = self.check_sara(word1) sara2 = self.check_sara(word2) if sara1 == 'อะ' and marttra1 == 'เกย': sara1 = 'ไอ' marttra1 = 'กา' elif sara2 == 'อะ' and marttra2 == 'เกย': sara2 = 'ไอ' marttra2 = 'กา' if sara1 == 'อำ' and marttra1 == 'กม': sara1 = 'อำ' marttra1 = 'กา' elif sara2 == 'อำ' and marttra2 == 'กม': sara2 = 'อำ' marttra2 = 'กา' return bool(marttra1 == marttra2 and sara1 == sara2)
[docs] def check_karu_lahu(self, text): if ( ( self.check_marttra(text) != 'กา' or ( self.check_marttra(text) == 'กา' and self.check_sara(text) in [ 'อา', 'อี', 'อือ', 'อู', 'เอ', 'แอ', 'โอ', 'ออ', 'เออ', 'เอีย', 'เอือ', 'อัว' ] ) or self.check_sara(text) in ['อำ', 'ไอ', 'เอา'] ) and text not in ['บ่', 'ณ', 'ธ', 'ก็'] ): return 'karu' else: return 'lahu'
[docs] def check_klon(self, text: str, k_type: int = 8) -> Union[List[str], str]: """ Check the suitability of the poem according to Thai principles. :param str text: Thai poem :param int k_type: type of Thai poem :return: the check results of the suitability of the poem according to Thai principles. :rtype: Union[List[str], str] :Example: :: from pythainlp.khavee import KhaveeVerifier kv = KhaveeVerifier() print(kv.check_klon( 'ฉันชื่อหมูกรอบ ฉันชอบกินไก่ แล้วก็วิ่งไล่ หมาชื่อนํ้าทอง ลคคนเก่ง เอ๋งเอ๋งคะนอง \ มีคนจับจอง เขาชื่อน้องเธียร', k_type=4 )) # output: The poem is correct according to the principle. print(kv.check_klon( 'ฉันชื่อหมูกรอบ ฉันชอบกินไก่ แล้วก็วิ่งไล่ หมาชื่อนํ้าทอง ลคคนเก่ง \ เอ๋งเอ๋งเสียงหมา มีคนจับจอง เขาชื่อน้องเธียร', k_type=4 )) # output: [ "Can't find rhyme between paragraphs ('หมา', 'จอง') in paragraph 2", "Can't find rhyme between paragraphs ('หมา', 'ทอง') in paragraph 2" ] """ if k_type == 8: try: error = [] list_sumpus_sent1 = [] list_sumpus_sent2h = [] list_sumpus_sent2l = [] list_sumpus_sent3 = [] list_sumpus_sent4 = [] for i, sent in enumerate(text.split()): sub_sent = subword_tokenize(sent, engine='dict') if len(sub_sent) > 10: error.append( 'In sentence ' + str(i + 2) + ', there are more than 10 words. ' + str(sub_sent) ) if (i + 1) % 4 == 1: list_sumpus_sent1.append(sub_sent[-1]) elif (i + 1) % 4 == 2: list_sumpus_sent2h.append( [sub_sent[1], sub_sent[2], sub_sent[3], sub_sent[4]] ) list_sumpus_sent2l.append(sub_sent[-1]) elif (i + 1) % 4 == 3: list_sumpus_sent3.append(sub_sent[-1]) elif (i + 1) % 4 == 0: list_sumpus_sent4.append(sub_sent[-1]) if ( len(list_sumpus_sent1) != len(list_sumpus_sent2h) or len(list_sumpus_sent2h) != len(list_sumpus_sent2l) or len(list_sumpus_sent2l) != len(list_sumpus_sent3) or len(list_sumpus_sent3) != len(list_sumpus_sent4) or len(list_sumpus_sent4) != len(list_sumpus_sent1) ): return 'The poem does not have 4 complete sentences.' else: for i in range(len(list_sumpus_sent1)): countwrong = 0 for j in list_sumpus_sent2h[i]: if self.is_sumpus(list_sumpus_sent1[i], j) is False: countwrong += 1 if countwrong > 3: error.append( 'Can\'t find rhyme between paragraphs ' + str((list_sumpus_sent1[i], list_sumpus_sent2h[i])) + ' in paragraph ' + str(i + 1) ) if self.is_sumpus(list_sumpus_sent2l[i], list_sumpus_sent3[i]) is False: error.append( 'Can\'t find rhyme between paragraphs ' + str((list_sumpus_sent2l[i], list_sumpus_sent3[i])) + ' in paragraph ' + str(i + 1) ) if i > 0: if self.is_sumpus( list_sumpus_sent2l[i], list_sumpus_sent4[i - 1] ) is False: error.append( 'Can\'t find rhyme between paragraphs ' + str((list_sumpus_sent2l[i], list_sumpus_sent4[i - 1])) + ' in paragraph ' + str(i + 1) ) if not error: return 'The poem is correct according to the principle.' else: return error except: return 'Something went wrong. Make sure you enter it in the correct form of klon 8.' elif k_type == 4: try: error = [] list_sumpus_sent1 = [] list_sumpus_sent2h = [] list_sumpus_sent2l = [] list_sumpus_sent3 = [] list_sumpus_sent4 = [] for i, sent in enumerate(text.split()): sub_sent = subword_tokenize(sent, engine='dict') if len(sub_sent) > 5: error.append( 'In sentence ' + str(i + 2) + ', there are more than 4 words. ' + str(sub_sent) ) if (i + 1) % 4 == 1: list_sumpus_sent1.append(sub_sent[-1]) elif (i + 1) % 4 == 2: list_sumpus_sent2h.append([sub_sent[1], sub_sent[2]]) list_sumpus_sent2l.append(sub_sent[-1]) elif (i + 1) % 4 == 3: list_sumpus_sent3.append(sub_sent[-1]) elif (i + 1) % 4 == 0: list_sumpus_sent4.append(sub_sent[-1]) if ( len(list_sumpus_sent1) != len(list_sumpus_sent2h) or len(list_sumpus_sent2h) != len(list_sumpus_sent2l) or len(list_sumpus_sent2l) != len(list_sumpus_sent3) or len(list_sumpus_sent3) != len(list_sumpus_sent4) or len(list_sumpus_sent4) != len(list_sumpus_sent1) ): return 'The poem does not have 4 complete sentences.' else: for i in range(len(list_sumpus_sent1)): countwrong = 0 for j in list_sumpus_sent2h[i]: if self.is_sumpus(list_sumpus_sent1[i], j) is False: countwrong += 1 if countwrong > 1: error.append( 'Can\'t find rhyme between paragraphs ' + str((list_sumpus_sent1[i], list_sumpus_sent2h[i])) + ' in paragraph ' + str(i + 1) ) if self.is_sumpus(list_sumpus_sent2l[i], list_sumpus_sent3[i]) is False: error.append( 'Can\'t find rhyme between paragraphs ' + str((list_sumpus_sent2l[i], list_sumpus_sent3[i])) + ' in paragraph ' + str(i + 1) ) if i > 0: if self.is_sumpus( list_sumpus_sent2l[i], list_sumpus_sent4[i - 1] ) is False: error.append( 'Can\'t find rhyme between paragraphs ' + str((list_sumpus_sent2l[i], list_sumpus_sent4[i - 1])) + ' in paragraph ' + str(i + 1) ) if not error: return 'The poem is correct according to the principle.' else: return error except: return 'Something went wrong. Make sure you enter it in the correct form.' else: return 'Something went wrong. Make sure you enter it in the correct form.'
[docs] def check_aek_too( self, text: Union[List[str], str], dead_syllable_as_aek: bool = False ) -> Union[List[bool], List[str], bool, str]: """ Checker of Thai tonal words :param Union[List[str], str] text: Thai word or list of Thai words :param bool dead_syllable_as_aek: if True, dead syllable will be considered as aek :return: the check result if the word is aek or too or False (not both) or list of check results if input is list :rtype: Union[List[bool], List[str], bool, str] :Example: :: from pythainlp.khavee import KhaveeVerifier kv = KhaveeVerifier() # การเช็คคำเอกโท print(kv.check_aek_too('เอง'), kv.check_aek_too('เอ่ง'), kv.check_aek_too('เอ้ง')) # -> False, aek, too print(kv.check_aek_too(['เอง', 'เอ่ง', 'เอ้ง'])) # ใช้ List ได้เหมือนกัน # -> [False, 'aek', 'too'] """ if isinstance(text, list): return [self.check_aek_too(t, dead_syllable_as_aek) for t in text] if not isinstance(text, str): raise TypeError('text must be str or iterable list[str]') word_characters = [*text] if '่' in word_characters and not '้' in word_characters: return 'aek' elif '้' in word_characters and not '่' in word_characters: return 'too' if dead_syllable_as_aek and sound_syllable(text) == 'dead': return 'aek' else: return False
[docs] def handle_karun_sound_silence(self, word: str) -> str: """ Handle silent sounds in Thai words using '์' character (Karun) by stripping all characters before the 'Karun' character that should be silenced :param str text: Thai word :return: Thai word with silent words stripped :rtype: str """ sound_silenced = word.endswith('์') if not sound_silenced: return word thai_consonants = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ" locate_silenced = word.rfind('์') - 1 can_silence_two = word[locate_silenced-2] in thai_consonants cut_off = 2 if can_silence_two else 1 word = word[:locate_silenced + 1 - cut_off] return word