Source code for pythainlp.khavee.core

# -*- coding: utf-8 -*-
# Copyright (C) 2016-2023 PyThaiNLP Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List, Union
from pythainlp.tokenize import subword_tokenize
from pythainlp.util import sound_syllable


[docs]class KhaveeVerifier:
[docs] def __init__(self): """ KhaveeVerifier: Thai Poetry verifier """ pass
[docs] def check_sara(self, word: str)-> str: """ Check the vowels in the Thai word. :param str word: Thai word :return: name vowel of the word. :rtype: str :Example: :: from pythainlp.khavee import KhaveeVerifier kv = KhaveeVerifier() print(kv.check_sara('เริง')) # output: 'เออ' """ sara = [] countoa = 0 # In case การันย์ if '์' in word[-1]: word = word[:-2] # In case สระเดี่ยว for i in word: if i == 'ะ' or i == 'ั': sara.append('อะ') elif i == 'ิ': sara.append('อิ') elif i == 'ุ': sara.append('อุ') elif i == 'ึ': sara.append('อึ') elif i == 'ี': sara.append('อี') elif i == 'ู': sara.append('อู') elif i == 'ื': sara.append('อือ') elif i == 'เ': sara.append('เอ') elif i == 'แ': sara.append('แอ') elif i == 'า': sara.append('อา') elif i == 'โ': sara.append('โอ') elif i == 'ำ': sara.append('อำ') elif i == 'อ': countoa += 1 sara.append('ออ') elif i == 'ั' and 'ว' in word: sara.append('อัว') elif i == 'ไ' or i == 'ใ': sara.append('ไอ') elif i == '็': sara.append('ออ') elif 'รร' in word: if self.check_marttra(word) == 'กม': sara.append('อำ') else: sara.append('อะ') # Incase ออ if countoa == 1 and 'อ' in word[-1] and 'เ' not in word: sara.remove('ออ') # In case เอ เอ countA = 0 for i in sara: if i == 'เอ': countA = countA + 1 if countA > 1: sara.remove('เอ') sara.remove('เอ') sara.append('แ') # In case สระประสม if 'เอ' in sara and 'อะ' in sara: sara.remove('เอ') sara.remove('อะ') sara.append('เอะ') elif 'แอ' in sara and 'อะ' in sara: sara.remove('แอ') sara.remove('อะ') sara.append('แอะ') if 'เอะ' in sara and 'ออ' in sara: sara.remove('เอะ') sara.remove('ออ') sara.append('เออะ') elif 'เอ' in sara and 'อิ' in sara: sara.remove('เอ') sara.remove('อิ') sara.append('เออ') elif 'เอ' in sara and 'ออ' in sara and 'อ' in word[-1]: sara.remove('เอ') sara.remove('ออ') sara.append('เออ') elif 'โอ' in sara and 'อะ' in sara: sara.remove('โอ') sara.remove('อะ') sara.append('โอะ') elif 'เอ' in sara and 'อี' in sara: sara.remove('เอ') sara.remove('อี') sara.append('เอีย') elif 'เอ' in sara and 'อือ' in sara: sara.remove('เอ') sara.remove('อือ') sara.append('อัว') elif 'เอ' in sara and 'อา' in sara: sara.remove('เอ') sara.remove('อา') sara.append('เอา') elif 'เ' in word and 'า' in word and 'ะ' in word: sara = [] sara.append('เอาะ') if 'อือ' in sara and 'เออ' in sara: sara.remove('เออ') sara.remove('อือ') sara.append('เอือ') elif 'ออ' in sara and len(sara) > 1: sara.remove('ออ') elif 'ว' in word and len(sara) == 0: sara.append('อัว') if 'ั' in word and self.check_marttra(word) == 'กา': sara = [] sara.append('ไอ') # In case อ if word == 'เออะ': sara = [] sara.append('เออะ') elif word == 'เออ': sara = [] sara.append('เออ') elif word == 'เอ': sara = [] sara.append('เอ') elif word == 'เอะ': sara = [] sara.append('เอะ') elif word == 'เอา': sara = [] sara.append('เอา') elif word == 'เอาะ': sara = [] sara.append('เอาะ') if 'ฤา' in word or 'ฦา' in word: sara = [] sara.append('อือ') elif 'ฤ' in word or 'ฦ' in word: sara = [] sara.append('อึ') # In case กน if sara == [] and len(word) == 2: if word[-1] != 'ร': sara.append('โอะ') else: sara.append('ออ') elif sara == [] and len(word) == 3: sara.append('ออ') # incase บ่ if 'บ่' in word: sara = [] sara.append('ออ') if sara == []: return 'Cant find Sara in this word' else: return sara[0]
[docs] def check_marttra(self, word: str) -> str: """ Check the Thai spelling Section in the Thai word. :param str word: Thai word :return: name spelling Section of the word. :rtype: str :Example: :: from pythainlp.khavee import KhaveeVerifier kv = KhaveeVerifier() print(kv.check_marttra('สาว')) # output: 'เกอว' """ if word[-1] == 'ร' and word[-2] in ['ต','ท'] : word = word[:-1] if '์' in word[-1]: if 'ิ' in word[-2] or 'ุ' in word[-2]: word = word[:-3] else: word = word[:-2] if 'ำ' in word or ('ํ' in word and 'า' in word) or 'ไ' in word or 'ใ' in word: return 'กา' elif word[-1] in ['า','ะ','ิ','ี','ุ','ู','อ'] or ('ี' in word and 'ย' in word[-1]) or ('ื' in word and 'อ' in word[-1]): return 'กา' elif word[-1] in ['ง']: return 'กง' elif word[-1] in ['ม']: return 'กม' elif word[-1] in ['ย']: if 'ั' in word: return 'กา' else: return 'เกย' elif word[-1] in ['ว']: return 'เกอว' elif word[-1] in ['ก','ข','ค','ฆ']: return 'กก' elif word[-1] in ['จ','ช','ซ','ฎ','ฏ','ฐ','ฑ','ฒ','ด','ต','ถ','ท','ธ','ศ','ษ','ส'] : return 'กด' elif word[-1] in ['ญ',', ณ' ,'น' ,'ร' ,'ล' ,'ฬ']: return 'กน' elif word[-1] in ['บ', 'ป', 'พ', 'ฟ', 'ภ']: return 'กบ' else: if '็' in word: return 'กา' else: return 'Cant find Marttra in this word'
[docs] def is_sumpus(self, word1: str,word2: str) -> bool: """ Check the rhyme between two words. :param str word1: Thai word :param str word2: Thai word :return: boolen :rtype: bool :Example: :: from pythainlp.khavee import KhaveeVerifier kv = KhaveeVerifier() print(kv.is_sumpus('สรร','อัน')) # output: True print(kv.is_sumpus('สรร','แมว')) # output: False """ marttra1 = self.check_marttra(word1) marttra2 = self.check_marttra(word2) sara1 = self.check_sara(word1) sara2 = self.check_sara(word2) if sara1 == 'อะ' and marttra1 == 'เกย': sara1 = 'ไอ' marttra1 = 'กา' elif sara2 == 'อะ' and marttra2 == 'เกย': sara2 = 'ไอ' marttra2 = 'กา' if sara1 == 'อำ' and marttra1 == 'กม': sara1 = 'อำ' marttra1 = 'กา' elif sara2 == 'อำ' and marttra2 == 'กม': sara2 = 'อำ' marttra2 = 'กา' if marttra1 == marttra2 and sara1 == sara2: return True else: return False
[docs] def check_karu_lahu(self,text): if (self.check_marttra(text) != 'กา' or (self.check_marttra(text) == 'กา' and self.check_sara(text) in ['อา','อี', 'อือ', 'อู', 'เอ', 'แอ', 'โอ', 'ออ', 'เออ', 'เอีย', 'เอือ' ,'อัว']) or self.check_sara(text) in ['อำ','ไอ','เอา']) and text not in ['บ่','ณ','ธ','ก็']: return 'karu' else: return 'lahu'
[docs] def check_klon(self, text: str,k_type: int=8) -> Union[List[str], str]: """ Check the suitability of the poem according to Thai principles. :param str text: Thai poem :param int k_type: Type of Thai poem :return: the check of the suitability of the poem according to Thai principles. :rtype: Union[List[str], str] :Example: :: from pythainlp.khavee import KhaveeVerifier kv = KhaveeVerifier() print(kv.check_klon('''ฉันชื่อหมูกรอบ ฉันชอบกินไก่ แล้วก็วิ่งไล่ หมาชื่อนํ้าทอง ลคคนเก่ง เอ๋งเอ๋งคะนอง มีคนจับจอง เขาชื่อน้องเธียร''', k_type=4)) # output: The poem is correct according to the principle. print(kv.check_klon('''ฉันชื่อหมูกรอบ ฉันชอบกินไก่ แล้วก็วิ่งไล่ หมาชื่อนํ้าทอง ลคคนเก่ง เอ๋งเอ๋งเสียงหมา มีคนจับจอง เขาชื่อน้องเธียร''',k_type=4)) # # -> ["Cant find rhyme between paragraphs ('หมา', 'จอง')in paragraph 2", "Cant find rhyme between paragraphs ('หมา', 'ทอง')in paragraph 2"] """ if k_type == 8: try: error = [] list_sumpus_sent1 = [] list_sumpus_sent2h = [] list_sumpus_sent2l = [] list_sumpus_sent3 = [] list_sumpus_sent4 = [] for i, sent in enumerate(text.split()): sub_sent = subword_tokenize(sent,engine='dict') # print(i) if len(sub_sent) > 10: error.append('In the sentence'+str(i+2)+'there are more than 10 words.'+str(sub_sent)) if (i+1) % 4 == 1: list_sumpus_sent1.append(sub_sent[-1]) elif (i+1) % 4 == 2: list_sumpus_sent2h.append([sub_sent[1],sub_sent[2],sub_sent[3],sub_sent[4]]) list_sumpus_sent2l.append(sub_sent[-1]) elif (i+1) % 4 == 3: list_sumpus_sent3.append(sub_sent[-1]) elif (i+1) % 4 == 0: list_sumpus_sent4.append(sub_sent[-1]) if len(list_sumpus_sent1) != len(list_sumpus_sent2h) or len(list_sumpus_sent2h) != len(list_sumpus_sent2l) or len(list_sumpus_sent2l) != len(list_sumpus_sent3) or len(list_sumpus_sent3) != len(list_sumpus_sent4) or len(list_sumpus_sent4) != len(list_sumpus_sent1): return 'The poem does not complete 4 sentences.' else: for i in range(len(list_sumpus_sent1)): countwrong = 0 for j in list_sumpus_sent2h[i]: if self.is_sumpus(list_sumpus_sent1[i],j) == False: countwrong +=1 if countwrong > 3: error.append('Cant find rhyme between paragraphs '+str((list_sumpus_sent1[i],list_sumpus_sent2h[i]))+'in paragraph '+str(i+1)) if self.is_sumpus(list_sumpus_sent2l[i],list_sumpus_sent3[i]) == False: # print(sumpus_sent2l,sumpus_sent3) error.append('Cant find rhyme between paragraphs '+str((list_sumpus_sent2l[i],list_sumpus_sent3[i]))+'in paragraph '+str(i+1)) if i > 0: if self.is_sumpus(list_sumpus_sent2l[i],list_sumpus_sent4[i-1]) == False: error.append('Cant find rhyme between paragraphs '+str((list_sumpus_sent2l[i],list_sumpus_sent4[i-1]))+'in paragraph '+str(i+1)) if error == []: return 'The poem is correct according to the principle.' else: return error except: return 'Something went wrong Make sure you enter it in correct form of klon 8.' elif k_type == 4: try: error = [] list_sumpus_sent1 = [] list_sumpus_sent2h = [] list_sumpus_sent2l = [] list_sumpus_sent3 = [] list_sumpus_sent4 = [] for i, sent in enumerate(text.split()): sub_sent = subword_tokenize(sent,engine='dict') if len(sub_sent) > 5: error.append('In the sentence'+str(i+2)+'there are more than 4 words.'+str(sub_sent)) if (i+1) % 4 == 1: list_sumpus_sent1.append(sub_sent[-1]) elif (i+1) % 4 == 2: # print([sub_sent[1],sub_sent[2]]) list_sumpus_sent2h.append([sub_sent[1],sub_sent[2]]) list_sumpus_sent2l.append(sub_sent[-1]) elif (i+1) % 4 == 3: list_sumpus_sent3.append(sub_sent[-1]) elif (i+1) % 4 == 0: list_sumpus_sent4.append(sub_sent[-1]) if len(list_sumpus_sent1) != len(list_sumpus_sent2h) or len(list_sumpus_sent2h) != len(list_sumpus_sent2l) or len(list_sumpus_sent2l) != len(list_sumpus_sent3) or len(list_sumpus_sent3) != len(list_sumpus_sent4) or len(list_sumpus_sent4) != len(list_sumpus_sent1): return 'The poem does not complete 4 sentences.' else: for i in range(len(list_sumpus_sent1)): countwrong = 0 for j in list_sumpus_sent2h[i]: # print(list_sumpus_sent1[i],j) if self.is_sumpus(list_sumpus_sent1[i],j) == False: countwrong +=1 if countwrong > 1: error.append('Cant find rhyme between paragraphs '+str((list_sumpus_sent1[i],list_sumpus_sent2h[i]))+'in paragraph '+str(i+1)) if self.is_sumpus(list_sumpus_sent2l[i],list_sumpus_sent3[i]) == False: # print(sumpus_sent2l,sumpus_sent3) error.append('Cant find rhyme between paragraphs '+str((list_sumpus_sent2l[i],list_sumpus_sent3[i]))+'in paragraph '+str(i+1)) if i > 0: if self.is_sumpus(list_sumpus_sent2l[i],list_sumpus_sent4[i-1]) == False: error.append('Cant find rhyme between paragraphs '+str((list_sumpus_sent2l[i],list_sumpus_sent4[i-1]))+'in paragraph '+str(i+1)) if error == []: return 'The poem is correct according to the principle.' else: return error except: return 'Something went wrong Make sure you enter it in correct form.' else: return 'Something went wrong Make sure you enter it in correct form.'
[docs] def check_aek_too(self, text: Union[List[str], str], dead_syllable_as_aek:bool = False) -> Union[List[bool], List[str], bool, str]: """ Thai tonal word checker :param Union[List[str], str] text: Thai word or list of Thai words :param bool dead_syllable_as_aek: if True, dead syllable will be considered as aek :return: the check if the word is aek or too or False(not both) or list of the check if input is list :rtype: Union[List[bool], List[str], bool, str] :Example: :: from pythainlp.khavee import KhaveeVerifier kv = KhaveeVerifier() # การเช็คคำเอกโท print(kv.check_aek_too('เอง'), kv.check_aek_too('เอ่ง'), kv.check_aek_too('เอ้ง')) ## -> False, aek, too print(kv.check_aek_too(['เอง', 'เอ่ง', 'เอ้ง'])) # ใช้ List ได้เหมือนกัน ## -> [False, 'aek', 'too'] """ if isinstance(text, list): return [self.check_aek_too(t, dead_syllable_as_aek) for t in text] if not isinstance(text, str): raise TypeError('text must be str or iterable list[str]') word_characters = [*text] if '่' in word_characters and not '้' in word_characters: return 'aek' elif '้' in word_characters and not '่' in word_characters: return 'too' if dead_syllable_as_aek and sound_syllable(text) == 'dead': return 'aek' else: return False