Source code for pythainlp.util.thaiwordcheck

# -*- coding: utf-8 -*-
"""
Check if a word is a "native Thai word"

Adapted from
https://github.com/wannaphongcom/open-thai-nlp-document/blob/master/check_thai_word.md

References
- ทีมงานทรูปลูกปัญญา 2015. ลักษณะของคำไทยแท้ http://www.trueplookpanya.com/learning/detail/30589-043067
- วารุณี บำรุงรส 2010. คำไทยแท้ https://www.gotoknow.org/posts/377619
"""
import re

_THANTHAKHAT_CHAR = "\u0e4c"  # Thanthakhat (cancellation of sound)

# Non-native Thai characters
_TH_NON_NATIVE_CHARS = {
    "ฆ",
    "ณ",
    "ฌ",
    "ฎ",
    "ฏ",
    "ฐ",
    "ฑ",
    "ฒ",
    "ธ",
    "ศ",
    "ษ",
    "ฬ",
    _THANTHAKHAT_CHAR,
}

# Native Thai final consonants
_TH_NATIVE_FINALS = {"ก", "ด", "บ", "น", "ง", "ม", "ย", "ว"}

# Known native Thai words (exceptions)
_TH_NATIVE_WORDS = {
    "ฆ่า",
    "เฆี่ยน",
    "ศึก",
    "ศอก",
    "เศิก",
    "เศร้า",
    "ธ",
    "ณ",
    "ฯพณฯ",
    "ใหญ่",
    "หญ้า",
    "ควาย",
    "ความ",
    "กริ่งเกรง",
    "ผลิ",
}

# Diphthong prefixes (can starts native Thai word)
_TH_PREFIX_DIPHTHONG = {"กะ", "กระ", "ปะ", "ประ"}

# Thai consonant filter
# O ANG (U+0E2D) is omitted, as it can be considered as vowel
_TH_CONSONANTS_PATTERN = re.compile(r"[ก-ฬฮ]", re.U)


[docs]def is_native_thai(word: str) -> bool: """ Check if a word is an "native Thai word" (Thai: "คำไทยแท้") This function based on a simple heuristic algorithm and cannot be entirely reliable. :param str word: word :return: True or False :rtype: bool :Example: English word:: from pythainlp.util import is_native_thai is_native_thai("Avocado") # output: False Native Thai word:: is_native_thai("มะม่วง") # output: True is_native_thai("ตะวัน") # output: True Non-native Thai word:: is_native_thai("สามารถ") # output: False is_native_thai("อิสริยาภรณ์") # output: False """ if not isinstance(word, str) or not word.strip(): return False word = word.strip() # Known native Thai words (exceptions) if word in _TH_NATIVE_WORDS: return True # If a word contains non-Thai char, it is not a native Thai if any(ch in word for ch in _TH_NON_NATIVE_CHARS): return False # If does not contain any Thai consonants -> cannot be Thai chs = re.findall(_TH_CONSONANTS_PATTERN, word) if not chs: return False # If there's only one Thai consonant -> it can be a native Thai if len(chs) == 1: return True # If a word ends with native final, it can be a native Thai if word[-1] in _TH_NATIVE_FINALS: return True # Note: This will not work, as it check the whole word, not the prefix. # Prefix-sentitive tokenization is required in order to able to check this. if word in _TH_PREFIX_DIPHTHONG: return True return False