Source code for pythainlp.morpheme.thaiwordcheck

# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0
Check if a word is a "native Thai word"

Adapted from

- ทีมงานทรูปลูกปัญญา 2015. ลักษณะของคำไทยแท้ \
- วารุณี บำรุงรส 2010. คำไทยแท้
import re

_THANTHAKHAT_CHAR = "\u0e4c"  # Thanthakhat (cancellation of sound)

# Non-native Thai characters

# Native Thai final consonants
_TH_NATIVE_FINALS = {"ก", "ด", "บ", "น", "ง", "ม", "ย", "ว"}

# Known native Thai words (exceptions)

# Diphthong prefixes (can start native Thai word)
_TH_PREFIX_DIPHTHONG = {"กะ", "กระ", "ปะ", "ประ"}

# Thai consonant filter
# O ANG (U+0E2D) is omitted, as it can be considered as vowel
_TH_CONSONANTS_PATTERN = re.compile(r"[ก-ฬฮ]", re.U)

[docs]def is_native_thai(word: str) -> bool: """ Check if a word is an "native Thai word" (Thai: "คำไทยแท้") This function is based on a simple heuristic algorithm and cannot be entirely reliable. :param str word: word :return: True or False :rtype: bool :Example: English word:: from pythainlp.util import is_native_thai is_native_thai("Avocado") # output: False Native Thai word:: is_native_thai("มะม่วง") # output: True is_native_thai("ตะวัน") # output: True Non-native Thai word:: is_native_thai("สามารถ") # output: False is_native_thai("อิสริยาภรณ์") # output: False """ if not isinstance(word, str) or not word.strip(): return False word = word.strip() # Known native Thai words (exceptions) if word in _TH_NATIVE_WORDS: return True # If a word contains non-Thai chars, it is not a native Thai if any(ch in word for ch in _TH_NON_NATIVE_CHARS): return False # If it does not contain any Thai consonants -> it cannot be Thai chs = re.findall(_TH_CONSONANTS_PATTERN, word) if not chs: return False # If there's only one Thai consonant -> it can be a native Thai if len(chs) == 1: return True # If a word ends with native final, it can be a native Thai if word[-1] in _TH_NATIVE_FINALS: return True # Note: This will not work, as it check the whole word, not the prefix. # Prefix-sensitive tokenization is required in order to be able to check this. if word in _TH_PREFIX_DIPHTHONG: return True return False