Source code for pythainlp.util.thai

# -*- coding: utf-8 -*-
"""
Check if it is Thai text
"""
import string

_DEFAULT_IGNORE_CHARS = string.whitespace + string.digits + string.punctuation


[docs]def isthaichar(ch: str) -> bool: """ Check if a character is Thai เป็นอักษรไทยหรือไม่ :param str ch: input character :return: True or False """ ch_val = ord(ch) if ch_val >= 3584 and ch_val <= 3711: return True return False
[docs]def isthai(word: str, ignore_chars: str = ".") -> bool: """ Check if all character is Thai เป็นคำที่มีแต่อักษรไทยหรือไม่ :param str word: input text :param str ignore_chars: characters to be ignored (i.e. will be considered as Thai) :return: True or False """ if not ignore_chars: ignore_chars = "" for ch in word: if ch not in ignore_chars and not isthaichar(ch): return False return True
[docs]def countthai(text: str, ignore_chars: str = _DEFAULT_IGNORE_CHARS) -> float: """ :param str text: input text :return: float, proportion of characters in the text that is Thai character """ if not text or not isinstance(text, str): return 0 if not ignore_chars: ignore_chars = "" num_thai = 0 num_ignore = 0 for ch in text: if ch in ignore_chars: num_ignore += 1 elif isthaichar(ch): num_thai += 1 num_count = len(text) - num_ignore return (num_thai / num_count) * 100