Source code for pythainlp.util.thai

# -*- coding: utf-8 -*-
"""
Check if it is Thai text
"""
import string
from typing import Tuple

from pythainlp import thai_above_vowels, thai_tonemarks
from pythainlp.transliterate import pronunciate
from pythainlp.util.syllable import tone_detector

_DEFAULT_IGNORE_CHARS = string.whitespace + string.digits + string.punctuation
_TH_FIRST_CHAR_ASCII = 3584
_TH_LAST_CHAR_ASCII = 3711


[docs]def isthaichar(ch: str) -> bool: """Check if a character is a Thai character. :param ch: input character :type ch: str :return: True if ch is a Thai characttr, otherwise False. :rtype: bool :Example: :: from pythainlp.util import isthaichar isthaichar("ก") # THAI CHARACTER KO KAI # output: True isthaichar("๕") # THAI DIGIT FIVE # output: True """ ch_val = ord(ch) if ch_val >= _TH_FIRST_CHAR_ASCII and ch_val <= _TH_LAST_CHAR_ASCII: return True return False
[docs]def isthai(text: str, ignore_chars: str = ".") -> bool: """Check if every characters in a string are Thai character. :param text: input text :type text: str :param ignore_chars: characters to be ignored, defaults to "." :type ignore_chars: str, optional :return: True if every characters in the input string are Thai, otherwise False. :rtype: bool :Example: :: from pythainlp.util import isthai isthai("กาลเวลา") # output: True isthai("กาลเวลา.") # output: True isthai("กาล-เวลา") # output: False isthai("กาล-เวลา +66", ignore_chars="01234567890+-.,") # output: True """ if not ignore_chars: ignore_chars = "" for ch in text: if ch not in ignore_chars and not isthaichar(ch): return False return True
[docs]def countthai(text: str, ignore_chars: str = _DEFAULT_IGNORE_CHARS) -> float: """Find proportion of Thai characters in a given text :param text: input text :type text: str :param ignore_chars: characters to be ignored, defaults to whitespaces,\\ digits, and puntuations. :type ignore_chars: str, optional :return: proportion of Thai characters in the text (percent) :rtype: float :Example: :: from pythainlp.util import countthai countthai("ไทยเอ็นแอลพี 3.0") # output: 100.0 countthai("PyThaiNLP 3.0") # output: 0.0 countthai("ใช้งาน PyThaiNLP 3.0") # output: 40.0 countthai("ใช้งาน PyThaiNLP 3.0", ignore_chars="") # output: 30.0 """ if not text or not isinstance(text, str): return 0.0 if not ignore_chars: ignore_chars = "" num_thai = 0 num_ignore = 0 for ch in text: if ch in ignore_chars: num_ignore += 1 elif isthaichar(ch): num_thai += 1 num_count = len(text) - num_ignore if num_count == 0: return 0.0 return (num_thai / num_count) * 100
[docs]def display_thai_char(ch: str) -> str: """Prefix an underscore (_) to a high-position vowel or a tone mark, to ease readability. :param ch: input character :type ch: str :return: "_" + ch :rtype: str :Example: :: from pythainlp.util import display_thai_char display_thai_char("้") # output: "_้" """ if ( ch in thai_above_vowels or ch in thai_tonemarks or ch in "\u0e33\u0e4c\u0e4d\u0e4e" ): # last condition is Sra Aum, Thanthakhat, Nikhahit, Yamakkan return "_" + ch else: return ch
[docs]def thai_word_tone_detector(word: str) -> Tuple[str, str]: """ Thai tone detector for word. It use pythainlp.transliterate.pronunciate for convert word to\ pronunciation. :param str word: Thai word. :return: Thai pronunciation with tone each syllables.\ (l, m, h, r, f or empty if it cannot detector) :rtype: Tuple[str, str] :Example: :: from pythainlp.util import thai_word_tone_detector print(thai_word_tone_detector("คนดี")) # output: [('คน', 'm'), ('ดี', 'm')] print(thai_word_tone_detector("มือถือ")) # output: [('มือ', 'm'), ('ถือ', 'r')] """ _pronunciate = pronunciate(word).split('-') return [(i, tone_detector(i.replace('หฺ', 'ห'))) for i in _pronunciate]