Source code for pythainlp.tokenize.longest

# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0
"""
Dictionary-based longest-matching Thai word segmentation. Implementation is based
on the codes from Patorn Utenpattanun.

:See Also:
    * `GitHub Repository \
       <https://github.com/patorn/thaitokenizer/blob/master/thaitokenizer/tokenizer.py>`_

"""
import re
from typing import List, Union

from pythainlp import thai_tonemarks
from pythainlp.tokenize import DEFAULT_WORD_DICT_TRIE
from pythainlp.util import Trie

_FRONT_DEP_CHAR = [
    "ะ",
    "ั",
    "า ",
    "ำ",
    "ิ",
    "ี",
    "ึ",
    "ื",
    "ุ",
    "ู",
    "ๅ",
    "็",
    "์",
    "ํ",
]
_REAR_DEP_CHAR = ["ั", "ื", "เ", "แ", "โ", "ใ", "ไ", "ํ"]
_TRAILING_CHAR = ["ๆ", "ฯ"]

_RE_NONTHAI = re.compile(r"[A-Za-z\d]*")

_KNOWN = True
_UNKNOWN = False


[docs]class LongestMatchTokenizer:
[docs]    def __init__(self, trie: Trie):
        self.__trie = trie

    @staticmethod
    def __search_nonthai(text: str) -> Union[None, str]:
        match = _RE_NONTHAI.search(text)
        if match.group(0):
            return match.group(0).lower()
        return None

    def __is_next_word_valid(self, text: str, begin_pos: int) -> bool:
        text = text[begin_pos:].strip()

        if not text:
            return True

        match = self.__search_nonthai(text)
        if match:
            return True

        for pos in range(len(text) + 1):
            if text[0:pos] in self.__trie:
                return True

        return False

    def __longest_matching(self, text: str, begin_pos: int) -> str:
        text = text[begin_pos:]

        match = self.__search_nonthai(text)
        if match:
            return match

        word = None
        word_valid = None

        for pos in range(len(text) + 1):
            w = text[0:pos]
            if w in self.__trie:
                word = w
                if self.__is_next_word_valid(text, pos):
                    word_valid = w

        if word:
            if not word_valid:
                word_valid = word

            try:
                len_word_valid = len(word_valid)
                if text[len_word_valid] in _TRAILING_CHAR:
                    return text[0 : len_word_valid + 1]
                else:
                    return word_valid
            except BaseException:
                return word_valid
        else:
            return ""

    def __segment(self, text: str):
        begin_pos = 0
        len_text = len(text)
        tokens = []
        token_statuses = []
        while begin_pos < len_text:
            match = self.__longest_matching(text, begin_pos)
            if not match:
                if (
                    begin_pos != 0
                    and not text[begin_pos].isspace()
                    and (
                        text[begin_pos] in _FRONT_DEP_CHAR
                        or text[begin_pos - 1] in _REAR_DEP_CHAR
                        or text[begin_pos] in thai_tonemarks
                        or (token_statuses and token_statuses[-1] == _UNKNOWN)
                    )
                ):
                    tokens[-1] += text[begin_pos]
                    token_statuses[-1] = _UNKNOWN
                else:
                    tokens.append(text[begin_pos])
                    token_statuses.append(_UNKNOWN)
                begin_pos += 1
            else:
                if begin_pos != 0 and text[begin_pos - 1] in _REAR_DEP_CHAR:
                    tokens[-1] += match
                else:
                    tokens.append(match)
                    token_statuses.append(_KNOWN)
                begin_pos += len(match)

        return tokens

[docs]    def tokenize(self, text: str) -> List[str]:
        tokens = self.__segment(text)
        return tokens


[docs]def segment(
    text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE
) -> List[str]:
    """
    Dictionary-based longest matching word segmentation.

    :param str text: text to be tokenized into words
    :param pythainlp.util.Trie custom_dict: dictionary for tokenization
    :return: list of words, tokenized from the text
    """
    if not text or not isinstance(text, str):
        return []

    if not custom_dict:
        custom_dict = DEFAULT_WORD_DICT_TRIE

    return LongestMatchTokenizer(custom_dict).tokenize(text)