Source code for pythainlp.tokenize.nercut

# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0
"""
nercut 0.2

Dictionary-based maximal matching word segmentation, constrained by
Thai Character Cluster (TCC) boundaries, and combining tokens that are
parts of the same named entity.

Code by Wannaphong Phatthiyaphaibun
"""
from typing import Iterable, List

from pythainlp.tag.named_entity import NER

_thainer = NER(engine="thainer")


[docs]def segment(
    text: str,
    taglist: Iterable[str] = [
        "ORGANIZATION",
        "PERSON",
        "PHONE",
        "EMAIL",
        "DATE",
        "TIME",
    ],
    tagger=_thainer,
) -> List[str]:
    """
    Dictionary-based maximal matching word segmentation, constrained by
    Thai Character Cluster (TCC) boundaries, and combining tokens that are
    parts of the same named-entity.

    :param str text: text to be tokenized into words
    :param list taglist: a list of named entity tags to be used
    :param class tagger: NER tagger engine
    :return: list of words, tokenized from the text
    """
    if not isinstance(text, str):
        return []

    tagged_words = tagger.tag(text, pos=False)

    words = []
    combining_word = ""
    for idx, (curr_word, curr_tag) in enumerate(tagged_words):
        if curr_tag != "O":
            tag = curr_tag[2:]
        else:
            tag = "O"

        if curr_tag.startswith("B-") and tag in taglist:
            combining_word = curr_word
        elif (
            curr_tag.startswith("I-")
            and combining_word != ""
            and tag in taglist
        ):
            combining_word += curr_word
        elif curr_tag == "O" and combining_word != "":
            words.append(combining_word)
            combining_word = ""
            words.append(curr_word)
        else:  # if tag is O
            combining_word = ""
            words.append(curr_word)
        if idx + 1 == len(tagged_words):
            if curr_tag.startswith("B-") and combining_word != "":
                words.append(combining_word)
            elif curr_tag.startswith("I-") and combining_word != "":
                words.append(combining_word)
            else:
                pass

    return words