Source code for pythainlp.tokenize.nercut

# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0
"""
nercut 0.2

Dictionary-based maximal matching word segmentation, constrained by
Thai Character Cluster (TCC) boundaries, and combining tokens that are
parts of the same named entity.

Code by Wannaphong Phatthiyaphaibun
"""
from typing import Iterable, List

from pythainlp.tag.named_entity import NER

_thainer = NER(engine="thainer")


[docs]def segment( text: str, taglist: Iterable[str] = [ "ORGANIZATION", "PERSON", "PHONE", "EMAIL", "DATE", "TIME", ], tagger=_thainer, ) -> List[str]: """ Dictionary-based maximal matching word segmentation, constrained by Thai Character Cluster (TCC) boundaries, and combining tokens that are parts of the same named-entity. :param str text: text to be tokenized into words :param list taglist: a list of named entity tags to be used :param class tagger: NER tagger engine :return: list of words, tokenized from the text """ if not isinstance(text, str): return [] tagged_words = tagger.tag(text, pos=False) words = [] combining_word = "" for idx, (curr_word, curr_tag) in enumerate(tagged_words): if curr_tag != "O": tag = curr_tag[2:] else: tag = "O" if curr_tag.startswith("B-") and tag in taglist: combining_word = curr_word elif ( curr_tag.startswith("I-") and combining_word != "" and tag in taglist ): combining_word += curr_word elif curr_tag == "O" and combining_word != "": words.append(combining_word) combining_word = "" words.append(curr_word) else: # if tag is O combining_word = "" words.append(curr_word) if idx + 1 == len(tagged_words): if curr_tag.startswith("B-") and combining_word != "": words.append(combining_word) elif curr_tag.startswith("I-") and combining_word != "": words.append(combining_word) else: pass return words