Source code for pythainlp.tag.tltk

# -*- coding: utf-8 -*-
# Copyright (C) 2016-2023 PyThaiNLP Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List, Tuple, Union
try:
    from tltk import nlp
except ImportError:
    raise ImportError("Not found tltk! Please install tltk by pip install tltk")
from pythainlp.tokenize import word_tokenize

nlp.pos_load()
nlp.ner_load()


def pos_tag(words: List[str], corpus: str = "tnc") -> List[Tuple[str, str]]:
    if corpus != "tnc":
        raise ValueError("tltk not support {0} corpus.".format(0))
    return nlp.pos_tag_wordlist(words)


def _post_process(text: str) -> str:
    return text.replace("<s/>", " ")


[docs]def get_ner( text: str, pos: bool = True, tag: bool = False ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]: """ Named-entity recognizer from **TLTK** This function tags named-entitiy from text in IOB format. :param str text: text in Thai to be tagged :param bool pos: To include POS tags in the results (`True`) or exclude (`False`). The defualt value is `True` :param bool tag: output like html tag. :return: a list of tuple associated with tokenized word, NER tag, POS tag (if the parameter `pos` is specified as `True`), and output like html tag (if the parameter `tag` is specified as `True`). Otherwise, return a list of tuple associated with tokenized word and NER tag :rtype: Union[list[tuple[str, str]], list[tuple[str, str, str]]], str :Example: >>> from pythainlp.tag.tltk import get_ner >>> get_ner("เขาเรียนที่โรงเรียนนางรอง") [('เขา', 'PRON', 'O'), ('เรียน', 'VERB', 'O'), ('ที่', 'SCONJ', 'O'), ('โรงเรียน', 'NOUN', 'B-L'), ('นางรอง', 'VERB', 'I-L')] >>> get_ner("เขาเรียนที่โรงเรียนนางรอง", pos=False) [('เขา', 'O'), ('เรียน', 'O'), ('ที่', 'O'), ('โรงเรียน', 'B-L'), ('นางรอง', 'I-L')] >>> get_ner("เขาเรียนที่โรงเรียนนางรอง", tag=True) 'เขาเรียนที่<L>โรงเรียนนางรอง</L>' """ if not text: return [] list_word = [] for i in word_tokenize(text, engine="tltk"): if i == " ": i = "<s/>" list_word.append(i) _pos = nlp.pos_tag_wordlist(list_word) sent_ner = [ (_post_process(word), pos, ner) for word, pos, ner in nlp.ner(_pos) ] if tag: temp = "" sent = "" for idx, (word, pos, ner) in enumerate(sent_ner): if ner.startswith("B-") and temp != "": sent += "</" + temp + ">" temp = ner[2:] sent += "<" + temp + ">" elif ner.startswith("B-"): temp = ner[2:] sent += "<" + temp + ">" elif ner == "O" and temp != "": sent += "</" + temp + ">" temp = "" sent += word if idx == len(sent_ner) - 1 and temp != "": sent += "</" + temp + ">" return sent if pos is False: return [(word, ner) for word, pos, ner in sent_ner] return sent_ner