Source code for pythainlp.tag.named_entity

# -*- coding: utf-8 -*-
# Copyright (C) 2016-2023 PyThaiNLP Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Named-entity recognizer
"""
import warnings
from typing import List, Tuple, Union


[docs]class NER: """ Named-entity recognizer class :param str engine: Named-entity recognizer engine :param str corpus: corpus **Options for engine** * *thainer* - Thai NER engine * *tltk* - wrapper for `TLTK <https://pypi.org/project/tltk/>`_. * *thainer-v2* - Thai NER engine v2.0 for Thai NER 2.0 **Options for corpus** * *thainer* - Thai NER corpus **Note**: for tltk engine, It's support ner model from tltk only. """
[docs] def __init__(self, engine: str, corpus: str = "thainer") -> None: self.load_engine(engine=engine, corpus=corpus)
[docs] def load_engine(self, engine: str, corpus: str) -> None: self.name_engine = engine self.engine = None if engine == "thainer" and corpus == "thainer": from pythainlp.tag.thainer import ThaiNameTagger self.engine = ThaiNameTagger() elif engine == "thainer-v2" and corpus == "thainer": from pythainlp.wangchanberta import NamedEntityRecognition self.engine = NamedEntityRecognition(model="pythainlp/thainer-corpus-v2-base-model") elif engine == "tltk": from pythainlp.tag import tltk self.engine = tltk elif engine == "wangchanberta" and corpus == "thainer": from pythainlp.wangchanberta import ThaiNameTagger self.engine = ThaiNameTagger(dataset_name=corpus) else: raise ValueError( "NER class not support {0} engine or {1} corpus.".format( engine, corpus ) )
[docs] def tag( self, text, pos=False, tag=False ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]: """ This function tags named-entitiy from text in IOB format. :param str text: text in Thai to be tagged :param bool pos: output with part-of-speech tag.\ (wangchanberta is not support) :param bool tag: output like html tag. :return: a list of tuple associated with tokenized word, NER tag, POS tag (if the parameter `pos` is specified as `True`), and output like html tag (if the parameter `tag` is specified as `True`). Otherwise, return a list of tuple associated with tokenized word and NER tag :rtype: Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str] :Example: >>> from pythainlp.tag import NER >>> >>> ner = NER("thainer") >>> ner.tag("ทดสอบนายวรรณพงษ์ ภัททิยไพบูลย์") [('ทดสอบ', 'O'), ('นาย', 'B-PERSON'), ('วรรณ', 'I-PERSON'), ('พงษ์', 'I-PERSON'), (' ', 'I-PERSON'), ('ภัททิย', 'I-PERSON'), ('ไพบูลย์', 'I-PERSON')] >>> ner.tag("ทดสอบนายวรรณพงษ์ ภัททิยไพบูลย์", tag=True) 'ทดสอบ<PERSON>นายวรรณพงษ์ ภัททิยไพบูลย์</PERSON>' """ return self.engine.get_ner(text, tag=tag, pos=pos)
[docs]class NNER: """ Nested Named Entity Recognition :param str engine: Nested Named entity recognizer engine :param str corpus: corpus **Options for engine** * *thai_nner* - Thai NER engine """
[docs] def __init__(self, engine: str = "thai_nner") -> None: self.load_engine(engine)
[docs] def load_engine(self, engine: str = "thai_nner") -> None: from pythainlp.tag.thai_nner import Thai_NNER self.engine = Thai_NNER()
[docs] def tag(self, text) -> Tuple[List[str], List[dict]]: """ This function tags nested named-entitiy. :param str text: text in Thai to be tagged :return: a list of tuple associated with tokenized word, NNER tag. :rtype: Tuple[List[str], List[dict]] :Example: >>> from pythainlp.tag.named_entity import NNER >>> nner = NNER() >>> nner.tag("แมวทำอะไรตอนห้าโมงเช้า") ([ '<s>', '', 'แมว', 'ทํา', '', 'อะไร', 'ตอน', '', 'ห้า', '', 'โมง', '', 'เช้า', '</s>' ], [ { 'text': ['', 'ห้า'], 'span': [7, 9], 'entity_type': 'cardinal' }, { 'text': ['', 'ห้า', '', 'โมง'], 'span': [7, 11], 'entity_type': 'time' }, { 'text': ['', 'โมง'], 'span': [9, 11], 'entity_type': 'unit' } ]) """ return self.engine.tag(text)