Source code for pythainlp.tag.named_entity

# -*- coding: utf-8 -*-
"""
Named-entity recognizer
"""
import warnings
from typing import List, Tuple, Union


[docs]class NER: """ Named-entity recognizer class :param str engine: Named-entity recognizer engine :param str corpus: corpus **Options for engine** * *thainer* - Thai NER engine * *wangchanberta* - wangchanberta model * *tltk* - wrapper for `TLTK <https://pypi.org/project/tltk/>`_. **Options for corpus** * *thaimer* - Thai NER corpus * *lst20* - lst20 corpus (wangchanberta only) **Note**: for tltk engine, It's support ner model from tltk only. """ def __init__(self, engine: str, corpus: str = "thainer") -> None: self.load_engine(engine=engine, corpus=corpus) def load_engine(self, engine: str, corpus: str) -> None: self.name_engine = engine self.engine = None if engine == "thainer" and corpus == "thainer": from pythainlp.tag.thainer import ThaiNameTagger self.engine = ThaiNameTagger() elif engine == "wangchanberta": from pythainlp.wangchanberta import ThaiNameTagger self.engine = ThaiNameTagger(dataset_name=corpus) elif engine == "tltk": from pythainlp.tag import tltk self.engine = tltk else: raise ValueError( "NER class not support {0} engine or {1} corpus.".format( engine, corpus ) )
[docs] def tag( self, text, pos=True, tag=False ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]: """ This function tags named-entitiy from text in IOB format. :param str text: text in Thai to be tagged :param bool pos: output with part-of-speech tag.\ (wangchanberta is not support) :param bool tag: output like html tag. :return: a list of tuple associated with tokenized word, NER tag, POS tag (if the parameter `pos` is specified as `True`), and output like html tag (if the parameter `tag` is specified as `True`). Otherwise, return a list of tuple associated with tokenized word and NER tag :rtype: Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str] :Example: >>> from pythainlp.tag import NER >>> >>> ner = NER("thainer") >>> ner.tag("ทดสอบนายวรรณพงษ์ ภัททิยไพบูลย์") [('ทดสอบ', 'VV', 'O'), ('นาย', 'NN', 'B-PERSON'), ('วรรณ', 'NN', 'I-PERSON'), ('พงษ์', 'NN', 'I-PERSON'), (' ', 'PU', 'I-PERSON'), ('ภัททิย', 'NN', 'I-PERSON'), ('ไพบูลย์', 'NN', 'I-PERSON')] >>> ner.tag("ทดสอบนายวรรณพงษ์ ภัททิยไพบูลย์", tag=True) 'ทดสอบ<PERSON>นายวรรณพงษ์ ภัททิยไพบูลย์</PERSON>' """ if pos and self.name_engine == "wangchanberta": warnings.warn( """wangchanberta is not support part-of-speech tag. It have not part-of-speech tag in output.""" ) if self.name_engine == "wangchanberta": return self.engine.get_ner(text, tag=tag) else: return self.engine.get_ner(text, tag=tag, pos=pos)