Source code for pythainlp.tag

# -*- coding: utf-8 -*-
"""
Tagging each token in a sentence with supplementary information,
such as its Part-of-Speech (POS) tag, and Named Entity Recognition (NER) tag.


"""

from typing import List, Tuple

__all__ = ["pos_tag", "pos_tag_sents", "tag_provinces"]
from .locations import tag_provinces

# tag map for orchid to Universal Dependencies
# from Korakot Chaovavanich
_TAG_MAP_UD = {
    # NOUN
    "NOUN": "NOUN",
    "NCMN": "NOUN",
    "NTTL": "NOUN",
    "CNIT": "NOUN",
    "CLTV": "NOUN",
    "CMTR": "NOUN",
    "CFQC": "NOUN",
    "CVBL": "NOUN",
    # VERB
    "VACT": "VERB",
    "VSTA": "VERB",
    # PROPN
    "PROPN": "PROPN",
    "NPRP": "PROPN",
    # ADJ
    "ADJ": "ADJ",
    "NONM": "ADJ",
    "VATT": "ADJ",
    "DONM": "ADJ",
    # ADV
    "ADV": "ADV",
    "ADVN": "ADV",
    "ADVI": "ADV",
    "ADVP": "ADV",
    "ADVS": "ADV",
    # INT
    "INT": "INTJ",
    # PRON
    "PRON": "PRON",
    "PPRS": "PRON",
    "PDMN": "PRON",
    "PNTR": "PRON",
    # DET
    "DET": "DET",
    "DDAN": "DET",
    "DDAC": "DET",
    "DDBQ": "DET",
    "DDAQ": "DET",
    "DIAC": "DET",
    "DIBQ": "DET",
    "DIAQ": "DET",
    # NUM
    "NUM": "NUM",
    "NCNM": "NUM",
    "NLBL": "NUM",
    "DCNM": "NUM",
    # AUX
    "AUX": "AUX",
    "XVBM": "AUX",
    "XVAM": "AUX",
    "XVMM": "AUX",
    "XVBB": "AUX",
    "XVAE": "AUX",
    # ADP
    "ADP": "ADP",
    "RPRE": "ADP",
    # CCONJ
    "CCONJ": "CCONJ",
    "JCRG": "CCONJ",
    # SCONJ
    "SCONJ": "SCONJ",
    "PREL": "SCONJ",
    "JSBR": "SCONJ",
    "JCMP": "SCONJ",
    # PART
    "PART": "PART",
    "FIXN": "PART",
    "FIXV": "PART",
    "EAFF": "PART",
    "EITT": "PART",
    "AITT": "PART",
    "NEG": "PART",
    # PUNCT
    "PUNCT": "PUNCT",
    "PUNC": "PUNCT",
}


def _UD_Exception(w: str, tag: str) -> str:
    if w == "การ" or w == "ความ":
        return "NOUN"

    return tag


def _orchid_to_ud(tag) -> List[Tuple[str, str]]:
    _i = 0
    temp = []
    while _i < len(tag):
        temp.append(
            (tag[_i][0], _UD_Exception(tag[_i][0], _TAG_MAP_UD[tag[_i][1]]))
        )
        _i += 1

    return temp


[docs]def pos_tag( words: List[str], engine: str = "perceptron", corpus: str = "orchid" ) -> List[Tuple[str, str]]: """ The function tag a list of tokenized words into Part-of-Speech (POS) tags such as 'NOUN', 'VERB', 'ADJ', and 'DET'. :param list words: a list of tokenized words :param str engine: * *perceptron* - perceptron tagger (default) * *unigram* - unigram tagger :param str corpus: * *orchid* - annotated Thai academic articles namedly `Orchid <https://www.academia.edu/9127599/Thai_Treebank>`_ (default) * *orchid_ud* - annotated Thai academic articles *Orchid* but the POS tags are mapped to comply with `Universal Dependencies <https://universaldependencies.org/u/pos>`_ POS Tags * *pud* - `Parallel Universal Dependencies (PUD) <https://github.com/UniversalDependencies/UD_Thai-PUD>`_ treebanks :return: returns a list of labels regarding which part of speech it is :rtype: list[tuple[str, str]] :Example: Tag words with corpus `orchid` (default):: from pythainlp.tag import pos_tag words = ['ฉัน','มี','ชีวิต','รอด','ใน','อาคาร','หลบภัย','ของ', \\ 'นายก', 'เชอร์ชิล'] pos_tag(words) # output: # [('ฉัน', 'PPRS'), ('มี', 'VSTA'), ('ชีวิต', 'NCMN'), ('รอด', 'NCMN'), # ('ใน', 'RPRE'), ('อาคาร', 'NCMN'), ('หลบภัย', 'NCMN'), # ('ของ', 'RPRE'), ('นายก', 'NCMN'), ('เชอร์ชิล', 'NCMN')] Tag words with corpus `orchid_ud`:: from pythainlp.tag import pos_tag words = ['ฉัน','มี','ชีวิต','รอด','ใน','อาคาร','หลบภัย','ของ', \\ 'นายก', 'เชอร์ชิล'] pos_tag(words, corpus='orchid_ud') # output: # [('ฉัน', 'PROPN'), ('มี', 'VERB'), ('ชีวิต', 'NOUN'), # ('รอด', 'NOUN'), ('ใน', 'ADP'), ('อาคาร', 'NOUN'), # ('หลบภัย', 'NOUN'), ('ของ', 'ADP'), ('นายก', 'NOUN'), # ('เชอร์ชิล', 'NOUN')] Tag words with corpus `pud`:: from pythainlp.tag import pos_tag words = ['ฉัน','มี','ชีวิต','รอด','ใน','อาคาร','หลบภัย','ของ', \\ 'นายก', 'เชอร์ชิล'] pos_tag(words, corpus='pud') # [('ฉัน', 'PRON'), ('มี', 'VERB'), ('ชีวิต', 'NOUN'), ('รอด', 'VERB'), # ('ใน', 'ADP'), ('อาคาร', 'NOUN'), ('หลบภัย', 'NOUN'), # ('ของ', 'ADP'), ('นายก', 'NOUN'), ('เชอร์ชิล', 'PROPN')] Tag words with different engines including *perceptron* and *unigram*:: from pythainlp.tag import pos_tag words = ['เก้าอี้','มี','จำนวน','ขา', ' ', '=', '3'] pos_tag(words, engine='perceptron', corpus='orchid') # output: # [('เก้าอี้', 'NCMN'), ('มี', 'VSTA'), ('จำนวน', 'NCMN'), # ('ขา', 'NCMN'), (' ', 'PUNC'), # ('=', 'PUNC'), ('3', 'NCNM')] pos_tag(words, engine='unigram', corpus='pud') # output: # [('เก้าอี้', None), ('มี', 'VERB'), ('จำนวน', 'NOUN'), ('ขา', None), # ('<space>', None), ('<equal>', None), ('3', 'NUM')] """ # NOTE: _corpus = corpus _tag = [] if corpus == "orchid_ud": corpus = "orchid" if not words: return [] if engine == "perceptron": from .perceptron import tag as tag_ else: # default, use "unigram" ("old") engine from .unigram import tag as tag_ _tag = tag_(words, corpus=corpus) if _corpus == "orchid_ud": _tag = _orchid_to_ud(_tag) return _tag
[docs]def pos_tag_sents( sentences: List[List[str]], engine: str = "perceptron", corpus: str = "orchid", ) -> List[List[Tuple[str, str]]]: """ The function tag multiple list of tokenized words into Part-of-Speech (POS) tags. :param list sentences: a list of lists of tokenized words :param str engine: * *perceptron* - perceptron tagger (default) * *unigram* - unigram tagger :param str corpus: * *orchid* - annotated Thai academic articles namedly\ `Orchid <https://www.academia.edu/9127599/Thai_Treebank>`_\ (default) * *orchid_ud* - annotated Thai academic articles using\ `Universal Dependencies <https://universaldependencies.org/>`_ Tags * *pud* - `Parallel Universal Dependencies (PUD)\ <https://github.com/UniversalDependencies/UD_Thai-PUD>`_ treebanks :return: returns a list of labels regarding which part of speech it is for each sentence given. :rtype: list[list[tuple[str, str]]] :Example: Labels POS for two sentences:: from pythainlp.tag import pos_tag_sents sentences = [['เก้าอี้','มี','3','ขา'], \\ ['นก', 'บิน', 'กลับ', 'รัง']] pos_tag_sents(sentences, corpus='pud) # output: # [[('เก้าอี้', 'PROPN'), ('มี', 'VERB'), ('3', 'NUM'), # ('ขา', 'NOUN')], [('นก', 'NOUN'), ('บิน', 'VERB'), # ('กลับ', 'VERB'), ('รัง', 'NOUN')]] """ if not sentences: return [] return [pos_tag(sent, engine=engine, corpus=corpus) for sent in sentences]