Source code for pythainlp.tokenize.crfcut

# -*- coding: utf-8 -*-
"""
CRFCut - Thai sentence segmenter.

Thai sentence segmentation using conditional random field,
default model trained on TED dataset

Performance:
- ORCHID - space-correct accuracy 87% vs 95% state-of-the-art
  (Zhou et al, 2016; https://www.aclweb.org/anthology/C16-1031.pdf)
- TED dataset - space-correct accuracy 82%

See development notebooks at https://github.com/vistec-AI/ted_crawler;
POS features are not used due to unreliable POS tagging available
"""

import os
from typing import List

import pycrfsuite
from pythainlp.corpus import corpus_path
from pythainlp.tokenize import word_tokenize

_ENDERS = {
    # ending honorifics
    "ครับ",
    "ค่ะ",
    "คะ",
    "นะคะ",
    "นะ",
    "จ้ะ",
    "จ้า",
    "จ๋า",
    "ฮะ",
    # enders
    "ๆ",
    "ได้",
    "แล้ว",
    "ด้วย",
    "เลย",
    "มาก",
    "น้อย",
    "กัน",
    "เช่นกัน",
    "เท่านั้น",
    "อยู่",
    "ลง",
    "ขึ้น",
    "มา",
    "ไป",
    "ไว้",
    "เอง",
    "อีก",
    "ใหม่",
    "จริงๆ",
    "บ้าง",
    "หมด",
    "ทีเดียว",
    "เดียว",
    # demonstratives
    "นั้น",
    "นี้",
    "เหล่านี้",
    "เหล่านั้น",
    # questions
    "อย่างไร",
    "ยังไง",
    "หรือไม่",
    "มั้ย",
    "ไหน",
    "ไหม",
    "อะไร",
    "ทำไม",
    "เมื่อไหร่",
    "เมื่อไร",
}
_STARTERS = {
    # pronouns
    "ผม",
    "ฉัน",
    "ดิฉัน",
    "ชั้น",
    "คุณ",
    "มัน",
    "เขา",
    "เค้า",
    "เธอ",
    "เรา",
    "พวกเรา",
    "พวกเขา",
    "กู",
    "มึง",
    "แก",
    "ข้าพเจ้า",
    # connectors
    "และ",
    "หรือ",
    "แต่",
    "เมื่อ",
    "ถ้า",
    "ใน",
    "ด้วย",
    "เพราะ",
    "เนื่องจาก",
    "ซึ่ง",
    "ไม่",
    "ตอนนี้",
    "ทีนี้",
    "ดังนั้น",
    "เพราะฉะนั้น",
    "ฉะนั้น",
    "ตั้งแต่",
    "ในที่สุด",
    "ก็",
    "กับ",
    "แก่",
    "ต่อ",
    # demonstratives
    "นั้น",
    "นี้",
    "เหล่านี้",
    "เหล่านั้น",
}


[docs]def extract_features( doc: List[str], window: int = 2, max_n_gram: int = 3 ) -> List[List[str]]: """ Extract features for CRF by sliding `max_n_gram` of tokens for +/- `window` from the current token :param List[str] doc: tokens from which features are to be extracted from :param int window: size of window before and after the current token :param int max_n_gram: create n_grams from 1-gram to `max_n_gram`-gram \ within the `window` :return: list of lists of features to be fed to CRF """ doc_features = [] doc = ( ["xxpad" for i in range(window)] + doc + ["xxpad" for i in range(window)] ) # add enders and starters doc_ender = [] doc_starter = [] for i in range(len(doc)): if doc[i] in _ENDERS: doc_ender.append("ender") else: doc_ender.append("normal") if doc[i] in _STARTERS: doc_starter.append("starter") else: doc_starter.append("normal") # for each word for i in range(window, len(doc) - window): # bias term word_features = ["bias"] # ngram features for n_gram in range(1, min(max_n_gram + 1, 2 + window * 2)): for j in range(i - window, i + window + 2 - n_gram): feature_position = f"{n_gram}_{j-i}_{j-i+n_gram}" word_ = f'{"|".join(doc[j:(j+n_gram)])}' word_features += [f"word_{feature_position}={word_}"] ender_ = f'{"|".join(doc_ender[j:(j+n_gram)])}' word_features += [f"ender_{feature_position}={ender_}"] starter_ = f'{"|".join(doc_starter[j:(j+n_gram)])}' word_features += [f"starter_{feature_position}={starter_}"] # append to feature per word doc_features.append(word_features) return doc_features
_CRFCUT_DATA_FILENAME = "sentenceseg_crfcut.model" _tagger = pycrfsuite.Tagger() _tagger.open(os.path.join(corpus_path(), _CRFCUT_DATA_FILENAME))
[docs]def segment(text: str) -> List[str]: """ CRF-based sentence segmentation. :param str text: text to be tokenized to sentences :return: list of words, tokenized from the text """ toks = word_tokenize(text) feat = extract_features(toks) labs = _tagger.tag(feat) labs[-1] = "E" # make sure it cuts the last sentence sentences = [] sentence = "" for i, w in enumerate(toks): sentence = sentence + w if labs[i] == "E": sentences.append(sentence) sentence = "" return sentences