Source code for pythainlp.tokenize.crfcut

# -*- coding: utf-8 -*-
# Copyright (C) 2016-2023 PyThaiNLP Project
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.
CRFCut - Thai sentence segmenter.

Thai sentence segmentation using conditional random field,
with default model trained on TED dataset

- ORCHID - space-correct accuracy 87% vs 95% state-of-the-art
  (Zhou et al, 2016;
- TED dataset - space-correct accuracy 82%

See development notebooks at;
POS features are not used due to unreliable POS tagging available

import os
from typing import List

import pycrfsuite
from pythainlp.corpus import corpus_path
from pythainlp.tokenize import word_tokenize

    # ending honorifics
    # enders
    # demonstratives
    # questions
    # pronouns
    # connectors
    # demonstratives

[docs]def extract_features( doc: List[str], window: int = 2, max_n_gram: int = 3 ) -> List[List[str]]: """ Extract features for CRF by sliding `max_n_gram` of tokens for +/- `window` from the current token :param List[str] doc: tokens from which features are to be extracted :param int window: size of window before and after the current token :param int max_n_gram: create n_grams from 1-gram to `max_n_gram`-gram \ within the `window` :return: list of lists of features to be fed to CRF """ doc_features = [] doc = ( ["xxpad" for i in range(window)] + doc + ["xxpad" for i in range(window)] ) # add enders and starters doc_ender = [] doc_starter = [] for i in range(len(doc)): if doc[i] in _ENDERS: doc_ender.append("ender") else: doc_ender.append("normal") if doc[i] in _STARTERS: doc_starter.append("starter") else: doc_starter.append("normal") # for each word for i in range(window, len(doc) - window): # bias term word_features = ["bias"] # ngram features for n_gram in range(1, min(max_n_gram + 1, 2 + window * 2)): for j in range(i - window, i + window + 2 - n_gram): feature_position = f"{n_gram}_{j-i}_{j-i+n_gram}" word_ = f'{"|".join(doc[j:(j+n_gram)])}' word_features += [f"word_{feature_position}={word_}"] ender_ = f'{"|".join(doc_ender[j:(j+n_gram)])}' word_features += [f"ender_{feature_position}={ender_}"] starter_ = f'{"|".join(doc_starter[j:(j+n_gram)])}' word_features += [f"starter_{feature_position}={starter_}"] # append to feature per word doc_features.append(word_features) return doc_features
_CRFCUT_DATA_FILENAME = "sentenceseg_crfcut.model" _tagger = pycrfsuite.Tagger(), _CRFCUT_DATA_FILENAME))
[docs]def segment(text: str) -> List[str]: """ CRF-based sentence segmentation. :param str text: text to be tokenized into sentences :return: list of words, tokenized from the text """ if isinstance(text, str): toks = word_tokenize(text) else: toks = text feat = extract_features(toks) labs = _tagger.tag(feat) labs[-1] = "E" # make sure it cuts the last sentence sentences = [] sentence = "" for i, w in enumerate(toks): sentence = sentence + w if labs[i] == "E": sentences.append(sentence) sentence = "" return sentences