Source code for pythainlp.tokenize.han_solo

# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
# SPDX-FileCopyrightText: Copyright 2019 Ponrawee Prasertsom
# SPDX-License-Identifier: Apache-2.0
"""
ðŸŠŋ Han-solo: Thai syllable segmenter

GitHub: https://github.com/PyThaiNLP/Han-solo
"""
from typing import List
from pythainlp.corpus import path_pythainlp_corpus

try:
    import pycrfsuite
except ImportError:
    raise ImportError(
        "ImportError; Install pycrfsuite by pip install python-crfsuite"
    )

tagger = pycrfsuite.Tagger()
tagger.open(path_pythainlp_corpus("han_solo.crfsuite"))


[docs]class Featurizer: # This class from ssg at https://github.com/ponrawee/ssg.
[docs] def __init__(self, N=2, sequence_size=1, delimiter=None): self.N = N self.delimiter = delimiter self.radius = N + sequence_size
[docs] def pad(self, sentence, padder="#"): return padder * (self.radius) + sentence + padder * (self.radius)
[docs] def featurize( self, sentence, padding=True, indiv_char=True, return_type="list" ): if padding: sentence = self.pad(sentence) all_features = [] all_labels = [] skip_next = False for current_position in range( self.radius, len(sentence) - self.radius + 1 ): if skip_next: skip_next = False continue features = {} if return_type == "list": features = [] cut = 0 char = sentence[current_position] if char == self.delimiter: cut = 1 skip_next = True counter = 0 chars_left = "" chars_right = "" chars = "" abs_index_left = current_position # left start at -1 abs_index_right = current_position - 1 # right start at 0 while counter < self.radius: abs_index_left -= ( 1 # āļŠāļĄāļĄāļļāļ•āļīāļ•āļģāđāļŦāļ™āđˆāļ‡āļ—āļĩāđˆ 0 āļˆāļ°āđ„āļ”āđ‰ -1, -2, -3, -4, -5 (radius = 5) ) char_left = sentence[abs_index_left] while char_left == self.delimiter: abs_index_left -= 1 char_left = sentence[abs_index_left] relative_index_left = -counter - 1 # āđ€āļāđ‡āļšāļ•āļąāļ§āļŦāļ™āļąāļ‡āļŠāļ·āļ­ chars_left = char_left + chars_left # āđƒāļŠāđˆāļĨāļ‡ feature if indiv_char: left_key = "|".join([str(relative_index_left), char_left]) if return_type == "dict": features[left_key] = 1 else: features.append(left_key) abs_index_right += ( 1 # āļŠāļĄāļĄāļļāļ•āļīāļ„āļ·āļ­āļ•āļģāđāļŦāļ™āđˆāļ‡āļ—āļĩāđˆ 0 āļˆāļ°āđ„āļ”āđ‰ 0, 1, 2, 3, 4 (radius = 5) ) char_right = sentence[abs_index_right] while char_right == self.delimiter: abs_index_right += 1 char_right = sentence[abs_index_right] relative_index_right = counter chars_right += char_right if indiv_char: right_key = "|".join( [str(relative_index_right), char_right] ) if return_type == "dict": features[right_key] = 1 else: features.append(right_key) counter += 1 chars = chars_left + chars_right for i in range(0, len(chars) - self.N + 1): ngram = chars[i : i + self.N] ngram_key = "|".join([str(i - self.radius), ngram]) if return_type == "dict": features[ngram_key] = 1 else: features.append(ngram_key) all_features.append(features) if return_type == "list": cut = str(cut) all_labels.append(cut) return {"X": all_features, "Y": all_labels}
_to_feature = Featurizer()
[docs]def segment(text: str) -> List[str]: x = _to_feature.featurize(text)["X"] y_pred = tagger.tag(x) list_cut = [] for j, k in zip(list(text), y_pred): if k == "1": list_cut.append(j) else: list_cut[-1] += j return list_cut