Source code for pythainlp.tokenize.han_solo

# -*- coding: utf-8 -*-
# Copyright (C) 2016-2023 PyThaiNLP Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
ðŸŠŋ Han-solo: Thai syllable segmenter

GitHub: https://github.com/PyThaiNLP/Han-solo
"""
from typing import List
from pythainlp.corpus import path_pythainlp_corpus
try:
    import pycrfsuite
except ImportError:
    raise ImportError("ImportError; Install pycrfsuite by pip install python-crfsuite")

tagger = pycrfsuite.Tagger()
tagger.open(path_pythainlp_corpus('han_solo.crfsuite'))


[docs]class Featurizer: # This class from ssg at https://github.com/ponrawee/ssg. # Copyright 2019 Ponrawee Prasertsom # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # http://www.apache.org/licenses/LICENSE-2.0 # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # { # "0 (current anchor)|+1 (the character on the right from anchor)|A (character)" : 1 # }
[docs] def __init__(self, N=2, sequence_size=1, delimiter=None): self.N = N self.delimiter = delimiter self.radius = N + sequence_size
[docs] def pad(self, sentence, padder='#'): return padder * (self.radius) + sentence + padder * (self.radius)
[docs] def featurize(self, sentence, padding=True, indiv_char=True, return_type='list'): if padding: sentence = self.pad(sentence) all_features = [] all_labels = [] skip_next = False for current_position in range(self.radius, len(sentence) - self.radius + 1): if skip_next: skip_next = False continue features = {} if return_type == 'list': features = [] cut = 0 char = sentence[current_position] if char == self.delimiter: cut = 1 skip_next = True counter = 0 chars_left = '' chars_right = '' chars = '' abs_index_left = current_position # left start at -1 abs_index_right = current_position - 1 # right start at 0 while counter < self.radius: abs_index_left -= 1 # āļŠāļĄāļĄāļļāļ•āļīāļ•āļģāđāļŦāļ™āđˆāļ‡āļ—āļĩāđˆ 0 āļˆāļ°āđ„āļ”āđ‰ -1, -2, -3, -4, -5 (radius = 5) char_left = sentence[abs_index_left] while char_left == self.delimiter: abs_index_left -= 1 char_left = sentence[abs_index_left] relative_index_left = -counter - 1 # āđ€āļāđ‡āļšāļ•āļąāļ§āļŦāļ™āļąāļ‡āļŠāļ·āļ­ chars_left = char_left + chars_left # āđƒāļŠāđˆāļĨāļ‡ feature if indiv_char: left_key = '|'.join([str(relative_index_left), char_left]) if return_type == 'dict': features[left_key] = 1 else: features.append(left_key) abs_index_right += 1 # āļŠāļĄāļĄāļļāļ•āļīāļ„āļ·āļ­āļ•āļģāđāļŦāļ™āđˆāļ‡āļ—āļĩāđˆ 0 āļˆāļ°āđ„āļ”āđ‰ 0, 1, 2, 3, 4 (radius = 5) char_right = sentence[abs_index_right] while char_right == self.delimiter: abs_index_right += 1 char_right = sentence[abs_index_right] relative_index_right = counter chars_right += char_right if indiv_char: right_key = '|'.join([str(relative_index_right), char_right]) if return_type == 'dict': features[right_key] = 1 else: features.append(right_key) counter += 1 chars = chars_left + chars_right for i in range(0, len(chars) - self.N + 1): ngram = chars[i:i + self.N] ngram_key = '|'.join([str(i - self.radius), ngram]) if return_type == 'dict': features[ngram_key] = 1 else: features.append(ngram_key) all_features.append(features) if return_type == 'list': cut = str(cut) all_labels.append(cut) return { 'X': all_features, 'Y': all_labels }
_to_feature = Featurizer()
[docs]def segment(text: str) -> List[str]: x=_to_feature.featurize(text)["X"] y_pred = tagger.tag(x) list_cut = [] for j,k in zip(list(text),y_pred): if k=="1": list_cut.append(j) else: list_cut[-1]+=j return list_cut