# -*- coding: utf-8 -*-
# Copyright (C) 2016-2023 PyThaiNLP Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
ðŠŋ Han-solo: Thai syllable segmenter
GitHub: https://github.com/PyThaiNLP/Han-solo
"""
from typing import List
from pythainlp.corpus import path_pythainlp_corpus
try:
import pycrfsuite
except ImportError:
raise ImportError("ImportError; Install pycrfsuite by pip install python-crfsuite")
tagger = pycrfsuite.Tagger()
tagger.open(path_pythainlp_corpus('han_solo.crfsuite'))
[docs]class Featurizer:
# This class from ssg at https://github.com/ponrawee/ssg.
# Copyright 2019 Ponrawee Prasertsom
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# {
# "0 (current anchor)|+1 (the character on the right from anchor)|A (character)" : 1
# }
[docs] def __init__(self, N=2, sequence_size=1, delimiter=None):
self.N = N
self.delimiter = delimiter
self.radius = N + sequence_size
[docs] def pad(self, sentence, padder='#'):
return padder * (self.radius) + sentence + padder * (self.radius)
[docs] def featurize(self, sentence, padding=True, indiv_char=True, return_type='list'):
if padding:
sentence = self.pad(sentence)
all_features = []
all_labels = []
skip_next = False
for current_position in range(self.radius, len(sentence) - self.radius + 1):
if skip_next:
skip_next = False
continue
features = {}
if return_type == 'list':
features = []
cut = 0
char = sentence[current_position]
if char == self.delimiter:
cut = 1
skip_next = True
counter = 0
chars_left = ''
chars_right = ''
chars = ''
abs_index_left = current_position # left start at -1
abs_index_right = current_position - 1 # right start at 0
while counter < self.radius:
abs_index_left -= 1 # āļŠāļĄāļĄāļļāļāļīāļāļģāđāļŦāļāđāļāļāļĩāđ 0 āļāļ°āđāļāđ -1, -2, -3, -4, -5 (radius = 5)
char_left = sentence[abs_index_left]
while char_left == self.delimiter:
abs_index_left -= 1
char_left = sentence[abs_index_left]
relative_index_left = -counter - 1
# āđāļāđāļāļāļąāļ§āļŦāļāļąāļāļŠāļ·āļ
chars_left = char_left + chars_left
# āđāļŠāđāļĨāļ feature
if indiv_char:
left_key = '|'.join([str(relative_index_left), char_left])
if return_type == 'dict':
features[left_key] = 1
else:
features.append(left_key)
abs_index_right += 1 # āļŠāļĄāļĄāļļāļāļīāļāļ·āļāļāļģāđāļŦāļāđāļāļāļĩāđ 0 āļāļ°āđāļāđ 0, 1, 2, 3, 4 (radius = 5)
char_right = sentence[abs_index_right]
while char_right == self.delimiter:
abs_index_right += 1
char_right = sentence[abs_index_right]
relative_index_right = counter
chars_right += char_right
if indiv_char:
right_key = '|'.join([str(relative_index_right), char_right])
if return_type == 'dict':
features[right_key] = 1
else:
features.append(right_key)
counter += 1
chars = chars_left + chars_right
for i in range(0, len(chars) - self.N + 1):
ngram = chars[i:i + self.N]
ngram_key = '|'.join([str(i - self.radius), ngram])
if return_type == 'dict':
features[ngram_key] = 1
else:
features.append(ngram_key)
all_features.append(features)
if return_type == 'list':
cut = str(cut)
all_labels.append(cut)
return {
'X': all_features,
'Y': all_labels
}
_to_feature = Featurizer()
[docs]def segment(text: str) -> List[str]:
x=_to_feature.featurize(text)["X"]
y_pred = tagger.tag(x)
list_cut = []
for j,k in zip(list(text),y_pred):
if k=="1":
list_cut.append(j)
else:
list_cut[-1]+=j
return list_cut