# -*- coding: utf-8 -*-
# Copyright (C) 2016-2023 PyThaiNLP Project
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from sys import stderr
from typing import List
from nlpo3 import segment as nlpo3_segment
from nlpo3 import load_dict as nlpo3_load_dict
from pythainlp.corpus.common import _THAI_WORDS_FILENAME
from pythainlp.corpus import path_pythainlp_corpus
_NLPO3_DEFAULT_DICT_NAME = "_67a47bf9"
_NLPO3_DEFAULT_DICT = nlpo3_load_dict(
[docs]def load_dict(file_path: str, dict_name: str) -> bool:
"""Load a dictionary file into an in-memory dictionary collection.
The loaded dictionary will be accessible through the assigned dict_name.
*** This function does not override an existing dict name. ***
:param file_path: Path to a dictionary file
:type file_path: str
:param dict_name: A unique dictionary name, used for reference.
:type dict_name: str
msg, success = nlpo3_load_dict(file_path=file_path, dict_name=dict_name)
if bool is False:
custom_dict: str = _NLPO3_DEFAULT_DICT_NAME,
safe_mode: bool = False,
parallel_mode: bool = False,
) -> List[str]:
"""Break text into tokens.
Python binding for nlpO3. It is newmm engine in Rust.
:param str text: text to be tokenized
:param str custom_dict: dictionary name, as assigned with load_dict(),\
defaults to pythainlp/corpus/common/words_th.txt
:param bool safe_mode: reduce chance for long processing time for long text\
with many ambiguous breaking points, defaults to False
:param bool parallel_mode: Use multithread mode, defaults to False
:return: list of tokens