Source code for pythainlp.ulmfit.tokenizer

# -*- coding: utf-8 -*-
# Copyright (C) 2016-2023 PyThaiNLP Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Tokenzier classes for ULMFiT
"""

from typing import Collection, List
from pythainlp.tokenize import THAI2FIT_TOKENIZER


class BaseTokenizer:
    """Basic class for a tokenizer function. (code from `fastai`)"""

    def __init__(self, lang: str):
        self.lang = lang

    def tokenizer(self, t: str) -> List[str]:
        return t.split(" ")

    def add_special_cases(self, toks: Collection[str]):
        pass


[docs]class ThaiTokenizer(BaseTokenizer): """ Wrapper around a frozen newmm tokenizer to make it a :class:`fastai.BaseTokenizer`. (see: https://docs.fast.ai/text.transform#BaseTokenizer) """
[docs] def __init__(self, lang: str = "th"): self.lang = lang
[docs] @staticmethod def tokenizer(text: str) -> List[str]: """ This function tokenizes text with *newmm* engine and the dictionary specifically for `ulmfit` related functions (see: `Dictonary file (.txt) \ <https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/words_th_thai2fit_201810.txt>`_). :meth: tokenize text with a frozen newmm engine :param str text: text to tokenize :return: tokenized text :rtype: list[str] :Example: Using :func:`pythainlp.ulmfit.ThaiTokenizer.tokenizer` is similar to :func:`pythainlp.tokenize.word_tokenize` with *ulmfit* engine. >>> from pythainlp.ulmfit import ThaiTokenizer >>> from pythainlp.tokenize import word_tokenize >>> >>> text = "อาภรณ์, จินตมยปัญญา ภาวนามยปัญญา" >>> ThaiTokenizer.tokenizer(text) ['อาภรณ์', ',', ' ', 'จิน', 'ตม', 'ย', 'ปัญญา', ' ', 'ภาวนามยปัญญา'] >>> >>> word_tokenize(text, engine='ulmfit') ['อาภรณ์', ',', ' ', 'จิน', 'ตม', 'ย', 'ปัญญา', ' ', 'ภาวนามยปัญญา'] """ return THAI2FIT_TOKENIZER.word_tokenize(text)
[docs] def add_special_cases(self, toks): pass