Source code for pythainlp.util.phoneme

# -*- coding: utf-8 -*-
# Copyright (C) 2016-2023 PyThaiNLP Project
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.
Phonemes util
import unicodedata
from pythainlp.util.trie import Trie
from pythainlp.tokenize import Tokenizer

consonants_ipa_nectec = [
# ipa, initial, final

monophthong_ipa_nectec = [
    ("","@@"), #-อ long

diphthong_ipa_nectec = [

tones_ipa_nectec = [

dict_nectec_to_ipa = {i[1]:i[0] for i in consonants_ipa_nectec+monophthong_ipa_nectec+diphthong_ipa_nectec+tones_ipa_nectec}
dict_nectec_to_ipa.update({i[2]:i[0] for i in consonants_ipa_nectec if len(i)>2})

[docs]def nectec_to_ipa(pronunciation: str) -> str: """ Convert NECTEC system to IPA system :param str pronunciation: NECTEC phoneme :return: IPA that is converted :rtype: str :Example: :: from pythainlp.util import nectec_to_ipa print(nectec_to_ipa("kl-uua-j^-2")) # output : 'kl uua j ˥˩' References ---------- Pornpimon Palingoon, Sumonmas Thatphithakkul. Chapter 4 Speech processing and Speech corpus. In: Handbook of Thai Electronic Corpus. 1st ed. p. 122–56. """ pronunciation = pronunciation.split("-") _temp = [] for i in pronunciation: if i in dict_nectec_to_ipa.keys(): _temp.append(dict_nectec_to_ipa[i]) else: _temp.append(i) return ' '.join(_temp)
dict_ipa_rtgs = { "b":"b", "d":"d", "f":"f", "h":"h", # The conversion of j depends on its position in the syllable. # But, unfortunately, the current implementation cannot handle both cases. # To remove confusions without changing the behavior and breaking existing codes, # it is suggested that the first key-value mapping of j be simply commented out, # as it would be overridden by the second one and thus never take effect from the beginning. # See #846 for a more detailed discussion: # "j":"y", "k":"k", "kʰ":"kh", "l":"l", "m":"m", "n":"n", "ŋ":"ng", "p":"p", "pʰ":"ph", "r":"r", "s":"s", "t":"t", "tʰ":"th", "tɕ":"ch", "tɕʰ":"ch", "w":"w", "ʔ":"", "j":"i", "a":"a", "e":"e", "ɛ":"ae", "i":"i", "o":"o", "ɔ":"o", "u":"u", "ɯ":"ue", "ɤ":"oe", "aː":"a", "eː":"e", "ɛː":"ae", "iː":"i", "oː":"o", "ɔː":"o", "uː":"u", "ɯː":"ue", "ɤː":"oe", "ia":"ia", "ua":"ua", "ɯa":"uea", "aj":"ai", "aw":"ao", "ew":"eo", "ɛw":"aeo", "iw":"io", "ɔj":"io", "uj":"ui", "aːj":"ai", "aːw":"ao", "eːw":"eo", "ɛːw":"aeo", "oːj":"oi", "ɔːj":"oi", "ɤːj":"oei", "iaw":"iao", "uaj":"uai", "ɯaj":"ueai", ".":".", } dict_ipa_rtgs_final = { "w":"o" } trie = Trie(list(dict_ipa_rtgs.keys())+list(dict_ipa_rtgs_final.keys())) ipa_cut = Tokenizer(custom_dict=trie, engine="newmm")
[docs]def ipa_to_rtgs(ipa: str) -> str: """ Convert IPA system to The Royal Thai General System of Transcription (RTGS) Docs: :param str ipa: IPA phoneme :return: The RTGS that is converted, according to rules listed in the Wikipedia page :rtype: str :Example: :: from pythainlp.util import ipa_to_rtgs print(ipa_to_rtgs("kluaj")) # output : 'kluai' """ _temp = [] _list_ipa = ipa_cut.word_tokenize(ipa) for i,p in enumerate(_list_ipa): if i == len(_list_ipa) -1 and p in list(dict_ipa_rtgs_final): _temp.append(dict_ipa_rtgs_final[p]) elif p in list(dict_ipa_rtgs): _temp.append(dict_ipa_rtgs[p]) else: _temp.append(p) _text = ''.join(_temp) _text = unicodedata.normalize('NFKD', _text).encode('ascii', 'ignore') return _text.decode("utf-8")
[docs]def remove_tone_ipa(ipa: str) -> str: """ Remove Thai Tones from IPA system :param str ipa: IPA phoneme :return: IPA phoneme with tones removed :rtype: str :Example: :: from pythainlp.util import remove_tone_ipa print(remove_tone_ipa("laː˦˥.sa˨˩.maj˩˩˦")) # output : laː.sa.maj """ _list_tone = ["˩˩˦", "˥˩", "˨˩", "˦˥", "˧"] for tone in _list_tone: ipa = ipa.replace(tone, "") return ipa