Source code for pythainlp.util.phoneme

# -*- coding: utf-8 -*-
# Copyright (C) 2016-2023 PyThaiNLP Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Phonemes util
"""
import unicodedata
from pythainlp.util.trie import Trie
from pythainlp.tokenize import Tokenizer

consonants_ipa_nectec = [
    ("k","k","k^"),
    ("kʰ","kh"),
    ("ŋ","ng","ng^"),
    ("tɕ","c"),
    ("tɕʰ","ch"),
    ("s","s"),
    ("j","j","j^"),
    ("d","d"),
    ("t","y","t^"),
    ("tʰ","th"),
    ("n","n","n^"),
    ("b","b"),
    ("p","p","p^"),
    ("pʰ","ph"),
    ("f","f"),
    ("m","m","m^"),
    ("r","r"),
    ("l","l"),
    ("w","w","w^"),
    ("h","h"),
    ("?","z","z^")
]
# ipa, initial, final

monophthong_ipa_nectec = [
    ("i","i"),
    ("e","e"),
    ("ɛ","x"),
    ("ɤ","q"),
    ("a","a"),
    ("am","am^"),
    ("aj","aj^"),
    ("aw","aw^"),
    ("u","u"),
    ("o","o"),
    ("ɔ","@"),
    ("ii","ii"),
    ("ee","ee"),
    ("ɛɛ","xx"),
    ("ɯɯ","vv"),
    ("ɤɤ","qq"),
    ("aa","aa"),
    ("uu","uu"),
    ("oo","oo"),
    ("","@@"), #-อ long
]

diphthong_ipa_nectec = [
    ("ia","ia"),
    ("ɯa","va"),
    ("ua","ua"),
    ("iia","iia"),
    ("ɯɯa","vva"),
    ("uua","uua"),
]

tones_ipa_nectec = [
    ("˧","0"),
    ("˨˩","1"),
    ("˥˩","2"),
    ("˦˥","3"),
    ("˩˩˦","4"),
]

dict_nectec_to_ipa = {i[1]:i[0] for i in consonants_ipa_nectec+monophthong_ipa_nectec+diphthong_ipa_nectec+tones_ipa_nectec}
dict_nectec_to_ipa.update({i[2]:i[0] for i in consonants_ipa_nectec if len(i)>2})


[docs]def nectec_to_ipa(pronunciation: str) -> str: """ Converter NECTEC system to IPA system :param str pronunciation: NECTEC phoneme :return: IPA that be convert :rtype: str :Example: :: from pythainlp.util import nectec_to_ipa print(nectec_to_ipa("kl-uua-j^-2")) # output : 'kl uua j ˥˩' References ---------- Pornpimon Palingoon, Sumonmas Thatphithakkul. Chapter 4 Speech processing and Speech corpus. In: Handbook of Thai Electronic Corpus. 1st ed. p. 122–56. """ pronunciation = pronunciation.split("-") _temp = [] for i in pronunciation: if i in dict_nectec_to_ipa.keys(): _temp.append(dict_nectec_to_ipa[i]) else: _temp.append(i) return ' '.join(_temp)
dict_ipa_rtgs = { "b":"b", "d":"d", "f":"f", "h":"h", "j":"y", "k":"k", "kʰ":"kh", "l":"l", "m":"m", "n":"n", "ŋ":"ng", "p":"p", "pʰ":"ph", "r":"r", "s":"s", "t":"t", "tʰ":"th", "tɕ":"ch", "tɕʰ":"ch", "w":"w", "ʔ":"", "j":"i", "a":"a", "e":"e", "ɛ":"ae", "i":"i", "o":"o", "ɔ":"o", "u":"u", "ɯ":"ue", "ɤ":"oe", "aː":"a", "eː":"e", "ɛː":"ae", "iː":"i", "oː":"o", "ɔː":"o", "uː":"u", "ɯː":"ue", "ɤː":"oe", "ia":"ia", "ua":"ua", "ɯa":"uea", "aj":"ai", "aw":"ao", "ew":"eo", "ɛw":"aeo", "iw":"io", "ɔj":"io", "uj":"ui", "aːj":"ai", "aːw":"ao", "eːw":"eo", "ɛːw":"aeo", "oːj":"oi", "ɔːj":"oi", "ɤːj":"oei", "iaw":"iao", "uaj":"uai", "ɯaj":"ueai", ".":".", } dict_ipa_rtgs_final = { "w":"o" } trie = Trie(list(dict_ipa_rtgs.keys())+list(dict_ipa_rtgs_final.keys())) ipa_cut = Tokenizer(custom_dict=trie, engine="newmm")
[docs]def ipa_to_rtgs(ipa: str) -> str: """ Converter IPA system to The Royal Thai General System of Transcription (RTGS) Docs: https://en.wikipedia.org/wiki/Help:IPA/Thai :param str ipa: IPA phoneme :return: The RTGS that be convert :rtype: str :Example: :: from pythainlp.util import ipa_to_rtgs print(ipa_to_rtgs("kluaj")) # output : 'kluai' """ _temp = [] _list_ipa = ipa_cut.word_tokenize(ipa) for i,p in enumerate(_list_ipa): if i == len(_list_ipa) -1 and p in list(dict_ipa_rtgs_final.keys()): _temp.append(dict_ipa_rtgs_final[p]) elif p in list(dict_ipa_rtgs.keys()): _temp.append(dict_ipa_rtgs[p]) else: _temp.append(p) _text = ''.join(_temp) _text = unicodedata.normalize('NFKD', _text).encode('ascii', 'ignore') return _text.decode("utf-8")
[docs]def remove_tone_ipa(ipa: str) -> str: """ Remove Thai Tone from IPA system :param str ipa: IPA phoneme :return: IPA phoneme that deleted tone :rtype: str :Example: :: from pythainlp.util import remove_tone_ipa print(remove_tone_ipa("laː˦˥.sa˨˩.maj˩˩˦")) # output : laː.sa.maj """ _list_tone = ["˩˩˦", "˥˩", "˨˩", "˦˥", "˧"] for tone in _list_tone: ipa = ipa.replace(tone, "") return ipa