Source code for pythainlp.corpus.th_en_translit

# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0
"""
Thai-English Transliteration Dictionary v1.4

Wannaphong Phatthiyaphaibun. (2022).
wannaphong/thai-english-transliteration-dictionary: v1.4 (v1.4).
Zenodo. https://doi.org/10.5281/zenodo.6716672
"""

__all__ = [
    "get_transliteration_dict",
    "TRANSLITERATE_EN",
    "TRANSLITERATE_FOLLOW_RTSG",
]

from collections import defaultdict

from pythainlp.corpus import path_pythainlp_corpus

_FILE_NAME = "th_en_transliteration_v1.4.tsv"
TRANSLITERATE_EN = "en"
TRANSLITERATE_FOLLOW_RTSG = "follow_rtsg"


[docs]def get_transliteration_dict() -> defaultdict:
    """
    Get Thai to English transliteration dictionary.

    The returned dict is in defaultdict[str, defaultdict[List[str], List[Optional[bool]]]] format.
    """
    path = path_pythainlp_corpus(_FILE_NAME)
    if not path:
        raise FileNotFoundError(
            f"Unable to load transliteration dictionary. "
            f"{_FILE_NAME} is not found under pythainlp/corpus."
        )

    # use list, as one word can have multiple transliterations.
    trans_dict = defaultdict(
        lambda: {TRANSLITERATE_EN: [], TRANSLITERATE_FOLLOW_RTSG: []}
    )
    try:
        with open(path, "r", encoding="utf-8") as f:
            # assume that the first row contains column names, so skip it.
            for line in f.readlines()[1:]:
                stripped = line.strip()
                if stripped:
                    th, *en_checked = stripped.split("\t")
                    # replace in-between whitespace to prevent mismatched results from different tokenizers.
                    # e.g. "บอยแบนด์"
                    # route 1: "บอยแบนด์" -> ["บอย", "แบนด์"] -> ["boy", "band"] -> "boyband"
                    # route 2: "บอยแบนด์" -> [""บอยแบนด์""] -> ["boy band"] -> "boy band"
                    en_translit = en_checked[0].replace(" ", "")
                    trans_dict[th][TRANSLITERATE_EN].append(en_translit)
                    en_follow_rtgs = (
                        bool(en_checked[1]) if len(en_checked) == 2 else None
                    )
                    trans_dict[th][TRANSLITERATE_FOLLOW_RTSG].append(
                        en_follow_rtgs
                    )

    except ValueError:
        raise ValueError(
            f"Unable to parse {_FILE_NAME}."
            f"Make sure it is a 3-column tab-separated file with header."
        )
    else:
        return trans_dict


TRANSLITERATE_DICT = get_transliteration_dict()