Source code for pythainlp.util.keyboard

# -*- coding: utf-8 -*-
# Copyright (C) 2016-2023 PyThaiNLP Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Functions related to keyboard layout.
"""

EN_TH_KEYB_PAIRS = {
    "Z": "(",
    "z": "ผ",
    "X": ")",
    "x": "ป",
    "C": "ฉ",
    "c": "แ",
    "V": "ฮ",
    "v": "อ",
    "B": "\u0e3a",  # พินทุ
    "b": "\u0e34",  # สระอุ
    "N": "\u0e4c",  # การันต์
    "n": "\u0e37",  # สระอือ
    "M": "?",
    "m": "ท",
    "<": "ฒ",
    ",": "ม",
    ">": "ฬ",
    ".": "ใ",
    "?": "ฦ",
    "/": "ฝ",
    "A": "ฤ",
    "a": "ฟ",
    "S": "ฆ",
    "s": "ห",
    "D": "ฏ",
    "d": "ก",
    "F": "โ",
    "f": "ด",
    "G": "ฌ",
    "g": "เ",
    "H": "\u0e47",  # ไม้ไต่คู้
    "h": "\u0e49",  # ไม้โท
    "J": "\u0e4b",  # ไม้จัตวา
    "j": "\u0e48",  # ไม้เอก
    "K": "ษ",
    "k": "า",
    "L": "ศ",
    "l": "ส",
    ":": "ซ",
    ";": "ว",
    '"': ".",
    "'": "ง",
    "Q": "๐",
    "q": "ๆ",
    "W": '"',
    "w": "ไ",
    "E": "ฎ",
    "e": "\u0e33",  # สระอำ
    "R": "ฑ",
    "r": "พ",
    "T": "ธ",
    "t": "ะ",
    "Y": "\u0e4d",  # นิคหิต
    "y": "\u0e31",  # ไม้หันอากาศ
    "U": "\u0e4a",  # ไม้ตรี
    "u": "\u0e35",  # สระอ ี
    "I": "ณ",
    "i": "ร",
    "O": "ฯ",
    "o": "น",
    "P": "ญ",
    "p": "ย",
    "{": "ฐ",
    "[": "บ",
    "}": ",",
    "]": "ล",
    "|": "ฅ",
    "\\": "ฃ",
    "~": "%",
    "`": "_",
    "@": "๑",
    "2": "/",
    "#": "๒",
    "3": "-",
    "$": "๓",
    "4": "ภ",
    "%": "๔",
    "5": "ถ",
    "^": "\u0e39",  # สระอู
    "6": "\u0e38",  # สระอุ
    "&": "฿",
    "7": "\u0e36",  # สระอึ
    "*": "๕",
    "8": "ค",
    "(": "๖",
    "9": "ต",
    ")": "๗",
    "0": "จ",
    "_": "๘",
    "-": "ข",
    "+": "๙",
    "=": "ช",
}

TH_EN_KEYB_PAIRS = {v: k for k, v in EN_TH_KEYB_PAIRS.items()}

EN_TH_TRANSLATE_TABLE = str.maketrans(EN_TH_KEYB_PAIRS)
TH_EN_TRANSLATE_TABLE = str.maketrans(TH_EN_KEYB_PAIRS)

TIS_820_2531_MOD = [
    ["-", "ๅ", "/", "", "_", "ภ", "ถ", "ุ", "ึ", "ค", "ต", "จ", "ข", "ช"],
    ["ๆ", "ไ", "ำ", "พ", "ะ", "ั", "ี", "ร", "น", "ย", "บ", "ล", "ฃ"],
    ["ฟ", "ห", "ก", "ด", "เ", "้", "่", "า", "ส", "ว", "ง"],
    ["ผ", "ป", "แ", "อ", "ิ", "ื", "ท", "ม", "ใ", "ฝ"],
]
TIS_820_2531_MOD_SHIFT = [
    ["%", "+", "๑", "๒", "๓", "๔", "ู", "฿", "๕", "๖", "๗", "๘", "๙"],
    ["๐", '"', "ฎ", "ฑ", "ธ", "ํ", "๊", "ณ", "ฯ", "ญ", "ฐ", ",", "ฅ"],
    ["ฤ", "ฆ", "ฏ", "โ", "ฌ", "็", "๋", "ษ", "ศ", "ซ", "."],
    ["(", ")", "ฉ", "ฮ", "ฺ", "์", "?", "ฒ", "ฬ", "ฦ"],
]


[docs]def eng_to_thai(text: str) -> str: """ Corrects the given text that was incorrectly typed using English-US Qwerty keyboard layout to the originally intended keyboard layout that is the Thai Kedmanee keyboard. :param str text: incorrect text input (type Thai with English keyboard) :return: Thai text where incorrect typing with a keyboard layout is corrected :rtype: str :Example: Intentionally type "ธนาคารแห่งประเทศไทย", but got "Tok8kicsj'xitgmLwmp":: from pythainlp.util import eng_to_thai eng_to_thai("Tok8kicsj'xitgmLwmp") # output: ธนาคารแห่งประเทศไทย """ return text.translate(EN_TH_TRANSLATE_TABLE)
[docs]def thai_to_eng(text: str) -> str: """ Corrects the given text that was incorrectly typed using Thai Kedmanee keyboard layout to the originally intended keyboard layout that is the English-US Qwerty keyboard. :param str text: incorrect text input (type English with Thai keyboard) :return: English text where incorrect typing with a keyboard layout is corrected :rtype: str :Example: Intentionally type "Bank of Thailand", but got "ฺฟืา นด ธ้ฟรสฟืก":: from pythainlp.util import eng_to_thai thai_to_eng("ฺฟืา นด ธ้ฟรสฟืก") # output: 'Bank of Thailand' """ return text.translate(TH_EN_TRANSLATE_TABLE)
def thai_keyboard_dist(c1: str, c2: str, shift_dist: float = 0.0) -> float: """ Calculate euclidean distance between two Thai characters according to their location on a Thai keyboard layout. A modified TIS 820-2531 standard keyboard layout, which is developed from Kedmanee layout and is the most commonly used Thai keyboard layout, is used in distance calculation. The modified TIS 820-2531 is TIS 820-2531 with few key extensions proposed in TIS 820-2536 draft. See Figure 4, notice grey keys, in https://www.nectec.or.th/it-standards/keyboard_layout/thai-key.html Noted that the latest TIS 820-2538 has slight changes in layout from TIS 820-2531. See Figure 2, notice the Thai Baht sign and ฅ-ฃ pair, in https://www.nectec.or.th/it-standards/std820/std820.html Since TIS 820-2538 is not widely adopted by keyboard manufacturer, this function uses the de facto standard modified TIS 820-2531 instead. :param str c1: first character :param str c2: second character :param str shift_dist: return value if they're shifted :return: euclidean distance between two characters :rtype: float :Example: from pythainlp.util import thai_keyboard_dist thai_keyboard_dist("ด", "ะ") # output: 1.4142135623730951 thai_keyboard_dist("ฟ", "ฤ") # output: 0.0 thai_keyboard_dist("ฟ", "ห") # output: 1.0 thai_keyboard_dist("ฟ", "ก") # output: 2.0 thai_keyboard_dist("ฟ", "ฤ", 0.5) # output: 0.5 """ def get_char_coord( ch: str, layouts=[TIS_820_2531_MOD, TIS_820_2531_MOD_SHIFT] ): for layout in layouts: for row in layout: if ch in row: r = layout.index(row) c = row.index(ch) return (r, c) raise ValueError(ch + " not found in given keyboard layout") coord1 = get_char_coord(c1) coord2 = get_char_coord(c2) distance = ( (coord1[0] - coord2[0]) ** 2 + (coord1[1] - coord2[1]) ** 2 ) ** (0.5) if distance == 0 and c1 != c2: return shift_dist return distance