Source code for pythainlp.util.normalize

# -*- coding: utf-8 -*-
"""
Text normalization
"""
import re

from pythainlp import thai_tonemarks

_NORMALIZE_RULE1 = [
    "ะ",
    "ั",
    "็",
    "า",
    "ิ",
    "ี",
    "ึ",
    "่",
    "ํ",
    "ุ",
    "ู",
    "ใ",
    "ไ",
    "โ",
    "ื",
    "่",
    "้",
    "๋",
    "๊",
    "ึ",
    "์",
    "๋",
    "ำ",
]  # เก็บพวกสระ วรรณยุกต์ที่ซ้ำกันแล้วมีปัญหา


_NORMALIZE_RULE2 = [
    ("เเ", "แ"),  # เ เ -> แ
    ("ํ(t)า", "\\1ำ"),
    ("ํา(t)", "\\1ำ"),
    ("([่-๋])([ัิ-ื])", "\\2\\1"),
    ("([่-๋])([ูุ])", "\\2\\1"),
    ("ำ([่-๋])", "\\1ำ"),
    ("(์)([ัิ-ู])", "\\2\\1"),
]  # เก็บพวก พิมพ์ลำดับผิดหรือผิดแป้นแต่กลับแสดงผลถูกต้อง ให้ไปเป็นแป้นที่ถูกต้อง เช่น เ + เ ไปเป็น แ


[docs]def normalize(text: str) -> str: """ Thai text normalize :param str text: thai text :return: thai text **Example**:: >>> print(normalize("เเปลก")=="แปลก") # เ เ ป ล ก กับ แปลก True """ for data in _NORMALIZE_RULE2: text = re.sub(data[0].replace("t", "[่้๊๋]"), data[1], text) for data in list(zip(_NORMALIZE_RULE1, _NORMALIZE_RULE1)): text = re.sub(data[0].replace("t", "[่้๊๋]") + "+", data[1], text) return text
[docs]def deletetone(text: str) -> str: """ Remove tonemarks :param str text: thai text :return: thai text """ chars = [ch for ch in text if ch not in thai_tonemarks] return "".join(chars)