Source code for pythainlp.tools.misspell

# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0
from typing import List
import numpy as np

THAI_CHARACTERS_WITHOUT_SHIFT = [
    "ผปแอิืทมใฝ",
    "ฟหกดเ้่าสวง",
    "ๆไำพะัีรนยบลฃ",
    "ๅ/_ภถุึคตจขช",
]

THAI_CHARACTERS_WITH_SHIFT = [
    "()ฉฮฺ์?ฒฬฦ",
    "ฤฆฏโฌ็๋ษศซ.",
    '๐"ฎฑธํ๊ณฯญฐ,',
    "+๑๒๓๔ู฿๕๖๗๘๙",
]

ENGLISH_CHARACTERS_WITHOUT_SHIFT = [
    "1234567890-=",
    "qwertyuiop[]\\",
    "asdfghjkl;'",
    "zxcvbnm,./",
]

ENGLISH_CHARACTERS_WITH_SHIFT = [
    "!@#$%^&*()_+",
    "QWERTYUIOP{}|",
    'ASDFGHJKL:"',
    "ZXCVBNM<>?",
]


ALL_CHARACTERS = [
    THAI_CHARACTERS_WITHOUT_SHIFT + THAI_CHARACTERS_WITH_SHIFT,
    ENGLISH_CHARACTERS_WITHOUT_SHIFT + ENGLISH_CHARACTERS_WITH_SHIFT,
]


def search_location_of_character(char: str):
    for language_ix in [0, 1]:
        for ix, row in enumerate(ALL_CHARACTERS[language_ix]):
            if char in row:
                return (language_ix, ix // 4, ix % 4, row.index(char))


def find_neighbour_locations(
    loc: tuple,
    char: str,
    kernel: List = [(-1, -1), (-1, 0), (1, 1), (0, 1), (0, -1), (1, 0)],
):
    language_ix, is_shift, row, pos = loc

    valid_neighbours = []
    for kr, ks in kernel:
        _row, _pos = row + kr, pos + ks
        if 0 <= _row <= 3 and 0 <= _pos <= len(
            ALL_CHARACTERS[language_ix][is_shift * 4 + _row]
        ):
            valid_neighbours.append((language_ix, is_shift, _row, _pos, char))

    return valid_neighbours


def find_misspell_candidates(char: str, verbose: bool = False):
    loc = search_location_of_character(char)
    if loc is None:
        return None

    valid_neighbours = find_neighbour_locations(loc, char)

    chars = []
    printing_locations = ["▐"] * 3 + [char] + ["▐"] * 3

    for language_ix, is_shift, row, pos, char in valid_neighbours:
        try:
            char = ALL_CHARACTERS[language_ix][is_shift * 4 + row][pos]
            chars.append(char)
            kernel = (row - loc[1], pos - loc[2])

            if kernel == (-1, -1):
                ix = 5
            elif kernel == (-1, 0):
                ix = 6
            elif kernel[0] == 0:
                ix = 3 + kernel[1]
            elif kernel == (1, 0):
                ix = 0
            elif kernel == (1, 1):
                ix = 1
            else:
                continue
            printing_locations[ix] = char
        except IndexError:
            continue
        except Exception as e:
            print("Something wrong with: ", char)
            raise e

    return chars


[docs]def misspell(sentence: str, ratio: float = 0.05):
    """
    Simulate some misspellings of the input sentence.
    The number of misspelled locations is governed by ratio.

    :params str sentence: sentence to be misspelled
    :params float ratio: number of misspells per 100 chars. Defaults to 0.5.

    :return: sentence containing some misspelled words
    :rtype: str

    :Example:
    ::

        from pythainlp.tools.misspell import misspell

        sentence = "ภาษาไทยปรากฏครั้งแรกในพุทธศักราช 1826"

        misspell(sent, ratio=0.1)
        # output:
        ภาษาไทยปรากฏครั้งแรกในกุทธศักราช 1727
    """
    num_misspells = np.floor(len(sentence) * ratio).astype(int)
    positions = np.random.choice(
        len(sentence), size=num_misspells, replace=False
    )

    # convert strings to array of characters
    misspelled = list(sentence)
    for pos in positions:
        potential_candidates = find_misspell_candidates(sentence[pos])
        if potential_candidates is None:
            continue

        candidate = np.random.choice(potential_candidates)

        misspelled[pos] = candidate

    return "".join(misspelled)