Source code for pythainlp.corpus.oscar

# -*- coding: utf-8 -*-
"""
Thai unigram word frequency from OSCAR Corpus (icu word tokenize)

Credit: Korakot Chaovavanich
https://web.facebook.com/groups/colab.thailand/permalink/1524070061101680/
"""

__all__ = [
    "word_freqs",
    "unigram_word_freqs"
]

from collections import defaultdict
from typing import List, Tuple

from pythainlp.corpus import get_corpus_path

_FILENAME = "oscar_icu"


[docs]def word_freqs() -> List[Tuple[str, int]]:
    """
    Get word frequency from OSCAR Corpus (icu word tokenize)
    """
    word_freqs = []
    _path = get_corpus_path(_FILENAME)
    with open(_path, "r", encoding="utf-8") as f:
        _data = [i for i in f.readlines()]
        del _data[0]
        for line in _data:
            _temp = line.strip().split(",")
            if len(_temp) >= 2:
                if _temp[0] != " " and '"' not in _temp[0]:
                    word_freqs.append((_temp[0], int(_temp[1])))
                elif _temp[0] == " ":
                    word_freqs.append(("<s/>", int(_temp[1])))

    return word_freqs


[docs]def unigram_word_freqs() -> defaultdict:
    """
    Get unigram word frequency from OSCAR Corpus (icu word tokenize)
    """
    _path = get_corpus_path(_FILENAME)
    _word_freqs = defaultdict(int)
    with open(_path, "r", encoding="utf-8-sig") as fh:
        _data = [i for i in fh.readlines()]
        del _data[0]
        for i in _data:
            _temp = i.strip().split(",")
            if _temp[0] != " " and '"' not in _temp[0]:
                _word_freqs[_temp[0]] = int(_temp[-1])
            elif _temp[0] == " ":
                _word_freqs["<s/>"] = int(_temp[-1])

    return _word_freqs