Source code for pyaiml21.utils.text_preprocessors

"""Utilities to preprocess user's input."""
from abc import ABC, abstractmethod
import unicodedata
from typing import List


##############################################################################
#    CJK = Chinese-Japanese-Korean alphabet
##############################################################################
# original post: https://stackoverflow.com/a/30070664/19021375

ranges = [
    # compatibility ideographs
    (ord(u"\u3300"), ord(u"\u33ff")),
    (ord(u"\ufe30"), ord(u"\ufe4f")),
    (ord(u"\uf900"), ord(u"\ufaff")),
    (ord(u"\U0002F800"), ord(u"\U0002fa1f")),

    # Japanese Hiragana
    (ord(u'\u3040'), ord(u'\u309f')),
    # Japanese Katakana
    (ord(u"\u30a0"), ord(u"\u30ff")),
    # cjk radicals supplement
    (ord(u"\u2e80"), ord(u"\u2eff")),
    (ord(u"\u4e00"), ord(u"\u9fff")),
    (ord(u"\u3400"), ord(u"\u4dbf")),
    (ord(u"\U00020000"), ord(u"\U0002a6df")),
    (ord(u"\U0002a700"), ord(u"\U0002b73f")),
    (ord(u"\U0002b740"), ord(u"\U0002b81f")),
    # included as of Unicode 8.0
    (ord(u"\U0002b820"), ord(u"\U0002ceaf"))
]


[docs]def is_cjk(char: str) -> bool: """Return True if given character is CJK character.""" return any([from_ <= ord(char) <= to_ for from_, to_ in ranges])
############################################################################## # W o r d N o r m a l i s a t i o n ############################################################################## class _Normalizer(ABC): @staticmethod @abstractmethod def normalize(sentence: str) -> str: """ Normalize given sentence to canonical form. The canonical form of the sentence in the AIML is defined as a sentence with characters capitalized and the interaction omitted. :param sentence: a sentence to be normalized :return: canonical form of the sentence, one that can be used with a GraphMaster """ pass class _SimpleNormalizer(_Normalizer): UNICODE_FORM = "NFC" @staticmethod def normalize(sentence: str) -> str: """ Normalize sentence by converting to uppercase and ignoring punctuation. Convert text to uppercase, perform unicode normalisation and keep only alphanumeric characters and spaces. :param sentence: sentence to be normalized :return: canonical form of the sentence >>> normalizer = _SimpleNormalizer() >>> normalizer.normalize("Simple sentence.") == "SIMPLE SENTENCE" True >>> text = "This? is, ..., a sentence with punctuation." >>> res = "THIS IS A SENTENCE WITH PUNCTUATION" >>> normalizer.normalize(text) == res True """ normalized: str = unicodedata.normalize("NFC", sentence) uppercase = normalized.upper() return "".join(c for c in uppercase if c.isspace() or c.isalnum()) ############################################################################## # S e n t e n c e S p l i t t i n g ############################################################################## class _SentenceSplitter(ABC): @staticmethod @abstractmethod def split(text: str) -> List[str]: """ Split user input into a list of sentences. :param text: text to be split :return: list of sentences from the text """ pass class _SimpleSentenceSplitter(_SentenceSplitter): @staticmethod def split(text: str) -> List[str]: """ Split text on usual sentence boundaries. Given ``text``, finds all occurrences of '?', '!', '.', and splits it on these positions. Keeps the last characters of the sentence, i.e. the sentence boundary is returned with the sentence it belongs to. Additionaly, keeps all whitespace characters, so the concatenation of the result is exactly the input ``text``. :param text: text to be split :return: list of sentences from the text >>> splitter = _SimpleSentenceSplitter() >>> exp = ["Hello.", " This is a sentence?", " Definitely!"] >>> splitter.split("Hello. This is a sentence? Definitely!") == exp True >>> expected = ["Anyone?!", " Yes!!!", " I will do it."] >>> splitter.split("Anyone?! Yes!!! I will do it.") == expected True >>> expected = [".", "a.", "b.", "c.", "d"] >>> splitter.split(".a.b.c.d") == expected True """ delimiters = u"?!.。?" sentences = [] sentence_start = 0 for sentence_end, symbol in enumerate(text): if symbol in delimiters: next_symbol = None \ if sentence_end == len(text) - 1 \ else text[sentence_end + 1] if next_symbol is None or next_symbol not in delimiters: last_sentence = text[sentence_start:sentence_end + 1] sentences.append(last_sentence) sentence_start = sentence_end + 1 if sentence_start < len(text): text_last_sentence = text[sentence_start:] sentences.append(text_last_sentence) return sentences ############################################################################## # W o r d S p l i t t i n g ############################################################################## class _WordSplitter(ABC): @staticmethod @abstractmethod def split(sentence: str) -> List[str]: """ Split the `sentence` to a list of words. :param sentence: string to be split :return: list of words of the sentence """ pass class _SimpleWordSplitter(_WordSplitter): @staticmethod def split(sentence: str) -> List[str]: """ Split sentence into words, ignoring any whitespace characters. More precisely, return all non-empty subsequences from the sentence, that do not contain whitespace characters :param sentence: string gto be split :return: list of words >>> splitter = _SimpleWordSplitter() >>> expected = ["This", "is", "a", "sentence."] >>> splitter.split("This is a sentence.") == expected True >>> got = splitter.split("And it has long spaces") >>> expected = ["And", "it", "has", "long", "spaces"] >>> got == expected True >>> splitter.split(" ") [] """ return sentence.split() class _CJKSplitter(_WordSplitter): @staticmethod def split(sentence: str) -> List[str]: """ Split the sentence into a list of characters. Treating each character as a word, return a list of non-space characters from the sentence. :param sentence: text to be split :return: list of characters from the sentence, in the same order >>> splitter = _CJKSplitter() >>> e = ["S", "o", "m", "e", ... "s", "e", "n", "t", "e", "n", "c", "e", ... "."] >>> splitter.split("Some sentence.") == e True >>> expected = ["こ", "れ", "は", "文", "で", "す"] >>> splitter.split("これは文です") == expected True >>> splitter.split(" ") [] """ return [char for char in sentence if not char.isspace()] # the functions
[docs]def split_sentences(s: str) -> List[str]: """Split text `s` to its sentences.""" return _SimpleSentenceSplitter.split(s)
[docs]def normalize_user_input(s: str) -> List[List[str]]: """ Perform latin-alphabet normalisation, split to sentences and each to words. Word normalisation consists of UNICODE normalisation and converting to uppercase. :param s: user input to normalize :return: list of sentences, each sentence is a list of words Example: >>> text = "Hello. How ARE you...." >>> expected = [["HELLO"], ["HOW", "ARE", "YOU"]] >>> normalize_user_input(text) == expected True """ sentences = split_sentences(s) normed = map(_SimpleNormalizer.normalize, sentences) word_split = map(_SimpleWordSplitter.split, normed) return list(word_split)
[docs]def normalize_cjk_user_input(s: str) -> List[List[str]]: """ Perform CJK normalisation, split to sentences and each to words. CJK (Chinese, Japanses, Korean) normalisation is equivalent to using <explode> on each word. Also UNICODE normalisation with uppercase-ing is done. :param s: user input to normalize :return: list of sentences, each sentence is a list of words Example: >>> text = u"こんにちは。この企画を気に入っていただけたでしょうか?" >>> expected = [list("こんにちは"), ... list("この企画を気に入っていただけたでしょうか")] >>> normalize_cjk_user_input(text) == expected True """ sentences = split_sentences(s) normed = map(_SimpleNormalizer.normalize, sentences) word_split = map(_CJKSplitter.split, normed) return list(word_split)