| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | import re |
| | import unicodedata |
| |
|
| | UNICODE_PUNCT = { |
| | ",": ",", |
| | "。": ".", |
| | "、": ",", |
| | "„": '"', |
| | "”": '"', |
| | "“": '"', |
| | "«": '"', |
| | "»": '"', |
| | "1": '"', |
| | "」": '"', |
| | "「": '"', |
| | "《": '"', |
| | "》": '"', |
| | "´": "'", |
| | "∶": ":", |
| | ":": ":", |
| | "?": "?", |
| | "!": "!", |
| | "(": "(", |
| | ")": ")", |
| | ";": ";", |
| | "–": "-", |
| | "—": " - ", |
| | ".": ". ", |
| | "~": "~", |
| | "’": "'", |
| | "…": "...", |
| | "━": "-", |
| | "〈": "<", |
| | "〉": ">", |
| | "【": "[", |
| | "】": "]", |
| | "%": "%", |
| | "►": "-", |
| | } |
| |
|
| | UNICODE_PUNCT_RE = re.compile(f"[{''.join(UNICODE_PUNCT.keys())}]") |
| |
|
| | MATH_RE = r"(?<!\\)(\$\$?.+?\$\$?)" |
| | CODE_RE = r'\`{1,3}.*?\`{1,3}' |
| |
|
| |
|
| | def replace_unicode_punct(text: str) -> str: |
| | return "".join((UNICODE_PUNCT.get(c, c) for c in text)) |
| |
|
| |
|
| | def remove_unicode_punct(text: str) -> str: |
| | """More aggressive version of replace_unicode_punct but also faster.""" |
| | return UNICODE_PUNCT_RE.sub("", text) |
| |
|
| |
|
| | def strip_accents(line: str) -> str: |
| | """Strips accents from a piece of text.""" |
| | nfd = unicodedata.normalize("NFD", line) |
| | output = [c for c in nfd if unicodedata.category(c) != "Mn"] |
| | if len(output) == line: |
| | return line |
| | return "".join(output) |
| |
|
| |
|
| | |
| | NON_PRINTING_CHARS_RE = re.compile( |
| | f"[{''.join(map(chr, list(range(0,32)) + list(range(127,160))))}]" |
| | ) |
| | DIGIT_RE = re.compile(r"\d") |
| | PUNCT_OR_NON_PRINTING_CHARS_RE = re.compile( |
| | (UNICODE_PUNCT_RE.pattern + NON_PRINTING_CHARS_RE.pattern).replace("][", "") |
| | ) |
| |
|
| |
|
| | def remove_non_printing_char(text: str) -> str: |
| | return NON_PRINTING_CHARS_RE.sub("", text) |
| |
|
| |
|
| | def normalize_spacing_for_tok(text: str, language: str = "en") -> str: |
| | res = ( |
| | text.replace("\r", "") |
| | |
| | .replace("(", " (") |
| | .replace(")", ") ") |
| | .replace(" +", " ") |
| | ) |
| | res = re.sub(r"\) ([\.\!\:\?\;\,])", r"\)\1", res) |
| | res = res.replace("( ", "(").replace(" )", ")") |
| | res = re.sub(r"(\d) \%", r"\1\%", res) |
| | res = res.replace(" :", ":").replace(" ;", ";") |
| | res = res.replace("`", "'").replace("''", ' " ') |
| |
|
| | res = ( |
| | res.replace("„", '"') |
| | .replace("“", '"') |
| | .replace("”", '"') |
| | .replace("–", "-") |
| | .replace("—", " - ") |
| | .replace(" +", " ") |
| | .replace("´", "'") |
| | .replace("([a-z])‘([a-z])", r"\1'\2/") |
| | .replace("([a-z])’([a-z])", r"\1'\2/") |
| | .replace("‘", '"') |
| | .replace("‚", '"') |
| | .replace("’", '"') |
| | .replace("''", '"') |
| | .replace("´´", '"') |
| | .replace("…", "...") |
| | |
| | .replace(" « ", ' "') |
| | .replace("« ", '"') |
| | .replace("«", '"') |
| | .replace(" » ", '" ') |
| | .replace(" »", '"') |
| | .replace("»", '"') |
| | |
| | .replace(" %", "%") |
| | .replace("nº ", "nº ") |
| | .replace(" :", ":") |
| | .replace(" ºC", " ºC") |
| | .replace(" cm", " cm") |
| | .replace(" ?", "?") |
| | .replace(" !", "!") |
| | .replace(" ;", ";") |
| | .replace(", ", ", ") |
| | .replace(" +", " ") |
| | .replace(".", ". ") |
| | ) |
| | |
| | if language == "en": |
| | res = re.sub(r"\"([,\.]+)", r"\1\"", res) |
| | |
| | elif language == "cs" or language == "cz": |
| | pass |
| | |
| | else: |
| | res = res.replace(',"', '",') |
| | res = re.sub( |
| | r"(\.+)\"(\s*[^<])", r"\"\1\2", res |
| | ) |
| |
|
| | if ( |
| | language == "de" |
| | or language == "es" |
| | or language == "cz" |
| | or language == "cs" |
| | or language == "fr" |
| | ): |
| | res = re.sub(r"(\d) (\d)", r"\1,\2", res) |
| | else: |
| | res = re.sub(r"(\d) (\d)", r"\1.\2", res) |
| | return res |
| |
|
| |
|
| | def normalize(line: str, accent=True, case=True, numbers=True, math=True, code=True, punct=1) -> str: |
| | line = line.strip() |
| | if not line: |
| | return line |
| | if case: |
| | line = line.lower() |
| | if accent: |
| | line = strip_accents(line) |
| | if numbers: |
| | line = DIGIT_RE.sub("0", line) |
| | if punct == 1: |
| | line = replace_unicode_punct(line) |
| | elif punct == 2: |
| | line = remove_unicode_punct(line) |
| | if math: |
| | line = re.sub(MATH_RE, "[EQUATION]", line, flags=re.DOTALL) |
| | if code: |
| | line = re.sub(CODE_RE, "[CODE]", line, flags=re.DOTALL) |
| | |
| | line = line.replace("<s>", "").replace("</s>", "") |
| | line = remove_non_printing_char(line) |
| | return line |
| |
|
| |
|
| | def slow_normalize_for_dedup(line: str) -> str: |
| | return normalize(line, accent=False, case=True, numbers=True, punct=2) |
| |
|
| |
|
| | def normalize_for_dedup(line: str) -> str: |
| | line = line.strip() |
| | if not line: |
| | return line |
| | |
| | line = line.lower() |
| | |
| | line = DIGIT_RE.sub("0", line) |
| | line = PUNCT_OR_NON_PRINTING_CHARS_RE.sub("", line) |
| | return line |