from collections import OrderedDict
from itertools import chain
import regex as re
from dateutil import parser
from dateparser.timezone_parser import pop_tz_offset_from_string, word_is_tz
from dateparser.utils import combine_dicts, normalize_unicode
from .dictionary import ALWAYS_KEEP_TOKENS, Dictionary, NormalizedDictionary
NUMERAL_PATTERN = re.compile(r"(\d+)", re.U)
[docs]
class Locale:
"""
Class that deals with applicability and translation from a locale.
:param shortname:
A locale code, e.g. 'fr-PF', 'qu-EC', 'af-NA'.
:type shortname: str
:param language_info:
Language info (translation data) of the language the locale belongs to.
:type language_info: dict
:return: A Locale instance
"""
_dictionary = None
_normalized_dictionary = None
_simplifications = None
_normalized_simplifications = None
_splitters = None
_wordchars = None
_relative_translations = None
_normalized_relative_translations = None
_abbreviations = None
_split_dictionary = None
_wordchars_for_detection = None
def __init__(self, shortname, language_info):
self.shortname = shortname
locale_specific_info = language_info.get("locale_specific", {}).get(
shortname, {}
)
self.info = combine_dicts(language_info, locale_specific_info)
self.info.pop("locale_specific", None)
[docs]
def is_applicable(self, date_string, strip_timezone=False, settings=None):
"""
Check if the locale is applicable to translate date string.
:param date_string:
A string representing date and/or time in a recognizably valid format.
:type date_string: str
:param strip_timezone:
If True, timezone is stripped from date string.
:type strip_timezone: bool
:return: boolean value representing if the locale is applicable for the date string or not.
"""
if strip_timezone:
date_string, _ = pop_tz_offset_from_string(date_string, as_offset=False)
date_string = self._translate_numerals(date_string)
if settings.NORMALIZE:
date_string = normalize_unicode(date_string)
date_string = self._simplify(date_string, settings=settings)
dictionary = self._get_dictionary(settings)
date_tokens = dictionary.split(date_string)
return dictionary.are_tokens_valid(date_tokens)
[docs]
def count_applicability(self, text, strip_timezone=False, settings=None):
if strip_timezone:
text, _ = pop_tz_offset_from_string(text, as_offset=False)
text = self._simplify(text, settings=settings)
sentences = self._sentence_split(text, settings=settings)
tokens = []
for sent in sentences:
tokens.extend(self._split(sent, keep_formatting=False, settings=settings))
return self._count_words_present_in_the_dictionary(tokens, settings)
def _count_words_present_in_the_dictionary(self, words, settings=None):
dictionary = self.clean_dictionary(
self._get_split_dictionary(settings=settings)
)
dict_cnt = 0
skip_cnt = 0
for word in set(words):
if word in dictionary:
if dictionary[word]:
dict_cnt += 1
else:
skip_cnt += 1
elif word.isdigit():
skip_cnt += 1
return [dict_cnt, skip_cnt]
[docs]
@staticmethod
def clean_dictionary(dictionary, threshold=2):
del_keys = []
for key in dictionary:
if len(key) < threshold:
del_keys.append(key)
for del_key in del_keys:
del dictionary[del_key]
return dictionary
[docs]
def translate(self, date_string, keep_formatting=False, settings=None):
"""
Translate the date string to its English equivalent.
:param date_string:
A string representing date and/or time in a recognizably valid format.
:type date_string: str
:param keep_formatting:
If True, retain formatting of the date string after translation.
:type keep_formatting: bool
:return: translated date string.
"""
date_string = self._translate_numerals(date_string)
if settings.NORMALIZE:
date_string = normalize_unicode(date_string)
date_string = self._simplify(date_string, settings=settings)
dictionary = self._get_dictionary(settings)
date_string_tokens = dictionary.split(date_string, keep_formatting)
relative_translations = self._get_relative_translations(settings=settings)
for i, word in enumerate(date_string_tokens):
word = word.lower()
for pattern, replacement in relative_translations.items():
if pattern.match(word):
date_string_tokens[i] = pattern.sub(replacement, word)
break
else:
if word in dictionary:
fallback = word if keep_formatting and not word.isalpha() else ""
date_string_tokens[i] = dictionary[word] or fallback
if "in" in date_string_tokens:
date_string_tokens = self._clear_future_words(date_string_tokens)
return self._join(
list(filter(bool, date_string_tokens)),
separator="" if keep_formatting else " ",
settings=settings,
)
def _translate_numerals(self, date_string):
date_string_tokens = NUMERAL_PATTERN.split(date_string)
for i, token in enumerate(date_string_tokens):
if token.isdecimal():
date_string_tokens[i] = str(int(token)).zfill(len(token))
return "".join(date_string_tokens)
def _get_relative_translations(self, settings=None):
if settings.NORMALIZE:
if self._normalized_relative_translations is None:
self._normalized_relative_translations = (
self._generate_relative_translations(normalize=True)
)
return self._normalized_relative_translations
else:
if self._relative_translations is None:
self._relative_translations = self._generate_relative_translations(
normalize=False
)
return self._relative_translations
def _generate_relative_translations(self, normalize=False):
relative_translations = self.info.get("relative-type-regex", {})
relative_dictionary = OrderedDict()
for key, value in relative_translations.items():
if normalize:
value = list(map(normalize_unicode, value))
pattern = "|".join(sorted(value, key=len, reverse=True))
pattern = pattern.replace(r"(\d+", r"(?P<n>\d+")
pattern = re.compile(
r"^(?:{})$".format(pattern), re.UNICODE | re.IGNORECASE
)
relative_dictionary[pattern] = key
return relative_dictionary
[docs]
def translate_search(self, search_string, settings=None):
dashes = ["-", "——", "—", "~"]
word_joint_unsupported_languages = ["zh", "ja"]
sentences = self._sentence_split(search_string, settings=settings)
dictionary = self._get_dictionary(settings=settings)
translated = []
original = []
for sentence in sentences:
original_tokens, simplified_tokens = self._simplify_split_align(
sentence, settings=settings
)
translated_chunk = []
original_chunk = []
last_token_index = len(simplified_tokens) - 1
skip_next_token = False
for i, word in enumerate(simplified_tokens):
next_word = simplified_tokens[i + 1] if i < last_token_index else ""
current_and_next_joined = self._join_chunk(
[word, next_word], settings=settings
)
if skip_next_token:
skip_next_token = False
continue
if word == "" or word == " ":
translated_chunk.append(word)
original_chunk.append(original_tokens[i])
elif (
current_and_next_joined in dictionary
and word not in dashes
and self.shortname not in word_joint_unsupported_languages
):
translated_chunk.append(dictionary[current_and_next_joined])
original_chunk.append(
self._join_chunk(
[original_tokens[i], original_tokens[i + 1]],
settings=settings,
)
)
skip_next_token = True
elif word in dictionary and word not in dashes:
translated_chunk.append(dictionary[word])
original_chunk.append(original_tokens[i])
elif word.strip("()\"'{}[],.،") in dictionary and word not in dashes:
punct = word[len(word.strip("()\"'{}[],.،")) :]
if punct and dictionary[word.strip("()\"'{}[],.،")]:
translated_chunk.append(
dictionary[word.strip("()\"'{}[],.،")] + punct
)
else:
translated_chunk.append(dictionary[word.strip("()\"'{}[],.،")])
original_chunk.append(original_tokens[i])
elif self._token_with_digits_is_ok(word):
translated_chunk.append(word)
original_chunk.append(original_tokens[i])
# Use original token because word_is_tz is case sensitive
elif translated_chunk and word_is_tz(original_tokens[i]):
translated_chunk.append(word)
original_chunk.append(original_tokens[i])
else:
if translated_chunk:
translated.append(translated_chunk)
translated_chunk = []
original.append(original_chunk)
original_chunk = []
if translated_chunk:
translated.append(translated_chunk)
original.append(original_chunk)
for i in range(len(translated)):
if "in" in translated[i]:
translated[i] = self._clear_future_words(translated[i])
translated[i] = self._join_chunk(
list(filter(bool, translated[i])), settings=settings
)
original[i] = self._join_chunk(
list(filter(bool, original[i])), settings=settings
)
return translated, original
def _get_abbreviations(self, settings):
dictionary = self._get_dictionary(settings=settings)
abbreviations = []
if self._abbreviations is None:
for item in dictionary:
if item.endswith(".") and len(item) > 1:
abbreviations.append(item)
self._abbreviations = abbreviations
return self._abbreviations
def _sentence_split(self, string, settings):
abbreviations = self._get_abbreviations(settings=settings)
digit_abbreviations = ["[0-9]"] # numeric date with full stop
abbreviation_string = ""
for abbreviation in abbreviations:
abbreviation_string += (
"(?<! " + abbreviation[:-1] + ")"
) # negative lookbehind
if self.shortname in ["fi", "cs", "hu", "de", "da"]:
for digit_abbreviation in digit_abbreviations:
abbreviation_string += (
"(?<!" + digit_abbreviation + ")"
) # negative lookbehind
splitters_dict = {
1: r"[\.!?;…\r\n]+(?:\s|$)*", # most European, Tagalog, Hebrew, Georgian,
# Indonesian, Vietnamese
2: r"[\.!?;…\r\n]+(\s*[¡¿]*|$)|[¡¿]+", # Spanish
3: r"[|!?;\r\n]+(?:\s|$)+", # Hindi and Bangla
4: r"[。…‥\.!??!;\r\n]+(?:\s|$)+", # Japanese and Chinese
5: r"[\r\n]+", # Thai
6: r"[\r\n؟!\.…]+(?:\s|$)+",
} # Arabic and Farsi
if "sentence_splitter_group" not in self.info:
split_reg = abbreviation_string + splitters_dict[1]
sentences = re.split(split_reg, string)
else:
split_reg = (
abbreviation_string
+ splitters_dict[self.info["sentence_splitter_group"]]
)
sentences = re.split(split_reg, string)
sentences = filter(None, sentences)
return sentences
def _simplify_split_align(self, original, settings):
# TODO: Switch to new split method.
original_tokens = self._word_split(original, settings=settings)
simplified_tokens = self._word_split(
self._simplify(normalize_unicode(original), settings=settings),
settings=settings,
)
if len(original_tokens) == len(simplified_tokens):
return original_tokens, simplified_tokens
elif len(original_tokens) < len(simplified_tokens):
add_empty = False
for i, token in enumerate(simplified_tokens):
if i < len(original_tokens):
if token == normalize_unicode(original_tokens[i].lower()):
add_empty = False
else:
if not add_empty:
add_empty = True
continue
else:
original_tokens.insert(i, "")
else:
original_tokens.insert(i, "")
else:
add_empty = False
for i, token in enumerate(original_tokens):
if i < len(simplified_tokens):
if normalize_unicode(token.lower()) == simplified_tokens[i]:
add_empty = False
else:
if not add_empty:
add_empty = True
continue
else:
simplified_tokens.insert(i, "")
else:
simplified_tokens.insert(i, "")
while len(original_tokens) != len(simplified_tokens):
if len(original_tokens) > len(simplified_tokens):
original_tokens.remove("")
else:
simplified_tokens.remove("")
return original_tokens, simplified_tokens
def _get_split_dictionary(self, settings):
if self._split_dictionary is None:
settings.NORMALIZE = True
dictionary = self._get_dictionary(settings=settings)
self._split_dictionary = self._split_dict(dictionary)
return self._split_dictionary
def _split_dict(self, dictionary):
newdict = {}
for item in dictionary:
if " " in item:
items = item.split()
for i in items:
newdict[i] = dictionary[item]
else:
newdict[item] = dictionary[item]
return newdict
def _word_split(self, string, settings):
if "no_word_spacing" in self.info:
return self._split(string, keep_formatting=True, settings=settings)
else:
return string.split()
def _split(self, date_string, keep_formatting, settings=None):
tokens = [date_string]
tokens = list(self._split_tokens_with_regex(tokens, r"(\d+)"))
tokens = list(
self._split_tokens_by_known_words(
tokens, keep_formatting, settings=settings
)
)
return tokens
def _split_tokens_with_regex(self, tokens, regex):
tokens = tokens[:]
for i, token in enumerate(tokens):
tokens[i] = re.split(regex, token)
return filter(bool, chain.from_iterable(tokens))
def _split_tokens_by_known_words(self, tokens, keep_formatting, settings=None):
dictionary = self._get_dictionary(settings)
for i, token in enumerate(tokens):
tokens[i] = dictionary.split(token, keep_formatting)
return list(chain.from_iterable(tokens))
def _join_chunk(self, chunk, settings):
if "no_word_spacing" in self.info:
return self._join(chunk, separator="", settings=settings)
else:
return re.sub(r"\s{2,}", " ", " ".join(chunk))
def _token_with_digits_is_ok(self, token):
if "no_word_spacing" in self.info:
if re.search(r"[\d\.:\-/]+", token) is not None:
return True
else:
return False
else:
if re.search(r"\d+", token) is not None:
return True
else:
return False
def _simplify(self, date_string, settings=None):
date_string = date_string.lower()
simplifications = self._get_simplifications(settings=settings)
for simplification in simplifications:
pattern, replacement = list(simplification.items())[0]
date_string = pattern.sub(replacement, date_string).lower()
return date_string
def _get_simplifications(self, settings=None):
no_word_spacing = eval(self.info.get("no_word_spacing", "False"))
if settings.NORMALIZE:
if self._normalized_simplifications is None:
self._normalized_simplifications = []
simplifications = self._generate_simplifications(normalize=True)
for simplification in simplifications:
pattern, replacement = list(simplification.items())[0]
if not no_word_spacing:
pattern = r"(?<=\A|\W|_)%s(?=\Z|\W|_)" % pattern
pattern = re.compile(pattern, flags=re.I | re.U)
self._normalized_simplifications.append({pattern: replacement})
return self._normalized_simplifications
else:
if self._simplifications is None:
self._simplifications = []
simplifications = self._generate_simplifications(normalize=False)
for simplification in simplifications:
pattern, replacement = list(simplification.items())[0]
if not no_word_spacing:
pattern = r"(?<=\A|\W|_)%s(?=\Z|\W|_)" % pattern
pattern = re.compile(pattern, flags=re.I | re.U)
self._simplifications.append({pattern: replacement})
return self._simplifications
def _generate_simplifications(self, normalize=False):
simplifications = []
for simplification in self.info.get("simplifications", []):
c_simplification = {}
key, value = list(simplification.items())[0]
if normalize:
key = normalize_unicode(key)
if isinstance(value, int):
c_simplification[key] = str(value)
else:
c_simplification[key] = normalize_unicode(value) if normalize else value
simplifications.append(c_simplification)
return simplifications
def _clear_future_words(self, words):
freshness_words = {"day", "week", "month", "year", "hour", "minute", "second"}
if set(words).isdisjoint(freshness_words):
words.remove("in")
return words
def _join(self, tokens, separator=" ", settings=None):
if not tokens:
return ""
capturing_splitters = self._get_splitters(settings)["capturing"]
joined = tokens[0]
for i in range(1, len(tokens)):
left, right = tokens[i - 1], tokens[i]
if left not in capturing_splitters and right not in capturing_splitters:
joined += separator
joined += right
return joined
def _get_dictionary(self, settings=None):
if not settings.NORMALIZE:
if self._dictionary is None:
self._generate_dictionary()
self._dictionary._settings = settings
return self._dictionary
else:
if self._normalized_dictionary is None:
self._generate_normalized_dictionary()
self._normalized_dictionary._settings = settings
return self._normalized_dictionary
def _get_wordchars(self, settings=None):
if self._wordchars is None:
self._set_wordchars(settings)
return self._wordchars
def _get_splitters(self, settings=None):
if self._splitters is None:
self._set_splitters(settings)
return self._splitters
def _set_splitters(self, settings=None):
splitters = {
# The ones that split string only if they are not surrounded by letters from both sides:
"wordchars": set(),
# The ones that are not filtered out from tokens after split:
"capturing": set(),
}
splitters["capturing"] |= set(ALWAYS_KEEP_TOKENS)
wordchars = self._get_wordchars(settings)
skip = set(self.info.get("skip", [])) | splitters["capturing"]
for token in skip:
if not re.match(r"^\W+$", token, re.UNICODE):
continue
if token in wordchars:
splitters["wordchars"].add(token)
self._splitters = splitters
def _set_wordchars(self, settings=None):
wordchars = set()
for word in self._get_dictionary(settings):
if re.match(r"^[\W\d_]+$", word, re.UNICODE):
continue
for char in word:
wordchars.add(char.lower())
self._wordchars = wordchars - {" "} | {
"0",
"1",
"2",
"3",
"4",
"5",
"6",
"7",
"8",
"9",
}
[docs]
def get_wordchars_for_detection(self, settings):
if self._wordchars_for_detection is None:
wordchars = set()
for word in self._get_dictionary(settings):
if re.match(r"^[\W\d_]+$", word, re.UNICODE):
continue
for char in word:
wordchars.add(char.lower())
self._wordchars_for_detection = wordchars - {
"0",
"1",
"2",
"3",
"4",
"5",
"6",
"7",
"8",
"9",
":",
"(",
")",
"'",
"q",
"a",
"m",
"p",
" ",
}
return self._wordchars_for_detection
def _generate_dictionary(self, settings=None):
self._dictionary = Dictionary(self.info, settings=settings)
def _generate_normalized_dictionary(self, settings=None):
self._normalized_dictionary = NormalizedDictionary(self.info, settings=settings)
[docs]
def to_parserinfo(self, base_cls=parser.parserinfo):
attributes = {
"JUMP": self.info.get("skip", []),
"PERTAIN": self.info.get("pertain", []),
"WEEKDAYS": [
self.info["monday"],
self.info["tuesday"],
self.info["wednesday"],
self.info["thursday"],
self.info["friday"],
self.info["saturday"],
self.info["sunday"],
],
"MONTHS": [
self.info["january"],
self.info["february"],
self.info["march"],
self.info["april"],
self.info["may"],
self.info["june"],
self.info["july"],
self.info["august"],
self.info["september"],
self.info["october"],
self.info["november"],
self.info["december"],
],
"HMS": [self.info["hour"], self.info["minute"], self.info["second"]],
}
name = "{language}ParserInfo".format(language=self.info["name"])
return type(name, bases=[base_cls], dict=attributes)