from itertools import chain, zip_longest
from operator import methodcaller
import regex as re
from dateparser.utils import normalize_unicode
PARSER_HARDCODED_TOKENS = [":", ".", " ", "-", "/"]
PARSER_KNOWN_TOKENS = ["am", "pm", "UTC", "GMT", "Z"]
ALWAYS_KEEP_TOKENS = ["+"] + PARSER_HARDCODED_TOKENS
KNOWN_WORD_TOKENS = [
"monday",
"tuesday",
"wednesday",
"thursday",
"friday",
"saturday",
"sunday",
"january",
"february",
"march",
"april",
"may",
"june",
"july",
"august",
"september",
"october",
"november",
"december",
"decade",
"year",
"month",
"week",
"day",
"hour",
"minute",
"second",
"ago",
"in",
"am",
"pm",
]
PARENTHESES_PATTERN = re.compile(r"[\(\)]")
NUMERAL_PATTERN = re.compile(r"(\d+)")
KEEP_TOKEN_PATTERN = re.compile(r"^.*[^\W_].*$", flags=re.U)
[docs]
class UnknownTokenError(Exception):
pass
[docs]
class Dictionary:
"""
Class that modifies and stores translations and handles splitting of date string.
:param locale_info:
Locale info (translation data) of the locale.
:type language_info: dict
:param settings:
Configure customized behavior using settings defined in :mod:`dateparser.conf.Settings`.
:type settings: dict
:return: a Dictionary instance.
"""
_split_regex_cache = {}
_sorted_words_cache = {}
_split_relative_regex_cache = {}
_sorted_relative_strings_cache = {}
_match_relative_regex_cache = {}
def __init__(self, locale_info, settings=None):
dictionary = {}
self._settings = settings
self.info = locale_info
if "skip" in locale_info:
skip = map(methodcaller("lower"), locale_info["skip"])
dictionary.update(zip_longest(skip, [], fillvalue=None))
if "pertain" in locale_info:
pertain = map(methodcaller("lower"), locale_info["pertain"])
dictionary.update(zip_longest(pertain, [], fillvalue=None))
for word in KNOWN_WORD_TOKENS:
if word in locale_info:
translations = map(methodcaller("lower"), locale_info[word])
dictionary.update(zip_longest(translations, [], fillvalue=word))
dictionary.update(zip_longest(ALWAYS_KEEP_TOKENS, ALWAYS_KEEP_TOKENS))
dictionary.update(
zip_longest(
map(methodcaller("lower"), PARSER_KNOWN_TOKENS), PARSER_KNOWN_TOKENS
)
)
relative_type = locale_info.get("relative-type", {})
for key, value in relative_type.items():
relative_translations = map(methodcaller("lower"), value)
dictionary.update(zip_longest(relative_translations, [], fillvalue=key))
self._dictionary = dictionary
no_word_spacing = locale_info.get("no_word_spacing", "False")
self._no_word_spacing = bool(eval(no_word_spacing))
relative_type_regex = locale_info.get("relative-type-regex", {})
self._relative_strings = list(chain.from_iterable(relative_type_regex.values()))
def __contains__(self, key):
if key in self._settings.SKIP_TOKENS:
return True
return self._dictionary.__contains__(key)
def __getitem__(self, key):
if key in self._settings.SKIP_TOKENS:
return None
return self._dictionary.__getitem__(key)
def __iter__(self):
return chain(self._settings.SKIP_TOKENS, iter(self._dictionary))
[docs]
def are_tokens_valid(self, tokens):
"""
Check if tokens are valid tokens for the locale.
:param tokens:
a list of string tokens.
:type tokens: list
:return: True if tokens are valid, False otherwise.
"""
has_only_keep_tokens = not set(tokens) - set(ALWAYS_KEEP_TOKENS)
if has_only_keep_tokens:
return False
match_relative_regex = self._get_match_relative_regex_cache()
for token in tokens:
if token.isdigit() or match_relative_regex.match(token) or token in self:
continue
else:
return False
else:
return True
[docs]
def split(self, string, keep_formatting=False):
"""
Split the date string using translations in locale info.
:param string:
Date string to be splitted.
:type string:
str
:param keep_formatting:
If True, retain formatting of the date string.
:type keep_formatting: bool
:return: A list of string tokens formed after splitting the date string.
"""
if not string:
return string
split_relative_regex = self._get_split_relative_regex_cache()
match_relative_regex = self._get_match_relative_regex_cache()
tokens = split_relative_regex.split(string)
for i, token in enumerate(tokens):
if match_relative_regex.match(token):
tokens[i] = [token]
continue
tokens[i] = self._split_by_known_words(token, keep_formatting)
return list(filter(bool, chain.from_iterable(tokens)))
def _add_to_cache(self, value, cache):
cache.setdefault(self._settings.registry_key, {})[self.info["name"]] = value
if (
self._settings.CACHE_SIZE_LIMIT
and len(cache) > self._settings.CACHE_SIZE_LIMIT
):
cache.pop(list(cache.keys())[0])
def _split_by_known_words(self, string, keep_formatting):
if not string:
return string
regex = self._get_split_regex_cache()
match = regex.match(string)
if not match:
return (
self._split_by_numerals(string, keep_formatting)
if self._should_capture(string, keep_formatting)
else []
)
unparsed, known, unknown = match.groups()
splitted = [known] if self._should_capture(known, keep_formatting) else []
if unparsed and self._should_capture(unparsed, keep_formatting):
splitted = self._split_by_numerals(unparsed, keep_formatting) + splitted
if unknown:
splitted.extend(self._split_by_known_words(unknown, keep_formatting))
return splitted
def _split_by_numerals(self, string, keep_formatting):
return [
token
for token in NUMERAL_PATTERN.split(string)
if self._should_capture(token, keep_formatting)
]
def _should_capture(self, token, keep_formatting):
return (
keep_formatting
or token in ALWAYS_KEEP_TOKENS
or KEEP_TOKEN_PATTERN.match(token)
)
def _get_sorted_words_from_cache(self):
if (
self._settings.registry_key not in self._sorted_words_cache
or self.info["name"]
not in self._sorted_words_cache[self._settings.registry_key]
):
self._add_to_cache(
cache=self._sorted_words_cache,
value=sorted([key for key in self], key=len, reverse=True),
)
return self._sorted_words_cache[self._settings.registry_key][self.info["name"]]
def _get_split_regex_cache(self):
if (
self._settings.registry_key not in self._split_regex_cache
or self.info["name"]
not in self._split_regex_cache[self._settings.registry_key]
):
self._construct_split_regex()
return self._split_regex_cache[self._settings.registry_key][self.info["name"]]
def _construct_split_regex(self):
known_words_group = "|".join(
map(re.escape, self._get_sorted_words_from_cache())
)
if self._no_word_spacing:
regex = r"^(.*?)({})(.*)$".format(known_words_group)
else:
regex = r"^(.*?(?:\A|\W|_|\d))({})((?:\Z|\W|_|\d).*)$".format(
known_words_group
)
self._add_to_cache(
cache=self._split_regex_cache,
value=re.compile(regex, re.UNICODE | re.IGNORECASE),
)
def _get_sorted_relative_strings_from_cache(self):
if (
self._settings.registry_key not in self._sorted_relative_strings_cache
or self.info["name"]
not in self._sorted_relative_strings_cache[self._settings.registry_key]
):
self._add_to_cache(
cache=self._sorted_relative_strings_cache,
value=sorted(
[
PARENTHESES_PATTERN.sub("", key)
for key in self._relative_strings
],
key=len,
reverse=True,
),
)
return self._sorted_relative_strings_cache[self._settings.registry_key][
self.info["name"]
]
def _get_split_relative_regex_cache(self):
if (
self._settings.registry_key not in self._split_relative_regex_cache
or self.info["name"]
not in self._split_relative_regex_cache[self._settings.registry_key]
):
self._construct_split_relative_regex()
return self._split_relative_regex_cache[self._settings.registry_key][
self.info["name"]
]
def _construct_split_relative_regex(self):
known_relative_strings_group = "|".join(
self._get_sorted_relative_strings_from_cache()
)
if self._no_word_spacing:
regex = "({})".format(known_relative_strings_group)
else:
regex = "(?<=(?:\\A|\\W|_))({})(?=(?:\\Z|\\W|_))".format(
known_relative_strings_group
)
self._add_to_cache(
cache=self._split_relative_regex_cache,
value=re.compile(regex, re.UNICODE | re.IGNORECASE),
)
def _get_match_relative_regex_cache(self):
if (
self._settings.registry_key not in self._match_relative_regex_cache
or self.info["name"]
not in self._match_relative_regex_cache[self._settings.registry_key]
):
self._construct_match_relative_regex()
return self._match_relative_regex_cache[self._settings.registry_key][
self.info["name"]
]
def _construct_match_relative_regex(self):
known_relative_strings_group = "|".join(
self._get_sorted_relative_strings_from_cache()
)
regex = "^({})$".format(known_relative_strings_group)
self._add_to_cache(
cache=self._match_relative_regex_cache,
value=re.compile(regex, re.UNICODE | re.IGNORECASE),
)
[docs]
class NormalizedDictionary(Dictionary):
def __init__(self, locale_info, settings=None):
super().__init__(locale_info, settings)
self._normalize()
def _normalize(self):
new_dict = {}
conflicting_keys = []
for key, value in self._dictionary.items():
normalized = normalize_unicode(key)
if key != normalized and normalized in self._dictionary:
conflicting_keys.append(key)
else:
new_dict[normalized] = value
for key in conflicting_keys:
normalized = normalize_unicode(key)
if key in (self.info.get("skip", []) + self.info.get("pertain", [])):
new_dict[normalized] = self._dictionary[key]
self._dictionary = new_dict
self._relative_strings = list(map(normalize_unicode, self._relative_strings))