# -*- coding: utf-8 -*-
import regex as re
from itertools import chain
from dateutil import parser
from dateparser.timezone_parser import pop_tz_offset_from_string
from dateparser.utils import wrap_replacement_for_regex
from .dictionary import Dictionary, ALWAYS_KEEP_TOKENS
from .validation import LanguageValidator
[docs]class Language(object):
_dictionary = None
_splitters = None
_wordchars = None
def __init__(self, shortname, language_info):
self.shortname = shortname
self.info = language_info
for simplification in self.info.get('simplifications', []):
key, value = list(simplification.items())[0]
if isinstance(value, int):
simplification[key] = str(value)
[docs] def validate_info(self, validator=None):
if validator is None:
validator = LanguageValidator
return validator.validate_info(language_id=self.shortname, info=self.info)
[docs] def is_applicable(self, date_string, strip_timezone=False, settings=None):
if strip_timezone:
date_string, _ = pop_tz_offset_from_string(date_string, as_offset=False)
date_string = self._simplify(date_string)
tokens = self._split(date_string, keep_formatting=False, settings=settings)
if self._is_date_consists_of_digits_only(tokens):
return True
else:
return self._are_all_words_in_the_dictionary(tokens, settings)
[docs] def translate(self, date_string, keep_formatting=False, settings=None):
date_string = self._simplify(date_string)
words = self._split(date_string, keep_formatting, settings=settings)
dictionary = self._get_dictionary(settings)
for i, word in enumerate(words):
word = word.lower()
if word in dictionary:
words[i] = dictionary[word] or ''
return self._join(
list(filter(bool, words)), separator="" if keep_formatting else " ", settings=settings)
def _simplify(self, date_string):
date_string = date_string.lower()
for simplification in self.info.get('simplifications', []):
pattern, replacement = list(simplification.items())[0]
if not self.info.get('no_word_spacing', False):
replacement = wrap_replacement_for_regex(replacement, pattern)
pattern = r'(\A|\d|_|\W)%s(\d|_|\W|\Z)' % pattern
date_string = re.sub(
pattern, replacement, date_string, flags=re.IGNORECASE | re.UNICODE).lower()
return date_string
def _is_date_consists_of_digits_only(self, tokens):
for token in tokens:
if not token.isdigit():
return False
else:
return True
def _are_all_words_in_the_dictionary(self, words, settings=None):
dictionary = self._get_dictionary(settings=settings)
for word in words:
word = word.lower()
if word.isdigit() or word in dictionary:
continue
else:
return False
else:
return True
def _split(self, date_string, keep_formatting, settings=None):
tokens = [date_string]
tokens = list(self._split_tokens_with_regex(tokens, r"(\d+)"))
tokens = list(
self._split_tokens_by_known_words(tokens, keep_formatting, settings=settings))
return tokens
def _split_tokens_with_regex(self, tokens, regex):
tokens = tokens[:]
for i, token in enumerate(tokens):
tokens[i] = re.split(regex, token)
return filter(bool, chain(*tokens))
def _split_tokens_by_known_words(self, tokens, keep_formatting, settings=None):
dictionary = self._get_dictionary(settings)
for i, token in enumerate(tokens):
tokens[i] = dictionary.split(token, keep_formatting)
return list(chain(*tokens))
def _join(self, tokens, separator=" ", settings=None):
if not tokens:
return ""
capturing_splitters = self._get_splitters(settings)['capturing']
joined = tokens[0]
for i in range(1, len(tokens)):
left, right = tokens[i - 1], tokens[i]
if left not in capturing_splitters and right not in capturing_splitters:
joined += separator
joined += right
return joined
def _get_dictionary(self, settings=None):
if self._dictionary is None:
self._generate_dictionary()
self._dictionary._settings = settings
return self._dictionary
def _get_wordchars(self, settings=None):
if self._wordchars is None:
self._set_wordchars(settings)
return self._wordchars
def _get_splitters(self, settings=None):
if self._splitters is None:
self._set_splitters(settings)
return self._splitters
def _set_splitters(self, settings=None):
splitters = {
'wordchars': set(), # The ones that split string only if they are not surrounded by letters from both sides
'capturing': set(), # The ones that are not filtered out from tokens after split
}
splitters['capturing'] |= set(ALWAYS_KEEP_TOKENS)
wordchars = self._get_wordchars(settings)
skip = set(self.info.get('skip', [])) | splitters['capturing']
for token in skip:
if not re.match(r'^\W+$', token, re.UNICODE):
continue
if token in wordchars:
splitters['wordchars'].add(token)
self._splitters = splitters
def _set_wordchars(self, settings=None):
wordchars = set()
for word in self._get_dictionary(settings):
if re.match(r'^[\W\d_]+$', word, re.UNICODE):
continue
for char in word:
wordchars.add(char.lower())
self._wordchars = wordchars - {" "} | {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"}
def _generate_dictionary(self, settings=None):
self._dictionary = Dictionary(self.info, settings=settings)
[docs] def to_parserinfo(self, base_cls=parser.parserinfo):
attributes = {
'JUMP': self.info.get('skip', []),
'PERTAIN': self.info.get('pertain', []),
'WEEKDAYS': [self.info['monday'],
self.info['tuesday'],
self.info['wednesday'],
self.info['thursday'],
self.info['friday'],
self.info['saturday'],
self.info['sunday']],
'MONTHS': [self.info['january'],
self.info['february'],
self.info['march'],
self.info['april'],
self.info['may'],
self.info['june'],
self.info['july'],
self.info['august'],
self.info['september'],
self.info['october'],
self.info['november'],
self.info['december']],
'HMS': [self.info['hour'],
self.info['minute'],
self.info['second']],
}
name = '{language}ParserInfo'.format(language=self.info['name'])
return type(name, bases=[base_cls], dict=attributes)