Source code for dateparser.freshness_date_parser

from datetime import datetime, time, timezone

import regex as re
from dateutil.relativedelta import relativedelta
from tzlocal import get_localzone

from dateparser.utils import apply_timezone, localize_timezone, strip_braces

from .parser import time_parser
from .timezone_parser import pop_tz_offset_from_string

_UNITS = r"decade|year|month|week|day|hour|minute|second"
PATTERN = re.compile(r"(\d+[.,]?\d*)\s*(%s)\b" % _UNITS, re.I | re.S | re.U)


[docs] class FreshnessDateDataParser: """Parses date string like "1 year, 2 months ago" and "3 hours, 50 minutes ago" """ def _are_all_words_units(self, date_string): skip = [_UNITS, r"ago|in|\d+", r":|[ap]m"] date_string = re.sub(r"\s+", " ", date_string.strip()) words = [x for x in re.split(r"\W", date_string) if x] words = [x for x in words if not re.match(r"%s" % "|".join(skip), x)] return not words def _parse_time(self, date_string, settings): """Attempts to parse time part of date strings like '1 day ago, 2 PM'""" date_string = PATTERN.sub("", date_string) date_string = re.sub(r"\b(?:ago|in)\b", "", date_string) try: return time_parser(date_string) except: pass
[docs] def get_local_tz(self): return get_localzone()
[docs] def parse(self, date_string, settings): date_string = strip_braces(date_string) date_string, ptz = pop_tz_offset_from_string(date_string) _time = self._parse_time(date_string, settings) _settings_tz = settings.TIMEZONE.lower() def apply_time(dateobj, timeobj): if not isinstance(_time, time): return dateobj return dateobj.replace( hour=timeobj.hour, minute=timeobj.minute, second=timeobj.second, microsecond=timeobj.microsecond, ) if settings.RELATIVE_BASE: now = settings.RELATIVE_BASE if "local" not in _settings_tz: now = localize_timezone(now, settings.TIMEZONE) if ptz: if now.tzinfo: now = now.astimezone(ptz) else: if hasattr(ptz, "localize"): now = ptz.localize(now) else: now = now.replace(tzinfo=ptz) if not now.tzinfo: if hasattr(self.get_local_tz(), "localize"): now = self.get_local_tz().localize(now) else: now = now.replace(tzinfo=self.get_local_tz()) elif ptz: localized_now = datetime.now(ptz) if "local" in _settings_tz: now = localized_now else: now = apply_timezone(localized_now, settings.TIMEZONE) else: if "local" not in _settings_tz: utc_dt = datetime.now(tz=timezone.utc) now = apply_timezone(utc_dt, settings.TIMEZONE) else: now = datetime.now(self.get_local_tz()) date, period = self._parse_date(date_string, now, settings.PREFER_DATES_FROM) if date: old_date = date date = apply_time(date, _time) if settings.RETURN_TIME_AS_PERIOD and old_date != date: period = "time" if settings.TO_TIMEZONE: date = apply_timezone(date, settings.TO_TIMEZONE) if not settings.RETURN_AS_TIMEZONE_AWARE or ( settings.RETURN_AS_TIMEZONE_AWARE and "default" == settings.RETURN_AS_TIMEZONE_AWARE and not ptz ): date = date.replace(tzinfo=None) return date, period
def _parse_date(self, date_string, now, prefer_dates_from): if not self._are_all_words_units(date_string): return None, None kwargs = self.get_kwargs(date_string) if not kwargs: return None, None period = "day" if "days" not in kwargs: for k in ["weeks", "months", "years"]: if k in kwargs: period = k[:-1] break td = relativedelta(**kwargs) if ( re.search(r"\bin\b", date_string) or re.search(r"\bfuture\b", prefer_dates_from) and not re.search(r"\bago\b", date_string) ): date = now + td else: date = now - td return date, period
[docs] def get_kwargs(self, date_string): m = PATTERN.findall(date_string) if not m: return {} kwargs = {} for num, unit in m: kwargs[unit + "s"] = float(num.replace(",", ".")) if "decades" in kwargs: kwargs["years"] = 10 * kwargs["decades"] + kwargs.get("years", 0) del kwargs["decades"] return kwargs
[docs] def get_date_data(self, date_string, settings=None): from dateparser.date import DateData date, period = self.parse(date_string, settings) return DateData(date_obj=date, period=period)
freshness_date_parser = FreshnessDateDataParser()