dateparser package¶
Submodules¶
dateparser.date module¶
-
class
dateparser.date.
DateDataParser
(language=None, allow_redetect_language=False)[source]¶ Bases:
object
-
get_date_data
(date_string, date_formats=None)[source]¶ Return a dictionary with a date object and a period. Period values can be a ‘day’ (default), ‘week’, ‘month’, ‘year’. It aims to solve the following issue: In example, a forum could displays “2 weeks ago” in the thread list (in the thread itself there’s the right date) so the engine will translate “2 weeks ago” to a certain date. The next thread summary displays “3 weeks ago” which is translated to a other date seven days before first date. A valid date_string between both dates won’t be scraped because it’s not an exact date match. The period field helps to build better date range detection.
TODO: Timezone issues
-
-
dateparser.date.
parse_with_formats
(date_string, date_formats, final_call=False, alt_parser=None)[source]¶ Parse with formats and return depending on final_call arg. If final_call is True, return a dictionary with ‘period’ and ‘obj_date’ because these data won’t be processed by any method outside. If final_call is False, return a ‘obj_date’ because it will be processed.
Returns: datetime.datetime
, dict or None
dateparser.date_parser module¶
-
class
dateparser.date_parser.
AutoDetectLanguage
(language=None, allow_redetection=False, *args, **kwargs)[source]¶ Bases:
dateparser.date_parser.DateParsingStrategy
Date parser with support for language detection.
It uses the get_language_candidates() function to get the possible languages for each date, keeps track of the previously detected languages and uses this information to reduce the set of possible languages.
-
class
dateparser.date_parser.
BaseParserInfo
(dayfirst=False, yearfirst=False)[source]¶ Bases:
dateutil.parser.parserinfo
-
JUMP
= [u' ', u'.', u',', u';', u'-', u'/', u"'", u'|', u'@', u'[', u']']¶
-
-
class
dateparser.date_parser.
DateParser
(language=None, allow_redetect_language=False)[source]¶ Bases:
object
-
class
dateparser.date_parser.
ExactLanguage
(language, *args, **kwargs)[source]¶ Bases:
dateparser.date_parser.DateParsingStrategy
Date parser that works only for a specific language
-
exception
dateparser.date_parser.
LanguageWasNotSeenBeforeError
[source]¶ Bases:
exceptions.RuntimeError
-
class
dateparser.date_parser.
cz_parserinfo
(dayfirst=False, yearfirst=False)[source]¶ Bases:
dateparser.date_parser.BaseParserInfo
-
MONTHS
= [(u'leden', u'led'), (u'\xfanor', u'\xfano'), (u'b\u0159ezen', u'b\u0159e'), (u'duben', u'dub'), (u'kv\u011bten', u'kv\u011b'), (u'\u010derven', u'\u010der'), (u'\u010dervenec', u'\u010drc'), (u'srpen', u'srp'), (u'z\xe1\u0159\xed', u'z\xe1\u0159'), (u'\u0159\xedjen', u'\u0159\xedj'), (u'listopad', u'lis'), (u'prosinec', u'pro')]¶
-
WEEKDAYS
= [(u'pond\u011bl\xed', u'pon'), (u'\xfater\xfd', u'\xfate'), (u'st\u0159eda', u'st\u0159'), (u'\u010dtvrtek', u'\u010dtv'), (u'p\xe1tek', u'p\xe1t'), (u'sobota', u'sob'), (u'ned\u011ble', u'ned')]¶
-
-
dateparser.date_parser.
dateutil_parse
(date_string, **kwargs)[source]¶ Wrapper function around dateutil.parser.parse
-
class
dateparser.date_parser.
de_parserinfo
(dayfirst=False, yearfirst=False)[source]¶ Bases:
dateparser.date_parser.BaseParserInfo
-
JUMP
= [u' ', u'.', u',', u';', u'-', u'/', u"'", u'|', u'@', u'[', u']', u'um', u'uhr']¶
-
MONTHS
= [(u'Januar', u'Jan'), (u'Februar', u'Feb'), (u'M\xe4rz',), (u'April', u'Apr'), (u'Mai',), (u'Juni',), (u'Juli',), (u'August', u'Aug'), (u'September', u'Sept'), (u'Oktober', u'Okt'), (u'November', u'Nov'), (u'Dezember', u'Dez')]¶
-
-
class
dateparser.date_parser.
en_parserinfo
(dayfirst=False, yearfirst=False)[source]¶ Bases:
dateparser.date_parser.BaseParserInfo
-
JUMP
= [u'and', u'@', u'ad', u'on', u'm', u'at', u'[', u'st', u']', u' ', u"'", u'of', u'nd', u'-', u',', u'/', u'.', u'rd', u't', u'th', u';', u'|']¶
-
-
class
dateparser.date_parser.
es_parserinfo
(dayfirst=False, yearfirst=False)[source]¶ Bases:
dateparser.date_parser.BaseParserInfo
-
JUMP
= [u' ', u'.', u',', u';', u'-', u'/', u"'", u'|', u'@', u'[', u']', u'de', u'del']¶
-
MONTHS
= [(u'enero', u'ene'), (u'febrero', u'feb'), (u'marzo', u'mar'), (u'abril', u'abr'), (u'mayo', u'may'), (u'junio', u'jun'), (u'julio', u'jul'), (u'agosto', u'ago'), (u'septiembre', u'setiembre', u'sep', u'set'), (u'octubre', u'oct'), (u'noviembre', u'nov'), (u'diciembre', u'dic')]¶
-
PERTAIN
= [u'de', u'del']¶
-
WEEKDAYS
= [(u'Lunes',), (u'Martes',), (u'Mi\xe9rcoles',), (u'Jueves',), (u'Viernes',), (u'S\xe1bado',), (u'Domingo',)]¶
-
-
class
dateparser.date_parser.
fr_parserinfo
(dayfirst=False, yearfirst=False)[source]¶ Bases:
dateparser.date_parser.BaseParserInfo
-
JUMP
= [u' ', u'.', u',', u';', u'-', u'/', u"'", u'|', u'@', u'[', u']', u'le']¶
-
MONTHS
= [(u'janvier', u'janv', u'jan'), (u'f\xe9vrier', u'f\xe9vr', u'f\xe9v'), (u'mars', u'mar'), (u'avril', u'avr'), u'mai', u'juin', (u'juillet', u'juil'), (u'ao\xfbt', u'ao\xfb'), (u'septembre', u'sept', u'sep'), (u'octobre', u'oct'), (u'novembre', u'nov'), (u'd\xe9cembre', u'd\xe9c')]¶
-
WEEKDAYS
= [(u'Lundi',), (u'Mardi',), (u'Mercredi',), (u'Jeudi',), (u'Vendredi',), (u'Samedi',), (u'Dimanche',)]¶
-
-
dateparser.date_parser.
get_language_candidates
(tokens, languages=None, exclude_languages=None)[source]¶ Find the languages which have a word matching at least one of the given tokens and all tokens are known by this language
-
class
dateparser.date_parser.
it_parserinfo
(dayfirst=False, yearfirst=False)[source]¶ Bases:
dateparser.date_parser.BaseParserInfo
-
MONTHS
= [(u'gennaio', u'gen'), (u'febbraio', u'feb'), (u'marzo', u'mar'), (u'aprile', u'apr'), (u'maggio', u'mag'), (u'giugno', u'giu'), (u'luglio', u'lug'), (u'agosto', u'ago'), (u'settembre', u'set'), (u'ottobre', u'ott'), (u'novembre', u'nov'), (u'dicembre', u'dic')]¶
-
-
class
dateparser.date_parser.
new_relativedelta
(dt1=None, dt2=None, years=0, months=0, days=0, leapdays=0, weeks=0, hours=0, minutes=0, seconds=0, microseconds=0, year=None, month=None, day=None, weekday=None, yearday=None, nlyearday=None, hour=None, minute=None, second=None, microsecond=None)[source]¶ Bases:
dateutil.relativedelta.relativedelta
dateutil does not check if result of parsing weekday is in the future. Although items dates are already in the past, so we need to fix this particular case.
-
class
dateparser.date_parser.
nl_parserinfo
(dayfirst=False, yearfirst=False)[source]¶ Bases:
dateparser.date_parser.BaseParserInfo
-
MONTHS
= [(u'januari', u'jan'), (u'februari', u'feb'), (u'maart', u'mrt'), (u'april', u'apr'), (u'mei',), (u'juni', u'jun'), (u'juli', u'jul'), (u'augustus', u'aug'), (u'september', u'sep'), (u'oktober', u'okt'), (u'november', u'nov'), (u'december', u'dec')]¶
-
WEEKDAYS
= [(u'Maandag', u'ma'), (u'Dinsdag', u'di'), (u'Woensdag', u'wo'), (u'Donderdag', u'do'), (u'Vrijdag', u'vr'), (u'Zaterdag', u'za'), (u'Zondag', u'zo')]¶
-
-
dateparser.date_parser.
parse_using_languages
(date_string, date_format, languages)[source]¶ Try parsing date using the given format for each of the languages given as argument
-
class
dateparser.date_parser.
pt_parserinfo
(dayfirst=False, yearfirst=False)[source]¶ Bases:
dateparser.date_parser.BaseParserInfo
-
JUMP
= [u' ', u'.', u',', u';', u'-', u'/', u"'", u'|', u'@', u'[', u']', u'de']¶
-
MONTHS
= [(u'janeiro', u'jan'), (u'fevereiro', u'fev'), (u'mar\xe7o', u'mar'), (u'abril', u'abr'), (u'maio', u'mai'), (u'junho', u'jun'), (u'julho', u'jul'), (u'agosto', u'ago'), (u'septembro', u'setembro', u'septemberembro', u'set'), (u'outubro', u'out'), (u'novembro', u'nov'), (u'dezembro', u'dez')]¶
-
PERTAIN
= [u'de']¶
-
WEEKDAYS
= [(u'Segunda-feira',), (u'Ter\xe7a-feira',), (u'Quarta-feira',), (u'Quinta-feira',), (u'Sexta-feira',), (u'S\xe1bado',), (u'Domingo',)]¶
-
-
class
dateparser.date_parser.
ro_parserinfo
(dayfirst=False, yearfirst=False)[source]¶ Bases:
dateparser.date_parser.BaseParserInfo
-
MONTHS
= [(u'ianuarie', u'ian'), (u'februarie', u'feb'), (u'martie', u'mar'), (u'aprilie', u'apr'), (u'mai',), (u'iunie',), (u'iulie',), (u'august', u'aug'), (u'septembrie', u'sept'), (u'octombrie', u'oct'), (u'noiembrie', u'noiem'), (u'decembrie', u'dec')]¶
-
-
class
dateparser.date_parser.
ru_parserinfo
(dayfirst=False, yearfirst=False)[source]¶ Bases:
dateparser.date_parser.BaseParserInfo
-
JUMP
= [u' ', u'.', u',', u';', u'-', u'/', u"'", u'|', u'@', u'[', u']', u'\u0432']¶
-
MONTHS
= [(u'\u044f\u043d\u0432\u0430\u0440\u044f', u'\u042f\u043d\u0432\u0430\u0440\u044f'), (u'\u0444\u0435\u0432\u0440\u0430\u043b\u044f', u'\u0424\u0435\u0432\u0440\u0430\u043b\u044f'), (u'\u043c\u0430\u0440\u0442\u0430', u'\u041c\u0430\u0440\u0442\u0430'), (u'\u0430\u043f\u0440\u0435\u043b\u044f', u'\u0410\u043f\u0440\u0435\u043b\u044f'), (u'\u043c\u0430\u044f', u'\u041c\u0430\u044f'), (u'\u0438\u044e\u043d\u044f', u'\u0418\u044e\u043d\u044f'), (u'\u0438\u044e\u043b\u044f', u'\u0418\u044e\u043b\u044f'), (u'\u0430\u0432\u0433\u0443\u0441\u0442\u0430', u'\u0410\u0432\u0433\u0443\u0441\u0442\u0430'), (u'\u0441\u0435\u043d\u0442\u044f\u0431\u0440\u044f', u'\u0421\u0435\u043d\u0442\u044f\u0431\u0440\u044f'), (u'\u043e\u043a\u0442\u044f\u0431\u0440\u044f', u'\u041e\u043a\u0442\u044f\u0431\u0440\u044f'), (u'\u043d\u043e\u044f\u0431\u0440\u044f', u'\u041d\u043e\u044f\u0431\u0440\u044f'), (u'\u0434\u0435\u043a\u0430\u0431\u0440\u044f', u'\u0414\u0435\u043a\u0430\u0431\u0440\u044f')]¶
-
-
dateparser.date_parser.
tokenize_date
(s)¶
-
class
dateparser.date_parser.
tr_parserinfo
(dayfirst=False, yearfirst=False)[source]¶ Bases:
dateparser.date_parser.BaseParserInfo
-
MONTHS
= [(u'Ocak',), (u'\u015eubat',), (u'Mart',), (u'Nisan',), (u'May\u0131s',), (u'Haziran',), (u'Temmuz',), (u'A\u011fustos',), (u'Eyl\xfcl',), (u'Ekim',), (u'Kas\u0131m',), (u'Aral\u0131k',)]¶
-
dateparser.dateparser module¶
dateparser.freshness_date_parser module¶
-
class
dateparser.freshness_date_parser.
FreshnessDateDataParser
(now=None)[source]¶ Bases:
object
Parses date string like “1 year, 2 months ago” and “3 hours, 50 minutes ago”
-
langs
= {u'ru': {u'units': {u'week': (u'\u043d\u0435\u0434\u0435\u043b\u044f', u'\u043d\u0435\u0434\u0435\u043b\u0438', u'\u043d\u0435\u0434\u0435\u043b\u044c', u'\u043d\u0435\u0434\u0435\u043b\u044e'), u'hour': (u'\u0447\u0430\u0441', u'\u0447\u0430\u0441\u0430', u'\u0447\u0430\u0441\u043e\u0432'), u'month': (u'\u043c\u0435\u0441\u044f\u0446', u'\u043c\u0435\u0441\u044f\u0446\u0430', u'\u043c\u0435\u0441\u044f\u0446\u0435\u0432'), u'second': (u'\u0441\u0435\u043a\u0443\u043d\u0434\u0430', u'\u0441\u0435\u043a\u0443\u043d\u0434\u044b', u'\u0441\u0435\u043a\u0443\u043d\u0434', u'\u0441\u0435\u043a\u0443\u043d\u0434\u0443'), u'year': (u'\u0433\u043e\u0434', u'\u0433\u043e\u0434\u0430', u'\u043b\u0435\u0442'), u'day': (u'\u0434\u0435\u043d\u044c', u'\u0434\u043d\u044f', u'\u0434\u043d\u0435\u0439'), u'minute': (u'\u043c\u0438\u043d\u0443\u0442\u0430', u'\u043c\u0438\u043d\u0443\u0442\u0430', u'\u043c\u0438\u043d\u0443\u0442', u'\u043c\u0438\u043d\u0443\u0442\u0443')}, u'word_replacements': [(u'1 \u0434\u043d\u0435\u0439', [u'\u0432\u0447\u0435\u0440\u0430\u0432\u0447\u0435\u0440\u0430', u'\u0412\u0447\u0435\u0440\u0430 \u0432', u'\u0432\u0447\u0435\u0440\u0430', u'\u0412\u0447\u0435\u0440\u0430']), (u'0 \u0434\u0435\u043d\u044c', [u'\u0441\u0435\u0433\u043e\u0434\u043d\u044f']), (u'\u0447\u0430\u0441', [u'\u0447']), (u'\u043c\u0438\u043d\u0443\u0442\u0443', [u'\u043c\u0438\u043d']), (u'1 \u043c\u0438\u043d\u0443\u0442\u0443', [u'^\u043c\u0438\u043d\u0443\u0442\u0443']), (u'1 \u0447\u0430\u0441', [u'^\u0447\u0430\u0441']), (u'44 \u0441\u0435\u043a\u0443\u043d\u0434\u044b', [u'\u043d\u0435\u0441\u043a\u043e\u043b\u044c\u043a\u043e \u0441\u0435\u043a\u0443\u043d\u0434'])]}, u'fr': {u'units': {u'week': (u'semaine', u'semaines'), u'hour': (u'heure', u'heures'), u'month': (u'mois', u'mois'), u'year': (u'an', u'ann\xe9e', u'ann\xe9es'), u'day': (u'jour', u'jours'), u'minute': (u'minute', u'minutes')}, u'word_replacements': [(u'2 jour', [u'avant-hier']), (u'1 jour', [u'hier']), (u'0 jours', [u"aujourd'hui"]), (u'1', [u'un', u'une'])]}, u'en': {u'units': {u'week': (u'week', u'weeks'), u'hour': (u'hour', u'hours'), u'month': (u'month', u'months'), u'second': (u'second', u'seconds'), u'year': (u'year', u'years'), u'day': (u'day', u'days'), u'minute': (u'minute', u'minutes')}, u'word_replacements': [(u'2 days', [u'the day before yesterday']), (u'1 day', [u'yesterday']), (u'0 days', [u'today']), (u'1', [u'an', u'a', u'one']), (u'\\1 hour\\2', [u'(\\d+)\\s*hr(s?)']), (u'\\1 minute\\2', [u'(\\d+)\\s*min(s?)']), (u'\\1 second\\2', [u'(\\d+)\\s*sec(s?)'])]}, u'cn': {u'units': {u'week': (u'\u5468', u'\u661f\u671f'), u'hour': (u'\u5c0f\u65f6',), u'month': (u'\u6708', u'\u4e2a\u6708'), u'year': (u'\u5e74',), u'day': (u'\u5929',), u'minute': (u'\u5206', u'\u5206\u949f')}, u'word_replacements': [(u'1\u5929', [u'\u6628\u5929']), (u'2\u5929', [u'\u524d\u5929'])], u'no_word_spacing': True}, u'pt': {u'units': {u'week': (u'semana', u'semanas'), u'hour': (u'hora', u'horas'), u'month': (u'm\xeas', u'meses'), u'second': (u'segunda', u'segundos'), u'year': (u'ano', u'anos'), u'day': (u'dia', u'dias'), u'minute': (u'minuto', u'minutos')}, u'word_replacements': [(u'2 dias', [u'anteontem']), (u'1 dia', [u'ontem']), (u'0 dias', [u'hoje']), (u'1', [u'um', u'uma']), (u'44 segundos', [u'alguns segundos'])]}, u'cs': {u'units': {u'week': (u't\xfdden', u't\xfddn\u016f'), u'hour': (u'hodina', u'hodin', u'hodiny', u'hodinami'), u'month': (u'm\u011bs\xedc', u'm\u011bs\xedc\u016f', u'm\u011bs\xedce'), u'year': (u'rok', u'rok\u016f'), u'day': (u'den', u'dn\u016f', u'dny'), u'minute': (u'minuta', u'minut')}}, u'de': {u'units': {u'week': (u'Woche', u'Wochen'), u'hour': (u'Stunde', u'Stunden'), u'month': (u'Monat', u'Monate'), u'year': (u'Jahr', u'Jahre'), u'day': (u'Tag', u'Tage'), u'minute': (u'Minute', u'Minuten')}, u'word_replacements': [(u'2 Tag', [u'vorgestern']), (u'1 Tag', [u'gestern']), (u'0 Tage', [u'Heute']), (u'vor \\1 Stunden', [u'vor (\\d+)\\s*h']), (u'vor \\1 Minuten', [u'vor (\\d+)\\s*m']), (u'1', [u'einer', u'einem'])]}, u'tr': {u'units': {u'week': (u'hafta', u'hafta'), u'hour': (u'saat', u'saat'), u'month': (u'ay', u'ay'), u'year': (u'y\u0131l', u'y\u0131l'), u'day': (u'g\xfcn', u'g\xfcn'), u'minute': (u'dakika', u'dakika')}, u'word_replacements': [(u'1 g\xfcn', [u'd\xfcn'])]}, u'it': {u'units': {u'week': (u'settimana', u'settimane'), u'hour': (u'ora', u'ore'), u'month': (u'mese', u'mesi'), u'year': (u'anno', u'anni'), u'day': (u'giorno', u'giorni'), u'minute': (u'minuto', u'minuti')}, u'word_replacements': [(u'0 giorni', [u'oggi']), (u'1 giorno', [u'ieri'])]}, u'es': {u'units': {u'week': (u'semana', u'semanas'), u'hour': (u'hora', u'horas'), u'month': (u'mes', u'meses'), u'year': (u'a\xf1o', u'a\xf1os'), u'day': (u'd\xeda', u'd\xedas'), u'minute': (u'minuto', u'minutos')}, u'word_replacements': [(u'2 d\xeda', [u'anteayer']), (u'1 d\xeda', [u'ayer']), (u'0 d\xeda', [u'hoy']), (u'1', [u'un', u'una'])]}}¶
-