It might be useful to set up some intermediate variables.
import re
short_month_names = (
'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'
)
long_month_names = (
'January', 'February', 'March', 'April', 'May', 'June', 'July',
'August', 'September', 'October', 'November', 'December'
)
short_month_cap = '(?:' + '|'.join(short_month_names) + ')'
long_month_cap = '(?:' + '|'.join(long_month_names) + ')'
short_num_month_cap = '(?:[1-9]|1[12])'
long_num_month_cap = '(?:0[1-9]|1[12])'
long_day_cap = '(?:0[1-9]|[12][0-9]|3[01])'
short_day_cap = '(?:[1-9]|[12][0-9]|3[01])'
long_year_cap = '(?:[0-9]{3}[1-9]|[0-9]{2}[1-9][0-9]|[0-9][1-9][0-9]{2}|[1-9][0-9]{3})'
short_year_cap = '(?:[0-9][0-9])'
ordinal_day = '(?:2?1st|2?2nd|2?3rd|[12]?[4-9]th|1[123]th|[123]0th|31st)'
formats = (
r'(?P<month_0>{lnm}|{snm})/(?P<day_0>{ld}|{sd})/(?P<year_0>{sy}|{ly})',
r'(?P<month_1>{sm})\-(?P<day_1>{ld}|{sd})\-(?P<year_1>{ly})',
r'(?P<month_2>{sm}|{lm})(?:\.\s+|\s*)(?P<day_2>{ld}|{sd})(?:,\s+|\s*)(?P<year_2>{ly})',
r'(?P<day_3>{ld}|{sd})(?:[\.,]\s+|\s*)(?P<month_3>{lm}|{sm})(?:[\.,]\s+|\s*)(?P<year_3>{ly})',
r'(?P<month_4>{lm}|{sm})\s+(?P<year_4>{ly})',
r'(?P<month_5>{lnm}|{snm})/(?P<year_5>{ly})',
r'(?P<year_6>{ly})',
r'(?P<month_6>{sm})\s+(?P<day_4>(?={od})[0-9][0-9]?)..,\s*(?P<year_7>{ly})'
)
_pattern = '|'.join(
i.format(
sm=short_month_cap, lm=long_month_cap, snm=short_num_month_cap,
lnm=long_num_month_cap, ld=long_day_cap, sd=short_day_cap,
ly=long_year_cap, sy=short_year_cap, od=ordinal_day
) for i in formats
)
pattern = re.compile(_pattern)
def get_fields(match):
if not match:
return None
return {
k[:-2]: v
for k, v in match.groupdict().items()
if v is not None
}
tests = r'''04/20/2009; 04/20/09; 4/20/09; 4/3/09
Mar-20-2009; Mar 20, 2009; March 20, 2009; Mar. 20, 2009; Mar 20 2009
20 Mar 2009; 20 March 2009; 20 Mar. 2009; 20 March, 2009
Mar 20th, 2009; Mar 21st, 2009; Mar 22nd, 2009
Feb 2009; Sep 2009; Oct 2010
6/2008; 12/2009
2009; 2010'''
for test_line in tests.split('\n'):
for test in test_line.split('; '):
print('{!r}: {!r}'.format(test, get_fields(pattern.fullmatch(test))))
print('')
Which outputs:
'04/20/2009': {'month': '04', 'day': '20', 'year': '2009'}
'04/20/09': {'month': '04', 'day': '20', 'year': '09'}
'4/20/09': {'month': '4', 'day': '20', 'year': '09'}
'4/3/09': {'month': '4', 'day': '3', 'year': '09'}
'Mar-20-2009': {'month': 'Mar', 'day': '20', 'year': '2009'}
'Mar 20, 2009': {'month': 'Mar', 'day': '20', 'year': '2009'}
'March 20, 2009': {'month': 'March', 'day': '20', 'year': '2009'}
'Mar. 20, 2009': {'month': 'Mar', 'day': '20', 'year': '2009'}
'Mar 20 2009': {'month': 'Mar', 'day': '20', 'year': '2009'}
'20 Mar 2009': {'day': '20', 'month': 'Mar', 'year': '2009'}
'20 March 2009': {'day': '20', 'month': 'March', 'year': '2009'}
'20 Mar. 2009': {'day': '20', 'month': 'Mar', 'year': '2009'}
'20 March, 2009': {'day': '20', 'month': 'March', 'year': '2009'}
'Mar 20th, 2009': {'month': 'Mar', 'day': '20', 'year': '2009'}
'Mar 21st, 2009': {'month': 'Mar', 'day': '21', 'year': '2009'}
'Mar 22nd, 2009': {'month': 'Mar', 'day': '22', 'year': '2009'}
'Feb 2009': {'month': 'Feb', 'year': '2009'}
'Sep 2009': {'month': 'Sep', 'year': '2009'}
'Oct 2010': {'month': 'Oct', 'year': '2010'}
'6/2008': {'month': '6', 'year': '2008'}
'12/2009': {'month': '12', 'year': '2009'}
'2009': {'year': '2009'}
'2010': {'year': '2010'}
The main part is the formats variable, where all the different formats are defined. It matches slightly more than what is defined, and can easily be extended.
The overall pattern ends up being:
'(?P<month_0>(?:0[1-9]|1[12])|(?:[1-9]|1[12]))/(?P<day_0>(?:0[1-9]|[12][0-9]|3[01])|(?:[1-9]|[12][0-9]|3[01]))/(?P<year_0>(?:[0-9][0-9])|(?:[0-9]{3}[1-9]|[0-9]{2}[1-9][0-9]|[0-9][1-9][0-9]{2}|[1-9][0-9]{3}))|(?P<month_1>(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\-(?P<day_1>(?:0[1-9]|[12][0-9]|3[01])|(?:[1-9]|[12][0-9]|3[01]))\\-(?P<year_1>(?:[0-9]{3}[1-9]|[0-9]{2}[1-9][0-9]|[0-9][1-9][0-9]{2}|[1-9][0-9]{3}))|(?P<month_2>(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)|(?:January|February|March|April|May|June|July|August|September|October|November|December))(?:\\.\\s+|\\s*)(?P<day_2>(?:0[1-9]|[12][0-9]|3[01])|(?:[1-9]|[12][0-9]|3[01]))(?:,\\s+|\\s*)(?P<year_2>(?:[0-9]{3}[1-9]|[0-9]{2}[1-9][0-9]|[0-9][1-9][0-9]{2}|[1-9][0-9]{3}))|(?P<day_3>(?:0[1-9]|[12][0-9]|3[01])|(?:[1-9]|[12][0-9]|3[01]))(?:[\\.,]\\s+|\\s*)(?P<month_3>(?:January|February|March|April|May|June|July|August|September|October|November|December)|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))(?:[\\.,]\\s+|\\s*)(?P<year_3>(?:[0-9]{3}[1-9]|[0-9]{2}[1-9][0-9]|[0-9][1-9][0-9]{2}|[1-9][0-9]{3}))|(?P<month_4>(?:January|February|March|April|May|June|July|August|September|October|November|December)|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\s+(?P<year_4>(?:[0-9]{3}[1-9]|[0-9]{2}[1-9][0-9]|[0-9][1-9][0-9]{2}|[1-9][0-9]{3}))|(?P<month_5>(?:0[1-9]|1[12])|(?:[1-9]|1[12]))/(?P<year_5>(?:[0-9]{3}[1-9]|[0-9]{2}[1-9][0-9]|[0-9][1-9][0-9]{2}|[1-9][0-9]{3}))|(?P<year_6>(?:[0-9]{3}[1-9]|[0-9]{2}[1-9][0-9]|[0-9][1-9][0-9]{2}|[1-9][0-9]{3}))|(?P<month_6>(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\s+(?P<day_4>(?=(?:2?1st|2?2nd|2?3rd|[12]?[4-9]th|1[123]th|[123]0th|31st))[0-9][0-9]?)..,\\s*(?P<year_7>(?:[0-9]{3}[1-9]|[0-9]{2}[1-9][0-9]|[0-9][1-9][0-9]{2}|[1-9][0-9]{3}))'
Which would have been virtually impossible to write by hand.
The bounds for the "between random text" can be added around _pattern.
I would suggest _pattern = r'\b(?:{})\b'.format(_pattern).