#! /usr/bin/env python
from operator import itemgetter
from string import punctuation
from typing import Optional, Union
import numpy as np
REGEX_ILLEGAL_CHARS = ['[',']','\\','^','*','+','?','{',
'}','|','(',')','$','.', '"']
REGEX_WHITE_SPACE = "\s"
REGEX_LOWER = "[a-z]"
REGEX_UPPER = "[A-Z]"
REGEX_DIGIT = "[0-9]"
REGEX_PUNCT = "[(\.|\?|!)]" # subset
REGEX_OTHER = "[^a-zA-Z0-9\.\?! ]"
[docs]class RegexCharSet():
def __init__(self, complement:bool = False) -> None:
""" A regular expression character set, representing a solitary
character (eg '\\s'), a group with optional alternatives (eg
'(a|b|c)'), or a range (eg '[a-z]').
:param complement:
:type complement: bool
:rtype: None
"""
self.complement = complement
self._charset = np.empty(0, dtype=str)
self._values = set()
self._count = [[0,0]]
[docs] def exact(self) -> str:
""" Return a reduced character set that exactly matches the input, but
nothing beyond it.
:rtype: str
"""
return str(self)
[docs] def merge(self, other:Union['RegexCharRange',
'RegexCharAtom']) -> Union['RegexCharRange',
'RegexCharAtom']:
""" Return a new character set object which encompasses both self and
other, by merging the sets and adjusting the quantifiers.
:param other:
:type other: Union['RegexCharRange', 'RegexCharAtom']
:rtype: Union['RegexCharRange','RegexCharAtom']
"""
# assume that order matters
charsets = [cs for cs in self._charset]
for cs in other._charset:
if cs not in charsets:
charsets.append(cs)
charsets = _collapse_charsets(charsets)
charsets = np.array(charsets, dtype=str)
count = [[0,0] for _ in range(len(charsets))]
charset_map = {v:i for i,v in enumerate(charsets)}
for i, charset in enumerate(self._charset):
idx = charset_map[charset]
count[idx][0] = self._count[i][0]
count[idx][1] = self._count[i][1]
for i, charset in enumerate(other._charset):
idx = charset_map[charset]
if count[idx][0] == 0:
# this charset is unique to other
count[idx][0] = other._count[i][0]
count[idx][1] = other._count[i][1]
else:
# this charset is merged
count[idx][0] = min(count[idx][0],
other._count[i][0])
count[idx][1] = max(count[idx][1],
other._count[i][1])
if type(self) is RegexCharAtom and type(other) is RegexCharAtom:
if len(charsets) == 1:
# same charset
merged = RegexCharAtom(charsets[0])
else:
# different charset
begin = min(ord(self.charset()[-1]), ord(other.charset()[-1]))
end = max(ord(self.charset()[-1]), ord(other.charset()[-1]))
charset = np.array([f'[{begin}-{end}]'], dtype=str)
merged = RegexCharRange(charset = charset)
else:
merged = RegexCharRange(charset=charsets)
merged._values = set.union(self._values, other._values)
merged._count = count
return merged
[docs] def charset(self) -> str:
""" Return the full character set without quantifiers.
:rtype: str
"""
out = ''
if len(self) > 0:
out = ''.join(self._charset)
if type(self) is RegexCharRange:
pre = '[' if not self.complement else '[^'
out = pre + out + ']'
return out
[docs] def equiv(self, other:Union['RegexCharAtom', 'RegexCharRange']) -> bool:
""" Return true if self and other have the same full character set,
ignoring any quantifiers.
:param other:
:type other: Union['RegexCharAtom', 'RegexCharRange']
:rtype: bool
"""
return self.charset() == other.charset()
[docs] def weak_match(self, other:Union['RegexCharAtom', 'RegexCharRange']) -> bool:
""" Return true if self and other have the same reduced character set
and quantifiers.
:param other:
:type other: Union['RegexCharAtom', 'RegexCharRange']
:rtype: bool
"""
return str(self) == str(other)
def __repr__(self) -> str:
""" Return a string representation of this object.
:rtype: str
"""
return str(self)
def __lt__(self, other:Union['RegexCharAtom', 'RegexCharRange']) -> bool:
""" Return true if self has a less complex character class or has less
characters associated with it.
:param other:
:type other: Union['RegexCharAtom', 'RegexCharRange']
:rtype: bool
"""
if len(self._charset) < len(other._charset):
return True
if len(self._charset) == len(other._charset):
for i in range(len(self._charset)):
if self._charset[i] < other._charset[i]:
return True
if self._count < other._count:
return True
return False
def __eq__(self, other:Union['RegexCharAtom', 'RegexCharRange']) -> bool:
""" Return true if self and other have the same full character set,
including any quantifiers.
:param other:
:type other: Union['RegexCharAtom', 'RegexCharRange']
:rtype: bool
"""
return self.exact() == other.exact()
def __len__(self) -> int:
""" Return the shortest string length that would match the character
set.
:rtype: int
"""
return sum(c[0] for c in self._count)
def __str__(self) -> str:
""" Print the full character class, including quantifiers.
:rtype: str
"""
out = ''.join(self._charset)
if type(self) is RegexCharRange:
pre = '[' if not self.complement else '[^'
out = pre + out + ']'
q_min = sum([c[0] for c in self._count])
q_max = sum([c[1] for c in self._count])
if q_min > 1 or q_max > q_min:
out += '{' + str(q_min)
if q_max > q_min:
out += f',{q_max}'
out += '}'
return out
def __hash__(self) -> int:
""" Return a hash values based on the full character class, including
quantifiers.
:rtype: int
"""
return hash(self.exact())
[docs]class RegexCharAtom(RegexCharSet):
def __init__(self, charset_str:str) -> None:
""" A regular expression character set, representing a solitary
character (eg '\\s').
:param value:
:type value: str
:rtype: None
"""
super().__init__()
assert len(charset_str) == 1 or (len(charset_str) == 2 and
charset_str.startswith('\\'))
self._charset = np.array([charset_str], dtype=str)
self._count = [[0,0] for _ in range(len(self._charset))]
[docs] def add(self, char:Optional[str]) -> None:
""" Add character to character set count.
:param char:
:type char: str
:rtype: None
"""
self._values.add(char)
self._count[0][0] += 1
self._count[0][1] += 1
[docs] def exact(self) -> str:
""" Return true if self and other have the same reduced character set
and quantifiers.
:rtype: str
"""
out = str(self._charset[0])
if self._count[0][0] > 1\
or self._count[0][1] > self._count[0][0]:
out += '{' + str(self._count[0][0])
if self._count[0][1] > self._count[0][0]:
out += ',' + str(self._count[0][1])
out += '}'
return out
[docs] def copy(self) -> 'RegexCharAtom':
""" Return a deep copy of this character set.
:rtype: 'RegexCharAtom'
"""
c = RegexCharAtom(str(self._charset[0]))
c._values = {v for v in self._values}
c._count = [[i for i in c] for c in self._count]
return c
[docs]class RegexCharRange(RegexCharSet):
def __init__(self, charset:Optional[np.ndarray] = None,
charset_str:Optional[str] = None,
complement:bool = False) -> None:
""" A regular expression character set, representing a group with
optional alternatives (eg '(a|b|c)'), or a range (eg '[a-z]').
:param charset:
:type charset: Optional[np.ndarray]
:param charset_str:
:type charset_str: Optional[str]
:param complement:
:type complement: bool
:rtype: None
"""
super().__init__()
self.complement = complement
assert charset is not None or charset_str is not None
if charset is not None:
assert len(charset) > 0
self._charset = charset
elif charset_str is not None:
assert charset_str[0] == '[' and charset_str[-1] == ']'
self._update_charset(charset_str)
self._count = [[0,0] for _ in range(len(self._charset))]
[docs] def add(self, char:str) -> None:
""" Add character to character set count.
:param char:
:type char: str
:rtype: None
"""
cs_idx = self._isidx(char)
if cs_idx < 0:
return
self._values.add(char)
self._count[cs_idx][0] += 1
self._count[cs_idx][1] += 1
[docs] def exact(self) -> str:
""" Return true if self and other have the same reduced character set
and quantifiers.
:rtype: str
"""
if len(self) <= 0 or self.complement:
return str(self)
out = ''
for i, cs in enumerate(self._charset):
# range [x-y]
if '-' in cs:
begin, end = cs.split('-')
members = [ord(v) for v in self._values\
if ord(v) in range(ord(begin), ord(end) + 1)]
if len(members) <= 0:
continue
begin = chr(min(members))
end = chr(max(members))
if begin == end:
out += f'{begin}'
else:
out += f'[{begin}-{end}]'
if self._count[i][0] > 1\
or self._count[i][1] > self._count[i][0]:
out += '{' + str(self._count[i][0])
if self._count[i][1] > self._count[i][0]:
out += ',' + str(self._count[i][1])
out += '}'
else:
if cs[0] == '(' and cs[-1] == ")":
# group (...)
valueset = cs[1:-1].split('|')
else: # fallback check
valueset = cs
members = list()
for v in self._values:
if v in REGEX_ILLEGAL_CHARS:
v = '\\' + v
if v in valueset:
members.append(v)
if len(members) <= 0:
continue
elif len(members) == 1:
v = members[0]
out += f"{v}"
else:
values = '|'.join(members)
out += f"({values})"
if self._count[i][0] > 1\
or self._count[i][1] > self._count[i][0]:
out += '{' + str(self._count[i][0])
if self._count[i][1] > self._count[i][0]:
out += ',' + str(self._count[i][1])
out += '}'
return out
[docs] def copy(self) -> 'RegexCharRange':
""" Return a deep copy of this object.
:rtype: 'RegexCharRange'
"""
c = RegexCharRange(charset = np.copy(self._charset))
c._values = {v for v in self._values}
c._count = [[i for i in c] for c in self._count]
c.complement = self.complement
return c
def _update_charset(self, value:str) -> None:
""" Parse a string representation of a character set.
:param value:
:type value: str
:rtype: None
"""
if value[1] == '^':
self.complement = True
charset_lst = list()
for cs in [REGEX_LOWER, REGEX_UPPER, REGEX_DIGIT,
REGEX_OTHER]:
if cs[1:-1] in value[1:-1]:
charset_lst.append(cs[1:-1])
if value[1] == '(' and value[-2] == ")":
# group (...)
charset_lst.append(value[1:-1])
self._charset = np.array(charset_lst, dtype=str)
def _isidx(self, char:str) -> int:
""" Return the index of the corresponding character class if it exists,
or -1 otherwise.
:param char:
:type char: str
:rtype: int
"""
idx = -1 # non found code
for i, cs in enumerate(self._charset):
# range [x-y]
if '-' in cs:
assert len(cs) == 3
begin, end = cs.split('-')
if ord(char) in range(ord(begin), ord(end) + 1):
idx = i
break
else:
if cs[0] == '(' and cs[-1] == ")":
# group (...)
values = cs[1:-1].split('|')
else: # fallback check
values = cs
if char in values:
idx = i
break
if self.complement and idx >= 0:
# return non found code if a match is found
return -1
return idx
[docs]class RegexPattern():
def __init__(self) -> None:
""" A regular expression consisting of solitary characters (eg '\\s'),
groups with optional alternatives (eg '(a|b|c)'), and ranges (eg
'[a-z]').
:rtype: None
"""
self.pattern = list() # type: list[Union[RegexCharAtom, RegexCharRange]]
[docs] def add(self, charset:Union[RegexCharAtom, RegexCharRange]) -> None:
""" Add a character set to this regular expression. Character sets are
assumed ordered sequentially.
:param value:
:type value: Union[RegexCharAtom, RegexCharRange]
:rtype: None
"""
self.pattern.append(charset)
[docs] def exact(self) -> str:
""" Return a reduced pattern that exactly matches the input, but
nothing beyond it.
:rtype: str
"""
out = ''
for char in self.pattern:
out += char.exact()
return '^' + out + '$'
[docs] def generalize(self) -> 'RegexPattern':
""" Generalize character sets on word level. For example,
'[A-Z][a-z]{2}\\s' would yield '[A-Za-z]{3}'. The generalized
expression is returned as a new instance.
:rtype: 'RegexPattern'
"""
p = self.copy()
if len(p) <= 1:
return p
pattern = list()
cs_prev = p.pattern[0] if len(self.pattern) > 0 else None
for i in range(1, len(p.pattern)):
cs = p.pattern[i]
if type(cs) is RegexCharRange\
and type(cs_prev) is RegexCharRange:
cs_prev = cs_prev.merge(cs)
if i >= len(p.pattern) - 1:
# account for last member
pattern.append(cs_prev)
continue
pattern.append(cs_prev)
if i <= 0:
# account for first member
pattern.append(cs_prev)
if i >= len(p.pattern) - 1:
# account for last member
pattern.append(cs)
cs_prev = cs
p.pattern = pattern
return p
[docs] def copy(self) -> 'RegexPattern':
""" Return a deep copy of this object.
:rtype: 'RegexPattern'
"""
p = RegexPattern()
for cs in self.pattern:
p.pattern.append(cs.copy())
return p
[docs] def weak_match(self, other:'RegexPattern') -> bool:
""" Return a reduced regular expresion that exactly matches the input,
but nothing beyond it.
:param other:
:type other: 'RegexPattern'
:rtype: bool
"""
return str(self) == str(other)
[docs] def equiv(self, other:'RegexPattern') -> bool:
""" Return true if self has the same full expression as other,
excluding quantifiers.
:param other:
:type other: 'RegexPattern'
:rtype: bool
"""
return len(self) == len(other) and not False in\
[cs.charset() == ocs.charset()\
for cs, ocs in zip(self.pattern, other.pattern)]
def __repr__(self) -> str:
""" Return a string representation of this object, including
quantifiers.
:rtype: str
"""
return str(self)
def __str__(self) -> str:
""" Return a string representation of the full expression,
including quantifiers.
:rtype: str
"""
out = ''
for char in self.pattern:
out += str(char)
return '^' + out + '$'
def __len__(self) -> int:
""" Return the number of unique character sets in this pattern.
:rtype: int
"""
return len(self.pattern)
def __lt__(self, other:'RegexPattern') -> bool:
""" Return true if self has a less complex pattern than other.
:param other:
:type other: 'RegexPattern'
:rtype: bool
"""
return len(self) < len(other) or (len(self) == len(other)\
and True in [cs < ocs for cs, ocs in zip(self.pattern,
other.pattern)])
def __eq__(self, other:'RegexPattern') -> bool:
""" Return true if self has the same full expression as other,
including quantifiers.
:param other:
:type other: 'RegexPattern'
:rtype: bool
"""
return self.exact() == other.exact()
def __hash__(self) -> int:
""" Return a hash value based on the full expression, including
quantifiers.
:rtype: int
"""
return hash(self.exact())
[docs]def generalize_patterns(patterns:dict[RegexPattern,set[int]],
num_recursions:int = 1) -> dict[RegexPattern,set[int]]:
""" Generalize dictionary of regular expressions inplace, by merging
similar patterns and by aligning quantifiers.
:param patterns:
:type patterns: dict[RegexPattern,int]
:param num_recursions:
:type num_recursions: int
:rtype: dict[RegexPattern,int]
"""
if len(patterns) <= 0:
return patterns
generalized_patterns = dict() #type: dict[RegexPattern, set[int]]
for l in {len(p) for p in patterns.keys()}:
# pattern with same number of subpatterns
subset = {p:m for p,m in patterns.items() if len(p) == l}
for p, members in _generalize_patterns_uniform(subset).items():
if p not in generalized_patterns.keys():
generalized_patterns[p] = set()
generalized_patterns[p] = generalized_patterns[p].union(members)
if num_recursions > 0:
nrec = num_recursions - 1
for p, members in generalize_patterns(generalized_patterns,
num_recursions = nrec).items():
# don't sum here since that is already done in the called function
generalized_patterns[p] = members
return generalized_patterns
def _generalize_patterns_uniform(patterns:dict[RegexPattern,set[int]])\
-> dict[RegexPattern,set[int]]:
""" Generalize patterns of same length by updating quantifiers.
:param patterns:
:type patterns: dict[RegexPattern,int]
:rtype: dict[RegexPattern,int]
"""
if len(patterns) <= 1:
# nothing to merge
return dict()
# deterministic order
i2p = list(patterns.keys())
# match on unquantified character (set)
pattern_mat = np.array([[cs.charset() for cs in p.pattern]
for p in i2p], dtype=str)
eq_mat = pattern_mat[:, None] == pattern_mat # elementwise matching
# find matching patterns
matches = set()
nonmatches = list()
for i in range(eq_mat.shape[0]):
match_idx = np.where(eq_mat[i].all(axis = -1))[0]
if len(match_idx) >= 2:
# two or more patterns are similar
matches.add(tuple(match_idx))
continue
nonmatches.append(i)
# merge matching patterns
out = dict() #type: dict[RegexPattern, set[int]]
for match_idx in matches:
siblings = list(itemgetter(*match_idx)(i2p))
merged_pattern = _merge_patterns(siblings)
members = set.union(*[patterns[i2p[i]] for i in match_idx])
if merged_pattern not in patterns.keys():
out[merged_pattern] = members
continue
out[merged_pattern] = patterns[merged_pattern].union(members)
return out
def _merge_patterns(patterns:list[RegexPattern]) -> RegexPattern:
""" Merge two or more patterns
:param patterns:
:type patterns: list[RegexPattern]
:rtype: RegexPattern
"""
num_charsets = len(patterns[0])
merged_pattern = RegexPattern()
for i in range(num_charsets):
# merge character sets
charsets = [rp.pattern[i] for rp in patterns]
merged_cs = _merge_charsets(charsets)
merged_pattern.add(merged_cs)
return merged_pattern
def _merge_charsets(charsets:list) -> Union[RegexCharAtom, RegexCharRange]:
""" Merge two or more character sets
:param charsets:
:type charsets: list
:rtype: Union[RegexCharAtom, RegexCharRange]
"""
charset = charsets[0]
for charset_next in charsets[1:]:
charset = charset.merge(charset_next)
return charset
[docs]def generate_regex(s:str, strip_punctuation:bool = True) -> RegexPattern:
""" Generate regular expresion that fits string.
:param s:
:type s: str
:param strip_punctuation:
:type strip_punctuation: bool
:rtype: RegexPattern
"""
# remove unecessary white space and punctuation
s = ' '.join(s.split())
if strip_punctuation:
s.translate(str.maketrans('', '', punctuation))
slen = len(s)
pattern = RegexPattern()
if slen <= 0:
# empty string
return pattern
char_set = _mkcharset(_char_to_regex(s[0]))
char_set.add(s[0])
for i in range(1, slen):
symbol_char_set = _char_to_regex(s[i])
if symbol_char_set == char_set.charset():
# pattern continues still
char_set.add(s[i])
# account for final character
if i >= slen - 1:
pattern.add(char_set)
continue
# add interupted pattern to output
pattern.add(char_set)
# start next char set
char_set = _mkcharset(symbol_char_set)
char_set.add(s[i])
# account for final character
if i >= slen - 1:
pattern.add(char_set)
return pattern
def _mkcharset(char_set:str) -> Union[RegexCharAtom, RegexCharRange]:
""" Wrap a character set string in a dedicated object.
:param char_set:
:type char_set: str
:rtype: Union[RegexCharAtom, RegexCharRange]
"""
if char_set in {REGEX_DIGIT, REGEX_LOWER, REGEX_UPPER, REGEX_PUNCT}:
char_set_object = RegexCharRange(charset_str = char_set)
else: # white space and other non-word chars
char_set_object = RegexCharAtom(char_set)
return char_set_object
def _char_to_regex(c:str) -> str:
""" Return suitable character class or (escaped) char if none fit.
:param c:
:type c: str
:rtype: str
"""
char_class = _character_class(c)
if char_class == REGEX_OTHER:
if c in REGEX_ILLEGAL_CHARS:
# escape char
c = "\\" + c
return c
return char_class
def _character_class(c:str) -> str:
""" Infer character class.
:param c:
:type c: str
:rtype: str
"""
if c.isalpha():
char_class = REGEX_LOWER if c.islower() else REGEX_UPPER
elif c.isdigit():
char_class = REGEX_DIGIT
elif c.isspace():
char_class = REGEX_WHITE_SPACE
elif c == "." or c == "?" or c == "!":
char_class = REGEX_PUNCT
else:
char_class = REGEX_OTHER
return char_class
def _inrange(charset:str, char:str, complement:bool = False) -> bool:
""" Return true if character is part of a character set, or false if
the complement is asked.
:param charset:
:type charset: str
:param char:
:type char: str
:param complement:
:type complement: bool
:rtype: bool
"""
isin = False
if charset[0] == '[' and charset[-1] == ']':
charset = charset[1:-1]
# range [x-y]
if '-' in charset:
assert len(charset) == 3
begin, end = charset.split('-')
if ord(char) in range(ord(begin), ord(end) + 1):
isin = True
else:
if charset[0] == '(' and charset[-1] == ")":
# group (...)
values = charset[1:-1].split('|')
else: # fallback check
values = charset
if char in values:
isin = True
if complement:
isin = not isin
return isin
def _collapse_charsets(charsets:list[str]) -> list[str]:
""" Remove unecessary character sets, by removing characters which are
already represented by a character set.
:param charsets:
:type charsets: list[str]
:rtype: list[str]
"""
out = list()
for i in range(len(charsets)):
cs = charsets[i]
if not(len(cs) == 1 or len(cs) == 2 and cs.startswith('\\')):
# not a character
out.append(cs)
continue
is_contained = False
for j in range(len(charsets)):
if i == j:
continue
if _inrange(charsets[j], cs[-1]):
is_contained = True
break
if not is_contained:
out.append(cs)
return out