Spaces:
Build error
Build error
#======================================================================= | |
# | |
# Python Lexical Analyser | |
# | |
# Regular Expressions | |
# | |
#======================================================================= | |
from __future__ import absolute_import | |
import types | |
try: | |
from sys import maxsize as maxint | |
except ImportError: | |
from sys import maxint | |
from . import Errors | |
# | |
# Constants | |
# | |
BOL = 'bol' | |
EOL = 'eol' | |
EOF = 'eof' | |
nl_code = ord('\n') | |
# | |
# Helper functions | |
# | |
def chars_to_ranges(s): | |
""" | |
Return a list of character codes consisting of pairs | |
[code1a, code1b, code2a, code2b,...] which cover all | |
the characters in |s|. | |
""" | |
char_list = list(s) | |
char_list.sort() | |
i = 0 | |
n = len(char_list) | |
result = [] | |
while i < n: | |
code1 = ord(char_list[i]) | |
code2 = code1 + 1 | |
i += 1 | |
while i < n and code2 >= ord(char_list[i]): | |
code2 += 1 | |
i += 1 | |
result.append(code1) | |
result.append(code2) | |
return result | |
def uppercase_range(code1, code2): | |
""" | |
If the range of characters from code1 to code2-1 includes any | |
lower case letters, return the corresponding upper case range. | |
""" | |
code3 = max(code1, ord('a')) | |
code4 = min(code2, ord('z') + 1) | |
if code3 < code4: | |
d = ord('A') - ord('a') | |
return (code3 + d, code4 + d) | |
else: | |
return None | |
def lowercase_range(code1, code2): | |
""" | |
If the range of characters from code1 to code2-1 includes any | |
upper case letters, return the corresponding lower case range. | |
""" | |
code3 = max(code1, ord('A')) | |
code4 = min(code2, ord('Z') + 1) | |
if code3 < code4: | |
d = ord('a') - ord('A') | |
return (code3 + d, code4 + d) | |
else: | |
return None | |
def CodeRanges(code_list): | |
""" | |
Given a list of codes as returned by chars_to_ranges, return | |
an RE which will match a character in any of the ranges. | |
""" | |
re_list = [CodeRange(code_list[i], code_list[i + 1]) for i in range(0, len(code_list), 2)] | |
return Alt(*re_list) | |
def CodeRange(code1, code2): | |
""" | |
CodeRange(code1, code2) is an RE which matches any character | |
with a code |c| in the range |code1| <= |c| < |code2|. | |
""" | |
if code1 <= nl_code < code2: | |
return Alt(RawCodeRange(code1, nl_code), | |
RawNewline, | |
RawCodeRange(nl_code + 1, code2)) | |
else: | |
return RawCodeRange(code1, code2) | |
# | |
# Abstract classes | |
# | |
class RE(object): | |
"""RE is the base class for regular expression constructors. | |
The following operators are defined on REs: | |
re1 + re2 is an RE which matches |re1| followed by |re2| | |
re1 | re2 is an RE which matches either |re1| or |re2| | |
""" | |
nullable = 1 # True if this RE can match 0 input symbols | |
match_nl = 1 # True if this RE can match a string ending with '\n' | |
str = None # Set to a string to override the class's __str__ result | |
def build_machine(self, machine, initial_state, final_state, | |
match_bol, nocase): | |
""" | |
This method should add states to |machine| to implement this | |
RE, starting at |initial_state| and ending at |final_state|. | |
If |match_bol| is true, the RE must be able to match at the | |
beginning of a line. If nocase is true, upper and lower case | |
letters should be treated as equivalent. | |
""" | |
raise NotImplementedError("%s.build_machine not implemented" % | |
self.__class__.__name__) | |
def build_opt(self, m, initial_state, c): | |
""" | |
Given a state |s| of machine |m|, return a new state | |
reachable from |s| on character |c| or epsilon. | |
""" | |
s = m.new_state() | |
initial_state.link_to(s) | |
initial_state.add_transition(c, s) | |
return s | |
def __add__(self, other): | |
return Seq(self, other) | |
def __or__(self, other): | |
return Alt(self, other) | |
def __str__(self): | |
if self.str: | |
return self.str | |
else: | |
return self.calc_str() | |
def check_re(self, num, value): | |
if not isinstance(value, RE): | |
self.wrong_type(num, value, "Plex.RE instance") | |
def check_string(self, num, value): | |
if type(value) != type(''): | |
self.wrong_type(num, value, "string") | |
def check_char(self, num, value): | |
self.check_string(num, value) | |
if len(value) != 1: | |
raise Errors.PlexValueError("Invalid value for argument %d of Plex.%s." | |
"Expected a string of length 1, got: %s" % ( | |
num, self.__class__.__name__, repr(value))) | |
def wrong_type(self, num, value, expected): | |
if type(value) == types.InstanceType: | |
got = "%s.%s instance" % ( | |
value.__class__.__module__, value.__class__.__name__) | |
else: | |
got = type(value).__name__ | |
raise Errors.PlexTypeError("Invalid type for argument %d of Plex.%s " | |
"(expected %s, got %s" % ( | |
num, self.__class__.__name__, expected, got)) | |
# | |
# Primitive RE constructors | |
# ------------------------- | |
# | |
# These are the basic REs from which all others are built. | |
# | |
## class Char(RE): | |
## """ | |
## Char(c) is an RE which matches the character |c|. | |
## """ | |
## nullable = 0 | |
## def __init__(self, char): | |
## self.char = char | |
## self.match_nl = char == '\n' | |
## def build_machine(self, m, initial_state, final_state, match_bol, nocase): | |
## c = self.char | |
## if match_bol and c != BOL: | |
## s1 = self.build_opt(m, initial_state, BOL) | |
## else: | |
## s1 = initial_state | |
## if c == '\n' or c == EOF: | |
## s1 = self.build_opt(m, s1, EOL) | |
## if len(c) == 1: | |
## code = ord(self.char) | |
## s1.add_transition((code, code+1), final_state) | |
## if nocase and is_letter_code(code): | |
## code2 = other_case_code(code) | |
## s1.add_transition((code2, code2+1), final_state) | |
## else: | |
## s1.add_transition(c, final_state) | |
## def calc_str(self): | |
## return "Char(%s)" % repr(self.char) | |
def Char(c): | |
""" | |
Char(c) is an RE which matches the character |c|. | |
""" | |
if len(c) == 1: | |
result = CodeRange(ord(c), ord(c) + 1) | |
else: | |
result = SpecialSymbol(c) | |
result.str = "Char(%s)" % repr(c) | |
return result | |
class RawCodeRange(RE): | |
""" | |
RawCodeRange(code1, code2) is a low-level RE which matches any character | |
with a code |c| in the range |code1| <= |c| < |code2|, where the range | |
does not include newline. For internal use only. | |
""" | |
nullable = 0 | |
match_nl = 0 | |
range = None # (code, code) | |
uppercase_range = None # (code, code) or None | |
lowercase_range = None # (code, code) or None | |
def __init__(self, code1, code2): | |
self.range = (code1, code2) | |
self.uppercase_range = uppercase_range(code1, code2) | |
self.lowercase_range = lowercase_range(code1, code2) | |
def build_machine(self, m, initial_state, final_state, match_bol, nocase): | |
if match_bol: | |
initial_state = self.build_opt(m, initial_state, BOL) | |
initial_state.add_transition(self.range, final_state) | |
if nocase: | |
if self.uppercase_range: | |
initial_state.add_transition(self.uppercase_range, final_state) | |
if self.lowercase_range: | |
initial_state.add_transition(self.lowercase_range, final_state) | |
def calc_str(self): | |
return "CodeRange(%d,%d)" % (self.code1, self.code2) | |
class _RawNewline(RE): | |
""" | |
RawNewline is a low-level RE which matches a newline character. | |
For internal use only. | |
""" | |
nullable = 0 | |
match_nl = 1 | |
def build_machine(self, m, initial_state, final_state, match_bol, nocase): | |
if match_bol: | |
initial_state = self.build_opt(m, initial_state, BOL) | |
s = self.build_opt(m, initial_state, EOL) | |
s.add_transition((nl_code, nl_code + 1), final_state) | |
RawNewline = _RawNewline() | |
class SpecialSymbol(RE): | |
""" | |
SpecialSymbol(sym) is an RE which matches the special input | |
symbol |sym|, which is one of BOL, EOL or EOF. | |
""" | |
nullable = 0 | |
match_nl = 0 | |
sym = None | |
def __init__(self, sym): | |
self.sym = sym | |
def build_machine(self, m, initial_state, final_state, match_bol, nocase): | |
# Sequences 'bol bol' and 'bol eof' are impossible, so only need | |
# to allow for bol if sym is eol | |
if match_bol and self.sym == EOL: | |
initial_state = self.build_opt(m, initial_state, BOL) | |
initial_state.add_transition(self.sym, final_state) | |
class Seq(RE): | |
"""Seq(re1, re2, re3...) is an RE which matches |re1| followed by | |
|re2| followed by |re3|...""" | |
def __init__(self, *re_list): | |
nullable = 1 | |
for i, re in enumerate(re_list): | |
self.check_re(i, re) | |
nullable = nullable and re.nullable | |
self.re_list = re_list | |
self.nullable = nullable | |
i = len(re_list) | |
match_nl = 0 | |
while i: | |
i -= 1 | |
re = re_list[i] | |
if re.match_nl: | |
match_nl = 1 | |
break | |
if not re.nullable: | |
break | |
self.match_nl = match_nl | |
def build_machine(self, m, initial_state, final_state, match_bol, nocase): | |
re_list = self.re_list | |
if len(re_list) == 0: | |
initial_state.link_to(final_state) | |
else: | |
s1 = initial_state | |
n = len(re_list) | |
for i, re in enumerate(re_list): | |
if i < n - 1: | |
s2 = m.new_state() | |
else: | |
s2 = final_state | |
re.build_machine(m, s1, s2, match_bol, nocase) | |
s1 = s2 | |
match_bol = re.match_nl or (match_bol and re.nullable) | |
def calc_str(self): | |
return "Seq(%s)" % ','.join(map(str, self.re_list)) | |
class Alt(RE): | |
"""Alt(re1, re2, re3...) is an RE which matches either |re1| or | |
|re2| or |re3|...""" | |
def __init__(self, *re_list): | |
self.re_list = re_list | |
nullable = 0 | |
match_nl = 0 | |
nullable_res = [] | |
non_nullable_res = [] | |
i = 1 | |
for re in re_list: | |
self.check_re(i, re) | |
if re.nullable: | |
nullable_res.append(re) | |
nullable = 1 | |
else: | |
non_nullable_res.append(re) | |
if re.match_nl: | |
match_nl = 1 | |
i += 1 | |
self.nullable_res = nullable_res | |
self.non_nullable_res = non_nullable_res | |
self.nullable = nullable | |
self.match_nl = match_nl | |
def build_machine(self, m, initial_state, final_state, match_bol, nocase): | |
for re in self.nullable_res: | |
re.build_machine(m, initial_state, final_state, match_bol, nocase) | |
if self.non_nullable_res: | |
if match_bol: | |
initial_state = self.build_opt(m, initial_state, BOL) | |
for re in self.non_nullable_res: | |
re.build_machine(m, initial_state, final_state, 0, nocase) | |
def calc_str(self): | |
return "Alt(%s)" % ','.join(map(str, self.re_list)) | |
class Rep1(RE): | |
"""Rep1(re) is an RE which matches one or more repetitions of |re|.""" | |
def __init__(self, re): | |
self.check_re(1, re) | |
self.re = re | |
self.nullable = re.nullable | |
self.match_nl = re.match_nl | |
def build_machine(self, m, initial_state, final_state, match_bol, nocase): | |
s1 = m.new_state() | |
s2 = m.new_state() | |
initial_state.link_to(s1) | |
self.re.build_machine(m, s1, s2, match_bol or self.re.match_nl, nocase) | |
s2.link_to(s1) | |
s2.link_to(final_state) | |
def calc_str(self): | |
return "Rep1(%s)" % self.re | |
class SwitchCase(RE): | |
""" | |
SwitchCase(re, nocase) is an RE which matches the same strings as RE, | |
but treating upper and lower case letters according to |nocase|. If | |
|nocase| is true, case is ignored, otherwise it is not. | |
""" | |
re = None | |
nocase = None | |
def __init__(self, re, nocase): | |
self.re = re | |
self.nocase = nocase | |
self.nullable = re.nullable | |
self.match_nl = re.match_nl | |
def build_machine(self, m, initial_state, final_state, match_bol, nocase): | |
self.re.build_machine(m, initial_state, final_state, match_bol, | |
self.nocase) | |
def calc_str(self): | |
if self.nocase: | |
name = "NoCase" | |
else: | |
name = "Case" | |
return "%s(%s)" % (name, self.re) | |
# | |
# Composite RE constructors | |
# ------------------------- | |
# | |
# These REs are defined in terms of the primitive REs. | |
# | |
Empty = Seq() | |
Empty.__doc__ = \ | |
""" | |
Empty is an RE which matches the empty string. | |
""" | |
Empty.str = "Empty" | |
def Str1(s): | |
""" | |
Str1(s) is an RE which matches the literal string |s|. | |
""" | |
result = Seq(*tuple(map(Char, s))) | |
result.str = "Str(%s)" % repr(s) | |
return result | |
def Str(*strs): | |
""" | |
Str(s) is an RE which matches the literal string |s|. | |
Str(s1, s2, s3, ...) is an RE which matches any of |s1| or |s2| or |s3|... | |
""" | |
if len(strs) == 1: | |
return Str1(strs[0]) | |
else: | |
result = Alt(*tuple(map(Str1, strs))) | |
result.str = "Str(%s)" % ','.join(map(repr, strs)) | |
return result | |
def Any(s): | |
""" | |
Any(s) is an RE which matches any character in the string |s|. | |
""" | |
#result = apply(Alt, tuple(map(Char, s))) | |
result = CodeRanges(chars_to_ranges(s)) | |
result.str = "Any(%s)" % repr(s) | |
return result | |
def AnyBut(s): | |
""" | |
AnyBut(s) is an RE which matches any character (including | |
newline) which is not in the string |s|. | |
""" | |
ranges = chars_to_ranges(s) | |
ranges.insert(0, -maxint) | |
ranges.append(maxint) | |
result = CodeRanges(ranges) | |
result.str = "AnyBut(%s)" % repr(s) | |
return result | |
AnyChar = AnyBut("") | |
AnyChar.__doc__ = \ | |
""" | |
AnyChar is an RE which matches any single character (including a newline). | |
""" | |
AnyChar.str = "AnyChar" | |
def Range(s1, s2=None): | |
""" | |
Range(c1, c2) is an RE which matches any single character in the range | |
|c1| to |c2| inclusive. | |
Range(s) where |s| is a string of even length is an RE which matches | |
any single character in the ranges |s[0]| to |s[1]|, |s[2]| to |s[3]|,... | |
""" | |
if s2: | |
result = CodeRange(ord(s1), ord(s2) + 1) | |
result.str = "Range(%s,%s)" % (s1, s2) | |
else: | |
ranges = [] | |
for i in range(0, len(s1), 2): | |
ranges.append(CodeRange(ord(s1[i]), ord(s1[i + 1]) + 1)) | |
result = Alt(*ranges) | |
result.str = "Range(%s)" % repr(s1) | |
return result | |
def Opt(re): | |
""" | |
Opt(re) is an RE which matches either |re| or the empty string. | |
""" | |
result = Alt(re, Empty) | |
result.str = "Opt(%s)" % re | |
return result | |
def Rep(re): | |
""" | |
Rep(re) is an RE which matches zero or more repetitions of |re|. | |
""" | |
result = Opt(Rep1(re)) | |
result.str = "Rep(%s)" % re | |
return result | |
def NoCase(re): | |
""" | |
NoCase(re) is an RE which matches the same strings as RE, but treating | |
upper and lower case letters as equivalent. | |
""" | |
return SwitchCase(re, nocase=1) | |
def Case(re): | |
""" | |
Case(re) is an RE which matches the same strings as RE, but treating | |
upper and lower case letters as distinct, i.e. it cancels the effect | |
of any enclosing NoCase(). | |
""" | |
return SwitchCase(re, nocase=0) | |
# | |
# RE Constants | |
# | |
Bol = Char(BOL) | |
Bol.__doc__ = \ | |
""" | |
Bol is an RE which matches the beginning of a line. | |
""" | |
Bol.str = "Bol" | |
Eol = Char(EOL) | |
Eol.__doc__ = \ | |
""" | |
Eol is an RE which matches the end of a line. | |
""" | |
Eol.str = "Eol" | |
Eof = Char(EOF) | |
Eof.__doc__ = \ | |
""" | |
Eof is an RE which matches the end of the file. | |
""" | |
Eof.str = "Eof" | |