Spaces:
Build error
Build error
#======================================================================= | |
# | |
# Python Lexical Analyser | |
# | |
# Traditional Regular Expression Syntax | |
# | |
#======================================================================= | |
from __future__ import absolute_import | |
from .Regexps import Alt, Seq, Rep, Rep1, Opt, Any, AnyBut, Bol, Eol, Char | |
from .Errors import PlexError | |
class RegexpSyntaxError(PlexError): | |
pass | |
def re(s): | |
""" | |
Convert traditional string representation of regular expression |s| | |
into Plex representation. | |
""" | |
return REParser(s).parse_re() | |
class REParser(object): | |
def __init__(self, s): | |
self.s = s | |
self.i = -1 | |
self.end = 0 | |
self.next() | |
def parse_re(self): | |
re = self.parse_alt() | |
if not self.end: | |
self.error("Unexpected %s" % repr(self.c)) | |
return re | |
def parse_alt(self): | |
"""Parse a set of alternative regexps.""" | |
re = self.parse_seq() | |
if self.c == '|': | |
re_list = [re] | |
while self.c == '|': | |
self.next() | |
re_list.append(self.parse_seq()) | |
re = Alt(*re_list) | |
return re | |
def parse_seq(self): | |
"""Parse a sequence of regexps.""" | |
re_list = [] | |
while not self.end and not self.c in "|)": | |
re_list.append(self.parse_mod()) | |
return Seq(*re_list) | |
def parse_mod(self): | |
"""Parse a primitive regexp followed by *, +, ? modifiers.""" | |
re = self.parse_prim() | |
while not self.end and self.c in "*+?": | |
if self.c == '*': | |
re = Rep(re) | |
elif self.c == '+': | |
re = Rep1(re) | |
else: # self.c == '?' | |
re = Opt(re) | |
self.next() | |
return re | |
def parse_prim(self): | |
"""Parse a primitive regexp.""" | |
c = self.get() | |
if c == '.': | |
re = AnyBut("\n") | |
elif c == '^': | |
re = Bol | |
elif c == '$': | |
re = Eol | |
elif c == '(': | |
re = self.parse_alt() | |
self.expect(')') | |
elif c == '[': | |
re = self.parse_charset() | |
self.expect(']') | |
else: | |
if c == '\\': | |
c = self.get() | |
re = Char(c) | |
return re | |
def parse_charset(self): | |
"""Parse a charset. Does not include the surrounding [].""" | |
char_list = [] | |
invert = 0 | |
if self.c == '^': | |
invert = 1 | |
self.next() | |
if self.c == ']': | |
char_list.append(']') | |
self.next() | |
while not self.end and self.c != ']': | |
c1 = self.get() | |
if self.c == '-' and self.lookahead(1) != ']': | |
self.next() | |
c2 = self.get() | |
for a in range(ord(c1), ord(c2) + 1): | |
char_list.append(chr(a)) | |
else: | |
char_list.append(c1) | |
chars = ''.join(char_list) | |
if invert: | |
return AnyBut(chars) | |
else: | |
return Any(chars) | |
def next(self): | |
"""Advance to the next char.""" | |
s = self.s | |
i = self.i = self.i + 1 | |
if i < len(s): | |
self.c = s[i] | |
else: | |
self.c = '' | |
self.end = 1 | |
def get(self): | |
if self.end: | |
self.error("Premature end of string") | |
c = self.c | |
self.next() | |
return c | |
def lookahead(self, n): | |
"""Look ahead n chars.""" | |
j = self.i + n | |
if j < len(self.s): | |
return self.s[j] | |
else: | |
return '' | |
def expect(self, c): | |
""" | |
Expect to find character |c| at current position. | |
Raises an exception otherwise. | |
""" | |
if self.c == c: | |
self.next() | |
else: | |
self.error("Missing %s" % repr(c)) | |
def error(self, mess): | |
"""Raise exception to signal syntax error in regexp.""" | |
raise RegexpSyntaxError("Syntax error in regexp %s at position %d: %s" % ( | |
repr(self.s), self.i, mess)) | |