xfys's picture
Upload 432 files
f07f089
raw
history blame
7.64 kB
"""
A simple XPath-like language for tree traversal.
This works by creating a filter chain of generator functions. Each
function selects a part of the expression, e.g. a child node, a
specific descendant or a node that holds an attribute.
"""
from __future__ import absolute_import
import re
import operator
import sys
if sys.version_info[0] >= 3:
_unicode = str
else:
_unicode = unicode
path_tokenizer = re.compile(
r"("
r"'[^']*'|\"[^\"]*\"|"
r"//?|"
r"\(\)|"
r"==?|"
r"[/.*\[\]()@])|"
r"([^/\[\]()@=\s]+)|"
r"\s+"
).findall
def iterchildren(node, attr_name):
# returns an iterable of all child nodes of that name
child = getattr(node, attr_name)
if child is not None:
if type(child) is list:
return child
else:
return [child]
else:
return ()
def _get_first_or_none(it):
try:
try:
_next = it.next
except AttributeError:
return next(it)
else:
return _next()
except StopIteration:
return None
def type_name(node):
return node.__class__.__name__.split('.')[-1]
def parse_func(next, token):
name = token[1]
token = next()
if token[0] != '(':
raise ValueError("Expected '(' after function name '%s'" % name)
predicate = handle_predicate(next, token)
return name, predicate
def handle_func_not(next, token):
"""
not(...)
"""
name, predicate = parse_func(next, token)
def select(result):
for node in result:
if _get_first_or_none(predicate([node])) is None:
yield node
return select
def handle_name(next, token):
"""
/NodeName/
or
func(...)
"""
name = token[1]
if name in functions:
return functions[name](next, token)
def select(result):
for node in result:
for attr_name in node.child_attrs:
for child in iterchildren(node, attr_name):
if type_name(child) == name:
yield child
return select
def handle_star(next, token):
"""
/*/
"""
def select(result):
for node in result:
for name in node.child_attrs:
for child in iterchildren(node, name):
yield child
return select
def handle_dot(next, token):
"""
/./
"""
def select(result):
return result
return select
def handle_descendants(next, token):
"""
//...
"""
token = next()
if token[0] == "*":
def iter_recursive(node):
for name in node.child_attrs:
for child in iterchildren(node, name):
yield child
for c in iter_recursive(child):
yield c
elif not token[0]:
node_name = token[1]
def iter_recursive(node):
for name in node.child_attrs:
for child in iterchildren(node, name):
if type_name(child) == node_name:
yield child
for c in iter_recursive(child):
yield c
else:
raise ValueError("Expected node name after '//'")
def select(result):
for node in result:
for child in iter_recursive(node):
yield child
return select
def handle_attribute(next, token):
token = next()
if token[0]:
raise ValueError("Expected attribute name")
name = token[1]
value = None
try:
token = next()
except StopIteration:
pass
else:
if token[0] == '=':
value = parse_path_value(next)
readattr = operator.attrgetter(name)
if value is None:
def select(result):
for node in result:
try:
attr_value = readattr(node)
except AttributeError:
continue
if attr_value is not None:
yield attr_value
else:
def select(result):
for node in result:
try:
attr_value = readattr(node)
except AttributeError:
continue
if attr_value == value:
yield attr_value
elif (isinstance(attr_value, bytes) and isinstance(value, _unicode) and
attr_value == value.encode()):
# allow a bytes-to-string comparison too
yield attr_value
return select
def parse_path_value(next):
token = next()
value = token[0]
if value:
if value[:1] == "'" or value[:1] == '"':
return value[1:-1]
try:
return int(value)
except ValueError:
pass
elif token[1].isdigit():
return int(token[1])
else:
name = token[1].lower()
if name == 'true':
return True
elif name == 'false':
return False
raise ValueError("Invalid attribute predicate: '%s'" % value)
def handle_predicate(next, token):
token = next()
selector = []
while token[0] != ']':
selector.append( operations[token[0]](next, token) )
try:
token = next()
except StopIteration:
break
else:
if token[0] == "/":
token = next()
if not token[0] and token[1] == 'and':
return logical_and(selector, handle_predicate(next, token))
def select(result):
for node in result:
subresult = iter((node,))
for select in selector:
subresult = select(subresult)
predicate_result = _get_first_or_none(subresult)
if predicate_result is not None:
yield node
return select
def logical_and(lhs_selects, rhs_select):
def select(result):
for node in result:
subresult = iter((node,))
for select in lhs_selects:
subresult = select(subresult)
predicate_result = _get_first_or_none(subresult)
subresult = iter((node,))
if predicate_result is not None:
for result_node in rhs_select(subresult):
yield node
return select
operations = {
"@": handle_attribute,
"": handle_name,
"*": handle_star,
".": handle_dot,
"//": handle_descendants,
"[": handle_predicate,
}
functions = {
'not' : handle_func_not
}
def _build_path_iterator(path):
# parse pattern
stream = iter([ (special,text)
for (special,text) in path_tokenizer(path)
if special or text ])
try:
_next = stream.next
except AttributeError:
# Python 3
def _next():
return next(stream)
token = _next()
selector = []
while 1:
try:
selector.append(operations[token[0]](_next, token))
except StopIteration:
raise ValueError("invalid path")
try:
token = _next()
if token[0] == "/":
token = _next()
except StopIteration:
break
return selector
# main module API
def iterfind(node, path):
selector_chain = _build_path_iterator(path)
result = iter((node,))
for select in selector_chain:
result = select(result)
return result
def find_first(node, path):
return _get_first_or_none(iterfind(node, path))
def find_all(node, path):
return list(iterfind(node, path))