From 742a86af2a7e15cebbed6e2ca4f56b87170b9f7d Mon Sep 17 00:00:00 2001 From: dakkar Date: Sat, 27 Jun 2009 13:41:41 +0200 Subject: version 1.4, from CPAN --- XPath.pm | 619 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 619 insertions(+) create mode 100644 XPath.pm (limited to 'XPath.pm') diff --git a/XPath.pm b/XPath.pm new file mode 100644 index 0000000..0c442d0 --- /dev/null +++ b/XPath.pm @@ -0,0 +1,619 @@ +package Class::XPath; + +use 5.006; +use strict; +use warnings; + +our $VERSION = '1.4'; +use Carp qw(croak); +use constant DEBUG => 0; + +# regex fragment for names in XPath expressions +our $NAME = qr/[\w:]+/; + +# declare prototypes +sub foreach_node (&@); + +# handle request to build methods from 'use Class::XPath'. +sub import { + my $pkg = shift; + return unless @_; + my $target = (caller())[0]; + # hand off to add_methods + $pkg->add_methods(@_, target => $target, from_import => 1); +} + +{ + # setup lists of required params + my %required = map { ($_,1) } + qw(get_name get_parent get_children + get_attr_names get_attr_value get_content + get_root call_match call_xpath); + +# add the xpath and match methods to +sub add_methods { + my $pkg = shift; + my %args = (call_match => 'match', + call_xpath => 'xpath', + @_); + my $from_import = delete $args{from_import}; + my $target = delete $args{target}; + croak("Missing 'target' parameter to ${pkg}->add_methods()") + unless defined $target; + + # check args + local $_; + for (keys %args) { + croak("Unrecognized parameter '$_' " . + ($from_import ? " on 'use $pkg' line. " : + "passed to ${pkg}->add_methods()")) + unless $required{$_}; + } + for (keys %required) { + croak("Missing required parameter '$_' " . + ($from_import ? " on 'use $pkg' line. " : + "in call to ${pkg}->add_methods()")) + unless exists $args{$_}; + } + + # translate get_* method names to sub-refs + for (grep { /^get_/ } keys %args) { + next if ref $args{$_} and ref $args{$_} eq 'CODE'; + $args{$_} = eval "sub { shift->$args{$_}(\@_) };"; + croak("Unable to compile sub for '$_' : $@") if $@; + } + + # install code into requested names to call real match/xpath with + # supplied %args + { + no strict 'refs'; + *{"${target}::$args{call_match}"} = + sub { $pkg->match($_[0], \%args, $_[1]) }; + *{"${target}::$args{call_xpath}"} = + sub { $pkg->xpath($_[0], \%args) } + } +}} + +sub match { + my ($pkg, $self, $args, $xpath) = @_; + my ($get_root, $get_parent, $get_children, $get_name) = + @{$args}{qw(get_root get_parent get_children get_name)}; + + croak("Bad call to $args->{call_match}: missing xpath argument.") + unless defined $xpath; + + print STDERR "match('$xpath') called.\n" if DEBUG; + + # / is the root. This should probably work as part of the + # algorithm, but it doesn't. + return $get_root->($self) if $xpath eq '/'; + + # . is self. This should also work as part of the algorithm, + # but it doesn't. + return $self if $xpath eq '.'; + + # break up an incoming xpath into a set of @patterns to match + # against a list of @target elements + my (@patterns, @targets); + + # target aquisition + if ($xpath =~ m!^//(.*)$!) { + $xpath = $1; + # this is a match-anywhere pattern, which should be tried on + # all nodes + foreach_node { push(@targets, $_) } $get_root->($self), $get_children; + } elsif ($xpath =~ m!^/(.*)$!) { + $xpath = $1; + # this match starts at the root + @targets = ($get_root->($self)); + } elsif ($xpath =~ m!^\.\./(.*)$!) { + $xpath = $1; + # this match starts at the parent + @targets = ($get_parent->($self)); + } elsif ($xpath =~ m!^\./(.*)$!) { + $xpath = $1; + @targets = ($self); + } else { + # this match starts here + @targets = ($self); + } + + # pattern breakdown + my @parts = split('/', $xpath); + my $count = 0; + for (@parts) { + $count++; + if (/^$NAME$/) { + # it's a straight name match + push(@patterns, { name => $_ }); + } elsif (/^($NAME)\[(-?\d+)\]$/o) { + # it's an indexed name + push(@patterns, { name => $1, index => $2 }); + } elsif (/^($NAME)\[\@($NAME)\s*=\s*"([^"]+)"\]$/o or + /^($NAME)\[\@($NAME)\s*=\s*'([^']+)'\]$/o) { + # it's a string attribute match + push(@patterns, { name => $1, attr => $2, value => $3 }); + } elsif (/^($NAME)\[\@($NAME)\s*(=|>|<|<=|>=|!=)\s*(\d+)\]$/o) { + # it's a numeric attribute match + push(@patterns, { name => $1, attr => $2, op => $3, value => $4 }); + } elsif (/^($NAME)\[($NAME|\.)\s*=\s*"([^"]+)"\]$/o or + /^($NAME)\[($NAME|\.)\s*=\s*'([^']+)'\]$/o) { + # it's a string child match + push(@patterns, { name => $1, child => $2, value => $3 }); + } elsif (/^($NAME)\[($NAME|\.)\s*(=|>|<|<=|>=|!=)\s*(\d+)\]$/) { + # it's a numeric child match + push(@patterns, { name => $1, child => $2, op => $3, value => $4 }); + } elsif (/^\@($NAME)$/) { + # it's an attribute name + push(@patterns, { attr => $1 }); + + # it better be last + croak("Bad call to $args->{call_match}: '$xpath' contains an attribute selector in the middle of the expression.") + if $count != @parts; + } else { + # unrecognized token + croak("Bad call to $args->{call_match}: '$xpath' contains unknown token '$_'"); + } + } + + croak("Bad call to $args->{call_match}: '$xpath' contains no search tokens.") + unless @patterns; + + # apply the patterns to all available targets and collect results + my @results = map { $pkg->_do_match($_, $args, @patterns) } @targets; + + return @results; +} + +# the underlying match engine. this takes a list of patterns and +# applies them to child elements +sub _do_match { + my ($pkg, $self, $args, @patterns) = @_; + my ($get_parent, $get_children, $get_name, $get_attr_value, $get_attr_names, $get_content) = + @{$args}{qw(get_parent get_children get_name get_attr_value get_attr_names get_content)}; + local $_; + + print STDERR "_do_match(" . $get_name->($self) . " => " . + join(', ', map { '{' . join(',', %$_) . '}' } @patterns) . + ") called.\n" + if DEBUG; + + # get pattern to apply to direct descendants + my $pat = shift @patterns; + + # find matches and put in @results + my @results; + my @kids; + + { no warnings 'uninitialized'; + @kids = grep { $get_name->($_) eq $pat->{name} } $get_children->($self); + } + + if (defined $pat->{index}) { + # get a child by index + push @results, $kids[$pat->{index}] + if (abs($pat->{index}) <= $#kids); + } elsif (defined $pat->{attr}) { + if (defined $pat->{name}) { + # default op is 'eq' for string matching + my $op = $pat->{op} || 'eq'; + + # do attribute matching + foreach my $kid (@kids) { + my $value = $get_attr_value->($kid, $pat->{attr}); + push(@results, $kid) + if ($op eq 'eq' and $value eq $pat->{value}) or + ($op eq '=' and $value == $pat->{value}) or + ($op eq '!=' and $value != $pat->{value}) or + ($op eq '>' and $value > $pat->{value}) or + ($op eq '<' and $value < $pat->{value}) or + ($op eq '>=' and $value >= $pat->{value}) or + ($op eq '<=' and $value <= $pat->{value}); + } + } + else { + my $attr = $pat->{attr}; + push(@results, $get_attr_value->($self, $attr)) + if grep { $_ eq $attr } $get_attr_names->($self); + } + } elsif (defined $pat->{child}) { + croak("Can't process child pattern without name") + unless defined $pat->{name}; + # default op is 'eq' for string matching + my $op = $pat->{op} || 'eq'; + # do attribute matching + foreach my $kid (@kids) { + foreach ( + $pat->{child} eq "." ? $kid + : grep {$get_name->($_) eq $pat->{child}} $get_children->($kid) + ) { + my $value; + foreach_node { + my $txt = $get_content->($_); + $value .= $txt if defined $txt; + } $_, $get_children; + next unless defined $value; + push(@results, $kid) + if ($op eq 'eq' and $value eq $pat->{value}) or + ($op eq '=' and $value == $pat->{value}) or + ($op eq '!=' and $value != $pat->{value}) or + ($op eq '>' and $value > $pat->{value}) or + ($op eq '<' and $value < $pat->{value}) or + ($op eq '>=' and $value >= $pat->{value}) or + ($op eq '<=' and $value <= $pat->{value}); + } + } + } else { + push @results, @kids; + } + + # all done? + return @results unless @patterns; + + # apply remaining patterns on matching kids + return map { $pkg->_do_match($_, $args, @patterns) } @results; +} + + +sub xpath { + my ($pkg, $self, $args) = @_; + my ($get_parent, $get_children, $get_name) = + @{$args}{qw(get_parent get_children get_name)}; + + my $parent = $get_parent->($self); + return '/' unless defined $parent; # root's xpath is / + + # get order within same-named nodes in the parent + my $name = $get_name->($self); + my $count = 0; + for my $kid ($get_children->($parent)) { + last if $kid == $self; + $count++ if $get_name->($kid) eq $name; + } + + # construct xpath using parent's xpath and our name and count + return $pkg->xpath($parent, $args) . + ($get_parent->($parent) ? '/' : '') . + $name . '[' . $count . ']'; +} + + +# does a depth first traversal in a stack +sub foreach_node (&@) { + my ($code, $node, $get_children) = @_; + my @stack = ($node); + while (@stack) { + local $_ = shift(@stack); + $code->(); + push(@stack, $get_children->($_)); + } +} + +1; +__END__ + +=head1 NAME + +Class::XPath - adds xpath matching to object trees + +=head1 SYNOPSIS + +In your node class, use Class::XPath: + + # generate xpath() and match() using Class::XPath + use Class::XPath + + get_name => 'name', # get the node name with the 'name' method + + get_parent => 'parent', # get parent with the 'parent' method + + get_root => \&get_root, # call get_root($node) to get the root + + get_children => 'kids', # get children with the 'kids' method + + get_attr_names => 'param', # get names and values of attributes + get_attr_value => 'param', # from param + + get_content => 'data', # get content from the 'data' method + + ; + +Now your objects support XPath-esque matching: + + # find all pages, anywhere in the tree + @nodes = $node->match('//page'); + + # returns an XPath like "/page[1]/paragraph[2]" + $xpath = $node->xpath(); + +=head1 DESCRIPTION + +This module adds XPath-style matching to your object trees. This +means that you can find nodes using an XPath-esque query with +C from anywhere in the tree. Also, the C method +returns a unique path to a given node which can be used as an +identifier. + +To use this module you must already have an OO implementation of a +tree. The tree must be a true tree - all nodes have a single parent +and the tree must have a single root node. Also, the order of +children within a node must be stable. + +B This module is not yet a complete XPath implementation. Over +time I expect the subset of XPath supported to grow. See the SYNTAX +documentation for details on the current level of support. + +=head1 USAGE + +This module is used by providing it with information about how your +class works. Class::XPath uses this information to build the +C and C methods for your class. The parameters +passed to 'use Class::XPath' may be set with strings, indicating +method names, or subroutine references. They are: + +=over + +=item get_name (required) + +Returns the name of this node. This will be used as the element name +when evaluating an XPath match. The value returned must matches +/^[\w:]+$/. + +=item get_parent (required) + +Returns the parent of this node. The root node must return undef from +the get_parent method. + +=item get_children (required) + +Returns a list of child nodes, in order. + +=item get_attr_names (required) + +Returns a list of available attribute names. The values returned must +match /^[\w:]+$/). + +=item get_attr_value (required) + +Called with a single parameter, the name of the attribute. Returns +the value associated with that attribute. The value returned must be +C if no value exists for the attribute. + +=item get_content (required) + +Returns the contents of the node. In XML this is text between start +and end tags. + +=item get_root (required) + +Returns the root node of this tree. + +=item call_match (optional) + +Set this to the name of the C method to generate. Defaults +to 'match'. + +=item call_xpath (optional) + +Set this to the name of the C method to generate. Defaults +to 'xpath'. + +=back + +=head2 ALTERNATE USAGE + +If you're using someone else's OO tree module, and you don't want to +subclass it, you can still use Class::XPath to add XPath matching to +it. This is done by calling Cadd_methods()> with all +the options usually passed to C and one extra one, C. +For example, to add xpath() and match() to HTML::Element (the node +class for HTML::TreeBuilder): + + # add Class::XPath routines to HTML::Element + Class::XPath->add_methods(target => 'HTML::Element', + get_parent => 'parent', + get_name => 'tag', + get_attr_names => + sub { my %attr = shift->all_external_attr; + return keys %attr; }, + get_attr_value => + sub { my %attr = shift->all_external_attr; + return $attr{$_[0]}; }, + get_children => + sub { grep { ref $_ } shift->content_list }, + get_content => + sub { grep { not ref $_ } shift->content_list }, + get_root => + sub { local $_=shift; + while($_->parent) { $_ = $_->parent } + return $_; }); + + +Now you can load up an HTML file and do XPath matching on it: + + my $root = HTML::TreeBuilder->new; + $root->parse_file("foo.html");1 + + # get a list of all paragraphs + my @paragraphs = $root->match('//p'); + + # get the title element + my ($title) = $root->match('/head/title'); + +=head1 GENERATED METHODS + +This module generates two public methods for your class: + +=over + +=item C<< @results = $node->match('/xpath/expression') >> + +This method performs an XPath match against the tree to which this +node belongs. See the SYNTAX documentation for the range of supported +expressions. The return value is either a list of node objects, a list +of values (when retrieving specific attributes) or an empty list if no +matches could be found. If your XPath expression cannot be parsed then +the method will die. + +You can change the name of this method with the 'call_match' option +described above. + +=item C<< $xpath = $node->xpath() >> + +Get an xpath to uniquely identify this node. Can be used with match() +to find the element later. The xpath returned is guaranteed to be +unqiue within the element tree. For example, the third node named +"paragraph" inside node named "page" has the xpath +"/page[1]/paragraph[2]". + +You can change the name of this method with the 'call_xpath' option +described above. + +=back + +=head1 SYNTAX + +This module supports a small subset of XPath at the moment. Here is a +list of the type of expressions it knows about: + +=over + +=item . + +Selects and returns the current node. + +=item name + +=item ./name + +Selects a list of nodes called 'name' in the tree below the current +node. + +=item /name + +Selects a list of nodes called 'name' directly below the root of the +tree. + +=item //name + +Selects all nodes with a matching name, anywhere in the tree. + +=item parent/child/grandchild + +Selects a list of grandchildren for all children of all parents. + +=item parent[1]/child[2] + +Selects a single child by indexing into the children lists. + +=item parent[-1]/child[0] + +Selects the first child of the last parent. In the real XPath they +spell this 'parent[last()]/child[0]' but supporting the Perl syntax is +practically free here. Eventually I'll support the XPath style too. + +=item ../child[2] + +Selects the second child from the parent of the current node. +Currently .. only works at the start of an XPath, mostly because I +can't imagine using it anywhere else. + +=item child[@id=10] + +Selects the child node with an 'id' attribute of 10. + +=item child[@id>10] + +Selects all the child nodes with an 'id' attribute greater than 10. +Other supported operators are '<', '<=', '>=' and '!='. + +=item child[@category="sports"] + +Selects the child with an 'category' attribute of "sports". The value +must be a quoted string (single or double) and no escaping is allowed. + +=item child[title="Hello World"] + +Selects the child with a 'title' child element whose content is "Hello World". +The value must be a quoted string (single or double) and no escaping is allowed. +e.g. + + + Hello World + + +=item //title[.="Hello World"] + +Selects all 'title' elements whose content is "Hello World". + +=item child/@attr + +Returns the list of values for all attributes "attr" within each child. + +=item //@attr + +Returns the list of values for all attributes "attr" within each node. + +=back + +B this module has no support for Unicode. If this is a problem +for you please consider sending me a patch. I'm certain that I don't +know enough about Unicode to do it right myself. + +=head1 BUGS + +I know of no bugs in this module. If you find one, please file a bug +report at: + + http://rt.cpan.org + +Alternately you can email me directly at sam@tregar.com. Please +include the version of the module and a complete test case that +demonstrates the bug. + +=head1 TODO + +Planned future work: + +=over + +=item * + +Support more of XPath! + +=item * + +Do more to detect broken get_* functions. Maybe use Carp::Assert and +a special mode for use during development? + +=back + +=head1 ACKNOWLEDGMENTS + +I would like to thank the creators of XPath for their fine work and +the W3C for supporting them in their efforts. + +The following people have sent me patches and/or suggestions: + + Tim Peoples + Mark Addison + Timothy Appnel + +=head1 COPYRIGHT AND LICENSE + +Copyright (C) 2002 Sam Tregar + +This program is free software; you can redistribute it and/or modify +it under the same terms as Perl 5 itself. + +=head1 AUTHOR + +Sam Tregar + +=head1 SEE ALSO + +The XPath W3C Recommendation: + + http://www.w3.org/TR/xpath + +=cut -- cgit v1.2.3