/usr/bin/stag-diff is in libdata-stag-perl 0.14-2.
This file is owned by root:root, with mode 0o755.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 | #!/usr/bin/perl -w
# POD docs at bottom of file
use strict;
use Data::Stag qw(:all);
use Getopt::Long;
my $parser = "";
my $mapf;
my $help;
my @ignore = ();
my @report = ();
my $trace;
my $verbose;
GetOptions(
"help|h"=>\$help,
"parser|format|p=s"=>\$parser,
"ignore|s=s@"=>\@ignore,
"report|r=s@"=>\@report,
"trace|t"=>\$trace,
"verbose|v"=>\$verbose,
);
our %REPORT = map {$_=>1} @report;
if ($help) {
system("perldoc $0");
exit 0;
}
my @files = @ARGV;
my $file1 = shift @files;
if (!@files) {
die "you must pass in at least two files";
}
my $stag1 = Data::Stag->parse($file1, $parser);
$stag1->unset($_) foreach @ignore;
foreach my $file2 (@files) {
my $stag2 = Data::Stag->parse($file2, $parser);
$stag2->unset($_) foreach @ignore;
my ($match, $reason) = match($stag1, $stag2);
if ($match) {
printf "SAME: $file1 $file2\n";
}
else {
printf "DIFF: $file1 $file2\n";
printf "REASON:\n";
showreason($reason);
}
}
exit 0;
sub showreason {
my $reason = shift;
my $indent = shift || 0;
print ' ' x $indent;
my ($msg, @children) = @$reason;
printf $msg;
print "\n";
showreason($_, $indent+1) foreach @children;
}
sub match {
my $stag1 = shift;
my $stag2 = shift;
if ($stag1->name ne $stag2->name) {
return(0, mismatch("name_mismatch", [$stag1, $stag2]));
}
trace("comparing %s", $stag1->name, $stag2->name);
my $t1 = $stag1->isterminal || 0;
my $t2 = $stag2->isterminal || 0;
if ($t1 != $t2) {
return(0, mismatch("different_node_types", [$stag1, $stag2]));
}
if ($t1 && $t2) {
if ($stag1->data eq $stag2->data) {
return 1;
}
else {
return(0, mismatch(sprintf("data_mismatch(%s ne %s)",
smallstr($stag1->data),
smallstr($stag2->data),
),
[$stag1, $stag2]));
}
}
# both nodes nonterminal
if ($t1 || $t2) {
die "assertion error";
}
trace(" ..looking at kids\n");
my @kids1 = $stag1->kids;
my @kids2 = $stag2->kids;
# must match exactly
if (@kids1 != @kids2) {
return(0, mismatch(sprintf("subelement_count_mismatch [%s <=VS=> %s]",
names(@kids1), names(@kids2)),
[$stag1, $stag2]));
}
# null always matches
if (!@kids1) {
# both must be null
die "assertion error" unless !@kids2;
return 1;
}
trace(" ..matrix:\n");
my @filled = ();
for (my $i=0; $i<@kids1; $i++) {
my $kid1 = $kids1[$i];
my $matched;
my @reasons = ();
for (my $j=0; $j<@kids2; $j++) {
next if $filled[$j];
my $kid2 = $kids2[$j];
next unless $kid1->name eq $kid2->name;
my ($match, $reason) = match($kid1, $kid2);
if ($match) {
$filled[$j] = 1;
$matched = 1;
last;
}
else {
push(@reasons, $reason);
}
}
if (!$matched) {
my $mismatch =
mismatch("no_matching_node", [$kid1]);
push(@$mismatch, @reasons);
return(0, $mismatch);
}
}
trace(" ..match!\n");
return 1;
}
sub names {
join(', ', map {$_->name} @_);
}
sub mismatch {
my $msg = shift;
my @stags = @{shift || []};
my @names = map {$_->name} @stags;
my $reason =
sprintf "$msg: %s",
join(' AND ', @names);
if (grep {$REPORT{$_}} @names) {
printf "$reason\n";
if ($verbose) {
print $_->sxpr foreach @stags;
}
}
return [$reason];
}
sub smallstr {
my $str = shift;
return $str if length($str) < 50;
return substr($str, 0, 50) ."...";
}
sub trace {
return unless $trace;
my $fmt = shift;
printf $fmt, @_;
print "\n";
}
__END__
=head1 NAME
stag-diff - finds the difference between two stag files
=head1 SYNOPSIS
stag-diff -ignore foo-id -ignore bar-id file1.xml file2.xml
=head1 DESCRIPTION
Compares two data trees and reports whether they match. If they do not
match, the mismatch is reported.
=head2 ARGUMENTS
=over
=item -help|h
shows this document
=item -ignore|i ELEMENT
these nodes are ignored for the purposes of comparison. Note that
attributes are treated as elements, prefixed by the containing element
id. For example, if you have
<foo ID="wibble">
And you wish to ignore the ID attribute, then you would use the switch
-ignore foo-ID
You can specify multiple elements to ignore like this
-i foo -i bar -i baz
You can also specify paths
-i foo/bar/bar-id
=item -parser|p FORMAT
which parser to use. The default is XML. This can also be autodetected
by the file suffix. Other alternatives are B<sxpr> and B<itext>. See
L<Data::Stag> for details.
=item -report|r ELEMENT
report mismatches as they occur on each element of type ELEMENT
multiple elements can be specified
=item -verbose|v
used in conjunction with the B<-report> switch
shows the tree of the mismatching element
=back
=head2 OUTPUT
If a mismatch is reported, a report is generated displaying the
subpart of the tree that could not be matched. This will look like
this:
REASON:
no_matching_node: annotation
no_matching_node: feature_set
no_matching_node: feature_span
no_matching_node: evidence
no_matching_node: evidence-id
data_mismatch(:15077290 ne :15077291): evidence-id AND evidence-id
Due to the nature of tree matching, it can be difficult to specify
exactly how trees do not match. To investigate this, you may need to
use the B<-r> and B<-v> options. For the above output, I would
recommend using
stag-diff -r feature_span -v
=head2 ALGORITHM
Both trees are recursively traversed... see the actual code for how this works
The order of elements is not important; eg
<foo>
<bar>
<baz>1</baz>
</bar>
<bar>
<baz>2</baz>
</bar>
</foo>
matches
<foo>
<bar>
<baz>2</baz>
</bar>
<bar>
<baz>1</baz>
</bar>
</foo>
The recursive nature of this algorithm means that certain tree
comparisons will explode wrt time and memory. I think this will only
happen with very deep trees where nodes high up in the tree can only
be differentiated by nodes low down in the tree.
Both trees are loaded into memory to begin with, so it may thrash with
very large documents
=head2 AUTHOR
Chris Mungall
cjm at fruitfly dot org
=head1 SEE ALSO
L<Data::Stag>
=cut
|