/usr/bin/xmlformat

#! /usr/bin/perl -w 
# vim:set ts=2 sw=2 expandtab:

# xmlformat - configurable XML file formatter/pretty-printer

# Copyright (c) 2004, 2005 Kitebird, LLC.  All rights reserved.
# Some portions are based on the REX shallow XML parser, which
# is Copyright (c) 1998, Robert D. Cameron. These include the
# regular expression parsing variables and the shallow_parse()
# method.
# This software is licensed as described in the file LICENSE,
# which you should have received as part of this distribution.

# Syntax: xmlformat [config-file] xml-file

# Default config file is $ENV{XMLFORMAT_CONF} or ./xmlformat.conf, in that
# order.

# Paul DuBois
# paul@kitebird.com
# 2003-12-14

# The input document first is parsed into a list of strings.  Each string
# represents one of the following:
# - text node
# - processing instruction (the XML declaration is treated as a PI)
# - comment
# - CDATA section
# - DOCTYPE declaration
# - element tag (either <abc>, </abc>, or <abc/>), *including attributes*

# Entities are left untouched. They appear in their original form as part
# of the text node in which they occur.

# The list of strings then is converted to a hierarchical structure.
# The document top level is represented by a reference to a list.
# Each list element is a reference to a node -- a hash that has "type"
# and "content" key/value pairs. The "type" key indicates the node
# type and has one of the following values:

# "text"    - text node
# "pi"      - processing instruction node
# "comment" - comment node
# "CDATA"   - CDATA section node
# "DOCTYPE" - DOCTYPE node
# "elt"     - element node

# (For purposes of this program, it's really only necessary to have "text",
# "elt", and "other".  The types other than "text" and "elt" currently are
# all treated the same way.)

# For all but element nodes, the "content" value is the text of the node.

# For element nodes, the "content" hash is a reference to a list of
# nodes for the element's children. In addition, an element node has
# three additional key/value pairs:
# - The "name" value is the tag name within the opening tag, minus angle
#   brackets or attributes.
# - The "open_tag" value is the full opening tag, which may also be the
#   closing tag.
# - The "close_tag" value depends on the opening tag.  If the open tag is
#   "<abc>", the close tag is "</abc>". If the open tag is "<abc/>", the
#   close tag is the empty string.

# If the tree structure is converted back into a string with
# tree_stringify(), the result can be compared to the input file
# as a regression test. The string should be identical to the original
# input document.

use strict;

use Getopt::Long;
$Getopt::Long::ignorecase = 0; # options are case sensitive
$Getopt::Long::bundling = 1;   # allow short options to be bundled

my $PROG_NAME = "xmlformat";
my $PROG_VERSION = "1.04";
my $PROG_LANG = "Perl";

# ----------------------------------------------------------------------

package XMLFormat;

use strict;

# ----------------------------------------------------------------------

# Regular expressions for parsing document components. Based on REX.

# SPE = shallow parsing expression
# SE = scanning expression
# CE = completion expression
# RSB = right square brackets
# QM = question mark

my $TextSE = "[^<]+";
my $UntilHyphen = "[^-]*-";
my $Until2Hyphens = "$UntilHyphen(?:[^-]$UntilHyphen)*-";
my $CommentCE = "$Until2Hyphens>?";
my $UntilRSBs = "[^\\]]*\\](?:[^\\]]+\\])*\\]+";
my $CDATA_CE = "$UntilRSBs(?:[^\\]>]$UntilRSBs)*>";
my $S = "[ \\n\\t\\r]+";
my $NameStrt = "[A-Za-z_:]|[^\\x00-\\x7F]";
my $NameChar = "[A-Za-z0-9_:.-]|[^\\x00-\\x7F]";
my $Name = "(?:$NameStrt)(?:$NameChar)*";
my $QuoteSE = "\"[^\"]*\"|'[^']*'";
my $DT_IdentSE = "$S$Name(?:$S(?:$Name|$QuoteSE))*";
my $MarkupDeclCE = "(?:[^\\]\"'><]+|$QuoteSE)*>";
my $S1 = "[\\n\\r\\t ]";
my $UntilQMs = "[^?]*\\?+";
my $PI_Tail = "\\?>|$S1$UntilQMs(?:[^>?]$UntilQMs)*>";
my $DT_ItemSE =
"<(?:!(?:--$Until2Hyphens>|[^-]$MarkupDeclCE)|\\?$Name(?:$PI_Tail))|%$Name;|$S";
my $DocTypeCE = "$DT_IdentSE(?:$S)?(?:\\[(?:$DT_ItemSE)*\\](?:$S)?)?>?";
my $DeclCE =
"--(?:$CommentCE)?|\\[CDATA\\[(?:$CDATA_CE)?|DOCTYPE(?:$DocTypeCE)?";
my $PI_CE = "$Name(?:$PI_Tail)?";
my $EndTagCE = "$Name(?:$S)?>?";
my $AttValSE = "\"[^<\"]*\"|'[^<']*'";
my $ElemTagCE = "$Name(?:$S$Name(?:$S)?=(?:$S)?(?:$AttValSE))*(?:$S)?/?>?";
my $MarkupSPE =
"<(?:!(?:$DeclCE)?|\\?(?:$PI_CE)?|/(?:$EndTagCE)?|(?:$ElemTagCE)?)";
my $XML_SPE = "$TextSE|$MarkupSPE";

# ----------------------------------------------------------------------

# Allowable options and their possible values:
# - The keys of this hash are the allowable option names
# - The value for each key is list of allowable option values
# - If the value is undef, the option value must be numeric
# If any new formatting option is added to this program, it
# must be specified here, *and* a default value for it should
# be listed in the *DOCUMENT and *DEFAULT pseudo-element
# option hashes.

my %opt_list = (
  "format"    => [ "block", "inline", "verbatim" ],
  "normalize"   => [ "yes", "no" ],
  "subindent"   => undef,
  "wrap-length" => undef,
  "entry-break" => undef,
  "exit-break"  => undef,
  "element-break" => undef
);

# Object creation: set up the default formatting configuration
# and variables for maintaining input and output document.

sub new
{
my $type = shift;

  my $self = {};

  # Formatting options for each element.

  $self->{elt_opts} = { };

  # The formatting options for the *DOCUMENT and *DEFAULT pseudo-elements can
  # be overridden in the configuration file, but the options must also be
  # built in to make sure they exist if not specified in the configuration
  # file.  Each of the structures must have a value for every option.

  # Options for top-level document children.
  # - Do not change entry-break: 0 ensures no extra newlines before
  #   first element of output.
  # - Do not change exit-break: 1 ensures a newline after final element
  #   of output document.
  # - It's probably best not to change any of the others, except perhaps
  #   if you want to increase the element-break.

  $self->{elt_opts}->{"*DOCUMENT"} = {
    "format"    => "block",
    "normalize"   => "no",
    "subindent"   => 0,
    "wrap-length" => 0,
    "entry-break" => 0, # do not change
    "exit-break"  => 1, # do not change
    "element-break" => 1
  };

  # Default options. These are used for any elements in the document
  # that are not specified explicitly in the configuration file.

  $self->{elt_opts}->{"*DEFAULT"} = {
    "format"    => "block",
    "normalize"   => "no",
    "subindent"   => 1,
    "wrap-length" => 0,
    "entry-break" => 1,
    "exit-break"  => 1,
    "element-break" => 1
  };

  # Run the *DOCUMENT and *DEFAULT options through the option-checker
  # to verify that the built-in values are legal.

  my $err_count = 0;

  foreach my $elt_name (keys (%{$self->{elt_opts}}))  # ... for each element
  {
    # Check each option for element
    while (my ($opt_name, $opt_val) = each (%{$self->{elt_opts}->{$elt_name}}))
    {
      my $err_msg;

      ($opt_val, $err_msg) = check_option ($opt_name, $opt_val);
      if (!defined ($err_msg))
      {
        $self->{elt_opts}->{$elt_name}->{$opt_name} = $opt_val;
      }
      else
      {
        warn "LOGIC ERROR: $elt_name default option is invalid\n";
        warn "$err_msg\n";
        ++$err_count;
      }
    }
  }

  # Make sure that the every option is represented in the
  # *DOCUMENT and *DEFAULT structures.

  foreach my $opt_name (keys (%opt_list))
  {
    foreach my $elt_name (keys (%{$self->{elt_opts}}))
    {
      if (!exists ($self->{elt_opts}->{$elt_name}->{$opt_name}))
      {
        warn "LOGIC ERROR: $elt_name has no default '$opt_name' option\n";
        ++$err_count;
      }
    }
  }

  die "Cannot continue; internal default formatting options must be fixed\n"
    if $err_count > 0;

  bless $self, $type;    # bless object and return it
}

# Initialize the variables that are used per-document

sub init_doc_vars
{
my $self = shift;

  # Elements that are used in the document but not named explicitly
  # in the configuration file.

  $self->{unconf_elts} = { };

  # List of tokens for current document.

  $self->{tokens} = [ ];

  # List of line numbers for each token

  $self->{line_num} = [ ];

  # Document node tree (constructed from the token list).

  $self->{tree} = [ ];

  # Variables for formatting operations:
  # out_doc = resulting output document (constructed from document tree)
  # pending = array of pending tokens being held until flushed

  $self->{out_doc} = "";
  $self->{pending} = [ ];

  # Inline elements within block elements are processed using the
  # text normalization (and possible line-wrapping) values of their
  # enclosing block. Blocks and inlines may be nested, so we maintain
  # a stack that allows the normalize/wrap-length values of the current
  # block to be determined.

  $self->{block_name_stack} = [ ];  # for debugging
  $self->{block_opts_stack} = [ ];

  # A similar stack for maintaining each block's current break type.

  $self->{block_break_type_stack} = [ ];
}

# Accessors for token list and resulting output document

sub tokens
{
my $self = shift;

  return $self->{tokens};
}

sub out_doc
{
my $self = shift;

  return $self->{out_doc};
}


# Methods for adding strings to output document or
# to the pending output array

sub add_to_doc
{
my ($self, $str) = @_;

  $self->{out_doc} .= $str;
}

sub add_to_pending
{
my ($self, $str) = @_;

  push (@{$self->{pending}}, $str);
}


# Block stack mainenance methods

# Push options onto or pop options off from the stack.  When doing
# this, also push or pop an element onto the break-level stack.

sub begin_block
{
my ($self, $name, $opts) = @_;

  push (@{$self->{block_name_stack}}, $name);
  push (@{$self->{block_opts_stack}}, $opts);
  push (@{$self->{block_break_type_stack}}, "entry-break");
}

sub end_block
{
my $self = shift;

  pop (@{$self->{block_name_stack}});
  pop (@{$self->{block_opts_stack}});
  pop (@{$self->{block_break_type_stack}});
}

# Return the current block's normalization status or wrap length

sub block_normalize
{
my $self = shift;

  my $size = @{$self->{block_opts_stack}};
  my $opts = $self->{block_opts_stack}->[$size-1];
  return $opts->{normalize} eq "yes";
}

sub block_wrap_length
{
my $self = shift;

  my $size = @{$self->{block_opts_stack}};
  my $opts = $self->{block_opts_stack}->[$size-1];
  return $opts->{"wrap-length"};
}

# Set the current block's break type, or return the number of newlines
# for the block's break type

sub set_block_break_type
{
my ($self, $type) = @_;

  my $size = @{$self->{block_break_type_stack}};
  $self->{block_break_type_stack}->[$size-1] = $type;
}

sub block_break_value
{
my $self = shift;

  my $size = @{$self->{block_opts_stack}};
  my $opts = $self->{block_opts_stack}->[$size-1];
  $size = @{$self->{block_break_type_stack}};
  my $type = $self->{block_break_type_stack}->[$size-1];
  return $opts->{$type};
}


# ----------------------------------------------------------------------

# Read configuration information.  For each element, construct a hash
# containing a hash key and value for each option name and value.
# After reading the file, fill in missing option values for
# incomplete option structures using the *DEFAULT options.

sub read_config
{
my $self = shift;
my $conf_file = shift;
my @elt_names = ();
my $err_msg;
my $in_continuation = 0;
my $saved_line = "";

  open (FH, $conf_file) or die "Cannot read config file $conf_file: $!\n";
  while (<FH>)
  {
    chomp;

    next if /^\s*($|#)/;  # skip blank lines, comments
    if ($in_continuation)
    {
      $_ = $saved_line . " " . $_;
      $saved_line = "";
      $in_continuation = 0;
    }
    if (!/^\s/)
    {
      # Line doesn't begin with whitespace, so it lists element names.
      # Names are separated by whitespace or commas, possibly followed
      # by a continuation character or a comment.
      if (/\\$/)
      {
        s/\\$//;                          # remove continuation character
        $saved_line = $_;
        $in_continuation = 1;
        next;
      }
      s/\s*#.*$//;                        # remove any trailing comment
      @elt_names = split (/[\s,]+/, $_);
      # make sure each name has an entry in the elt_opts structure
      foreach my $elt_name (@elt_names)
      {
        $self->{elt_opts}->{$elt_name} = { }
          unless exists ($self->{elt_opts}->{$elt_name});
      }
    }
    else
    {
      # Line begins with whitespace, so it contains an option
      # to apply to the current element list, possibly followed by
      # a comment.  First check that there is a current list.
      # Then parse the option name/value.

      die "$conf_file:$.: Option setting found before any "
          . "elements were named.\n"
        if !@elt_names;
      s/\s*#.*$//;
      my ($opt_name, $opt_val) = /^\s+(\S+)(?:\s+|\s*=\s*)(\S+)$/;
      die "$conf_file:$.: Malformed line: $_\n" unless defined ($opt_val);

      # Check option. If illegal, die with message. Otherwise,
      # add option to each element in current element list

      ($opt_val, $err_msg) = check_option ($opt_name, $opt_val);
      die "$conf_file:$.: $err_msg\n" if defined ($err_msg);
      foreach my $elt_name (@elt_names)
      {
        $self->{elt_opts}->{$elt_name}->{$opt_name} = $opt_val;
      }
    }
  }
  close (FH);

  # For any element that has missing option values, fill in the values
  # using the options for the *DEFAULT pseudo-element.  This speeds up
  # element option lookups later.  It also makes it unnecessary to test
  # each option to see if it's defined: All element option structures
  # will have every option defined.

  my $def_opts = $self->{elt_opts}->{"*DEFAULT"};

  foreach my $elt_name (keys (%{$self->{elt_opts}}))
  {
    next if $elt_name eq "*DEFAULT";
    foreach my $opt_name (keys (%{$def_opts}))
    {
      next if exists ($self->{elt_opts}->{$elt_name}->{$opt_name}); # already set
      $self->{elt_opts}->{$elt_name}->{$opt_name} = $def_opts->{$opt_name};
    }
  }
}


# Check option name to make sure it's legal. Check the value to make sure
# that it's legal for the name.  Return a two-element array:
# (value, undef) if the option name and value are legal.
# (undef, message) if an error was found; message contains error message.
# For legal values, the returned value should be assigned to the option,
# because it may get type-converted here.

sub check_option
{
my ($opt_name, $opt_val) = @_;

  # - Check option name to make sure it's a legal option
  # - Then check the value.  If there is a list of values
  #   the value must be one of them.  Otherwise, the value
  #   must be an integer.

  return (undef, "Unknown option name: $opt_name")
    unless exists ($opt_list{$opt_name});
  my $allowable_val = $opt_list{$opt_name};
  if (defined ($allowable_val))
  {
    return (undef, "Unknown '$opt_name' value: $opt_val")
      unless grep (/^$opt_val$/, @{$allowable_val});
  }
  else  # other options should be numeric
  {
    # "$opt_val" converts $opt_val to string for pattern match
    return (undef, "'$opt_name' value ($opt_val) should be an integer")
      unless "$opt_val" =~ /^\d+$/;
  }
  return ($opt_val, undef);
}


# Return hash of option values for a given element.  If no options are found:
# - Add the element name to the list of unconfigured options.
# - Assign the default options to the element.  (This way the test for the
#   option fails only once.)

sub get_opts
{
my $self = shift;
my $elt_name = shift;

  my $opts = $self->{elt_opts}->{$elt_name};
  if (!defined ($opts))
  {
    $self->{unconf_elts}->{$elt_name} = 1;
    $opts = $self->{elt_opts}->{$elt_name} = $self->{elt_opts}->{"*DEFAULT"};
  }
  return $opts;
}


# Display contents of configuration options to be used to process document.
# For each element named in the elt_opts structure, display its format
# type, and those options that apply to the type.

sub display_config
{
my $self = shift;
# Format types and the additional options that apply to each type
my $format_opts = {
  "block" => [
              "entry-break",
              "element-break",
              "exit-break",
              "subindent",
              "normalize",
              "wrap-length"
              ],
  "inline" => [ ],
  "verbatim" => [ ]
};

  foreach my $elt_name (sort (keys (%{$self->{elt_opts}})))
  {
    print "$elt_name\n";
    my %opts = %{$self->{elt_opts}->{$elt_name}};
    my $format = $opts{format};
    # Write out format type, then options that apply to the format type
    print "  format = $format\n";
    foreach my $opt_name (@{$format_opts->{$format}})
    {
      print "  $opt_name = $opts{$opt_name}\n";
    }
    print "\n";
  }
}


# Display the list of elements that are used in the document but not
# configured in the configuration file.

# Then re-unconfigure the elements so that they won't be considered
# as configured for the next document, if there is one.

sub display_unconfigured_elements
{
my $self = shift;

  my @elts = keys (%{$self->{unconf_elts}});
  if (@elts == 0)
  {
    print "The document contains no unconfigured elements.\n";
  }
  else
  {
    print "The following document elements were assigned no formatting options:\n";
    foreach my $line ($self->line_wrap ([ join (" ", sort (@elts)) ], 0, 0, 65))
    {
      print "$line\n";
    }
  }

  foreach my $elt_name (@elts)
  {
    delete ($self->{elt_opts}->{$elt_name});
  }
}

# ----------------------------------------------------------------------

# Main document processing routine.
# - Argument is a string representing an input document
# - Return value is the reformatted document, or undef. An undef return
#   signifies either that an error occurred, or that some option was
#   given that suppresses document output. In either case, don't write
#   any output for the document.  Any error messages will already have
#   been printed when this returns.

sub process_doc
{
my $self = shift;
my ($doc, $verbose, $check_parser, $canonize_only, $show_unconf_elts) = @_;
my $str;

  $self->init_doc_vars ();

  # Perform lexical parse to split document into list of tokens
  warn "Parsing document...\n" if $verbose;
  $self->shallow_parse ($doc);

  if ($check_parser)
  {
    warn "Checking parser...\n" if $verbose;
    # concatentation of tokens should be identical to original document
    if ($doc eq join ("", @{$self->tokens ()}))
    {
      print "Parser is okay\n";
    }
    else
    {
      print "PARSER ERROR: document token concatenation differs from document\n";
    }
    return undef;
  }

  # Assign input line number to each token
  $self->assign_line_numbers ();

  # Look for and report any error tokens returned by parser
  warn "Checking document for errors...\n" if $verbose;
  if ($self->report_errors () > 0)
  {
    warn "Cannot continue processing document.\n";
    return undef;
  }

  # Convert the token list to a tree structure
  warn "Converting document tokens to tree...\n" if $verbose;
  if ($self->tokens_to_tree () > 0)
  {
    warn "Cannot continue processing document.\n";
    return undef;
  }

  # Check: Stringify the tree to convert it back to a single string,
  # then compare to original document string (should be identical)
  # (This is an integrity check on the validity of the to-tree and stringify
  # operations; if one or both do not work properly, a mismatch should occur.)
  #$str = $self->tree_stringify ();
  #print $str;
  #warn "ERROR: mismatch between document and resulting string\n" if $doc ne $str;

  # Canonize tree to remove extraneous whitespace
  warn "Canonizing document tree...\n" if $verbose;
  $self->tree_canonize ();

  if ($canonize_only)
  {
    print $self->tree_stringify () . "\n";
    return undef;
  }

  # One side-effect of canonizing the tree is that the formatting
  # options are looked up for each element in the document.  That
  # causes the list of elements that have no explicit configuration
  # to be built.  Display the list and return if user requested it.

  if ($show_unconf_elts)
  {
    $self->display_unconfigured_elements ();
    return undef;
  }

  # Format the tree to produce formatted XML as a single string
  warn "Formatting document tree...\n" if $verbose;
  $self->tree_format ();

  # If the document is not empty, add a newline and emit a warning if
  # reformatting failed to add a trailing newline.  This shouldn't
  # happen if the *DOCUMENT options are set up with exit-break = 1,
  # which is the reason for the warning rather than just silently
  # adding the newline.

  $str = $self->out_doc ();
  if ($str ne "" && $str !~ /\n$/)
  {
    warn "LOGIC ERROR: trailing newline had to be added\n";
    $str .= "\n";
  }

  return $str;
}

# ----------------------------------------------------------------------

# Parse XML document into array of tokens and store array

sub shallow_parse
{ 
my ($self, $xml_document) = @_;

  $self->{tokens} = [ $xml_document =~ /$XML_SPE/g ];
}

# ----------------------------------------------------------------------

# Extract a tag name from a tag and return it.

# Dies if the tag cannot be found, because this is supposed to be
# called only with a legal tag.

sub extract_tag_name
{
my $tag = shift;

  die "Cannot find tag name in tag: $tag\n" unless $tag =~ /^<\/?($Name)/;
  return $1;
}

# ----------------------------------------------------------------------

# Assign an input line number to each token.  The number indicates
# the line number on which the token begins.

sub assign_line_numbers
{
my $self = shift;
my $line_num = 1;

  $self->{line_num} = [ ];
  for (my $i = 0; $i < @{$self->{tokens}}; $i++)
  {
    my $token = $self->{tokens}->[$i];
    push (@{$self->{line_num}}, $line_num);
    # count newlines and increment line counter (tr returns no. of matches)
    $line_num += ($token =~ tr/\n/\n/);
  }
}

# ----------------------------------------------------------------------

# Check token list for errors and report any that are found. Error
# tokens are those that begin with "<" but do not end with ">".

# Returns the error count.

# Does not modify the original token list.

sub report_errors
{
my $self = shift;
my $err_count = 0;

  for (my $i = 0; $i < @{$self->{tokens}}; $i++)
  {
    my $token = $self->{tokens}->[$i];
    if ($token =~ /^</ && $token !~ />$/)
    {
      my $line_num = $self->{line_num}->[$i];
      warn "Malformed token at line $line_num, token " . ($i+1) . ": $token\n";
      ++$err_count;
    }
  }
  warn "Number of errors found: $err_count\n" if $err_count > 0;
  return $err_count;
}

# ----------------------------------------------------------------------

# Helper routine to print tag stack for tokens_to_tree

sub print_tag_stack
{
my ($label, @stack) = @_;
  if (@stack < 1)
  {
    warn "  $label: none\n";
  }
  else
  {
    warn "  $label:\n";
    for (my $i = 0; $i < @stack; $i++)
    {
      warn "  ", ($i+1), ": ", $stack[$i], "\n";
    }
  }
}

# Convert the list of XML document tokens to a tree representation.
# The implementation uses a loop and a stack rather than recursion.

# Does not modify the original token list.

# Returns an error count.

sub tokens_to_tree
{
my $self = shift;

  my @tag_stack = ();     # stack for element tags
  my @children_stack = ();  # stack for lists of children
  my $children = [ ];     # current list of children
  my $err_count = 0;

  for (my $i = 0; $i < @{$self->{tokens}}; $i++)
  {
    my $token = $self->{tokens}->[$i];
    my $line_num = $self->{line_num}->[$i];
    my $tok_err = "Error near line $line_num, token " . ($i+1) . " ($token)";
    if ($token !~ /^</)           # text
    {
      push (@{$children}, text_node ($token));
    }
    elsif ($token =~ /^<!--/)       # comment
    {
      push (@{$children}, comment_node ($token));
    }
    elsif ($token =~ /^<\?/)        # processing instruction
    {
      push (@{$children}, pi_node ($token));
    }
    elsif ($token =~ /^<!DOCTYPE/)      # DOCTYPE
    {
      push (@{$children}, doctype_node ($token));
    }
    elsif ($token =~ /^<!\[/)       # CDATA
    {
      push (@{$children}, cdata_node ($token));
    }
    elsif ($token =~ /^<\//)        # element close tag
    {
      if (!@tag_stack)
      {
        warn "$tok_err: Close tag w/o preceding open tag; malformed document?\n";
        ++$err_count;
        next;
      }
      if (!@children_stack)
      {
        warn "$tok_err: Empty children stack; malformed document?\n";
        ++$err_count;
        next;
      }
      my $tag = pop (@tag_stack);
      my $open_tag_name = extract_tag_name ($tag);
      my $close_tag_name = extract_tag_name ($token);
      if ($open_tag_name ne $close_tag_name)
      {
        warn "$tok_err: Tag mismatch; malformed document?\n";
        warn "  open tag: $tag\n";
        warn "  close tag: $token\n";
        print_tag_stack ("enclosing tags", @tag_stack);
        ++$err_count;
        next;
      }
      my $elt = element_node ($tag, $token, $children);
      $children = pop (@children_stack);
      push (@{$children}, $elt);
    }
    else                  # element open tag
    {
      # If we reach here, we're seeing the open tag for an element:
      # - If the tag is also the close tag (e.g., <abc/>), close the
      #   element immediately, giving it an empty child list.
      # - Otherwise, push tag and child list on stacks, begin new child
      #   list for element body.
      if ($token =~ /\/>$/)     # tag is of form <abc/>
      {
        push (@{$children}, element_node ($token, "", [ ]));
      }
      else              # tag is of form <abc>
      {
        push (@tag_stack, $token);
        push (@children_stack, $children);
        $children = [ ];
      }
    }
  }

  # At this point, the stacks should be empty if the document is
  # well-formed.

  if (@tag_stack)
  {
    warn "Error at EOF: Unclosed tags; malformed document?\n";
    print_tag_stack ("unclosed tags", @tag_stack);
    ++$err_count;
  }
  if (@children_stack)
  {
    warn "Error at EOF: Unprocessed child elements; malformed document?\n";
# TODO: print out info about them
    ++$err_count;
  }

  $self->{tree} = $children;
  return $err_count;
}


# Node-generating helper methods for tokens_to_tree

# Generic node generator

sub node         { return { "type" => $_[0], "content" => $_[1] }; }

# Generators for specific non-element nodes

sub text_node    { return node ("text",    $_[0]); }
sub comment_node { return node ("comment", $_[0]); }
sub pi_node      { return node ("pi",      $_[0]); }
sub doctype_node { return node ("DOCTYPE", $_[0]); }
sub cdata_node   { return node ("CDATA",   $_[0]); }

# For an element node, create a standard node with the type and content
# key/value pairs. Then add pairs for the "name", "open_tag", and
# "close_tag" hash keys.

sub element_node
{
my ($open_tag, $close_tag, $children) = @_;

  my $elt = node ("elt", $children);
  # name is the open tag with angle brackets and attibutes stripped
  $elt->{name} = extract_tag_name ($open_tag);
  $elt->{open_tag} = $open_tag;
  $elt->{close_tag} = $close_tag;
  return $elt;
}

# ----------------------------------------------------------------------

# Convert the given XML document tree (or subtree) to string form by
# concatentating all of its components.  Argument is a reference
# to a list of nodes at a given level of the tree.

# Does not modify the node list.

sub tree_stringify
{
my $self = shift;
my $children = shift || $self->{tree}; # use entire tree if no arg;
my $str = "";

  for (my $i = 0; $i < @{$children}; $i++)
  {
    my $child = $children->[$i];

    # - Elements have list of child nodes as content (process recursively)
    # - All other node types have text content

    if ($child->{type} eq "elt")
    {
      $str .= $child->{open_tag}
          . $self->tree_stringify ($child->{content})
          . $child->{close_tag};
    }
    else
    {
      $str .= $child->{content};
    }
  }
  return $str;
}

# ----------------------------------------------------------------------


# Put tree in "canonical" form by eliminating extraneous whitespace
# from element text content.

# $children is a list of child nodes

# This function modifies the node list.

# Canonizing occurs as follows:
# - Comment, PI, DOCTYPE, and CDATA nodes remain untouched
# - Verbatim elements and their descendants remain untouched
# - Within non-normalized block elements:
#   - Delete all-whitespace text node children
#   - Leave other text node children untouched
# - Within normalized block elements:
#   - Convert runs of whitespace (including line-endings) to single spaces
#   - Trim leading whitespace of first text node
#   - Trim trailing whitespace of last text node
#   - Trim whitespace that is adjacent to a verbatim or non-normalized
#     sub-element.  (For example, if a <programlisting> is followed by
#     more text, delete any whitespace at beginning of that text.)
# - Within inline elements:
#   - Normalize the same way as the enclosing block element, with the
#     exception that a space at the beginning or end is not removed.
#     (Otherwise, <para>three<literal> blind </literal>mice</para>
#     would become <para>three<literal>blind</literal>mice</para>.)

sub tree_canonize
{
my $self = shift;

  $self->{tree} = $self->tree_canonize2 ($self->{tree}, "*DOCUMENT");
}


sub tree_canonize2
{
my $self = shift;
my $children = shift;
my $par_name = shift;

  # Formatting options for parent
  my $par_opts = $self->get_opts ($par_name);

  # If parent is a block element, remember its formatting options on
  # the block stack so they can be used to control canonization of
  # inline child elements.

  $self->begin_block ($par_name, $par_opts) if $par_opts->{format} eq "block";

  # Iterate through list of child nodes to preserve, modify, or
  # discard whitespace.  Return resulting list of children.

  # Canonize element and text nodes. Leave everything else (comments,
  # processing instructions, etc.) untouched.

  my @new_children = ();

  while (@{$children})
  {
    my $child = shift (@{$children});

    if ($child->{type} eq "elt")
    {
      # Leave verbatim elements untouched. For other element nodes,
      # canonize child list using options appropriate to element.

      if ($self->get_opts ($child->{name})->{format} ne "verbatim")
      {
        $child->{content} = $self->tree_canonize2 ($child->{content},
                            $child->{name});
      }
    }
    elsif ($child->{type} eq "text")
    {
      # Delete all-whitespace node or strip whitespace as appropriate.

      # Paranoia check: We should never get here for verbatim elements,
      # because normalization is irrelevant for them.

      die "LOGIC ERROR: trying to canonize verbatim element $par_name!\n"
        if $par_opts->{format} eq "verbatim";

      if (!$self->block_normalize ())
      {
        # Enclosing block is not normalized:
        # - Delete child all-whitespace text nodes.
        # - Leave other text nodes untouched.

        next if $child->{content} =~ /^\s*$/;
      }
      else
      {
        # Enclosing block is normalized, so normalize this text node:
        # - Convert runs of whitespace characters (including
        #   line-endings characters) to single spaces.
        # - Trim leading whitespace if this node is the first child
        #   of a block element or it follows a non-normalized node.
        # - Trim leading whitespace if this node is the last child
        #   of a block element or it precedes a non-normalized node.

        # These are nil if there is no prev or next child
        my $prev_child = $new_children[$#new_children];
        my $next_child = $children->[0];

        $child->{content} =~ s/\s+/ /g;
        $child->{content} =~ s/^ //
          if (!defined ($prev_child) && $par_opts->{format} eq "block")
            || $self->non_normalized_node ($prev_child);
        $child->{content} =~ s/ $//
          if (!defined ($next_child) && $par_opts->{format} eq "block")
            || $self->non_normalized_node ($next_child);

        # If resulting text is empty, discard the node.
        next if $child->{content} =~ /^$/;
      }
    }
    push (@new_children, $child);
  }

  # Pop block stack if parent was a block element
  $self->end_block () if $par_opts->{format} eq "block";

  return \@new_children;
}


# Helper function for tree_canonize().

# Determine whether a node is normalized.  This is used to check
# the node that is adjacent to a given text node (either previous
# or following).
# - No is node is nil
# - No if the node is a verbatim element
# - If the node is a block element, yes or no according to its
#   normalize option
# - No if the node is an inline element.  Inlines are normalized
#   if the parent block is normalized, but this method is not called
#   except while examinine normalized blocks. So its inline children
#   are also normalized.
# - No if node is a comment, PI, DOCTYPE, or CDATA section. These are
#   treated like verbatim elements.

sub non_normalized_node
{
my $self = shift;
my $node = shift;

  return 0 if !$node;
  my $type = $node->{type};
  if ($type eq "elt")
  {
    my $node_opts = $self->get_opts ($node->{name});
    if ($node_opts->{format} eq "verbatim")
    {
      return 1;
    }
    if ($node_opts->{format} eq "block")
    {
      return $node_opts->{normalize} eq "no";
    }
    if ($node_opts->{format} eq "inline")
    {
      return 0;
    }
    die "LOGIC ERROR: non_normalized_node: unhandled node format.\n";
  }
  if ($type eq "comment" || $type eq "pi" || $type eq "DOCTYPE"
            || $type eq "CDATA")
  {
    return 1;
  }
  if ($type eq "text")
  {
    die "LOGIC ERROR: non_normalized_node: got called for text node.\n";
  }
  die "LOGIC ERROR: non_normalized_node: unhandled node type.\n";
}

# ----------------------------------------------------------------------

# Format (pretty-print) the document tree

# Does not modify the node list.

# The class maintains two variables for storing output:
# - out_doc stores content that has been seen and "flushed".
# - pending stores an array of strings (content of text nodes and inline
#   element tags).  These are held until they need to be flushed, at
#   which point they are concatenated and possibly wrapped/indented.
#   Flushing occurs when a break needs to be written, which happens
#   when something other than a text node or inline element is seen.

# If parent name and children are not given, format the entire document.
# Assume prevailing indent = 0 if not given.

sub tree_format
{
my $self = shift;
my $par_name = shift || "*DOCUMENT";    # format entire document if no arg
my $children = shift || $self->{tree};  # use entire tree if no arg
my $indent = shift || 0;

  # Formatting options for parent element
  my $par_opts = $self->get_opts ($par_name);

  # If parent is a block element:
  # - Remember its formatting options on the block stack so they can
  #   be used to control formatting of inline child elements.
  # - Set initial break type to entry-break.
  # - Shift prevailing indent right before generating child content.

  if ($par_opts->{format} eq "block")
  {
    $self->begin_block ($par_name, $par_opts);
    $self->set_block_break_type ("entry-break");
    $indent += $par_opts->{"subindent"};
  }

  # Variables for keeping track of whether the previous child
  # was a text node. Used for controlling break behavior in
  # non-normalized block elements: No line breaks are added around
  # text in such elements, nor is indenting added.

  my $prev_child_is_text = 0;
  my $cur_child_is_text = 0;

  foreach my $child (@{$children})
  {
    $prev_child_is_text = $cur_child_is_text;

    # Text nodes: just add text to pending output

    if ($child->{type} eq "text")
    {
      $cur_child_is_text = 1;
      $self->add_to_pending ($child->{content});
      next;
    }

    $cur_child_is_text = 0;

    # Element nodes: handle depending on format type

    if ($child->{type} eq "elt")
    {
      my $child_opts = $self->get_opts ($child->{name});

      # Verbatim elements:
      # - Print literally without change (use _stringify).
      # - Do not line-wrap or add any indent.

      if ($child_opts->{format} eq "verbatim")
      {
        $self->flush_pending ($indent);
        $self->emit_break (0)
          unless $prev_child_is_text && !$self->block_normalize ();
        $self->set_block_break_type ("element-break");
        $self->add_to_doc ($child->{open_tag}
                          . $self->tree_stringify ($child->{content})
                          . $child->{close_tag});
        next;
      }

      # Inline elements:
      # - Do not break or indent.
      # - Do not line-wrap content; just add content to pending output
      #   and let it be wrapped as part of parent's content.

      if ($child_opts->{format} eq "inline")
      {
        $self->add_to_pending ($child->{open_tag});
        $self->tree_format ($child->{name}, $child->{content}, $indent);
        $self->add_to_pending ($child->{close_tag});
        next;
      }

      # If we get here, node is a block element.

      # - Break and flush any pending output
      # - Break and indent (no indent if break count is zero)
      # - Process element itself:
      #   - Put out opening tag
      #   - Put out element content
      #   - Put out any indent needed before closing tag. None needed if:
      #     - Element's exit-break is 0 (closing tag is not on new line,
      #       so don't indent it)
      #     - There is no separate closing tag (it was in <abc/> format)
      #     - Element has no children (tags will be written as
      #       <abc></abc>, so don't indent closing tag)
      #     - Element has children, but the block is not normalized and
      #       the last child is a text node
      #   - Put out closing tag

      $self->flush_pending ($indent);
      $self->emit_break ($indent)
        unless $prev_child_is_text && !$self->block_normalize ();
      $self->set_block_break_type ("element-break");
      $self->add_to_doc ($child->{open_tag});
      $self->tree_format ($child->{name}, $child->{content}, $indent);
      $self->add_to_doc (" " x $indent)
        unless $child_opts->{"exit-break"} <= 0
        || $child->{close_tag} eq ""
        || !@{$child->{content}}
        || (@{$child->{content}}
              && $child->{content}->[$#{$child->{content}}]->{type} eq "text"
              && $child_opts->{normalize} eq "no");
      $self->add_to_doc ($child->{close_tag});
      next;
    }

    # Comments, PIs, etc. (everything other than text and elements),
    # treat similarly to verbatim block:
    # - Flush any pending output
    # - Put out a break
    # - Add node content to collected output

    $self->flush_pending ($indent);
    $self->emit_break (0)
      unless $prev_child_is_text && !$self->block_normalize ();
    $self->set_block_break_type ("element-break");
    $self->add_to_doc ($child->{content});
  }

  $prev_child_is_text = $cur_child_is_text;

  # Done processing current element's children now.

  # If current element is a block element:
  # - If there were any children, flush any pending output and put
  #   out the exit break.
  # - Pop the block stack

  if ($par_opts->{format} eq "block")
  {
    if (@{$children})
    {
      $self->flush_pending ($indent);
      $self->set_block_break_type ("exit-break");
      $self->emit_break (0)
        unless $prev_child_is_text && !$self->block_normalize ();
    }
    $self->end_block ();
  }
}


# Emit a break - the appropriate number of newlines according to the
# enclosing block's current break type.

# In addition, emit the number of spaces indicated by indent.  (indent
# > 0 when breaking just before emitting an element tag that should
# be indented within its parent element.)

# Exception: Emit no indent if break count is zero. That indicates
# any following output will be written on the same output line, not
# indented on a new line.

# Initially, when processing a node's child list, the break type is
# set to entry-break. Each subsequent break is an element-break.
# (After child list has been processed, an exit-break is produced as well.)

sub emit_break
{
my ($self, $indent) = @_;

  # number of newlines to emit
  my $break_value = $self->block_break_value ();

  $self->add_to_doc ("\n" x $break_value);
  # add indent if there *was* a break
  $self->add_to_doc (" " x $indent) if $indent > 0 && $break_value > 0;
}


# Flush pending output to output document collected thus far:
# - Wrap pending contents as necessary, with indent before *each* line.
# - Add pending text to output document (thus "flushing" it)
# - Clear pending array.

sub flush_pending
{
my ($self, $indent) = @_;

  # Do nothing if nothing to flush
  return if !@{$self->{pending}};

  # If current block is not normalized:
  # - Text nodes cannot be modified (no wrapping or indent).  Flush
  #   text as is without adding a break or indent.
  # If current block is normalized:
  # - Add a break.
  # - If line wrap is disabled:
  #   - Add indent if there is a break. (If there isn't a break, text
  #     should immediately follow preceding tag, so don't add indent.)
  #   - Add text without wrapping
  # - If line wrap is enabled:
  #   - First line indent is 0 if there is no break. (Text immediately
  #     follows preceding tag.) Otherwise first line indent is same as
  #     prevailing indent.
  #   - Any subsequent lines get the prevailing indent.

  # After flushing text, advance break type to element-break.

  my $s = "";

  if (!$self->block_normalize ())
  {
    $s .= join ("", @{$self->{pending}});
  }
  else
  {
    $self->emit_break (0);
    my $wrap_len = $self->block_wrap_length ();
    my $break_value = $self->block_break_value ();
    if ($wrap_len <= 0)
    {
      $s .= " " x $indent if $break_value > 0;
      $s .= join ("", @{$self->{pending}});
    }
    else
    {
      my $first_indent = ($break_value > 0 ? $indent : 0);
      # Wrap lines, then join by newlines (don't add one at end)
      my @lines = $self->line_wrap ($self->{pending},
                  $first_indent,
                  $indent,
                  $wrap_len);
      $s .= join ("\n", @lines);
    }
  }

  $self->add_to_doc ($s);
  $self->{pending} = [ ];
  $self->set_block_break_type ("element-break");
}


# Perform line-wrapping of string array to lines no longer than given
# length (including indent).
# Any word longer than line length appears by itself on line.
# Return array of lines (not newline-terminated).

# $strs - reference to array of text items to be joined and line-wrapped.
# Each item may be:
# - A tag (such as <emphasis role="bold">). This should be treated as
#   an atomic unit, which is important for preserving inline tags intact.
# - A possibly multi-word string (such as "This is a string"). In this
#   latter case, line-wrapping preserves internal whitespace in the
#   string, with the exception that if whitespace would be placed at
#   the end of a line, it is discarded.

# $first_indent - indent for first line
# $rest_indent - indent for any remaining lines
# $max_len - maximum length of output lines (including indent)

sub line_wrap
{
my ($self, $strs, $first_indent, $rest_indent, $max_len) = @_;

  # First, tokenize the strings

  my @words = ();
  foreach my $str (@{$strs})
  {
    if ($str =~ /^</)
    {
      # String is a tag; treat as atomic unit and don't split
      push (@words, $str);
    }
    else
    {
      # String of white and non-white tokens.
      # Tokenize into white and non-white tokens.
      push (@words, ($str =~ /\S+|\s+/g));
    }
  }

  # Now merge tokens that are not separated by whitespace tokens. For
  # example, "<i>", "word", "</i>" gets merged to "<i>word</i>".  But
  # "<i>", " ", "word", " ", "</i>" gets left as separate tokens.

  my @words2 = ();
  foreach my $word (@words)
  {
    # If there is a previous word that does not end with whitespace,
    # and the currrent word does not begin with whitespace, concatenate
    # current word to previous word.  Otherwise append current word to
    # end of list of words.
    if (@words2 && $words2[$#words2] !~ /\s$/ && $word !~ /^\s/)
    {
      $words2[$#words2] .= $word;
    }
    else
    {
      push (@words2, $word);
    }
  }

  my @lines = ();
  my $line = "";
  my $llen = 0;
  # set the indent for the first line
  my $indent = $first_indent;
  # saved-up whitespace to put before next non-white word
  my $white = "";

  foreach my $word (@words2)   # ... while words remain to wrap
  {
    # If word is whitespace, save it. It gets added before next
    # word if no line-break occurs.
    if ($word =~ /^\s/)
    {
       $white .= $word;
      next;
    }
    my $wlen = length ($word);
    if ($llen == 0)
    {
      # New output line; it gets at least one word (discard any
      # saved whitespace)
      $line = " " x $indent . $word;
      $llen = $indent + $wlen;
      $indent = $rest_indent;
      $white = "";
      next;
    }
    if ($llen + length ($white) + $wlen > $max_len)
    {
      # Word (plus saved whitespace) won't fit on current line.
      # Begin new line (discard any saved whitespace).
      push (@lines, $line);
      $line = " " x $indent . $word;
      $llen = $indent + $wlen;
      $indent = $rest_indent;
      $white = "";
      next;
    }
    # add word to current line with saved whitespace between
    $line .= $white . $word;
    $llen += length ($white) + $wlen;
    $white = "";
  }

  # push remaining line, if any
  push (@lines, $line) if $line ne "";

  return @lines;
}

1;

# ----------------------------------------------------------------------

# Begin main program

package main;


my $usage = <<EOF;
Usage: $PROG_NAME [options] xml-file

Options:
--help, -h
    Print this message and exit.
--backup suffix -b suffix
    Back up the input document, adding suffix to the input
    filename to create the backup filename.
--canonized-output
    Proceed only as far as the document canonization stage,
    printing the result.
--check-parser
    Parse the document into tokens and verify that their
    concatenation is identical to the original input document.
    This option suppresses further document processing.
--config-file file_name, -f file_name
    Specify the configuration filename. If no file is named,
    xmlformat uses the file named by the environment variable
    XMLFORMAT_CONF, if it exists, or ./xmlformat.conf, if it
    exists. Otherwise, xmlformat uses built-in formatting
    options.
--in-place, -i
    Format the document in place, replacing the contents of
    the input file with the reformatted document. (It's a
    good idea to use --backup along with this option.)
--show-config
    Show configuration options after reading configuration
    file. This option suppresses document processing.
--show-unconfigured-elements
    Show elements that are used in the document but for
    which no options were specified in the configuration
    file. This option suppresses document output.
--verbose, -v
    Be verbose about processing stages.
--version, -V
    Show version information and exit.
EOF

# Variables for command line options; most are undefined initially.
my $help;
my $backup_suffix;
my $conf_file;
my $canonize_only;
my $check_parser;
my $in_place;
my $show_conf;
my $show_unconf_elts;
my $show_version;
my $verbose;

GetOptions (
  # =i means an integer argument is required after the option
  # =s means a string argument is required after the option
  # :s means a string argument is optional after the option
  "help|h"           => \$help,          # print help message
  "backup|b=s"       => \$backup_suffix, # make backup using suffix
  "canonized-output" => \$canonize_only, # print canonized document
  "check-parser"     => \$check_parser,  # verify parser integrity
  "config-file|f=s"  => \$conf_file,     # config file
  "in-place|i"       => \$in_place,      # format in place
  "show-config"      => \$show_conf,     # show configuration file
  # need better name
  "show-unconfigured-elements" => \$show_unconf_elts,   # show unconfigured elements
  "verbose|v"        => \$verbose,       # be verbose
  "version|V"        => \$show_version,  # show version info
) or do { print "$usage\n"; exit (1); };

if (defined ($help))
{
  print "$usage\n";
  exit (0);
}

if (defined ($show_version))
{
  print "$PROG_NAME $PROG_VERSION ($PROG_LANG version)\n";
  exit (0);
}

# --in-place option requires a named file

warn "WARNING: --in-place/-i option ignored (requires named input files)\n"
  if defined ($in_place) && @ARGV == 0;

# --backup/-b is meaningless without --in-place

if (defined ($backup_suffix))
{
  if (!defined ($in_place))
  {
    die "--backup/-b option meaningless without --in-place/-i option\n";
  }
}

# Save input filenames
my @in_file = @ARGV;

my $xf = XMLFormat->new ();

# If a configuration file was named explicitly, use it. An error occurs
# if the file does not exist.

# If no configuration file was named, fall back to:
# - The file named by the environment variable XMLFORMAT_CONF, if it exists
# - ./xmlformat.conf, if it exists

# If no configuration file can be found at all, the built-in default options
# are used. (These are set up in new().)

my $env_conf_file = $ENV{XMLFORMAT_CONF};
my $def_conf_file = "./xmlformat.conf";

# If no config file was named, but XMLFORMAT_CONF is set, use its value
# as the config file name.
if (!defined ($conf_file))
{
  $conf_file = $env_conf_file if defined ($env_conf_file);
}
# If config file still isn't defined, use the default file if it exists.
if (!defined ($conf_file))
{
  if (-r $def_conf_file && ! -d $def_conf_file)
  {
    $conf_file = $def_conf_file;
  }
}
if (defined ($conf_file))
{
  warn "Reading configuration file...\n" if $verbose;
  die "Configuration file '$conf_file' is not readable.\n" if ! -r $conf_file;
  die "Configuration file '$conf_file' is a directory.\n"  if -d $conf_file;
  $xf->read_config ($conf_file)
}

if ($show_conf)   # show configuration and exit
{
  $xf->display_config ();
  exit(0);
}

my ($in_doc, $out_doc);

# Process arguments.
# - If no files named, read string, write to stdout.
# - If files named, read and process each one. Write output to stdout
#   unless --in-place option was given.  Make backup of original file
#   if --backup option was given.

if (@ARGV == 0)
{
  warn "Reading document...\n" if $verbose;
  {
    local $/ = undef;
    $in_doc = <>;            # slurp input document as single string
  }

  $out_doc = $xf->process_doc ($in_doc,
              $verbose, $check_parser, $canonize_only, $show_unconf_elts);
  if (defined ($out_doc))
  {
    warn "Writing output document...\n" if $verbose;
    print $out_doc;
  }
}
else
{
  foreach my $file (@ARGV)
  {
    warn "Reading document $file...\n" if $verbose;
    open (IN, $file)
      or die "Cannot read $file: $!\n";
    {
      local $/ = undef;
      $in_doc = <IN>;            # slurp input document as single string
    }
    close (IN);
    $out_doc = $xf->process_doc ($in_doc,
                $verbose, $check_parser, $canonize_only, $show_unconf_elts);
    next unless defined ($out_doc);
    if (defined ($in_place))
    {
      if (defined ($backup_suffix))
      {
        warn "Making backup of $file to $file$backup_suffix...\n" if $verbose;
        rename ($file, $file . $backup_suffix)
          or die "Could not rename $file to $file$backup_suffix: $!\n";
      }
      warn "Writing output document to $file...\n" if $verbose;
      open (OUT, ">$file") or die "Cannot write to $file: $!\n";
      print OUT $out_doc;
      close (OUT);
    }
    else
    {
      warn "Writing output document...\n" if $verbose;
      print $out_doc;
    }
  }
}

warn "Done!\n" if $verbose;

exit (0);
xmlformat-perl 1.04-2 / usr / bin / xmlformat