/usr/share/perl5/MARC/Charset/Compiler.pm is in libmarc-charset-perl 1.35-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 | package MARC::Charset::Compiler;
=head1 NAME
MARC::Charset::Compiler - compile XML mapping rules from LoC
=head1 SYNOPSIS
$compiler = MARC::Charset::Compiler->new();
$table = $compiler->compile('codetables.xml');
=head1 DESCRIPTION
MARC::Charset uses mapping rules from the Library of Congress for
generating a MARC::Charset::Table for looking up utf8 values based on the
source MARC-8 character set and the character.
=head1 METHODS
=cut
use strict;
use warnings;
use base qw( XML::SAX::Base );
use XML::SAX::ParserFactory;
use Unicode::UCD qw(charinfo);
use MARC::Charset::Table;
use MARC::Charset::Code;
=head1 new()
The constructor.
=cut
sub new
{
my $self = bless {}, 'MARC::Charset::Compiler';
$self->{table} = MARC::Charset::Table->brand_new();
$self->{current_code} = undef;
$self->{text} = '';
return $self;
}
=head1 compile()
Pass in the path to an XML file to compile.
=cut
sub compile
{
my ($self, $file) = @_;
my $factory = XML::SAX::ParserFactory->new();
my $parser = $factory->parser(Handler => $self);
$parser->parse_uri($file);
}
## SAX event handlers are below
sub start_element
{
my ($self, $data) = @_;
my $name = $data->{Name};
if ($name eq 'code')
{
$self->{current_code} = MARC::Charset::Code->new();
}
elsif ($name eq 'characterSet')
{
my $charset = $data->{Attributes}{'{}ISOcode'}{Value};
warn('missing ISOcode in characterSet element') unless $charset;
$self->{current_charset} = $charset;
}
}
sub end_element
{
my ($self, $data) = @_;
my $name = $data->{Name};
# normalize some names for method lookup
$name = 'is_combining' if $name eq 'isCombining';
# get the existing code if we have one
my $code = $self->{current_code};
# if we're ending a code element
if ($code and $name eq 'code')
{
# if there is no ucs code, use what's in alt
$code->ucs($code->alt()) unless $code->ucs;
# can't process a code point that lacks a unicode representation
die("invalid code: " . $code->to_string()) unless $code->ucs;
# set the charset code
$code->charset($self->{current_charset});
# lookup the name from perl's character db
my $info = charinfo(hex($code->ucs()));
$code->name($info->{name}) if $info;
# add it to the table
$self->{table}->add_code($code);
# start with a clean slate
$self->{current_code} = undef;
}
elsif ($code and $name eq 'marc')
{
my $codepoint = $self->text();
if ($self->{current_charset} eq '51' ||
$self->{current_charset} eq '34' ||
$self->{current_charset} eq '45')
{
# codetables.xml supplied by the Library of Congress mistakenly
# lists the G1 value of characters in the extended Latin, extended
# Cyrillic and extended Arabic sets rather than the G0 value.
# MARC::Charset's table uses the G0 value internally.
if (hex($codepoint) >= 0xa1 && hex($codepoint) <= 0xfe) {
$codepoint = sprintf("%x", hex($codepoint) - 128);
}
}
$code->marc($codepoint);
}
# add these elements
elsif ($code and $name =~ /^(marc|ucs|is_combining|alt|marc_right_half|marc_left_half)$/)
{
$code->$name($self->text());
}
# ending an element so forget all text
$self->{text} = '';
}
sub characters
{
my ($self, $data) = @_;
return unless $self->{current_code};
my $text = $data->{Data};
$self->{text} .= $data->{Data};
}
sub text
{
my $text = shift->{text};
# collapse whitespace
$text =~ s/\s\s+/ /g;
# strip new lines
$text =~ s/[\r\n]//g;
# strip leading whitespace
$text =~ s/^\s+//;
# strip trailing whitespace
$text =~ s/\s+$//;
return $text;
}
1;
|