← Index
Performance Profile   « block view • line view • sub view »
For t/test-parsing
  Run on Sun Nov 14 09:49:57 2010
Reported on Sun Nov 14 09:50:10 2010

File /usr/share/perl5/MARC/File/XML.pm
Statements Executed 52
Total Time 0.0028299 seconds
Subroutines — ordered by exclusive time
Calls P F Exclusive
Time
Inclusive
Time
Subroutine
11120µs119µsMARC::File::XML::::importMARC::File::XML::import
0000s0sMARC::File::XML::::BEGINMARC::File::XML::BEGIN
0000s0sMARC::File::XML::::DESTROYMARC::File::XML::DESTROY
0000s0sMARC::File::XML::::_nextMARC::File::XML::_next
0000s0sMARC::File::XML::::_unimarc_encodingMARC::File::XML::_unimarc_encoding
0000s0sMARC::File::XML::::closeMARC::File::XML::close
0000s0sMARC::File::XML::::decideMARC8BinaryMARC::File::XML::decideMARC8Binary
0000s0sMARC::File::XML::::decodeMARC::File::XML::decode
0000s0sMARC::File::XML::::default_record_formatMARC::File::XML::default_record_format
0000s0sMARC::File::XML::::encodeMARC::File::XML::encode
0000s0sMARC::File::XML::::escapeMARC::File::XML::escape
0000s0sMARC::File::XML::::footerMARC::File::XML::footer
0000s0sMARC::File::XML::::headerMARC::File::XML::header
0000s0sMARC::File::XML::::outMARC::File::XML::out
0000s0sMARC::File::XML::::recordMARC::File::XML::record
0000s0sMARC::File::XML::::writeMARC::File::XML::write
0000s0sMARC::Record::::as_xml MARC::Record::as_xml
0000s0sMARC::Record::::as_xml_record MARC::Record::as_xml_record
0000s0sMARC::Record::::new_from_xml MARC::Record::new_from_xml
LineStmts.Exclusive
Time
Avg.Code
1package MARC::File::XML;
2
3330µs10µsuse warnings;
# spent 28µs making 1 call to warnings::import
4341µs14µsuse strict;
# spent 9µs making 1 call to strict::import
5339µs13µsuse vars qw( $VERSION %_load_args );
# spent 47µs making 1 call to vars::import
6336µs12µsuse base qw( MARC::File );
# spent 1.19ms making 1 call to base::import
73140µs47µsuse MARC::Record;
# spent 48µs making 1 call to Exporter::import
8328µs9µsuse MARC::Field;
# spent 4µs making 1 call to import
93147µs49µsuse MARC::File::SAX;
# spent 4µs making 1 call to import
10334µs11µsuse XML::SAX qw(Namespaces Validation);
# spent 47µs making 1 call to Exporter::import
11
12325µs8µsuse MARC::Charset qw( marc8_to_utf8 utf8_to_marc8 );
# spent 37µs making 1 call to Exporter::import
13338µs13µsuse IO::File;
# spent 175µs making 1 call to Exporter::import
14332µs11µsuse Carp qw( croak );
# spent 34µs making 1 call to Exporter::import
1532.09ms696µsuse Encode ();
16
171900ns900ns$VERSION = '0.88';
18
19116µs16µsmy $handler = MARC::File::SAX->new();
# spent 121µs making 1 call to XML::SAX::Base::new
20
2119µs9µsmy $factory = XML::SAX::ParserFactory->new();
# spent 591µs making 1 call to XML::SAX::ParserFactory::new
2218µs8µs$factory->require_feature(Namespaces);
# spent 12µs making 1 call to XML::SAX::ParserFactory::require_feature
23
2419µs9µsmy $parser = $factory->parser( Handler => $handler, ProtocolEncoding => 'UTF-8' );
# spent 25.2ms making 1 call to XML::SAX::ParserFactory::parser
25
26
# spent 119µs (20+99) within MARC::File::XML::import which was called # once (20µs+99µs) at line 9 of /home/tamil/util/marc-moose/t/test-parsing
sub import {
27519µs4µs my $class = shift;
28 %_load_args = @_;
29 $_load_args{ DefaultEncoding } ||= 'UTF-8';
30 $_load_args{ RecordFormat } ||= 'USMARC';
31
32 $parser = $factory->parser( Handler => $handler, ProtocolEncoding => $_load_args{DefaultEncoding} );
# spent 99µs making 1 call to XML::SAX::ParserFactory::parser
33}
34
35=head1 NAME
36
37MARC::File::XML - Work with MARC data encoded as XML
38
39=head1 SYNOPSIS
40
41 ## Loading with USE options
42 use MARC::File::XML ( BinaryEncoding => 'utf8', RecordFormat => 'UNIMARC' );
43
44 ## Setting the record format without USE options
45 MARC::File::XML->default_record_format('USMARC');
46
47 ## reading with MARC::Batch
48 my $batch = MARC::Batch->new( 'XML', $filename );
49 my $record = $batch->next();
50
51 ## or reading with MARC::File::XML explicitly
52 my $file = MARC::File::XML->in( $filename );
53 my $record = $file->next();
54
55 ## serialize a single MARC::Record object as XML
56 print $record->as_xml();
57
58 ## write a bunch of records to a file
59 my $file = MARC::File::XML->out( 'myfile.xml' );
60 $file->write( $record1 );
61 $file->write( $record2 );
62 $file->write( $record3 );
63 $file->close();
64
65 ## instead of writing to disk, get the xml directly
66 my $xml = join( "\n",
67 MARC::File::XML::header(),
68 MARC::File::XML::record( $record1 ),
69 MARC::File::XML::record( $record2 ),
70 MARC::File::XML::footer()
71 );
72
73=head1 DESCRIPTION
74
75The MARC-XML distribution is an extension to the MARC-Record distribution for
76working with MARC21 data that is encoded as XML. The XML encoding used is the
77MARC21slim schema supplied by the Library of Congress. More information may
78be obtained here: http://www.loc.gov/standards/marcxml/
79
80You must have MARC::Record installed to use MARC::File::XML. In fact
81once you install the MARC-XML distribution you will most likely not use it
82directly, but will have an additional file format available to you when you
83use MARC::Batch.
84
85This version of MARC-XML supersedes an the versions ending with 0.25 which
86were used with the MARC.pm framework. MARC-XML now uses MARC::Record
87exclusively.
88
89If you have any questions or would like to contribute to this module please
90sign on to the perl4lib list. More information about perl4lib is available
91at L<http://perl4lib.perl.org>.
92
93=head1 METHODS
94
95When you use MARC::File::XML your MARC::Record objects will have two new
96additional methods available to them:
97
98=head2 MARC::File::XML->default_record_format([$format])
99
100Sets or returns the default record format used by MARC::File::XML. Valid
101formats are B<MARC21>, B<USMARC>, B<UNIMARC> and B<UNIMARCAUTH>.
102
103 MARC::File::XML->default_record_format('UNIMARC');
104
105=cut
106
107sub default_record_format {
108 my $self = shift;
109 my $format = shift;
110
111 $_load_args{RecordFormat} = $format if ($format);
112
113 return $_load_args{RecordFormat};
114}
115
116
117=head2 as_xml()
118
119Returns a MARC::Record object serialized in XML. You can pass an optional format
120parameter to tell MARC::File::XML what type of record (USMARC, UNIMARC, UNIMARCAUTH) you are
121serializing.
122
123 print $record->as_xml([$format]);
124
125=cut
126
127sub MARC::Record::as_xml {
128 my $record = shift;
129 my $format = shift || $_load_args{RecordFormat};
130 return( MARC::File::XML::encode( $record, $format ) );
131}
132
133=head2 as_xml_record([$format])
134
135Returns a MARC::Record object serialized in XML without a collection wrapper.
136You can pass an optional format parameter to tell MARC::File::XML what type of
137record (USMARC, UNIMARC, UNIMARCAUTH) you are serializing.
138
139 print $record->as_xml_record('UNIMARC');
140
141=cut
142
143sub MARC::Record::as_xml_record {
144 my $record = shift;
145 my $format = shift || $_load_args{RecordFormat};
146 return( MARC::File::XML::encode( $record, $format, 1 ) );
147}
148
149=head2 new_from_xml([$encoding, $format])
150
151If you have a chunk of XML and you want a record object for it you can use
152this method to generate a MARC::Record object. You can pass an optional
153encoding parameter to specify which encoding (UTF-8 or MARC-8) you would like
154the resulting record to be in. You can also pass a format parameter to specify
155the source record type, such as UNIMARC, UNIMARCAUTH, USMARC or MARC21.
156
157 my $record = MARC::Record->new_from_xml( $xml, $encoding, $format );
158
159Note: only works for single record XML chunks.
160
161=cut
162
163sub MARC::Record::new_from_xml {
164 my $xml = shift;
165 ## to allow calling as MARC::Record::new_from_xml()
166 ## or MARC::Record->new_from_xml()
167 $xml = shift if ( ref($xml) || ($xml eq "MARC::Record") );
168
169 my $enc = shift || $_load_args{BinaryEncoding};
170 my $format = shift || $_load_args{RecordFormat};
171 return( MARC::File::XML::decode( $xml, $enc, $format ) );
172}
173
174=pod
175
176If you want to write records as XML to a file you can use out() with write()
177to serialize more than one record as XML.
178
179=head2 out()
180
181A constructor for creating a MARC::File::XML object that can write XML to a
182file. You must pass in the name of a file to write XML to. If the $encoding
183parameter or the DefaultEncoding (see above) is set to UTF-8 then the binmode
184of the output file will be set appropriately.
185
186 my $file = MARC::File::XML->out( $filename [, $encoding] );
187
188=cut
189
190sub out {
191 my ( $class, $filename, $enc ) = @_;
192 my $fh = IO::File->new( ">$filename" ) or croak( $! );
193 $enc ||= $_load_args{DefaultEncoding};
194
195 if ($enc =~ /^utf-?8$/oi) {
196 $fh->binmode(':utf8');
197 } else {
198 $fh->binmode(':raw');
199 }
200
201 my %self = (
202 filename => $filename,
203 fh => $fh,
204 header => 0,
205 encoding => $enc
206 );
207 return( bless \%self, ref( $class ) || $class );
208}
209
210=head2 write()
211
212Used in tandem with out() to write records to a file.
213
214 my $file = MARC::File::XML->out( $filename );
215 $file->write( $record1 );
216 $file->write( $record2 );
217
218=cut
219
220sub write {
221 my ( $self, $record, $enc ) = @_;
222 if ( ! $self->{ fh } ) {
223 croak( "MARC::File::XML object not open for writing" );
224 }
225 if ( ! $record ) {
226 croak( "must pass write() a MARC::Record object" );
227 }
228 ## print the XML header if we haven't already
229 if ( ! $self->{ header } ) {
230 $enc ||= $self->{ encoding } || $_load_args{DefaultEncoding};
231 $self->{ fh }->print( header( $enc ) );
232 $self->{ header } = 1;
233 }
234 ## print out the record
235 $self->{ fh }->print( record( $record ) ) || croak( $! );
236 return( 1 );
237}
238
239=head2 close()
240
241When writing records to disk the filehandle is automatically closed when you
242the MARC::File::XML object goes out of scope. If you want to close it explicitly
243use the close() method.
244
245=cut
246
247sub close {
248 my $self = shift;
249 if ( $self->{ fh } ) {
250 $self->{ fh }->print( footer() ) if $self->{ header };
251 $self->{ fh } = undef;
252 $self->{ filename } = undef;
253 $self->{ header } = undef;
254 }
255 return( 1 );
256}
257
258## makes sure that the XML file is closed off
259
260sub DESTROY {
261 shift->close();
262}
263
264=pod
265
266If you want to generate batches of records as XML, but don't want to write to
267disk you'll have to use header(), record() and footer() to generate the
268different portions.
269
270 $xml = join( "\n",
271 MARC::File::XML::header(),
272 MARC::File::XML::record( $record1 ),
273 MARC::File::XML::record( $record2 ),
274 MARC::File::XML::record( $record3 ),
275 MARC::File::XML::footer()
276 );
277
278=head2 header()
279
280Returns a string of XML to use as the header to your XML file.
281
282=cut
283
284sub header {
285 my $enc = shift;
286 $enc = shift if ( $enc && (ref($enc) || ($enc eq "MARC::File::XML")) );
287 $enc ||= 'UTF-8';
288 return( <<MARC_XML_HEADER );
289<?xml version="1.0" encoding="$enc"?>
290<collection
291 xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
292 xsi:schemaLocation="http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd"
293 xmlns="http://www.loc.gov/MARC21/slim">
294MARC_XML_HEADER
295}
296
297=head2 footer()
298
299Returns a string of XML to use at the end of your XML file.
300
301=cut
302
303sub footer {
304 return( "</collection>" );
305}
306
307=head2 record()
308
309Returns a chunk of XML suitable for placement between the header and the footer.
310
311=cut
312
313sub record {
314 my $record = shift;
315 my $format = shift;
316 my $without_header = shift;
317 my $enc = shift;
318
319 $format ||= $_load_args{RecordFormat};
320
321 my $_transcode = 0;
322 my $ldr = $record->leader;
323 my $original_encoding = substr($ldr,9,1);
324
325 # Does the record think it is already Unicode?
326 if ($original_encoding ne 'a' && lc($format) !~ /^unimarc/o) {
327 # If not, we'll make it so
328 $_transcode++;
329 }
330
331 my @xml = ();
332
333 if ($without_header) {
334 push @xml, <<HEADER
335<?xml version="1.0" encoding="$enc"?>
336<record
337 xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
338 xsi:schemaLocation="http://www.loc.gov/MARC21/slim http://www.loc.gov/ standards/marcxml/schema/MARC21slim.xsd"
339 xmlns="http://www.loc.gov/MARC21/slim">
340HEADER
341
342 } else {
343 push( @xml, "<record>" );
344 }
345
346 push( @xml, " <leader>" . escape( $record->leader ) . "</leader>" );
347
348 foreach my $field ( $record->fields() ) {
349 my $tag = $field->tag();
350 if ( $field->is_control_field() ) {
351 my $data = $field->data;
352 push( @xml, qq( <controlfield tag="$tag">) .
353 escape( ($_transcode ? marc8_to_utf8($data) : $data) ). qq(</controlfield>) );
354 } else {
355 my $i1 = $field->indicator( 1 );
356 my $i2 = $field->indicator( 2 );
357 push( @xml, qq( <datafield tag="$tag" ind1="$i1" ind2="$i2">) );
358 foreach my $subfield ( $field->subfields() ) {
359 my ( $code, $data ) = @$subfield;
360 push( @xml, qq( <subfield code="$code">).
361 escape( ($_transcode ? marc8_to_utf8($data) : $data) ).qq(</subfield>) );
362 }
363 push( @xml, " </datafield>" );
364 }
365 }
366 push( @xml, "</record>\n" );
367
368 if ($_transcode) {
369 substr($ldr,9,1,$original_encoding);
370 $record->leader( $ldr );
371 }
372
373 return( join( "\n", @xml ) );
374}
375
37615µs5µsmy %ESCAPES = (
377 '&' => '&amp;',
378 '<' => '&lt;',
379 '>' => '&gt;',
380);
381my $ESCAPE_REGEX =
382 eval 'qr/' .
383462µs16µs join( '|', map { $_ = "\Q$_\E" } keys %ESCAPES ) .
384 '/;'
385 ;
386
387sub escape {
388 my $string = shift;
389 return '' if ! defined $string or $string eq '';
390 $string =~ s/($ESCAPE_REGEX)/$ESCAPES{$1}/oge;
391 return( $string );
392}
393
394sub _next {
395 my $self = shift;
396 my $fh = $self->{ fh };
397
398 ## return undef at the end of the file
399 return if eof($fh);
400
401 ## get a chunk of xml for a record
402 local $/ = '</record>';
403 my $xml = <$fh>;
404
405 ## trim stuff before the start record element
406 $xml =~ s/.*<record.*?>/<record>/s;
407
408 ## return undef if there isn't a good chunk of xml
409 return if ( $xml !~ m|<record>.*</record>|s );
410
411 ## return the chunk of xml
412 return( $xml );
413}
414
415=head2 decode()
416
417You probably don't ever want to call this method directly. If you do
418you should pass in a chunk of XML as the argument.
419
420It is normally invoked by a call to next(), see L<MARC::Batch> or L<MARC::File>.
421
422=cut
423
424sub decode {
425
426 my $text;
427 my $location = '';
428 my $self = shift;
429
430 ## see MARC::File::USMARC::decode for explanation of what's going on
431 ## here
432 if ( ref($self) =~ /^MARC::File/ ) {
433 $location = 'in record '.$self->{recnum};
434 $text = shift;
435 } else {
436 $location = 'in record 1';
437 $text = $self=~/MARC::File/ ? shift : $self;
438 }
439
440 my $enc = shift || $_load_args{BinaryEncoding};
441 my $format = shift || $_load_args{RecordFormat};
442
443 $parser->{ tagStack } = [];
444 $parser->{ subfields } = [];
445 $parser->{ Handler }{ record } = MARC::Record->new();
446 $parser->{ Handler }{ toMARC8 } = decideMARC8Binary($format,$enc);
447
448 $parser->parse_string( $text );
449
450 return( $parser->{ Handler }{ record } );
451
452}
453
454sub decideMARC8Binary {
455 my $format = shift;
456 my $enc = shift;
457
458 return 0 if (defined($format) && lc($format) =~ /^unimarc/o);
459 return 0 if (defined($enc) && lc($enc) =~ /^utf-?8/o);
460 return 1;
461}
462
463
464=head2 encode()
465
466You probably want to use the as_xml() method on your MARC::Record object
467instead of calling this directly. But if you want to you just need to
468pass in the MARC::Record object you wish to encode as XML, and you will be
469returned the XML as a scalar.
470
471=cut
472
473sub encode {
474 my $record = shift;
475 my $format = shift || $_load_args{RecordFormat};
476 my $without_header = shift;
477 my $enc = shift || $_load_args{DefaultEncoding};
478
479 if (lc($format) =~ /^unimarc/o) {
480 $enc = _unimarc_encoding( $format => $record );
481 }
482
483 my @xml = ();
484 push( @xml, header( $enc ) ) unless ($without_header);
485 push( @xml, record( $record, $format, $without_header, $enc ) );
486 push( @xml, footer() ) unless ($without_header);
487
488 return( join( "\n", @xml ) );
489}
490
491sub _unimarc_encoding {
492 my $f = shift;
493 my $r = shift;
494
495 my $pos = 26;
496 $pos = 13 if (lc($f) eq 'unimarcauth');
497
498 my $enc = substr( $r->subfield(100 => 'a'), $pos, 2 );
499
500 if ($enc eq '01' || $enc eq '03') {
501 return 'ISO-8859-1';
502 } elsif ($enc eq '50') {
503 return 'UTF-8';
504 } else {
505 die "Unsupported UNIMARC character encoding [$enc] for XML output for $f; 100$a -> " . $r->subfield(100 => 'a');
506 }
507}
508
509=head1 TODO
510
511=over 4
512
513=item * Support for callback filters in decode().
514
515=item * Command line utilities marc2xml, etc.
516
517=back
518
519=head1 SEE ALSO
520
521=over 4
522
523=item L<http://www.loc.gov/standards/marcxml/>
524
525=item L<MARC::File::USMARC>
526
527=item L<MARC::Batch>
528
529=item L<MARC::Record>
530
531=back
532
533=head1 AUTHORS
534
535=over 4
536
537=item * Ed Summers <ehs@pobox.com>
538
539=back
540
541=cut
542
543120µs20µs1;