← Index
Performance Profile   « block view • line view • sub view »
For t/test-parsing
  Run on Sun Nov 14 09:49:57 2010
Reported on Sun Nov 14 09:50:09 2010

File /usr/share/perl5/MARC/Charset.pm
Statements Executed 34
Total Time 0.0024698 seconds
Subroutines — ordered by exclusive time
Calls P F Exclusive
Time
Inclusive
Time
Subroutine
0000s0sMARC::Charset::::BEGINMARC::Charset::BEGIN
0000s0sMARC::Charset::::_process_escapeMARC::Charset::_process_escape
0000s0sMARC::Charset::::assume_encodingMARC::Charset::assume_encoding
0000s0sMARC::Charset::::assume_unicodeMARC::Charset::assume_unicode
0000s0sMARC::Charset::::ignore_errorsMARC::Charset::ignore_errors
0000s0sMARC::Charset::::marc8_to_utf8MARC::Charset::marc8_to_utf8
0000s0sMARC::Charset::::reset_charsetsMARC::Charset::reset_charsets
0000s0sMARC::Charset::::utf8_to_marc8MARC::Charset::utf8_to_marc8
LineStmts.Exclusive
Time
Avg.Code
1package MARC::Charset;
2
31900ns900nsour $VERSION = '0.98';
4334µs11µsuse strict;
# spent 15µs making 1 call to strict::import
5332µs11µsuse warnings;
# spent 30µs making 1 call to warnings::import
6
7342µs14µsuse base qw(Exporter);
# spent 74µs making 1 call to base::import
812µs2µsour @EXPORT_OK = qw(marc8_to_utf8 utf8_to_marc8);
9
103170µs57µsuse Unicode::Normalize;
# spent 70µs making 1 call to Exporter::import
113151µs50µsuse Encode 'decode';
# spent 95µs making 1 call to Exporter::import
123131µs44µsuse charnames ':full';
# spent 216µs making 1 call to charnames::import
133156µs52µsuse MARC::Charset::Table;
# spent 4µs making 1 call to import
1431.72ms575µsuse MARC::Charset::Constants qw(:all);
# spent 266µs making 1 call to Exporter::import
15
16=head1 NAME
17
18MARC::Charset - convert MARC-8 encoded strings to UTF-8
19
20=head1 SYNOPSIS
21
22 # import the marc8_to_utf8 function
23 use MARC::Charset 'marc8_to_utf8';
24
25 # prepare STDOUT for utf8
26 binmode(STDOUT, 'utf8');
27
28 # print out some marc8 as utf8
29 print marc8_to_utf8($marc8_string);
30
31=head1 DESCRIPTION
32
33MARC::Charset allows you to turn MARC-8 encoded strings into UTF-8
34strings. MARC-8 is a single byte character encoding that predates unicode, and
35allows you to put non-Roman scripts in MARC bibliographic records.
36
37 http://www.loc.gov/marc/specifications/spechome.html
38
39=head1 EXPORTS
40
41=cut
42
43# get the mapping table
44110µs10µsour $table = MARC::Charset::Table->new();
# spent 117µs making 1 call to MARC::Charset::Table::new
45
46# set default character sets
47# these are viewable at the package level
48# in case someone wants to set them
491800ns800nsour $DEFAULT_G0 = ASCII_DEFAULT;
501600ns600nsour $DEFAULT_G1 = EXTENDED_LATIN;
51
52=head2 ignore_errors()
53
54Tells MARC::Charset whether or not to ignore all encoding errors, and
55returns the current setting. This is helepfuli if you have records that
56contain both MARC8 and UNICODE characters.
57
58 my $ignore = MARC::Charset->ignore_errors();
59
60 MARC::Charset->ignore_errors(1); # ignore errors
61 MARC::Charset->ignore_errors(0); # DO NOT ignore errors
62
63=cut
64
65
661400ns400nsour $_ignore_errors = 0;
67sub ignore_errors {
68 my ($self,$i) = @_;
69 $_ignore_errors = $i if (defined($i));
70 return $_ignore_errors;
71}
72
73
74=head2 assume_unicode()
75
76Tells MARC::Charset whether or not to assume UNICODE when an error is
77encountered in ignore_errors mode and returns the current setting.
78This is helepfuli if you have records that contain both MARC8 and UNICODE
79characters.
80
81 my $setting = MARC::Charset->assume_unicode();
82
83 MARC::Charset->assume_unicode(1); # assume characters are unicode (utf-8)
84 MARC::Charset->assume_unicode(0); # DO NOT assume characters are unicode
85
86=cut
87
88
891500ns500nsour $_assume = '';
90sub assume_unicode {
91 my ($self,$i) = @_;
92 $_assume = 'utf8' if (defined($i) and $i);
93 return 1 if ($_assume eq 'utf8');
94}
95
96
97=head2 assume_encoding()
98
99Tells MARC::Charset whether or not to assume a specific encoding when an error
100is encountered in ignore_errors mode and returns the current setting. This
101is helpful if you have records that contain both MARC8 and other characters.
102
103 my $setting = MARC::Charset->assume_encoding();
104
105 MARC::Charset->assume_encoding('cp850'); # assume characters are cp850
106 MARC::Charset->assume_encoding(''); # DO NOT assume any encoding
107
108=cut
109
110
111sub assume_encoding {
112 my ($self,$i) = @_;
113 $_assume = $i if (defined($i));
114 return $_assume;
115}
116
117
118# place holders for working graphical character sets
1191300ns300nsmy $G0;
1201300ns300nsmy $G1;
121
122=head2 marc8_to_utf8()
123
124Converts a MARC-8 encoded string to UTF-8.
125
126 my $utf8 = marc8_to_utf8($marc8);
127
128If you'd like to ignore errors pass in a true value as the 2nd
129parameter or call MARC::Charset->ignore_errors() with a true
130value:
131
132 my $utf8 = marc8_to_utf8($marc8, 'ignore-errors');
133
134 or
135
136 MARC::Charset->ignore_errors(1);
137 my $utf8 = marc8_to_utf8($marc8);
138
139=cut
140
141
142sub marc8_to_utf8
143{
144 my ($marc8, $ignore_errors) = @_;
145 reset_charsets();
146
147 $ignore_errors = $_ignore_errors if (!defined($ignore_errors));
148
149 # holder for our utf8
150 my $utf8 = '';
151
152 my $index = 0;
153 my $length = length($marc8);
154 my $combining = '';
155 CHAR_LOOP: while ($index < $length)
156 {
157 # whitespace, line feeds and carriage returns just get added on unmolested
158 if (substr($marc8, $index, 1) =~ m/(\s+|\x0A+|\x0D+)/so)
159 {
160 $utf8 .= $1;
161 $index += 1;
162 next CHAR_LOOP;
163 }
164
165 # look for any escape sequences
166 my $new_index = _process_escape(\$marc8, $index, $length);
167 if ($new_index > $index)
168 {
169 $index = $new_index;
170 next CHAR_LOOP;
171 }
172
173 my $found;
174 CHARSET_LOOP: foreach my $charset ($G0, $G1)
175 {
176
177 # cjk characters are a string of three chars
178 my $char_size = $charset eq CJK ? 3 : 1;
179
180 # extract the next code point to examine
181 my $chunk = substr($marc8, $index, $char_size);
182
183 # look up the character to see if it's in our mapping
184 my $code = $table->lookup_by_marc8($charset, $chunk);
185
186 # try the next character set if no mapping was found
187 next CHARSET_LOOP if ! $code;
188 $found = 1;
189
190 # gobble up all combining characters for appending later
191 # this is necessary because combinging characters precede
192 # the character they modifiy in MARC-8, whereas they follow
193 # the character they modify in UTF-8.
194 if ($code->is_combining())
195 {
196 $combining .= $code->char_value();
197 }
198 else
199 {
200 $utf8 .= $code->char_value() . $combining;
201 $combining = '';
202 }
203
204 $index += $char_size;
205 next CHAR_LOOP;
206 }
207
208 if (!$found)
209 {
210 warn(sprintf("no mapping found for [0x\%X] at position $index in $marc8 ".
211 "g0=".MARC::Charset::Constants::charset_name($G0) . " " .
212 "g1=".MARC::Charset::Constants::charset_name($G1), unpack('C',substr($marc8,$index,1))));
213 if (!$ignore_errors)
214 {
215 reset_charsets();
216 return;
217 }
218 if ($_assume)
219 {
220 reset_charsets();
221 return NFC(decode($_assume => $marc8));
222 }
223 $index += 1;
224 }
225
226 }
227
228 # return the utf8
229 reset_charsets();
230 return $utf8;
231}
232
233
234
235=head2 utf8_to_marc8()
236
237Will attempt to translate utf8 into marc8.
238
239 my $marc8 = utf8_to_marc8($utf8);
240
241If you'd like to ignore errors, or characters that can't be
242converted to marc8 then pass in a true value as the second
243parameter:
244
245 my $marc8 = utf8_to_marc8($utf8, 'ignore-errors');
246
247 or
248
249 MARC::Charset->ignore_errors(1);
250 my $utf8 = marc8_to_utf8($marc8);
251
252=cut
253
254sub utf8_to_marc8
255{
256 my ($utf8, $ignore_errors) = @_;
257 reset_charsets();
258
259 $ignore_errors = $_ignore_errors if (!defined($ignore_errors));
260
261 # decompose combined characters
262 $utf8 = NFD($utf8);
263
264 my $len = length($utf8);
265 my $marc8 = '';
266 for (my $i=0; $i<$len; $i++)
267 {
268 my $slice = substr($utf8, $i, 1);
269
270 # spaces are copied from utf8 into marc8
271 if ($slice eq ' ')
272 {
273 $marc8 .= ' ';
274 next;
275 }
276
277 # try to find the code point in our mapping table
278 my $code = $table->lookup_by_utf8($slice);
279
280 if (! $code)
281 {
282 warn("no mapping found at position $i in $utf8");
283 reset_charsets() and return unless $ignore_errors;
284 }
285
286 # if it's a combining character move it around
287 if ($code->is_combining())
288 {
289 my $prev = chop($marc8);
290 $marc8 .= $code->marc_value() . $prev;
291 next;
292 }
293
294 # look to see if we need to escape to a new G0 charset
295 my $charset_value = $code->charset_value();
296
297 if ($code->default_charset_group() eq 'G0'
298 and $G0 ne $charset_value)
299 {
300 if ($G0 eq ASCII_DEFAULT and $charset_value eq BASIC_LATIN)
301 {
302 # don't bother escaping, they're functionally the same
303 }
304 else
305 {
306 $marc8 .= $code->get_escape();
307 $G0 = $charset_value;
308 }
309 }
310
311 # look to see if we need to escape to a new G1 charset
312 elsif ($code->default_charset_group() eq 'G1'
313 and $G1 ne $charset_value)
314 {
315 $marc8 .= $code->get_escape();
316 $G1 = $charset_value;
317 }
318
319 $marc8 .= $code->marc_value();
320 }
321
322 # escape back to default G0 if necessary
323 if ($G0 ne $DEFAULT_G0)
324 {
325 if ($DEFAULT_G0 eq ASCII_DEFAULT) { $marc8 .= ESCAPE . ASCII_DEFAULT; }
326 elsif ($DEFAULT_G0 eq CJK) { $marc8 .= ESCAPE . MULTI_G0_A . CJK; }
327 else { $marc8 .= ESCAPE . SINGLE_G0_A . $DEFAULT_G0; }
328 }
329
330 # escape back to default G1 if necessary
331 if ($G1 ne $DEFAULT_G1)
332 {
333 if ($DEFAULT_G1 eq CJK) { $marc8 .= ESCAPE . MULTI_G1_A . $DEFAULT_G1; }
334 else { $marc8 .= ESCAPE . SINGLE_G1_A . $DEFAULT_G1; }
335 }
336
337 return $marc8;
338}
339
340
341
342=head1 DEFAULT CHARACTER SETS
343
344If you need to alter the default character sets you can set the
345$MARC::Charset::DEFAULT_G0 and $MARC::Charset::DEFAULT_G1 variables to the
346appropriate character set code:
347
348 use MARC::Charset::Constants qw(:all);
349 $MARC::Charset::DEFAULT_G0 = BASIC_ARABIC;
350 $MARC::Charset::DEFAULT_G1 = EXTENDED_ARABIC;
351
352=head1 SEE ALSO
353
354=over 4
355
356=item * L<MARC::Charset::Constant>
357
358=item * L<MARC::Charset::Table>
359
360=item * L<MARC::Charset::Code>
361
362=item * L<MARC::Charset::Compiler>
363
364=item * L<MARC::Record>
365
366=item * L<MARC::XML>
367
368=back
369
370=head1 AUTHOR
371
372Ed Summers (ehs@pobox.com)
373
374=cut
375
376
377sub _process_escape
378{
379 ## this stuff is kind of scary ... for an explanation of what is
380 ## going on here check out the MARC-8 specs at LC.
381 ## http://lcweb.loc.gov/marc/specifications/speccharmarc8.html
382 my ($str_ref, $left, $right) = @_;
383
384 # first char needs to be an escape or else this isn't an escape sequence
385 return $left unless substr($$str_ref, $left, 1) eq ESCAPE;
386
387 ## if we don't have at least one character after the escape
388 ## then this can't be a character escape sequence
389 return $left if ($left+1 >= $right);
390
391 ## pull off the first escape
392 my $esc_char_1 = substr($$str_ref, $left+1, 1);
393
394 ## the first method of escaping to small character sets
395 if ( $esc_char_1 eq GREEK_SYMBOLS
396 or $esc_char_1 eq SUBSCRIPTS
397 or $esc_char_1 eq SUPERSCRIPTS
398 or $esc_char_1 eq ASCII_DEFAULT)
399 {
400 $G0 = $esc_char_1;
401 return $left+2;
402 }
403
404 ## the second more complicated method of escaping to bigger charsets
405 return $left if $left+2 >= $right;
406
407 my $esc_char_2 = substr($$str_ref, $left+2, 1);
408 my $esc_chars = $esc_char_1 . $esc_char_2;
409
410 if ($esc_char_1 eq SINGLE_G0_A
411 or $esc_char_1 eq SINGLE_G0_B)
412 {
413 $G0 = $esc_char_2;
414 return $left+3;
415 }
416
417 elsif ($esc_char_1 eq SINGLE_G1_A
418 or $esc_char_1 eq SINGLE_G1_B)
419 {
420 $G1 = $esc_char_2;
421 return $left+3;
422 }
423
424 elsif ( $esc_char_1 eq MULTI_G0_A ) {
425 $G0 = $esc_char_2;
426 return $left+3;
427 }
428
429 elsif ($esc_chars eq MULTI_G0_B
430 and ($left+3 < $right))
431 {
432 $G0 = substr($$str_ref, $left+3, 1);
433 return $left+4;
434 }
435
436 elsif (($esc_chars eq MULTI_G1_A or $esc_chars eq MULTI_G1_B)
437 and ($left + 3 < $right))
438 {
439 $G1 = substr($$str_ref, $left+3, 1);
440 return $left+4;
441 }
442
443 # we should never get here
444 warn("seem to have fallen through in _process_escape()");
445 return $left;
446}
447
448sub reset_charsets
449{
450 $G0 = $DEFAULT_G0;
451 $G1 = $DEFAULT_G1;
452}
453
454112µs12µs1;