File | /usr/share/perl5/MARC/Charset.pm |
Statements Executed | 34 |
Total Time | 0.0024698 seconds |
Calls | P | F | Exclusive Time |
Inclusive Time |
Subroutine |
---|---|---|---|---|---|
0 | 0 | 0 | 0s | 0s | BEGIN | MARC::Charset::
0 | 0 | 0 | 0s | 0s | _process_escape | MARC::Charset::
0 | 0 | 0 | 0s | 0s | assume_encoding | MARC::Charset::
0 | 0 | 0 | 0s | 0s | assume_unicode | MARC::Charset::
0 | 0 | 0 | 0s | 0s | ignore_errors | MARC::Charset::
0 | 0 | 0 | 0s | 0s | marc8_to_utf8 | MARC::Charset::
0 | 0 | 0 | 0s | 0s | reset_charsets | MARC::Charset::
0 | 0 | 0 | 0s | 0s | utf8_to_marc8 | MARC::Charset::
Line | Stmts. | Exclusive Time | Avg. | Code |
---|---|---|---|---|
1 | package MARC::Charset; | |||
2 | ||||
3 | 1 | 900ns | 900ns | our $VERSION = '0.98'; |
4 | 3 | 34µs | 11µs | use strict; # spent 15µs making 1 call to strict::import |
5 | 3 | 32µs | 11µs | use warnings; # spent 30µs making 1 call to warnings::import |
6 | ||||
7 | 3 | 42µs | 14µs | use base qw(Exporter); # spent 74µs making 1 call to base::import |
8 | 1 | 2µs | 2µs | our @EXPORT_OK = qw(marc8_to_utf8 utf8_to_marc8); |
9 | ||||
10 | 3 | 170µs | 57µs | use Unicode::Normalize; # spent 70µs making 1 call to Exporter::import |
11 | 3 | 151µs | 50µs | use Encode 'decode'; # spent 95µs making 1 call to Exporter::import |
12 | 3 | 131µs | 44µs | use charnames ':full'; # spent 216µs making 1 call to charnames::import |
13 | 3 | 156µs | 52µs | use MARC::Charset::Table; # spent 4µs making 1 call to import |
14 | 3 | 1.72ms | 575µs | use MARC::Charset::Constants qw(:all); # spent 266µs making 1 call to Exporter::import |
15 | ||||
16 | =head1 NAME | |||
17 | ||||
18 | MARC::Charset - convert MARC-8 encoded strings to UTF-8 | |||
19 | ||||
20 | =head1 SYNOPSIS | |||
21 | ||||
22 | # import the marc8_to_utf8 function | |||
23 | use MARC::Charset 'marc8_to_utf8'; | |||
24 | ||||
25 | # prepare STDOUT for utf8 | |||
26 | binmode(STDOUT, 'utf8'); | |||
27 | ||||
28 | # print out some marc8 as utf8 | |||
29 | print marc8_to_utf8($marc8_string); | |||
30 | ||||
31 | =head1 DESCRIPTION | |||
32 | ||||
33 | MARC::Charset allows you to turn MARC-8 encoded strings into UTF-8 | |||
34 | strings. MARC-8 is a single byte character encoding that predates unicode, and | |||
35 | allows you to put non-Roman scripts in MARC bibliographic records. | |||
36 | ||||
37 | http://www.loc.gov/marc/specifications/spechome.html | |||
38 | ||||
39 | =head1 EXPORTS | |||
40 | ||||
41 | =cut | |||
42 | ||||
43 | # get the mapping table | |||
44 | 1 | 10µs | 10µs | our $table = MARC::Charset::Table->new(); # spent 117µs making 1 call to MARC::Charset::Table::new |
45 | ||||
46 | # set default character sets | |||
47 | # these are viewable at the package level | |||
48 | # in case someone wants to set them | |||
49 | 1 | 800ns | 800ns | our $DEFAULT_G0 = ASCII_DEFAULT; |
50 | 1 | 600ns | 600ns | our $DEFAULT_G1 = EXTENDED_LATIN; |
51 | ||||
52 | =head2 ignore_errors() | |||
53 | ||||
54 | Tells MARC::Charset whether or not to ignore all encoding errors, and | |||
55 | returns the current setting. This is helepfuli if you have records that | |||
56 | contain both MARC8 and UNICODE characters. | |||
57 | ||||
58 | my $ignore = MARC::Charset->ignore_errors(); | |||
59 | ||||
60 | MARC::Charset->ignore_errors(1); # ignore errors | |||
61 | MARC::Charset->ignore_errors(0); # DO NOT ignore errors | |||
62 | ||||
63 | =cut | |||
64 | ||||
65 | ||||
66 | 1 | 400ns | 400ns | our $_ignore_errors = 0; |
67 | sub ignore_errors { | |||
68 | my ($self,$i) = @_; | |||
69 | $_ignore_errors = $i if (defined($i)); | |||
70 | return $_ignore_errors; | |||
71 | } | |||
72 | ||||
73 | ||||
74 | =head2 assume_unicode() | |||
75 | ||||
76 | Tells MARC::Charset whether or not to assume UNICODE when an error is | |||
77 | encountered in ignore_errors mode and returns the current setting. | |||
78 | This is helepfuli if you have records that contain both MARC8 and UNICODE | |||
79 | characters. | |||
80 | ||||
81 | my $setting = MARC::Charset->assume_unicode(); | |||
82 | ||||
83 | MARC::Charset->assume_unicode(1); # assume characters are unicode (utf-8) | |||
84 | MARC::Charset->assume_unicode(0); # DO NOT assume characters are unicode | |||
85 | ||||
86 | =cut | |||
87 | ||||
88 | ||||
89 | 1 | 500ns | 500ns | our $_assume = ''; |
90 | sub assume_unicode { | |||
91 | my ($self,$i) = @_; | |||
92 | $_assume = 'utf8' if (defined($i) and $i); | |||
93 | return 1 if ($_assume eq 'utf8'); | |||
94 | } | |||
95 | ||||
96 | ||||
97 | =head2 assume_encoding() | |||
98 | ||||
99 | Tells MARC::Charset whether or not to assume a specific encoding when an error | |||
100 | is encountered in ignore_errors mode and returns the current setting. This | |||
101 | is helpful if you have records that contain both MARC8 and other characters. | |||
102 | ||||
103 | my $setting = MARC::Charset->assume_encoding(); | |||
104 | ||||
105 | MARC::Charset->assume_encoding('cp850'); # assume characters are cp850 | |||
106 | MARC::Charset->assume_encoding(''); # DO NOT assume any encoding | |||
107 | ||||
108 | =cut | |||
109 | ||||
110 | ||||
111 | sub assume_encoding { | |||
112 | my ($self,$i) = @_; | |||
113 | $_assume = $i if (defined($i)); | |||
114 | return $_assume; | |||
115 | } | |||
116 | ||||
117 | ||||
118 | # place holders for working graphical character sets | |||
119 | 1 | 300ns | 300ns | my $G0; |
120 | 1 | 300ns | 300ns | my $G1; |
121 | ||||
122 | =head2 marc8_to_utf8() | |||
123 | ||||
124 | Converts a MARC-8 encoded string to UTF-8. | |||
125 | ||||
126 | my $utf8 = marc8_to_utf8($marc8); | |||
127 | ||||
128 | If you'd like to ignore errors pass in a true value as the 2nd | |||
129 | parameter or call MARC::Charset->ignore_errors() with a true | |||
130 | value: | |||
131 | ||||
132 | my $utf8 = marc8_to_utf8($marc8, 'ignore-errors'); | |||
133 | ||||
134 | or | |||
135 | ||||
136 | MARC::Charset->ignore_errors(1); | |||
137 | my $utf8 = marc8_to_utf8($marc8); | |||
138 | ||||
139 | =cut | |||
140 | ||||
141 | ||||
142 | sub marc8_to_utf8 | |||
143 | { | |||
144 | my ($marc8, $ignore_errors) = @_; | |||
145 | reset_charsets(); | |||
146 | ||||
147 | $ignore_errors = $_ignore_errors if (!defined($ignore_errors)); | |||
148 | ||||
149 | # holder for our utf8 | |||
150 | my $utf8 = ''; | |||
151 | ||||
152 | my $index = 0; | |||
153 | my $length = length($marc8); | |||
154 | my $combining = ''; | |||
155 | CHAR_LOOP: while ($index < $length) | |||
156 | { | |||
157 | # whitespace, line feeds and carriage returns just get added on unmolested | |||
158 | if (substr($marc8, $index, 1) =~ m/(\s+|\x0A+|\x0D+)/so) | |||
159 | { | |||
160 | $utf8 .= $1; | |||
161 | $index += 1; | |||
162 | next CHAR_LOOP; | |||
163 | } | |||
164 | ||||
165 | # look for any escape sequences | |||
166 | my $new_index = _process_escape(\$marc8, $index, $length); | |||
167 | if ($new_index > $index) | |||
168 | { | |||
169 | $index = $new_index; | |||
170 | next CHAR_LOOP; | |||
171 | } | |||
172 | ||||
173 | my $found; | |||
174 | CHARSET_LOOP: foreach my $charset ($G0, $G1) | |||
175 | { | |||
176 | ||||
177 | # cjk characters are a string of three chars | |||
178 | my $char_size = $charset eq CJK ? 3 : 1; | |||
179 | ||||
180 | # extract the next code point to examine | |||
181 | my $chunk = substr($marc8, $index, $char_size); | |||
182 | ||||
183 | # look up the character to see if it's in our mapping | |||
184 | my $code = $table->lookup_by_marc8($charset, $chunk); | |||
185 | ||||
186 | # try the next character set if no mapping was found | |||
187 | next CHARSET_LOOP if ! $code; | |||
188 | $found = 1; | |||
189 | ||||
190 | # gobble up all combining characters for appending later | |||
191 | # this is necessary because combinging characters precede | |||
192 | # the character they modifiy in MARC-8, whereas they follow | |||
193 | # the character they modify in UTF-8. | |||
194 | if ($code->is_combining()) | |||
195 | { | |||
196 | $combining .= $code->char_value(); | |||
197 | } | |||
198 | else | |||
199 | { | |||
200 | $utf8 .= $code->char_value() . $combining; | |||
201 | $combining = ''; | |||
202 | } | |||
203 | ||||
204 | $index += $char_size; | |||
205 | next CHAR_LOOP; | |||
206 | } | |||
207 | ||||
208 | if (!$found) | |||
209 | { | |||
210 | warn(sprintf("no mapping found for [0x\%X] at position $index in $marc8 ". | |||
211 | "g0=".MARC::Charset::Constants::charset_name($G0) . " " . | |||
212 | "g1=".MARC::Charset::Constants::charset_name($G1), unpack('C',substr($marc8,$index,1)))); | |||
213 | if (!$ignore_errors) | |||
214 | { | |||
215 | reset_charsets(); | |||
216 | return; | |||
217 | } | |||
218 | if ($_assume) | |||
219 | { | |||
220 | reset_charsets(); | |||
221 | return NFC(decode($_assume => $marc8)); | |||
222 | } | |||
223 | $index += 1; | |||
224 | } | |||
225 | ||||
226 | } | |||
227 | ||||
228 | # return the utf8 | |||
229 | reset_charsets(); | |||
230 | return $utf8; | |||
231 | } | |||
232 | ||||
233 | ||||
234 | ||||
235 | =head2 utf8_to_marc8() | |||
236 | ||||
237 | Will attempt to translate utf8 into marc8. | |||
238 | ||||
239 | my $marc8 = utf8_to_marc8($utf8); | |||
240 | ||||
241 | If you'd like to ignore errors, or characters that can't be | |||
242 | converted to marc8 then pass in a true value as the second | |||
243 | parameter: | |||
244 | ||||
245 | my $marc8 = utf8_to_marc8($utf8, 'ignore-errors'); | |||
246 | ||||
247 | or | |||
248 | ||||
249 | MARC::Charset->ignore_errors(1); | |||
250 | my $utf8 = marc8_to_utf8($marc8); | |||
251 | ||||
252 | =cut | |||
253 | ||||
254 | sub utf8_to_marc8 | |||
255 | { | |||
256 | my ($utf8, $ignore_errors) = @_; | |||
257 | reset_charsets(); | |||
258 | ||||
259 | $ignore_errors = $_ignore_errors if (!defined($ignore_errors)); | |||
260 | ||||
261 | # decompose combined characters | |||
262 | $utf8 = NFD($utf8); | |||
263 | ||||
264 | my $len = length($utf8); | |||
265 | my $marc8 = ''; | |||
266 | for (my $i=0; $i<$len; $i++) | |||
267 | { | |||
268 | my $slice = substr($utf8, $i, 1); | |||
269 | ||||
270 | # spaces are copied from utf8 into marc8 | |||
271 | if ($slice eq ' ') | |||
272 | { | |||
273 | $marc8 .= ' '; | |||
274 | next; | |||
275 | } | |||
276 | ||||
277 | # try to find the code point in our mapping table | |||
278 | my $code = $table->lookup_by_utf8($slice); | |||
279 | ||||
280 | if (! $code) | |||
281 | { | |||
282 | warn("no mapping found at position $i in $utf8"); | |||
283 | reset_charsets() and return unless $ignore_errors; | |||
284 | } | |||
285 | ||||
286 | # if it's a combining character move it around | |||
287 | if ($code->is_combining()) | |||
288 | { | |||
289 | my $prev = chop($marc8); | |||
290 | $marc8 .= $code->marc_value() . $prev; | |||
291 | next; | |||
292 | } | |||
293 | ||||
294 | # look to see if we need to escape to a new G0 charset | |||
295 | my $charset_value = $code->charset_value(); | |||
296 | ||||
297 | if ($code->default_charset_group() eq 'G0' | |||
298 | and $G0 ne $charset_value) | |||
299 | { | |||
300 | if ($G0 eq ASCII_DEFAULT and $charset_value eq BASIC_LATIN) | |||
301 | { | |||
302 | # don't bother escaping, they're functionally the same | |||
303 | } | |||
304 | else | |||
305 | { | |||
306 | $marc8 .= $code->get_escape(); | |||
307 | $G0 = $charset_value; | |||
308 | } | |||
309 | } | |||
310 | ||||
311 | # look to see if we need to escape to a new G1 charset | |||
312 | elsif ($code->default_charset_group() eq 'G1' | |||
313 | and $G1 ne $charset_value) | |||
314 | { | |||
315 | $marc8 .= $code->get_escape(); | |||
316 | $G1 = $charset_value; | |||
317 | } | |||
318 | ||||
319 | $marc8 .= $code->marc_value(); | |||
320 | } | |||
321 | ||||
322 | # escape back to default G0 if necessary | |||
323 | if ($G0 ne $DEFAULT_G0) | |||
324 | { | |||
325 | if ($DEFAULT_G0 eq ASCII_DEFAULT) { $marc8 .= ESCAPE . ASCII_DEFAULT; } | |||
326 | elsif ($DEFAULT_G0 eq CJK) { $marc8 .= ESCAPE . MULTI_G0_A . CJK; } | |||
327 | else { $marc8 .= ESCAPE . SINGLE_G0_A . $DEFAULT_G0; } | |||
328 | } | |||
329 | ||||
330 | # escape back to default G1 if necessary | |||
331 | if ($G1 ne $DEFAULT_G1) | |||
332 | { | |||
333 | if ($DEFAULT_G1 eq CJK) { $marc8 .= ESCAPE . MULTI_G1_A . $DEFAULT_G1; } | |||
334 | else { $marc8 .= ESCAPE . SINGLE_G1_A . $DEFAULT_G1; } | |||
335 | } | |||
336 | ||||
337 | return $marc8; | |||
338 | } | |||
339 | ||||
340 | ||||
341 | ||||
342 | =head1 DEFAULT CHARACTER SETS | |||
343 | ||||
344 | If you need to alter the default character sets you can set the | |||
345 | $MARC::Charset::DEFAULT_G0 and $MARC::Charset::DEFAULT_G1 variables to the | |||
346 | appropriate character set code: | |||
347 | ||||
348 | use MARC::Charset::Constants qw(:all); | |||
349 | $MARC::Charset::DEFAULT_G0 = BASIC_ARABIC; | |||
350 | $MARC::Charset::DEFAULT_G1 = EXTENDED_ARABIC; | |||
351 | ||||
352 | =head1 SEE ALSO | |||
353 | ||||
354 | =over 4 | |||
355 | ||||
356 | =item * L<MARC::Charset::Constant> | |||
357 | ||||
358 | =item * L<MARC::Charset::Table> | |||
359 | ||||
360 | =item * L<MARC::Charset::Code> | |||
361 | ||||
362 | =item * L<MARC::Charset::Compiler> | |||
363 | ||||
364 | =item * L<MARC::Record> | |||
365 | ||||
366 | =item * L<MARC::XML> | |||
367 | ||||
368 | =back | |||
369 | ||||
370 | =head1 AUTHOR | |||
371 | ||||
372 | Ed Summers (ehs@pobox.com) | |||
373 | ||||
374 | =cut | |||
375 | ||||
376 | ||||
377 | sub _process_escape | |||
378 | { | |||
379 | ## this stuff is kind of scary ... for an explanation of what is | |||
380 | ## going on here check out the MARC-8 specs at LC. | |||
381 | ## http://lcweb.loc.gov/marc/specifications/speccharmarc8.html | |||
382 | my ($str_ref, $left, $right) = @_; | |||
383 | ||||
384 | # first char needs to be an escape or else this isn't an escape sequence | |||
385 | return $left unless substr($$str_ref, $left, 1) eq ESCAPE; | |||
386 | ||||
387 | ## if we don't have at least one character after the escape | |||
388 | ## then this can't be a character escape sequence | |||
389 | return $left if ($left+1 >= $right); | |||
390 | ||||
391 | ## pull off the first escape | |||
392 | my $esc_char_1 = substr($$str_ref, $left+1, 1); | |||
393 | ||||
394 | ## the first method of escaping to small character sets | |||
395 | if ( $esc_char_1 eq GREEK_SYMBOLS | |||
396 | or $esc_char_1 eq SUBSCRIPTS | |||
397 | or $esc_char_1 eq SUPERSCRIPTS | |||
398 | or $esc_char_1 eq ASCII_DEFAULT) | |||
399 | { | |||
400 | $G0 = $esc_char_1; | |||
401 | return $left+2; | |||
402 | } | |||
403 | ||||
404 | ## the second more complicated method of escaping to bigger charsets | |||
405 | return $left if $left+2 >= $right; | |||
406 | ||||
407 | my $esc_char_2 = substr($$str_ref, $left+2, 1); | |||
408 | my $esc_chars = $esc_char_1 . $esc_char_2; | |||
409 | ||||
410 | if ($esc_char_1 eq SINGLE_G0_A | |||
411 | or $esc_char_1 eq SINGLE_G0_B) | |||
412 | { | |||
413 | $G0 = $esc_char_2; | |||
414 | return $left+3; | |||
415 | } | |||
416 | ||||
417 | elsif ($esc_char_1 eq SINGLE_G1_A | |||
418 | or $esc_char_1 eq SINGLE_G1_B) | |||
419 | { | |||
420 | $G1 = $esc_char_2; | |||
421 | return $left+3; | |||
422 | } | |||
423 | ||||
424 | elsif ( $esc_char_1 eq MULTI_G0_A ) { | |||
425 | $G0 = $esc_char_2; | |||
426 | return $left+3; | |||
427 | } | |||
428 | ||||
429 | elsif ($esc_chars eq MULTI_G0_B | |||
430 | and ($left+3 < $right)) | |||
431 | { | |||
432 | $G0 = substr($$str_ref, $left+3, 1); | |||
433 | return $left+4; | |||
434 | } | |||
435 | ||||
436 | elsif (($esc_chars eq MULTI_G1_A or $esc_chars eq MULTI_G1_B) | |||
437 | and ($left + 3 < $right)) | |||
438 | { | |||
439 | $G1 = substr($$str_ref, $left+3, 1); | |||
440 | return $left+4; | |||
441 | } | |||
442 | ||||
443 | # we should never get here | |||
444 | warn("seem to have fallen through in _process_escape()"); | |||
445 | return $left; | |||
446 | } | |||
447 | ||||
448 | sub reset_charsets | |||
449 | { | |||
450 | $G0 = $DEFAULT_G0; | |||
451 | $G1 = $DEFAULT_G1; | |||
452 | } | |||
453 | ||||
454 | 1 | 12µs | 12µs | 1; |