File | /usr/local/lib/perl/5.10.0/Encode.pm |
Statements Executed | 85 |
Total Time | 0.0034622 seconds |
Calls | P | F | Exclusive Time |
Inclusive Time |
Subroutine |
---|---|---|---|---|---|
1 | 1 | 1 | 708µs | 748µs | predefine_encodings | Encode::
0 | 0 | 0 | 0s | 0s | BEGIN | Encode::
0 | 0 | 0 | 0s | 0s | __ANON__[:269] | Encode::Internal::
0 | 0 | 0 | 0s | 0s | __ANON__[:244] | Encode::UTF_EBCDIC::
0 | 0 | 0 | 0s | 0s | __ANON__[:256] | Encode::UTF_EBCDIC::
0 | 0 | 0 | 0s | 0s | clone_encoding | Encode::
0 | 0 | 0 | 0s | 0s | decode | Encode::
0 | 0 | 0 | 0s | 0s | decode_utf8 | Encode::
0 | 0 | 0 | 0s | 0s | define_encoding | Encode::
0 | 0 | 0 | 0s | 0s | encode | Encode::
0 | 0 | 0 | 0s | 0s | encode_utf8 | Encode::
0 | 0 | 0 | 0s | 0s | encodings | Encode::
0 | 0 | 0 | 0s | 0s | find_encoding | Encode::
0 | 0 | 0 | 0s | 0s | from_to | Encode::
0 | 0 | 0 | 0s | 0s | getEncoding | Encode::
0 | 0 | 0 | 0s | 0s | perlio_ok | Encode::
0 | 0 | 0 | 0s | 0s | resolve_alias | Encode::
0 | 0 | 0 | 0s | 0s | BEGIN | Encode::utf8::
0 | 0 | 0 | 0s | 0s | __ANON__[:297] | Encode::utf8::
0 | 0 | 0 | 0s | 0s | __ANON__[:303] | Encode::utf8::
0 | 0 | 0 | 0s | 0s | __ANON__[:319] | Encode::utf8::
Line | Stmts. | Exclusive Time | Avg. | Code |
---|---|---|---|---|
1 | # | |||
2 | # $Id: Encode.pm,v 2.33 2009/03/25 07:53:19 dankogai Exp $ | |||
3 | # | |||
4 | package Encode; | |||
5 | 3 | 29µs | 10µs | use strict; # spent 12µs making 1 call to strict::import |
6 | 3 | 81µs | 27µs | use warnings; # spent 28µs making 1 call to warnings::import |
7 | 1 | 8µs | 8µs | our $VERSION = sprintf "%d.%02d", q$Revision: 2.33 $ =~ /(\d+)/g; |
8 | sub DEBUG () { 0 } | |||
9 | 3 | 50µs | 16µs | use XSLoader (); |
10 | 1 | 301µs | 301µs | XSLoader::load( __PACKAGE__, $VERSION ); # spent 342µs making 1 call to XSLoader::load |
11 | ||||
12 | 1 | 1µs | 1µs | require Exporter; |
13 | 3 | 177µs | 59µs | use base qw/Exporter/; # spent 70µs making 1 call to base::import |
14 | ||||
15 | # Public, encouraged API is exported by default | |||
16 | ||||
17 | 1 | 4µs | 4µs | our @EXPORT = qw( |
18 | decode decode_utf8 encode encode_utf8 str2bytes bytes2str | |||
19 | encodings find_encoding clone_encoding | |||
20 | ); | |||
21 | 1 | 3µs | 3µs | our @FB_FLAGS = qw( |
22 | DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC | |||
23 | PERLQQ HTMLCREF XMLCREF STOP_AT_PARTIAL | |||
24 | ); | |||
25 | 1 | 2µs | 2µs | our @FB_CONSTS = qw( |
26 | FB_DEFAULT FB_CROAK FB_QUIET FB_WARN | |||
27 | FB_PERLQQ FB_HTMLCREF FB_XMLCREF | |||
28 | ); | |||
29 | 1 | 10µs | 10µs | our @EXPORT_OK = ( |
30 | qw( | |||
31 | _utf8_off _utf8_on define_encoding from_to is_16bit is_8bit | |||
32 | is_utf8 perlio_ok resolve_alias utf8_downgrade utf8_upgrade | |||
33 | ), | |||
34 | @FB_FLAGS, @FB_CONSTS, | |||
35 | ); | |||
36 | ||||
37 | 1 | 28µs | 28µs | our %EXPORT_TAGS = ( |
38 | all => [ @EXPORT, @EXPORT_OK ], | |||
39 | default => [ @EXPORT ], | |||
40 | fallbacks => [ @FB_CONSTS ], | |||
41 | fallback_all => [ @FB_CONSTS, @FB_FLAGS ], | |||
42 | ); | |||
43 | ||||
44 | # Documentation moved after __END__ for speed - NI-S | |||
45 | ||||
46 | 1 | 700ns | 700ns | our $ON_EBCDIC = ( ord("A") == 193 ); |
47 | ||||
48 | 3 | 1.46ms | 488µs | use Encode::Alias; # spent 72µs making 1 call to Exporter::import |
49 | ||||
50 | # Make a %Encoding package variable to allow a certain amount of cheating | |||
51 | 1 | 300ns | 300ns | our %Encoding; |
52 | 1 | 200ns | 200ns | our %ExtModule; |
53 | 1 | 159µs | 159µs | require Encode::Config; |
54 | # See | |||
55 | # https://bugzilla.redhat.com/show_bug.cgi?id=435505#c2 | |||
56 | # to find why sig handers inside eval{} are disabled. | |||
57 | 4 | 80µs | 20µs | eval { |
58 | local $SIG{__DIE__}; | |||
59 | local $SIG{__WARN__}; | |||
60 | require Encode::ConfigLocal; | |||
61 | }; | |||
62 | ||||
63 | sub encodings { | |||
64 | my $class = shift; | |||
65 | my %enc; | |||
66 | if ( @_ and $_[0] eq ":all" ) { | |||
67 | %enc = ( %Encoding, %ExtModule ); | |||
68 | } | |||
69 | else { | |||
70 | %enc = %Encoding; | |||
71 | for my $mod ( map { m/::/o ? $_ : "Encode::$_" } @_ ) { | |||
72 | DEBUG and warn $mod; | |||
73 | for my $enc ( keys %ExtModule ) { | |||
74 | $ExtModule{$enc} eq $mod and $enc{$enc} = $mod; | |||
75 | } | |||
76 | } | |||
77 | } | |||
78 | return sort { lc $a cmp lc $b } | |||
79 | grep { !/^(?:Internal|Unicode|Guess)$/o } keys %enc; | |||
80 | } | |||
81 | ||||
82 | sub perlio_ok { | |||
83 | my $obj = ref( $_[0] ) ? $_[0] : find_encoding( $_[0] ); | |||
84 | $obj->can("perlio_ok") and return $obj->perlio_ok(); | |||
85 | return 0; # safety net | |||
86 | } | |||
87 | ||||
88 | sub define_encoding { | |||
89 | 28 | 23µs | 811ns | my $obj = shift; |
90 | my $name = shift; | |||
91 | $Encoding{$name} = $obj; | |||
92 | my $lc = lc($name); | |||
93 | define_alias( $lc => $obj ) unless $lc eq $name; | |||
94 | while (@_) { | |||
95 | my $alias = shift; | |||
96 | define_alias( $alias, $obj ); | |||
97 | } | |||
98 | return $obj; | |||
99 | } | |||
100 | ||||
101 | sub getEncoding { | |||
102 | my ( $class, $name, $skip_external ) = @_; | |||
103 | ||||
104 | ref($name) && $name->can('renew') and return $name; | |||
105 | exists $Encoding{$name} and return $Encoding{$name}; | |||
106 | my $lc = lc $name; | |||
107 | exists $Encoding{$lc} and return $Encoding{$lc}; | |||
108 | ||||
109 | my $oc = $class->find_alias($name); | |||
110 | defined($oc) and return $oc; | |||
111 | $lc ne $name and $oc = $class->find_alias($lc); | |||
112 | defined($oc) and return $oc; | |||
113 | ||||
114 | unless ($skip_external) { | |||
115 | if ( my $mod = $ExtModule{$name} || $ExtModule{$lc} ) { | |||
116 | $mod =~ s,::,/,g; | |||
117 | $mod .= '.pm'; | |||
118 | eval { require $mod; }; | |||
119 | exists $Encoding{$name} and return $Encoding{$name}; | |||
120 | } | |||
121 | } | |||
122 | return; | |||
123 | } | |||
124 | ||||
125 | sub find_encoding($;$) { | |||
126 | my ( $name, $skip_external ) = @_; | |||
127 | return __PACKAGE__->getEncoding( $name, $skip_external ); | |||
128 | } | |||
129 | ||||
130 | sub resolve_alias($) { | |||
131 | my $obj = find_encoding(shift); | |||
132 | defined $obj and return $obj->name; | |||
133 | return; | |||
134 | } | |||
135 | ||||
136 | sub clone_encoding($) { | |||
137 | my $obj = find_encoding(shift); | |||
138 | ref $obj or return; | |||
139 | eval { require Storable }; | |||
140 | $@ and return; | |||
141 | return Storable::dclone($obj); | |||
142 | } | |||
143 | ||||
144 | sub encode($$;$) { | |||
145 | my ( $name, $string, $check ) = @_; | |||
146 | return undef unless defined $string; | |||
147 | $string .= '' if ref $string; # stringify; | |||
148 | $check ||= 0; | |||
149 | unless ( defined $name ) { | |||
150 | require Carp; | |||
151 | Carp::croak("Encoding name should not be undef"); | |||
152 | } | |||
153 | my $enc = find_encoding($name); | |||
154 | unless ( defined $enc ) { | |||
155 | require Carp; | |||
156 | Carp::croak("Unknown encoding '$name'"); | |||
157 | } | |||
158 | my $octets = $enc->encode( $string, $check ); | |||
159 | $_[1] = $string if $check and !ref $check and !( $check & LEAVE_SRC() ); | |||
160 | return $octets; | |||
161 | } | |||
162 | 1 | 2µs | 2µs | *str2bytes = \&encode; |
163 | ||||
164 | sub decode($$;$) { | |||
165 | my ( $name, $octets, $check ) = @_; | |||
166 | return undef unless defined $octets; | |||
167 | $octets .= '' if ref $octets; | |||
168 | $check ||= 0; | |||
169 | my $enc = find_encoding($name); | |||
170 | unless ( defined $enc ) { | |||
171 | require Carp; | |||
172 | Carp::croak("Unknown encoding '$name'"); | |||
173 | } | |||
174 | my $string = $enc->decode( $octets, $check ); | |||
175 | $_[1] = $octets if $check and !ref $check and !( $check & LEAVE_SRC() ); | |||
176 | return $string; | |||
177 | } | |||
178 | 1 | 1µs | 1µs | *bytes2str = \&decode; |
179 | ||||
180 | sub from_to($$$;$) { | |||
181 | my ( $string, $from, $to, $check ) = @_; | |||
182 | return undef unless defined $string; | |||
183 | $check ||= 0; | |||
184 | my $f = find_encoding($from); | |||
185 | unless ( defined $f ) { | |||
186 | require Carp; | |||
187 | Carp::croak("Unknown encoding '$from'"); | |||
188 | } | |||
189 | my $t = find_encoding($to); | |||
190 | unless ( defined $t ) { | |||
191 | require Carp; | |||
192 | Carp::croak("Unknown encoding '$to'"); | |||
193 | } | |||
194 | my $uni = $f->decode($string); | |||
195 | $_[0] = $string = $t->encode( $uni, $check ); | |||
196 | return undef if ( $check && length($uni) ); | |||
197 | return defined( $_[0] ) ? length($string) : undef; | |||
198 | } | |||
199 | ||||
200 | sub encode_utf8($) { | |||
201 | my ($str) = @_; | |||
202 | utf8::encode($str); | |||
203 | return $str; | |||
204 | } | |||
205 | ||||
206 | sub decode_utf8($;$) { | |||
207 | my ( $str, $check ) = @_; | |||
208 | return $str if is_utf8($str); | |||
209 | if ($check) { | |||
210 | return decode( "utf8", $str, $check ); | |||
211 | } | |||
212 | else { | |||
213 | return decode( "utf8", $str ); | |||
214 | return $str; | |||
215 | } | |||
216 | } | |||
217 | ||||
218 | 1 | 12µs | 12µs | predefine_encodings(1); # spent 748µs making 1 call to Encode::predefine_encodings |
219 | ||||
220 | # | |||
221 | # This is to restore %Encoding if really needed; | |||
222 | # | |||
223 | ||||
224 | # spent 748µs (708+40) within Encode::predefine_encodings which was called
# once (708µs+40µs) at line 218 | |||
225 | 4 | 86µs | 21µs | require Encode::Encoding; |
226 | 3 | 655µs | 218µs | no warnings 'redefine'; # spent 26µs making 1 call to warnings::unimport |
227 | my $use_xs = shift; | |||
228 | 4 | 26µs | 6µs | if ($ON_EBCDIC) { |
229 | ||||
230 | # was in Encode::UTF_EBCDIC | |||
231 | package Encode::UTF_EBCDIC; | |||
232 | push @Encode::UTF_EBCDIC::ISA, 'Encode::Encoding'; | |||
233 | *decode = sub { | |||
234 | my ( $obj, $str, $chk ) = @_; | |||
235 | my $res = ''; | |||
236 | for ( my $i = 0 ; $i < length($str) ; $i++ ) { | |||
237 | $res .= | |||
238 | chr( | |||
239 | utf8::unicode_to_native( ord( substr( $str, $i, 1 ) ) ) | |||
240 | ); | |||
241 | } | |||
242 | $_[1] = '' if $chk; | |||
243 | return $res; | |||
244 | }; | |||
245 | *encode = sub { | |||
246 | my ( $obj, $str, $chk ) = @_; | |||
247 | my $res = ''; | |||
248 | for ( my $i = 0 ; $i < length($str) ; $i++ ) { | |||
249 | $res .= | |||
250 | chr( | |||
251 | utf8::native_to_unicode( ord( substr( $str, $i, 1 ) ) ) | |||
252 | ); | |||
253 | } | |||
254 | $_[1] = '' if $chk; | |||
255 | return $res; | |||
256 | }; | |||
257 | $Encode::Encoding{Unicode} = | |||
258 | bless { Name => "UTF_EBCDIC" } => "Encode::UTF_EBCDIC"; | |||
259 | } | |||
260 | else { | |||
261 | ||||
262 | package Encode::Internal; | |||
263 | push @Encode::Internal::ISA, 'Encode::Encoding'; | |||
264 | *decode = sub { | |||
265 | my ( $obj, $str, $chk ) = @_; | |||
266 | utf8::upgrade($str); | |||
267 | $_[1] = '' if $chk; | |||
268 | return $str; | |||
269 | }; | |||
270 | *encode = \&decode; | |||
271 | $Encode::Encoding{Unicode} = | |||
272 | bless { Name => "Internal" } => "Encode::Internal"; | |||
273 | } | |||
274 | ||||
275 | { | |||
276 | ||||
277 | # was in Encode::utf8 | |||
278 | package Encode::utf8; | |||
279 | 5 | 23µs | 5µs | push @Encode::utf8::ISA, 'Encode::Encoding'; |
280 | ||||
281 | # | |||
282 | 3 | 5µs | 2µs | if ($use_xs) { |
283 | Encode::DEBUG and warn __PACKAGE__, " XS on"; | |||
284 | *decode = \&decode_xs; | |||
285 | *encode = \&encode_xs; | |||
286 | } | |||
287 | else { | |||
288 | Encode::DEBUG and warn __PACKAGE__, " XS off"; | |||
289 | *decode = sub { | |||
290 | my ( $obj, $octets, $chk ) = @_; | |||
291 | my $str = Encode::decode_utf8($octets); | |||
292 | if ( defined $str ) { | |||
293 | $_[1] = '' if $chk; | |||
294 | return $str; | |||
295 | } | |||
296 | return undef; | |||
297 | }; | |||
298 | *encode = sub { | |||
299 | my ( $obj, $string, $chk ) = @_; | |||
300 | my $octets = Encode::encode_utf8($string); | |||
301 | $_[1] = '' if $chk; | |||
302 | return $octets; | |||
303 | }; | |||
304 | } | |||
305 | *cat_decode = sub { # ($obj, $dst, $src, $pos, $trm, $chk) | |||
306 | # currently ignores $chk | |||
307 | my ( $obj, undef, undef, $pos, $trm ) = @_; | |||
308 | my ( $rdst, $rsrc, $rpos ) = \@_[ 1, 2, 3 ]; | |||
309 | 3 | 200µs | 67µs | use bytes; # spent 9µs making 1 call to bytes::import |
310 | if ( ( my $npos = index( $$rsrc, $trm, $pos ) ) >= 0 ) { | |||
311 | $$rdst .= | |||
312 | substr( $$rsrc, $pos, $npos - $pos + length($trm) ); | |||
313 | $$rpos = $npos + length($trm); | |||
314 | return 1; | |||
315 | } | |||
316 | $$rdst .= substr( $$rsrc, $pos ); | |||
317 | $$rpos = length($$rsrc); | |||
318 | return ''; | |||
319 | }; | |||
320 | $Encode::Encoding{utf8} = | |||
321 | bless { Name => "utf8" } => "Encode::utf8"; | |||
322 | $Encode::Encoding{"utf-8-strict"} = | |||
323 | bless { Name => "utf-8-strict", strict_utf8 => 1 } => | |||
324 | "Encode::utf8"; | |||
325 | } | |||
326 | } | |||
327 | ||||
328 | 1 | 31µs | 31µs | 1; |
329 | ||||
330 | __END__ | |||
331 | ||||
332 | =head1 NAME | |||
333 | ||||
334 | Encode - character encodings | |||
335 | ||||
336 | =head1 SYNOPSIS | |||
337 | ||||
338 | use Encode; | |||
339 | ||||
340 | =head2 Table of Contents | |||
341 | ||||
342 | Encode consists of a collection of modules whose details are too big | |||
343 | to fit in one document. This POD itself explains the top-level APIs | |||
344 | and general topics at a glance. For other topics and more details, | |||
345 | see the PODs below: | |||
346 | ||||
347 | Name Description | |||
348 | -------------------------------------------------------- | |||
349 | Encode::Alias Alias definitions to encodings | |||
350 | Encode::Encoding Encode Implementation Base Class | |||
351 | Encode::Supported List of Supported Encodings | |||
352 | Encode::CN Simplified Chinese Encodings | |||
353 | Encode::JP Japanese Encodings | |||
354 | Encode::KR Korean Encodings | |||
355 | Encode::TW Traditional Chinese Encodings | |||
356 | -------------------------------------------------------- | |||
357 | ||||
358 | =head1 DESCRIPTION | |||
359 | ||||
360 | The C<Encode> module provides the interfaces between Perl's strings | |||
361 | and the rest of the system. Perl strings are sequences of | |||
362 | B<characters>. | |||
363 | ||||
364 | The repertoire of characters that Perl can represent is at least that | |||
365 | defined by the Unicode Consortium. On most platforms the ordinal | |||
366 | values of the characters (as returned by C<ord(ch)>) is the "Unicode | |||
367 | codepoint" for the character (the exceptions are those platforms where | |||
368 | the legacy encoding is some variant of EBCDIC rather than a super-set | |||
369 | of ASCII - see L<perlebcdic>). | |||
370 | ||||
371 | Traditionally, computer data has been moved around in 8-bit chunks | |||
372 | often called "bytes". These chunks are also known as "octets" in | |||
373 | networking standards. Perl is widely used to manipulate data of many | |||
374 | types - not only strings of characters representing human or computer | |||
375 | languages but also "binary" data being the machine's representation of | |||
376 | numbers, pixels in an image - or just about anything. | |||
377 | ||||
378 | When Perl is processing "binary data", the programmer wants Perl to | |||
379 | process "sequences of bytes". This is not a problem for Perl - as a | |||
380 | byte has 256 possible values, it easily fits in Perl's much larger | |||
381 | "logical character". | |||
382 | ||||
383 | =head2 TERMINOLOGY | |||
384 | ||||
385 | =over 2 | |||
386 | ||||
387 | =item * | |||
388 | ||||
389 | I<character>: a character in the range 0..(2**32-1) (or more). | |||
390 | (What Perl's strings are made of.) | |||
391 | ||||
392 | =item * | |||
393 | ||||
394 | I<byte>: a character in the range 0..255 | |||
395 | (A special case of a Perl character.) | |||
396 | ||||
397 | =item * | |||
398 | ||||
399 | I<octet>: 8 bits of data, with ordinal values 0..255 | |||
400 | (Term for bytes passed to or from a non-Perl context, e.g. a disk file.) | |||
401 | ||||
402 | =back | |||
403 | ||||
404 | =head1 PERL ENCODING API | |||
405 | ||||
406 | =over 2 | |||
407 | ||||
408 | =item $octets = encode(ENCODING, $string [, CHECK]) | |||
409 | ||||
410 | Encodes a string from Perl's internal form into I<ENCODING> and returns | |||
411 | a sequence of octets. ENCODING can be either a canonical name or | |||
412 | an alias. For encoding names and aliases, see L</"Defining Aliases">. | |||
413 | For CHECK, see L</"Handling Malformed Data">. | |||
414 | ||||
415 | For example, to convert a string from Perl's internal format to | |||
416 | iso-8859-1 (also known as Latin1), | |||
417 | ||||
418 | $octets = encode("iso-8859-1", $string); | |||
419 | ||||
420 | B<CAVEAT>: When you run C<$octets = encode("utf8", $string)>, then | |||
421 | $octets B<may not be equal to> $string. Though they both contain the | |||
422 | same data, the UTF8 flag for $octets is B<always> off. When you | |||
423 | encode anything, UTF8 flag of the result is always off, even when it | |||
424 | contains completely valid utf8 string. See L</"The UTF8 flag"> below. | |||
425 | ||||
426 | If the $string is C<undef> then C<undef> is returned. | |||
427 | ||||
428 | =item $string = decode(ENCODING, $octets [, CHECK]) | |||
429 | ||||
430 | Decodes a sequence of octets assumed to be in I<ENCODING> into Perl's | |||
431 | internal form and returns the resulting string. As in encode(), | |||
432 | ENCODING can be either a canonical name or an alias. For encoding names | |||
433 | and aliases, see L</"Defining Aliases">. For CHECK, see | |||
434 | L</"Handling Malformed Data">. | |||
435 | ||||
436 | For example, to convert ISO-8859-1 data to a string in Perl's internal format: | |||
437 | ||||
438 | $string = decode("iso-8859-1", $octets); | |||
439 | ||||
440 | B<CAVEAT>: When you run C<$string = decode("utf8", $octets)>, then $string | |||
441 | B<may not be equal to> $octets. Though they both contain the same data, | |||
442 | the UTF8 flag for $string is on unless $octets entirely consists of | |||
443 | ASCII data (or EBCDIC on EBCDIC machines). See L</"The UTF8 flag"> | |||
444 | below. | |||
445 | ||||
446 | If the $string is C<undef> then C<undef> is returned. | |||
447 | ||||
448 | =item [$obj =] find_encoding(ENCODING) | |||
449 | ||||
450 | Returns the I<encoding object> corresponding to ENCODING. Returns | |||
451 | undef if no matching ENCODING is find. | |||
452 | ||||
453 | This object is what actually does the actual (en|de)coding. | |||
454 | ||||
455 | $utf8 = decode($name, $bytes); | |||
456 | ||||
457 | is in fact | |||
458 | ||||
459 | $utf8 = do{ | |||
460 | $obj = find_encoding($name); | |||
461 | croak qq(encoding "$name" not found) unless ref $obj; | |||
462 | $obj->decode($bytes) | |||
463 | }; | |||
464 | ||||
465 | with more error checking. | |||
466 | ||||
467 | Therefore you can save time by reusing this object as follows; | |||
468 | ||||
469 | my $enc = find_encoding("iso-8859-1"); | |||
470 | while(<>){ | |||
471 | my $utf8 = $enc->decode($_); | |||
472 | # and do someting with $utf8; | |||
473 | } | |||
474 | ||||
475 | Besides C<< ->decode >> and C<< ->encode >>, other methods are | |||
476 | available as well. For instance, C<< -> name >> returns the canonical | |||
477 | name of the encoding object. | |||
478 | ||||
479 | find_encoding("latin1")->name; # iso-8859-1 | |||
480 | ||||
481 | See L<Encode::Encoding> for details. | |||
482 | ||||
483 | =item [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK]) | |||
484 | ||||
485 | Converts B<in-place> data between two encodings. The data in $octets | |||
486 | must be encoded as octets and not as characters in Perl's internal | |||
487 | format. For example, to convert ISO-8859-1 data to Microsoft's CP1250 | |||
488 | encoding: | |||
489 | ||||
490 | from_to($octets, "iso-8859-1", "cp1250"); | |||
491 | ||||
492 | and to convert it back: | |||
493 | ||||
494 | from_to($octets, "cp1250", "iso-8859-1"); | |||
495 | ||||
496 | Note that because the conversion happens in place, the data to be | |||
497 | converted cannot be a string constant; it must be a scalar variable. | |||
498 | ||||
499 | from_to() returns the length of the converted string in octets on | |||
500 | success, I<undef> on error. | |||
501 | ||||
502 | B<CAVEAT>: The following operations look the same but are not quite so; | |||
503 | ||||
504 | from_to($data, "iso-8859-1", "utf8"); #1 | |||
505 | $data = decode("iso-8859-1", $data); #2 | |||
506 | ||||
507 | Both #1 and #2 make $data consist of a completely valid UTF-8 string | |||
508 | but only #2 turns UTF8 flag on. #1 is equivalent to | |||
509 | ||||
510 | $data = encode("utf8", decode("iso-8859-1", $data)); | |||
511 | ||||
512 | See L</"The UTF8 flag"> below. | |||
513 | ||||
514 | Also note that | |||
515 | ||||
516 | from_to($octets, $from, $to, $check); | |||
517 | ||||
518 | is equivalent to | |||
519 | ||||
520 | $octets = encode($to, decode($from, $octets), $check); | |||
521 | ||||
522 | Yes, it does not respect the $check during decoding. It is | |||
523 | deliberately done that way. If you need minute control, C<decode> | |||
524 | then C<encode> as follows; | |||
525 | ||||
526 | $octets = encode($to, decode($from, $octets, $check_from), $check_to); | |||
527 | ||||
528 | =item $octets = encode_utf8($string); | |||
529 | ||||
530 | Equivalent to C<$octets = encode("utf8", $string);> The characters | |||
531 | that comprise $string are encoded in Perl's internal format and the | |||
532 | result is returned as a sequence of octets. All possible | |||
533 | characters have a UTF-8 representation so this function cannot fail. | |||
534 | ||||
535 | ||||
536 | =item $string = decode_utf8($octets [, CHECK]); | |||
537 | ||||
538 | equivalent to C<$string = decode("utf8", $octets [, CHECK])>. | |||
539 | The sequence of octets represented by | |||
540 | $octets is decoded from UTF-8 into a sequence of logical | |||
541 | characters. Not all sequences of octets form valid UTF-8 encodings, so | |||
542 | it is possible for this call to fail. For CHECK, see | |||
543 | L</"Handling Malformed Data">. | |||
544 | ||||
545 | =back | |||
546 | ||||
547 | =head2 Listing available encodings | |||
548 | ||||
549 | use Encode; | |||
550 | @list = Encode->encodings(); | |||
551 | ||||
552 | Returns a list of the canonical names of the available encodings that | |||
553 | are loaded. To get a list of all available encodings including the | |||
554 | ones that are not loaded yet, say | |||
555 | ||||
556 | @all_encodings = Encode->encodings(":all"); | |||
557 | ||||
558 | Or you can give the name of a specific module. | |||
559 | ||||
560 | @with_jp = Encode->encodings("Encode::JP"); | |||
561 | ||||
562 | When "::" is not in the name, "Encode::" is assumed. | |||
563 | ||||
564 | @ebcdic = Encode->encodings("EBCDIC"); | |||
565 | ||||
566 | To find out in detail which encodings are supported by this package, | |||
567 | see L<Encode::Supported>. | |||
568 | ||||
569 | =head2 Defining Aliases | |||
570 | ||||
571 | To add a new alias to a given encoding, use: | |||
572 | ||||
573 | use Encode; | |||
574 | use Encode::Alias; | |||
575 | define_alias(newName => ENCODING); | |||
576 | ||||
577 | After that, newName can be used as an alias for ENCODING. | |||
578 | ENCODING may be either the name of an encoding or an | |||
579 | I<encoding object> | |||
580 | ||||
581 | But before you do so, make sure the alias is nonexistent with | |||
582 | C<resolve_alias()>, which returns the canonical name thereof. | |||
583 | i.e. | |||
584 | ||||
585 | Encode::resolve_alias("latin1") eq "iso-8859-1" # true | |||
586 | Encode::resolve_alias("iso-8859-12") # false; nonexistent | |||
587 | Encode::resolve_alias($name) eq $name # true if $name is canonical | |||
588 | ||||
589 | resolve_alias() does not need C<use Encode::Alias>; it can be | |||
590 | exported via C<use Encode qw(resolve_alias)>. | |||
591 | ||||
592 | See L<Encode::Alias> for details. | |||
593 | ||||
594 | =head2 Finding IANA Character Set Registry names | |||
595 | ||||
596 | The canonical name of a given encoding does not necessarily agree with | |||
597 | IANA IANA Character Set Registry, commonly seen as C<< Content-Type: | |||
598 | text/plain; charset=I<whatever> >>. For most cases canonical names | |||
599 | work but sometimes it does not (notably 'utf-8-strict'). | |||
600 | ||||
601 | Therefore as of Encode version 2.21, a new method C<mime_name()> is added. | |||
602 | ||||
603 | use Encode; | |||
604 | my $enc = find_encoding('UTF-8'); | |||
605 | warn $enc->name; # utf-8-strict | |||
606 | warn $enc->mime_name; # UTF-8 | |||
607 | ||||
608 | See also: L<Encode::Encoding> | |||
609 | ||||
610 | =head1 Encoding via PerlIO | |||
611 | ||||
612 | If your perl supports I<PerlIO> (which is the default), you can use a | |||
613 | PerlIO layer to decode and encode directly via a filehandle. The | |||
614 | following two examples are totally identical in their functionality. | |||
615 | ||||
616 | # via PerlIO | |||
617 | open my $in, "<:encoding(shiftjis)", $infile or die; | |||
618 | open my $out, ">:encoding(euc-jp)", $outfile or die; | |||
619 | while(<$in>){ print $out $_; } | |||
620 | ||||
621 | # via from_to | |||
622 | open my $in, "<", $infile or die; | |||
623 | open my $out, ">", $outfile or die; | |||
624 | while(<$in>){ | |||
625 | from_to($_, "shiftjis", "euc-jp", 1); | |||
626 | print $out $_; | |||
627 | } | |||
628 | ||||
629 | Unfortunately, it may be that encodings are PerlIO-savvy. You can check | |||
630 | if your encoding is supported by PerlIO by calling the C<perlio_ok> | |||
631 | method. | |||
632 | ||||
633 | Encode::perlio_ok("hz"); # False | |||
634 | find_encoding("euc-cn")->perlio_ok; # True where PerlIO is available | |||
635 | ||||
636 | use Encode qw(perlio_ok); # exported upon request | |||
637 | perlio_ok("euc-jp") | |||
638 | ||||
639 | Fortunately, all encodings that come with Encode core are PerlIO-savvy | |||
640 | except for hz and ISO-2022-kr. For gory details, see | |||
641 | L<Encode::Encoding> and L<Encode::PerlIO>. | |||
642 | ||||
643 | =head1 Handling Malformed Data | |||
644 | ||||
645 | The optional I<CHECK> argument tells Encode what to do when it | |||
646 | encounters malformed data. Without CHECK, Encode::FB_DEFAULT ( == 0 ) | |||
647 | is assumed. | |||
648 | ||||
649 | As of version 2.12 Encode supports coderef values for CHECK. See below. | |||
650 | ||||
651 | =over 2 | |||
652 | ||||
653 | =item B<NOTE:> Not all encoding support this feature | |||
654 | ||||
655 | Some encodings ignore I<CHECK> argument. For example, | |||
656 | L<Encode::Unicode> ignores I<CHECK> and it always croaks on error. | |||
657 | ||||
658 | =back | |||
659 | ||||
660 | Now here is the list of I<CHECK> values available | |||
661 | ||||
662 | =over 2 | |||
663 | ||||
664 | =item I<CHECK> = Encode::FB_DEFAULT ( == 0) | |||
665 | ||||
666 | If I<CHECK> is 0, (en|de)code will put a I<substitution character> in | |||
667 | place of a malformed character. When you encode, E<lt>subcharE<gt> | |||
668 | will be used. When you decode the code point C<0xFFFD> is used. If | |||
669 | the data is supposed to be UTF-8, an optional lexical warning | |||
670 | (category utf8) is given. | |||
671 | ||||
672 | =item I<CHECK> = Encode::FB_CROAK ( == 1) | |||
673 | ||||
674 | If I<CHECK> is 1, methods will die on error immediately with an error | |||
675 | message. Therefore, when I<CHECK> is set to 1, you should trap the | |||
676 | error with eval{} unless you really want to let it die. | |||
677 | ||||
678 | =item I<CHECK> = Encode::FB_QUIET | |||
679 | ||||
680 | If I<CHECK> is set to Encode::FB_QUIET, (en|de)code will immediately | |||
681 | return the portion of the data that has been processed so far when an | |||
682 | error occurs. The data argument will be overwritten with everything | |||
683 | after that point (that is, the unprocessed part of data). This is | |||
684 | handy when you have to call decode repeatedly in the case where your | |||
685 | source data may contain partial multi-byte character sequences, | |||
686 | (i.e. you are reading with a fixed-width buffer). Here is a sample | |||
687 | code that does exactly this: | |||
688 | ||||
689 | my $buffer = ''; my $string = ''; | |||
690 | while(read $fh, $buffer, 256, length($buffer)){ | |||
691 | $string .= decode($encoding, $buffer, Encode::FB_QUIET); | |||
692 | # $buffer now contains the unprocessed partial character | |||
693 | } | |||
694 | ||||
695 | =item I<CHECK> = Encode::FB_WARN | |||
696 | ||||
697 | This is the same as above, except that it warns on error. Handy when | |||
698 | you are debugging the mode above. | |||
699 | ||||
700 | =item perlqq mode (I<CHECK> = Encode::FB_PERLQQ) | |||
701 | ||||
702 | =item HTML charref mode (I<CHECK> = Encode::FB_HTMLCREF) | |||
703 | ||||
704 | =item XML charref mode (I<CHECK> = Encode::FB_XMLCREF) | |||
705 | ||||
706 | For encodings that are implemented by Encode::XS, CHECK == | |||
707 | Encode::FB_PERLQQ turns (en|de)code into C<perlqq> fallback mode. | |||
708 | ||||
709 | When you decode, C<\xI<HH>> will be inserted for a malformed character, | |||
710 | where I<HH> is the hex representation of the octet that could not be | |||
711 | decoded to utf8. And when you encode, C<\x{I<HHHH>}> will be inserted, | |||
712 | where I<HHHH> is the Unicode ID of the character that cannot be found | |||
713 | in the character repertoire of the encoding. | |||
714 | ||||
715 | HTML/XML character reference modes are about the same, in place of | |||
716 | C<\x{I<HHHH>}>, HTML uses C<&#I<NNN>;> where I<NNN> is a decimal number and | |||
717 | XML uses C<&#xI<HHHH>;> where I<HHHH> is the hexadecimal number. | |||
718 | ||||
719 | In Encode 2.10 or later, C<LEAVE_SRC> is also implied. | |||
720 | ||||
721 | =item The bitmask | |||
722 | ||||
723 | These modes are actually set via a bitmask. Here is how the FB_XX | |||
724 | constants are laid out. You can import the FB_XX constants via | |||
725 | C<use Encode qw(:fallbacks)>; you can import the generic bitmask | |||
726 | constants via C<use Encode qw(:fallback_all)>. | |||
727 | ||||
728 | FB_DEFAULT FB_CROAK FB_QUIET FB_WARN FB_PERLQQ | |||
729 | DIE_ON_ERR 0x0001 X | |||
730 | WARN_ON_ERR 0x0002 X | |||
731 | RETURN_ON_ERR 0x0004 X X | |||
732 | LEAVE_SRC 0x0008 X | |||
733 | PERLQQ 0x0100 X | |||
734 | HTMLCREF 0x0200 | |||
735 | XMLCREF 0x0400 | |||
736 | ||||
737 | =back | |||
738 | ||||
739 | =over 2 | |||
740 | ||||
741 | =item Encode::LEAVE_SRC | |||
742 | ||||
743 | If the C<Encode::LEAVE_SRC> bit is not set, but I<CHECK> is, then the second | |||
744 | argument to C<encode()> or C<decode()> may be assigned to by the functions. If | |||
745 | you're not interested in this, then bitwise-or the bitmask with it. | |||
746 | ||||
747 | =back | |||
748 | ||||
749 | =head2 coderef for CHECK | |||
750 | ||||
751 | As of Encode 2.12 CHECK can also be a code reference which takes the | |||
752 | ord value of unmapped caharacter as an argument and returns a string | |||
753 | that represents the fallback character. For instance, | |||
754 | ||||
755 | $ascii = encode("ascii", $utf8, sub{ sprintf "<U+%04X>", shift }); | |||
756 | ||||
757 | Acts like FB_PERLQQ but E<lt>U+I<XXXX>E<gt> is used instead of | |||
758 | \x{I<XXXX>}. | |||
759 | ||||
760 | =head1 Defining Encodings | |||
761 | ||||
762 | To define a new encoding, use: | |||
763 | ||||
764 | use Encode qw(define_encoding); | |||
765 | define_encoding($object, 'canonicalName' [, alias...]); | |||
766 | ||||
767 | I<canonicalName> will be associated with I<$object>. The object | |||
768 | should provide the interface described in L<Encode::Encoding>. | |||
769 | If more than two arguments are provided then additional | |||
770 | arguments are taken as aliases for I<$object>. | |||
771 | ||||
772 | See L<Encode::Encoding> for more details. | |||
773 | ||||
774 | =head1 The UTF8 flag | |||
775 | ||||
776 | Before the introduction of Unicode support in perl, The C<eq> operator | |||
777 | just compared the strings represented by two scalars. Beginning with | |||
778 | perl 5.8, C<eq> compares two strings with simultaneous consideration of | |||
779 | I<the UTF8 flag>. To explain why we made it so, I will quote page 402 of | |||
780 | C<Programming Perl, 3rd ed.> | |||
781 | ||||
782 | =over 2 | |||
783 | ||||
784 | =item Goal #1: | |||
785 | ||||
786 | Old byte-oriented programs should not spontaneously break on the old | |||
787 | byte-oriented data they used to work on. | |||
788 | ||||
789 | =item Goal #2: | |||
790 | ||||
791 | Old byte-oriented programs should magically start working on the new | |||
792 | character-oriented data when appropriate. | |||
793 | ||||
794 | =item Goal #3: | |||
795 | ||||
796 | Programs should run just as fast in the new character-oriented mode | |||
797 | as in the old byte-oriented mode. | |||
798 | ||||
799 | =item Goal #4: | |||
800 | ||||
801 | Perl should remain one language, rather than forking into a | |||
802 | byte-oriented Perl and a character-oriented Perl. | |||
803 | ||||
804 | =back | |||
805 | ||||
806 | Back when C<Programming Perl, 3rd ed.> was written, not even Perl 5.6.0 | |||
807 | was born and many features documented in the book remained | |||
808 | unimplemented for a long time. Perl 5.8 corrected this and the introduction | |||
809 | of the UTF8 flag is one of them. You can think of this perl notion as of a | |||
810 | byte-oriented mode (UTF8 flag off) and a character-oriented mode (UTF8 | |||
811 | flag on). | |||
812 | ||||
813 | Here is how Encode takes care of the UTF8 flag. | |||
814 | ||||
815 | =over 2 | |||
816 | ||||
817 | =item * | |||
818 | ||||
819 | When you encode, the resulting UTF8 flag is always off. | |||
820 | ||||
821 | =item * | |||
822 | ||||
823 | When you decode, the resulting UTF8 flag is on unless you can | |||
824 | unambiguously represent data. Here is the definition of | |||
825 | dis-ambiguity. | |||
826 | ||||
827 | After C<$utf8 = decode('foo', $octet);>, | |||
828 | ||||
829 | When $octet is... The UTF8 flag in $utf8 is | |||
830 | --------------------------------------------- | |||
831 | In ASCII only (or EBCDIC only) OFF | |||
832 | In ISO-8859-1 ON | |||
833 | In any other Encoding ON | |||
834 | --------------------------------------------- | |||
835 | ||||
836 | As you see, there is one exception, In ASCII. That way you can assume | |||
837 | Goal #1. And with Encode Goal #2 is assumed but you still have to be | |||
838 | careful in such cases mentioned in B<CAVEAT> paragraphs. | |||
839 | ||||
840 | This UTF8 flag is not visible in perl scripts, exactly for the same | |||
841 | reason you cannot (or you I<don't have to>) see if a scalar contains a | |||
842 | string, integer, or floating point number. But you can still peek | |||
843 | and poke these if you will. See the section below. | |||
844 | ||||
845 | =back | |||
846 | ||||
847 | =head2 Messing with Perl's Internals | |||
848 | ||||
849 | The following API uses parts of Perl's internals in the current | |||
850 | implementation. As such, they are efficient but may change. | |||
851 | ||||
852 | =over 2 | |||
853 | ||||
854 | =item is_utf8(STRING [, CHECK]) | |||
855 | ||||
856 | [INTERNAL] Tests whether the UTF8 flag is turned on in the STRING. | |||
857 | If CHECK is true, also checks the data in STRING for being well-formed | |||
858 | UTF-8. Returns true if successful, false otherwise. | |||
859 | ||||
860 | As of perl 5.8.1, L<utf8> also has utf8::is_utf8(). | |||
861 | ||||
862 | =item _utf8_on(STRING) | |||
863 | ||||
864 | [INTERNAL] Turns on the UTF8 flag in STRING. The data in STRING is | |||
865 | B<not> checked for being well-formed UTF-8. Do not use unless you | |||
866 | B<know> that the STRING is well-formed UTF-8. Returns the previous | |||
867 | state of the UTF8 flag (so please don't treat the return value as | |||
868 | indicating success or failure), or C<undef> if STRING is not a string. | |||
869 | ||||
870 | This function does not work on tainted values. | |||
871 | ||||
872 | =item _utf8_off(STRING) | |||
873 | ||||
874 | [INTERNAL] Turns off the UTF8 flag in STRING. Do not use frivolously. | |||
875 | Returns the previous state of the UTF8 flag (so please don't treat the | |||
876 | return value as indicating success or failure), or C<undef> if STRING is | |||
877 | not a string. | |||
878 | ||||
879 | This function does not work on tainted values. | |||
880 | ||||
881 | =back | |||
882 | ||||
883 | =head1 UTF-8 vs. utf8 vs. UTF8 | |||
884 | ||||
885 | ....We now view strings not as sequences of bytes, but as sequences | |||
886 | of numbers in the range 0 .. 2**32-1 (or in the case of 64-bit | |||
887 | computers, 0 .. 2**64-1) -- Programming Perl, 3rd ed. | |||
888 | ||||
889 | That has been the perl's notion of UTF-8 but official UTF-8 is more | |||
890 | strict; Its ranges is much narrower (0 .. 10FFFF), some sequences are | |||
891 | not allowed (i.e. Those used in the surrogate pair, 0xFFFE, et al). | |||
892 | ||||
893 | Now that is overruled by Larry Wall himself. | |||
894 | ||||
895 | From: Larry Wall <larry@wall.org> | |||
896 | Date: December 04, 2004 11:51:58 JST | |||
897 | To: perl-unicode@perl.org | |||
898 | Subject: Re: Make Encode.pm support the real UTF-8 | |||
899 | Message-Id: <20041204025158.GA28754@wall.org> | |||
900 | ||||
901 | On Fri, Dec 03, 2004 at 10:12:12PM +0000, Tim Bunce wrote: | |||
902 | : I've no problem with 'utf8' being perl's unrestricted uft8 encoding, | |||
903 | : but "UTF-8" is the name of the standard and should give the | |||
904 | : corresponding behaviour. | |||
905 | ||||
906 | For what it's worth, that's how I've always kept them straight in my | |||
907 | head. | |||
908 | ||||
909 | Also for what it's worth, Perl 6 will mostly default to strict but | |||
910 | make it easy to switch back to lax. | |||
911 | ||||
912 | Larry | |||
913 | ||||
914 | Do you copy? As of Perl 5.8.7, B<UTF-8> means strict, official UTF-8 | |||
915 | while B<utf8> means liberal, lax, version thereof. And Encode version | |||
916 | 2.10 or later thus groks the difference between C<UTF-8> and C"utf8". | |||
917 | ||||
918 | encode("utf8", "\x{FFFF_FFFF}", 1); # okay | |||
919 | encode("UTF-8", "\x{FFFF_FFFF}", 1); # croaks | |||
920 | ||||
921 | C<UTF-8> in Encode is actually a canonical name for C<utf-8-strict>. | |||
922 | Yes, the hyphen between "UTF" and "8" is important. Without it Encode | |||
923 | goes "liberal" | |||
924 | ||||
925 | find_encoding("UTF-8")->name # is 'utf-8-strict' | |||
926 | find_encoding("utf-8")->name # ditto. names are case insensitive | |||
927 | find_encoding("utf_8")->name # ditto. "_" are treated as "-" | |||
928 | find_encoding("UTF8")->name # is 'utf8'. | |||
929 | ||||
930 | The UTF8 flag is internally called UTF8, without a hyphen. It indicates | |||
931 | whether a string is internally encoded as utf8, also without a hypen. | |||
932 | ||||
933 | =head1 SEE ALSO | |||
934 | ||||
935 | L<Encode::Encoding>, | |||
936 | L<Encode::Supported>, | |||
937 | L<Encode::PerlIO>, | |||
938 | L<encoding>, | |||
939 | L<perlebcdic>, | |||
940 | L<perlfunc/open>, | |||
941 | L<perlunicode>, L<perluniintro>, L<perlunifaq>, L<perlunitut> | |||
942 | L<utf8>, | |||
943 | the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt> | |||
944 | ||||
945 | =head1 MAINTAINER | |||
946 | ||||
947 | This project was originated by Nick Ing-Simmons and later maintained | |||
948 | by Dan Kogai E<lt>dankogai@dan.co.jpE<gt>. See AUTHORS for a full | |||
949 | list of people involved. For any questions, use | |||
950 | E<lt>perl-unicode@perl.orgE<gt> so we can all share. | |||
951 | ||||
952 | While Dan Kogai retains the copyright as a maintainer, the credit | |||
953 | should go to all those involoved. See AUTHORS for those submitted | |||
954 | codes. | |||
955 | ||||
956 | =head1 COPYRIGHT | |||
957 | ||||
958 | Copyright 2002-2006 Dan Kogai E<lt>dankogai@dan.co.jpE<gt> | |||
959 | ||||
960 | This library is free software; you can redistribute it and/or modify | |||
961 | it under the same terms as Perl itself. | |||
962 | ||||
963 | =cut |