Current File : //usr/share/perl5/vendor_perl/utf8/all.pm |
package utf8::all;
use strict;
use warnings;
use 5.010; # state
# ABSTRACT: turn on Unicode - all of it
our $VERSION = '0.024'; # VERSION
#pod =head1 SYNOPSIS
#pod
#pod use utf8::all; # Turn on UTF-8, all of it.
#pod
#pod open my $in, '<', 'contains-utf8'; # UTF-8 already turned on here
#pod print length 'føø bār'; # 7 UTF-8 characters
#pod my $utf8_arg = shift @ARGV; # @ARGV is UTF-8 too (only for main)
#pod
#pod =head1 DESCRIPTION
#pod
#pod The C<use utf8> pragma tells the Perl parser to allow UTF-8 in the
#pod program text in the current lexical scope. This also means that you
#pod can now use literal Unicode characters as part of strings, variable
#pod names, and regular expressions.
#pod
#pod C<utf8::all> goes further:
#pod
#pod =over 4
#pod
#pod =item *
#pod
#pod L<C<charnames>|charnames> are imported so C<\N{...}> sequences can be
#pod used to compile Unicode characters based on names.
#pod
#pod =item *
#pod
#pod On Perl C<v5.11.0> or higher, the C<use feature 'unicode_strings'> is
#pod enabled.
#pod
#pod =item *
#pod
#pod C<use feature fc> and C<use feature unicode_eval> are enabled on Perl
#pod C<5.16.0> and higher.
#pod
#pod =item *
#pod
#pod Filehandles are opened with UTF-8 encoding turned on by default
#pod (including C<STDIN>, C<STDOUT>, and C<STDERR> when C<utf8::all> is
#pod used from the C<main> package). Meaning that they automatically
#pod convert UTF-8 octets to characters and vice versa. If you I<don't>
#pod want UTF-8 for a particular filehandle, you'll have to set C<binmode
#pod $filehandle>.
#pod
#pod =item *
#pod
#pod C<@ARGV> gets converted from UTF-8 octets to Unicode characters (when
#pod C<utf8::all> is used from the C<main> package). This is similar to the
#pod behaviour of the C<-CA> perl command-line switch (see L<perlrun>).
#pod
#pod =item *
#pod
#pod C<readdir>, C<readlink>, C<readpipe> (including the C<qx//> and
#pod backtick operators), and L<C<glob>|perlfunc/glob> (including the C<<
#pod <> >> operator) now all work with and return Unicode characters
#pod instead of (UTF-8) octets (again only when C<utf8::all> is used from
#pod the C<main> package).
#pod
#pod =back
#pod
#pod =head2 Lexical Scope
#pod
#pod The pragma is lexically-scoped, so you can do the following if you had
#pod some reason to:
#pod
#pod {
#pod use utf8::all;
#pod open my $out, '>', 'outfile';
#pod my $utf8_str = 'føø bār';
#pod print length $utf8_str, "\n"; # 7
#pod print $out $utf8_str; # out as utf8
#pod }
#pod open my $in, '<', 'outfile'; # in as raw
#pod my $text = do { local $/; <$in>};
#pod print length $text, "\n"; # 10, not 7!
#pod
#pod Instead of lexical scoping, you can also use C<no utf8::all> to turn
#pod off the effects.
#pod
#pod Note that the effect on C<@ARGV> and the C<STDIN>, C<STDOUT>, and
#pod C<STDERR> file handles is always global and can not be undone!
#pod
#pod =head2 Enabling/Disabling Global Features
#pod
#pod As described above, the default behaviour of C<utf8::all> is to
#pod convert C<@ARGV> and to open the C<STDIN>, C<STDOUT>, and C<STDERR>
#pod file handles with UTF-8 encoding, and override the C<readlink> and
#pod C<readdir> functions and C<glob> operators when C<utf8::all> is used
#pod from the C<main> package.
#pod
#pod If you want to disable these features even when C<utf8::all> is used
#pod from the C<main> package, add the option C<NO-GLOBAL> (or
#pod C<LEXICAL-ONLY>) to the use line. E.g.:
#pod
#pod use utf8::all 'NO-GLOBAL';
#pod
#pod If on the other hand you want to enable these global effects even when
#pod C<utf8::all> was used from another package than C<main>, use the
#pod option C<GLOBAL> on the use line:
#pod
#pod use utf8::all 'GLOBAL';
#pod
#pod =head2 UTF-8 Errors
#pod
#pod C<utf8::all> will handle invalid code points (i.e., utf-8 that does
#pod not map to a valid unicode "character"), as a fatal error.
#pod
#pod For C<glob>, C<readdir>, and C<readlink>, one can change this
#pod behaviour by setting the attribute L</"$utf8::all::UTF8_CHECK">.
#pod
#pod =head1 COMPATIBILITY
#pod
#pod The filesystems of Dos, Windows, and OS/2 do not (fully) support
#pod UTF-8. The C<readlink> and C<readdir> functions and C<glob> operators
#pod will therefore not be replaced on these systems.
#pod
#pod =head1 SEE ALSO
#pod
#pod =over 4
#pod
#pod =item *
#pod
#pod L<File::Find::utf8> for fully utf-8 aware File::Find functions.
#pod
#pod =item *
#pod
#pod L<Cwd::utf8> for fully utf-8 aware Cwd functions.
#pod
#pod =back
#pod
#pod =cut
use Import::Into;
use parent qw(Encode charnames utf8 open warnings feature);
use Symbol qw(qualify_to_ref);
use Config;
# Holds the pointers to the original version of redefined functions
state %_orig_functions;
# Current (i.e., this) package
my $current_package = __PACKAGE__;
require Carp;
$Carp::Internal{$current_package}++; # To get warnings reported at correct caller level
#pod =attr $utf8::all::UTF8_CHECK
#pod
#pod By default C<utf8::all> marks decoding errors as fatal (default value
#pod for this setting is C<Encode::FB_CROAK>). If you want, you can change this by
#pod setting C<$utf8::all::UTF8_CHECK>. The value C<Encode::FB_WARN> reports
#pod the encoding errors as warnings, and C<Encode::FB_DEFAULT> will completely
#pod ignore them. Please see L<Encode> for details. Note: C<Encode::LEAVE_SRC> is
#pod I<always> enforced.
#pod
#pod Important: Only controls the handling of decoding errors in C<glob>,
#pod C<readdir>, and C<readlink>.
#pod
#pod =cut
use Encode ();
use PerlIO::utf8_strict;
our $UTF8_CHECK = Encode::FB_CROAK | Encode::LEAVE_SRC; # Die on encoding errors
# UTF-8 Encoding object
my $_UTF8 = Encode::find_encoding('UTF-8');
sub import {
# Enable features/pragmas in calling package
my $target = caller;
# Enable global effects be default only when imported from main package
my $no_global = $target ne 'main';
# Override global?
if (defined $_[1] && $_[1] =~ /^(?:(NO-)?GLOBAL|LEXICAL-ONLY)$/i) {
$no_global = $_[1] !~ /^GLOBAL$/i;
splice(@_, 1, 1); # Remove option from import's arguments
}
'utf8'->import::into($target);
'open'->import::into($target, 'IO' => ':utf8_strict');
# use open ':std' only works with some encodings.
state $have_encoded_std = 0;
unless ($no_global || $have_encoded_std++) {
binmode STDERR, ':utf8_strict';
binmode STDOUT, ':utf8_strict';
binmode STDIN, ':utf8_strict';
}
'charnames'->import::into($target, qw{:full :short});
'warnings'->import::into($target, qw{FATAL utf8});
'feature'->import::into($target, qw{unicode_strings}) if $^V >= v5.11.0;
'feature'->import::into($target, qw{unicode_eval fc}) if $^V >= v5.16.0;
unless ($no_global || $^O =~ /MSWin32|cygwin|dos|os2/) {
no strict qw(refs); ## no critic (TestingAndDebugging::ProhibitNoStrict)
no warnings qw(redefine);
# Replace readdir with utf8 aware version
*{$target . '::readdir'} = \&_utf8_readdir;
# Replace readdir with utf8 aware version
*{$target . '::readlink'} = \&_utf8_readlink;
# Replace glob with utf8 aware version
*{$target . '::glob'} = \&_utf8_glob;
# Set compiler hint to encode/decode in the redefined functions
$^H{'utf8::all'} = 1;
}
# Make @ARGV utf-8 when, unless perl was launched with the -CA
# flag as this already has @ARGV decoded automatically. -CA is
# active if the the fifth bit (32) of the ${^UNICODE} variable is
# set. (see perlrun on the -C command switch for details about
# ${^UNICODE})
unless ($no_global || (${^UNICODE} & 32)) {
state $have_encoded_argv = 0;
if (!$have_encoded_argv++) {
$UTF8_CHECK |= Encode::LEAVE_SRC if $UTF8_CHECK; # Enforce LEAVE_SRC
$_ = ($_ ? $_UTF8->decode($_, $UTF8_CHECK) : $_) for @ARGV;
}
}
return;
}
sub unimport { ## no critic (Subroutines::ProhibitBuiltinHomonyms)
# Disable features/pragmas in calling package
# Note: Does NOT undo the effect on @ARGV,
# nor on the STDIN, STDOUT, and STDERR file handles!
# These effects are always "global".
my $target = caller;
'utf8'->unimport::out_of($target);
'open'->import::into($target, qw{IO :bytes});
unless ($^O =~ /MSWin32|cygwin|dos|os2/) {
$^H{'utf8::all'} = 0; # Reset compiler hint
}
return;
}
sub _utf8_readdir(*) { ## no critic (Subroutines::ProhibitSubroutinePrototypes)
my $pre_handle = shift;
my $hints = (caller 0)[10];
my $handle = ref($pre_handle) ? $pre_handle : qualify_to_ref($pre_handle, caller);
if (not $hints->{'utf8::all'}) {
return CORE::readdir($handle);
} else {
$UTF8_CHECK |= Encode::LEAVE_SRC if $UTF8_CHECK; # Enforce LEAVE_SRC
if (wantarray) {
return map { $_ ? $_UTF8->decode($_, $UTF8_CHECK) : $_ } CORE::readdir($handle);
} else {
my $r = CORE::readdir($handle);
return $r ? $_UTF8->decode($r, $UTF8_CHECK) : $r;
}
}
}
sub _utf8_readlink(_) { ## no critic (Subroutines::ProhibitSubroutinePrototypes)
my $arg = shift;
my $hints = (caller 0)[10];
if (not $hints->{'utf8::all'}) {
return CORE::readlink($arg);
} else {
$UTF8_CHECK |= Encode::LEAVE_SRC if $UTF8_CHECK; # Enforce LEAVE_SRC
$arg = $arg ? $_UTF8->encode($arg, $UTF8_CHECK) : $arg;
my $r = CORE::readlink($arg);
return $r ? $_UTF8->decode($r, $UTF8_CHECK) : $r;
}
}
sub _utf8_glob {
my $arg = $_[0]; # Making this a lexical somehow is important!
my $hints = (caller 0)[10];
if (not $hints->{'utf8::all'}) {
return CORE::glob($arg);
} else {
$UTF8_CHECK |= Encode::LEAVE_SRC if $UTF8_CHECK; # Enforce LEAVE_SRC
$arg = $arg ? $_UTF8->encode($arg, $UTF8_CHECK) : $arg;
if (wantarray) {
return map { $_ ? $_UTF8->decode($_, $UTF8_CHECK) : $_ } CORE::glob($arg);
} else {
my $r = CORE::glob($arg);
return $r ? $_UTF8->decode($r, $UTF8_CHECK) : $r;
}
}
}
#pod =head1 INTERACTION WITH AUTODIE
#pod
#pod If you use L<autodie>, which is a great idea, you need to use at least
#pod version B<2.12>, released on L<June 26,
#pod 2012|https://metacpan.org/source/PJF/autodie-2.12/Changes#L3>.
#pod Otherwise, autodie obliterates the IO layers set by the L<open>
#pod pragma. See L<RT
#pod #54777|https://rt.cpan.org/Ticket/Display.html?id=54777> and L<GH
#pod #7|https://github.com/doherty/utf8-all/issues/7>.
#pod
#pod =cut
1;
__END__
=pod
=encoding UTF-8
=head1 NAME
utf8::all - turn on Unicode - all of it
=head1 VERSION
version 0.024
=head1 SYNOPSIS
use utf8::all; # Turn on UTF-8, all of it.
open my $in, '<', 'contains-utf8'; # UTF-8 already turned on here
print length 'føø bār'; # 7 UTF-8 characters
my $utf8_arg = shift @ARGV; # @ARGV is UTF-8 too (only for main)
=head1 DESCRIPTION
The C<use utf8> pragma tells the Perl parser to allow UTF-8 in the
program text in the current lexical scope. This also means that you
can now use literal Unicode characters as part of strings, variable
names, and regular expressions.
C<utf8::all> goes further:
=over 4
=item *
L<C<charnames>|charnames> are imported so C<\N{...}> sequences can be
used to compile Unicode characters based on names.
=item *
On Perl C<v5.11.0> or higher, the C<use feature 'unicode_strings'> is
enabled.
=item *
C<use feature fc> and C<use feature unicode_eval> are enabled on Perl
C<5.16.0> and higher.
=item *
Filehandles are opened with UTF-8 encoding turned on by default
(including C<STDIN>, C<STDOUT>, and C<STDERR> when C<utf8::all> is
used from the C<main> package). Meaning that they automatically
convert UTF-8 octets to characters and vice versa. If you I<don't>
want UTF-8 for a particular filehandle, you'll have to set C<binmode
$filehandle>.
=item *
C<@ARGV> gets converted from UTF-8 octets to Unicode characters (when
C<utf8::all> is used from the C<main> package). This is similar to the
behaviour of the C<-CA> perl command-line switch (see L<perlrun>).
=item *
C<readdir>, C<readlink>, C<readpipe> (including the C<qx//> and
backtick operators), and L<C<glob>|perlfunc/glob> (including the C<<
<> >> operator) now all work with and return Unicode characters
instead of (UTF-8) octets (again only when C<utf8::all> is used from
the C<main> package).
=back
=head2 Lexical Scope
The pragma is lexically-scoped, so you can do the following if you had
some reason to:
{
use utf8::all;
open my $out, '>', 'outfile';
my $utf8_str = 'føø bār';
print length $utf8_str, "\n"; # 7
print $out $utf8_str; # out as utf8
}
open my $in, '<', 'outfile'; # in as raw
my $text = do { local $/; <$in>};
print length $text, "\n"; # 10, not 7!
Instead of lexical scoping, you can also use C<no utf8::all> to turn
off the effects.
Note that the effect on C<@ARGV> and the C<STDIN>, C<STDOUT>, and
C<STDERR> file handles is always global and can not be undone!
=head2 Enabling/Disabling Global Features
As described above, the default behaviour of C<utf8::all> is to
convert C<@ARGV> and to open the C<STDIN>, C<STDOUT>, and C<STDERR>
file handles with UTF-8 encoding, and override the C<readlink> and
C<readdir> functions and C<glob> operators when C<utf8::all> is used
from the C<main> package.
If you want to disable these features even when C<utf8::all> is used
from the C<main> package, add the option C<NO-GLOBAL> (or
C<LEXICAL-ONLY>) to the use line. E.g.:
use utf8::all 'NO-GLOBAL';
If on the other hand you want to enable these global effects even when
C<utf8::all> was used from another package than C<main>, use the
option C<GLOBAL> on the use line:
use utf8::all 'GLOBAL';
=head2 UTF-8 Errors
C<utf8::all> will handle invalid code points (i.e., utf-8 that does
not map to a valid unicode "character"), as a fatal error.
For C<glob>, C<readdir>, and C<readlink>, one can change this
behaviour by setting the attribute L</"$utf8::all::UTF8_CHECK">.
=head1 ATTRIBUTES
=head2 $utf8::all::UTF8_CHECK
By default C<utf8::all> marks decoding errors as fatal (default value
for this setting is C<Encode::FB_CROAK>). If you want, you can change this by
setting C<$utf8::all::UTF8_CHECK>. The value C<Encode::FB_WARN> reports
the encoding errors as warnings, and C<Encode::FB_DEFAULT> will completely
ignore them. Please see L<Encode> for details. Note: C<Encode::LEAVE_SRC> is
I<always> enforced.
Important: Only controls the handling of decoding errors in C<glob>,
C<readdir>, and C<readlink>.
=head1 INTERACTION WITH AUTODIE
If you use L<autodie>, which is a great idea, you need to use at least
version B<2.12>, released on L<June 26,
2012|https://metacpan.org/source/PJF/autodie-2.12/Changes#L3>.
Otherwise, autodie obliterates the IO layers set by the L<open>
pragma. See L<RT
#54777|https://rt.cpan.org/Ticket/Display.html?id=54777> and L<GH
#7|https://github.com/doherty/utf8-all/issues/7>.
=head1 BUGS
Please report any bugs or feature requests on the bugtracker
L<website|https://github.com/doherty/utf8-all/issues>.
When submitting a bug or request, please include a test-file or a
patch to an existing test-file that illustrates the bug or desired
feature.
=head1 COMPATIBILITY
The filesystems of Dos, Windows, and OS/2 do not (fully) support
UTF-8. The C<readlink> and C<readdir> functions and C<glob> operators
will therefore not be replaced on these systems.
=head1 SEE ALSO
=over 4
=item *
L<File::Find::utf8> for fully utf-8 aware File::Find functions.
=item *
L<Cwd::utf8> for fully utf-8 aware Cwd functions.
=back
=head1 AUTHORS
=over 4
=item *
Michael Schwern <mschwern@cpan.org>
=item *
Mike Doherty <doherty@cpan.org>
=item *
Hayo Baan <info@hayobaan.com>
=back
=head1 COPYRIGHT AND LICENSE
This software is copyright (c) 2009 by Michael Schwern <mschwern@cpan.org>; he originated it.
This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.
=cut