Skip to content

unicode2utf8.pl と utf82unicode.pl

by wetcradle on 6月 2nd, 2011

Unicode から UTF-8 への変換と、UTF-8 から Unicode への変換を行うスクリプトです。
一時的に必要になったので、やっつけ仕事。

unicode2utf8.pl

実行例

$ ./unicode2utf8.pl 65e5 672c 8a9e
string: 日本語
bytes (decimal): 230 151 165 230 156 172 232 170 158
bytes (hex): e6 97 a5 e6 9c ac e8 aa 9e
bytes (binary): 11100110 10010111 10100101 11100110 10011100 10101100 11101000 10101010 10011110
chars (decimal): 15112101 15113388 15248030
chars (hex): e697a5 e69cac e8aa9e
chars (binary): 111001101001011110100101 111001101001110010101100 111010001010101010011110

ソースコード

#!/usr/bin/perl

use strict;
use warnings;
use Getopt::Long;
use File::Basename;

########## usage ##########
sub usage() {
	my $script = basename($0);
	print STDERR <<EOF;
Usage: $script hex_unicode ...
       $script -d decimal_unicode ...
       $script -h
EOF
}

########## option ##########
sub parse_option() {
	my $decimal_mode = 0;
	my $help = 0;
	GetOptions (
		'decimal|d' => \$decimal_mode,
		'help|h' => \$help,
	) || (&usage && exit 1);
	
	if ($help) {
		&usage();
		exit 0;
	}
	
	if ($#ARGV == -1) {
		&usage();
		exit 1;
	}
	return $decimal_mode;
}

########## main ##########
sub main() {
	my $decimal_mode = &parse_option();
	
	my @utf8s = ();
	my @bytes = ();
	foreach (@ARGV) {
		my $unicode = $decimal_mode ? $_ : hex($_);
		my ($utf8, $bytes_ref) = &unicode2utf8($unicode);
		push(@utf8s, $utf8);
		push(@bytes, @$bytes_ref);
	}
	
	print "string: " . pack("C*", @bytes) . "\n";
	
	my $bytes_length = $#bytes + 1;
	printf("bytes (decimal):".(" %d" x $bytes_length)."\n", @bytes);
	printf("bytes (hex):".(" %02x" x $bytes_length)."\n", @bytes);
	printf("bytes (binary):".(" %08b" x $bytes_length)."\n", @bytes);
	
	my $utf8s_length = $#utf8s + 1;
	printf("chars (decimal):".(" %d" x $utf8s_length)."\n", @utf8s);
	printf("chars (hex):".(" %x" x $utf8s_length)."\n", @utf8s);
	printf("chars (binary):".(" %b" x $utf8s_length)."\n", @utf8s);
}
&main;

########## unicode2utf8 ##########
sub unicode2utf8($) {
	my ($unicode) = @_;
	
	# 変換
	# bytes は 実際の UTF-8 バイト列とは逆順なので注意
	my @bytes = (0, 0, 0, 0, 0, 0);
	if ($unicode <= -1) {
		die "invalid unicode";
	}
	elsif ($unicode <= 0x7f) {
		$bytes&#91;0&#93; = $unicode;
	}
	elsif ($unicode <= 0x7ff) {
		$bytes&#91;0&#93; = 0b10000000 | 0b111111 & $unicode;
		$bytes&#91;1&#93; = 0b11000000 | 0b11111 & $unicode >> 6;
	}
	elsif ($unicode <= 0xffff) {
		$bytes&#91;0&#93; = 0b10000000 | 0b111111 & $unicode;
		$bytes&#91;1&#93; = 0b10000000 | 0b111111 & $unicode >> 6;
		$bytes[2] = 0b11100000 | 0b1111 & $unicode >> 12;
	}
	elsif ($unicode <= 0x1fffff) {
		$bytes&#91;0&#93; = 0b10000000 | 0b111111 & $unicode;
		$bytes&#91;1&#93; = 0b10000000 | 0b111111 & $unicode >> 6;
		$bytes[2] = 0b10000000 | 0b111111 & $unicode >> 12;
		$bytes[3] = 0b11110000 | 0b111 & $unicode >> 18;
	}
	elsif ($unicode <= 0x3ffffff) {
		$bytes&#91;0&#93; = 0b10000000 | 0b111111 & $unicode;
		$bytes&#91;1&#93; = 0b10000000 | 0b111111 & $unicode >> 6;
		$bytes[2] = 0b10000000 | 0b111111 & $unicode >> 12;
		$bytes[3] = 0b10000000 | 0b111111 & $unicode >> 18;
		$bytes[4] = 0b11111000 | 0b11 & $unicode >> 24;
	}
	elsif ($unicode <= 0x7fffffff) {
		$bytes&#91;0&#93; = 0b10000000 | 0b111111 & $unicode;
		$bytes&#91;1&#93; = 0b10000000 | 0b111111 & $unicode >> 6;
		$bytes[2] = 0b10000000 | 0b111111 & $unicode >> 12;
		$bytes[3] = 0b10000000 | 0b111111 & $unicode >> 18;
		$bytes[4] = 0b10000000 | 0b111111 & $unicode >> 24;
		$bytes[5] = 0b11111100 | 0b1 & $unicode >> 30;
	}
	else {
		die "invalid unicode";
	}
	
	# utf8 の作成
	my $utf8 = 0;
	for (my $i=0; $i<=$#bytes; $i++) {
		$utf8 |= $bytes&#91;$i&#93; << ($i * 8)
	}
	
	# bytes から不要なバイトを除き、実際の UTF-8 バイト列に並び替える(若いインデックスが早いバイト)
	while ($#bytes > 0 && $bytes[$#bytes] == 0) {
		pop(@bytes);
	}
	@bytes = reverse(@bytes);
	
	return ($utf8, \@bytes);
}

utf82unicode.pl

実行例

$ ./utf82unicode.pl e6 97 a5 e6 9c ac e8 aa 9e
string: 日本語
unicodes (decimal): 26085 26412 35486
unicodes (hex): 65e5 672c 8a9e
unicodes (binary): 110010111100101 110011100101100 1000101010011110

ソースコード


#!/usr/bin/perl

use strict;
use warnings;
use Getopt::Long;
use File::Basename;

########## const ##########
my @HEAD_DATA_MASKS = (
undef,
0b01111111,
0b00011111,
0b00001111,
0b00000111,
0b00000011,
0b00000001,
);
my $TAIL_DATA_MASK = 0b00111111;

########## usage ##########
sub usage() {
my $script = basename($0);
print STDERR < \$decimal_mode,
‘help|h’ => \$help,
) || (&usage && exit 1);

if ($help) {
&usage();
exit 0;
}

if ($#ARGV == -1) {
&usage();
exit 1;
}
return $decimal_mode;
}

########## main ##########
sub main() {
my $decimal_mode = &parse_option();

my @bytes = map {
my $byte = $decimal_mode ? $_ : hex($_);
if ($byte < 0 || $byte > 0xff) {
die “invalid byte: $_”;
}
$byte;
} @ARGV;

my $counter = 0;
my $byte;
my $unicode;
my @unicodes = ();
my @temp_bytes = @bytes;
while ($byte = shift(@temp_bytes)) {
if ($counter == 0) {
$counter = &head_byte2byte_length($byte);
die “invalid byte: $byte” if !$counter;
$unicode = $HEAD_DATA_MASKS[$counter] & $byte;
}
elsif (&is_tail_byte($byte)) {
$unicode = ($unicode << 6) | ($TAIL_DATA_MASK & $byte); } else { die “invalid byte: $byte”; } push(@unicodes, $unicode) if (!(–$counter)); } if ($counter) { die “incomplete bytes”; } print “string: ” . pack(“C*”, @bytes) . “\n”; my $unicodes_length = $#unicodes + 1; printf(“unicodes (decimal):”.(” %d” x $unicodes_length).”\n”, @unicodes); printf(“unicodes (hex):”.(” %02x” x $unicodes_length).”\n”, @unicodes); printf(“unicodes (binary):”.(” %08b” x $unicodes_length).”\n”, @unicodes); } &main; ########## is_tail_byte ########## sub is_tail_byte($) { my ($byte) = @_; return (0b11000000 & $byte) == 0b10000000; } ########## head_byte2byte_length ########## sub head_byte2byte_length($) { my ($byte) = @_; if ((0b10000000 & $byte) == 0b00000000) { return 1; } elsif ((0b11100000 & $byte) == 0b11000000) { return 2; } elsif ((0b11110000 & $byte) == 0b11100000) { return 3; } elsif ((0b11111000 & $byte) == 0b11110000) { return 4; } elsif ((0b11111100 & $byte) == 0b11111000) { return 5; } elsif ((0b11111110 & $byte) == 0b11111100) { return 6; } return 0; } [/perl]

From → perl

No comments yet

Leave a Reply

Note: XHTML is allowed. Your email address will never be published.

Subscribe to this comment feed via RSS