#!/usr/bin/perl
#
# Perl script to convert the Unicode Character Database source files
# into data for libucd.
#

#
# Internally this file uses a hash with the UCS value as key, and
# as data another hash from property name to value.
#

%ucs_props = ();

sub parse_unicode_string($) {
    my ($str) = @_;
    my @str = split(/\s+/, $str, 0);
    my @xv = ();
    my $x;

    foreach $x ( @str ) {
	push(@xv, hex $x);
    }

    return pack("U*", @xv);
}

#
# File to read a UCD file with a list of properties (no names)
#
sub read_separated_file($$$) {
    my ($filename, $proplist, $default) = @_;
    my $fh;
    my $line, @fields, $c0, $c1, $c;
    my $was_first = 0;

    print STDERR "Reading $filename\n";
    open($fh, '<', $filename) or return 0;
    while ( defined($line = <$fh>) ) {
	chomp $line;
	$line =~ s/\s*(|\#.*)$//;
	@fields = split(/\s*\;\s*/, $line);

	if ( $fields[0] =~ /^([0-9a-f]+)(|..([0-9a-f]+))$/i ) {
	    if ( $was_first ) {
		$c1 = hex $1;
		$was_first = 0;
	    } else {
		$c0 = hex $1;
		$c1 = ($2 eq '') ? $c0 : hex $3;
	    }

	    for ( $c = $c0 ; $c <= $c1 ; $c++ ) {
		my $p, $f;

		$ucs_props{$c} = {} unless ( exists($ucs_props{$c}) );
		
		for ( $f = 1 ; $f < scalar(@fields) ; $f++ ) {
		    my $prop = ${$proplist}[$f-1];
		    if ( defined($prop) ) {
			my $type = substr($prop, 0, 1);
			my $prop = substr($prop, 1);
			my $def = ${$default}[$f-1];
			$def = sprintf("%04X", $c) if ( $def eq '=' );
			my $val = $fields[$f];
			$val = $def if ( $val eq '' );

			if ( $type eq 'b' ) {
			    # Boolean (Y/N)
			    $val = ($val eq 'N') ? 0 : 1;
			} elsif ( $type eq 'p' ) {
			    # Code point
			    $val = hex $val;
			} elsif ( $type eq 's' ) {
			    # String of code points
			    $val = parse_unicode_string($val);
			} elsif ( $type eq '!' ) {
			    # Special case
			    if ( $prop eq 'Name' ) {
				# In UnicodeData.txt, ranges aren't encoded the same way
				# as elsewhere, instead the first and last code point are
				# entered, with <..., first> or <..., last> as the name
				if ( $val =~ /^\<.*, First\>$/ ) {
				    $was_first = 1;
				}
				$val = undef if ( $val =~ /^\<.*\>$/ );
			    } elsif ( $prop eq 'Decomposition' ) {
				$prop = 'Decomposition_Mapping';
				if ( $val =~ /^(\<([a-z0-9]+)\>\s*|)([0-9a-f\s]+)$/i ) {
				    my $dct = $2 || 'canonical';
				    $val = parse_unicode_string($3);

				    ${$ucs_props{$c}}{'Decomposition_Type'} = $dct;
				}
			    } else {
				die "$0: Unknown special: $prop\n";
			    }
			}

			${$ucs_props{$c}}{$prop} = $val;
		    }
		}
	    }
	}
    }
    close($fh);

    return 1;
}

#
# File to read a UCD file with boolean properties
#
sub read_boolean_file($) {
    my ($filename) = @_;
    my $fh;
    my $line, @fields, $c0, $c1, $c;

    print STDERR "Reading $filename\n";
    open($fh, '<', $filename) or return 0;
    while ( defined($line = <$fh>) ) {
	chomp $line;
	$line =~ s/\s*(|\#.*)$//;
	@fields = split(/\s*\;\s*/, $line);

	if ( $fields[0] =~ /^([0-9A-F]+)(|..([0-9A-F]+))$/ &&
	     $fields[1] ne '' ) {
	    $c0 = hex $1;
	    $c1 = ($2 eq '') ? $c0 : hex $3;

	    for ( $c = $c0 ; $c <= $c1 ; $c++ ) {
		my $p, $f;

		$ucs_props{$c} = {} unless ( exists($ucs_props{$c}) );
		
		${$ucs_props{$c}}{$fields[1]} = 1;
	    }
	}
    }
    close($fh);

    return 1;
}

sub make_jamo_string($) {
    my ($s) = @_;
    my $i, $c;
    my $o = '';

    $o .= "{";
    for ( $i = 0 ; $i < 4 ; $i++ ) {
	$c = substr($s,$i,1);
	$o .= length($c) ? "\'$c\'" : '0';
	$o .= ($i == 3) ? '}' : ',';
    }

    return $o;
}

# This produces tables used to generate the systematic Hangul syllables
sub make_jamo_tables() {
    my $LBase = 0x1100;
    my $VBase = 0x1161;
    my $TBase = 0x11A7;
    my $LCount = 19;
    my $VCount = 21;
    my $TCount = 28;
    my $i;
    my $fh;
    
    # None of the syllables exceed 4 bytes, so let's not waste
    # pointer space that might have to be relocated...

    print STDERR "Writing gen/jamo.c\n";
    open($fh, '>', 'gen/jamo.c') or die "$0 cannot create gen/jamo.c";
    print $fh "#include \"libucd_int.h\"\n\n";

    print $fh "const char _libucd_hangul_jamo_l[$LCount][4] = {\n";
    for ( $i = 0 ; $i < $LCount ; $i++ ) {
	printf $fh "\t%s,\n", make_jamo_string(${$ucs_props{$LBase+$i}}{'Jamo_Short_Name'});
    }
    print $fh "};\n";
    print $fh "const char _libucd_hangul_jamo_v[$VCount][4] = {\n";
    for ( $i = 0 ; $i < $VCount ; $i++ ) {
	printf $fh "\t%s,\n", make_jamo_string(${$ucs_props{$VBase+$i}}{'Jamo_Short_Name'});
    }
    print $fh "};\n";
    print $fh "const char _libucd_hangul_jamo_t[$TCount][4] = {\n";
    for ( $i = 0 ; $i < $TCount ; $i++ ) {
	printf $fh "\t%s,\n", make_jamo_string(${$ucs_props{$TBase+$i}}{'Jamo_Short_Name'});
    }
    print $fh "};\n";

    close($fh);
}

# This produces a names list sorted by UCS, and produces a reverse map.
%name_to_ucs   = ();

sub make_names_list() {
    my $k;
    my $pos = 0;
    my $fh;
    my $col;

    print STDERR "Writing gen/nameslist.tab\n";
    open($fh, '>', 'gen/nameslist.tab') or die;

    foreach $k ( sort {$a <=> $b} (keys(%ucs_props)) ) {
	print STDERR "Not a number: \"$k\"\n" if ( $k ne ($k+0) );

	my $n = ${$ucs_props{$k}}{'Name'};
	if ( defined($n) ) {
	    if ( defined($name_to_ucs{$n}) ) {
		printf STDERR "WARNING: Name \"%s\" duplicated from U+%04X to U+%04X\n",
		$n, $k, $name_to_ucs{$n};
	    } else {
		$name_to_ucs{$n} = $k;
		printf $fh "%05x %s\n", $k, $n;
	    }
	}
    }
    close($fh);
}

#
# Produce a list of names for automatic hash table generation.
# This includes the Hangul syllables, but not systematically
# named CJK.
#
sub write_hangul_names($$)
{
    my ($fh, $fht) = @_;
    my $SBase = 0xAC00;
    my $LBase = 0x1100;
    my $VBase = 0x1161;
    my $TBase = 0x11A7;
    my $LCount = 19;
    my $VCount = 21;
    my $TCount = 28;
    my $SCount = $LCount*$VCount*$TCount;
    my $l, $v, $t, $c;

    $c = $SBase;
    for ( $l = 0 ; $l < $LCount ; $l++ ) {
	for ( $v = 0 ; $v < $VCount ; $v++ ) {
	    for ( $t = 0 ; $t < $TCount ; $t++) {
		my $name = sprintf("HANGUL SYLLABLE %s%s%s",
				   ${$ucs_props{$LBase+$l}}{'Jamo_Short_Name'},
				   ${$ucs_props{$VBase+$v}}{'Jamo_Short_Name'},
				   ${$ucs_props{$TBase+$t}}{'Jamo_Short_Name'});
		printf $fh "%s\n", $name;
		printf $fht "%05x %s\n", $c, $name;

		$c++;
	    }
	}
    }
}

sub make_name_keyfile()
{
    my $fh, $fht;
    my $k;

    print STDERR "Writing gen/nametoucs.keys and gen/nametoucs.tab\n";
    open($fh, '>', 'gen/nametoucs.keys') or die;
    open($fht, '>', 'gen/nametoucs.tab') or die;

    foreach $k ( keys(%name_to_ucs) ) {
	printf $fh "%s\n", $k;
	printf $fht "%05x %s\n", $name_to_ucs{$k}, $k;
    }

    write_hangul_names($fh, $fht);

    close($fh);
    close($fht);
}

#
# Make a keyfile for all non-systematically named codepoints
#
sub make_named_ucs_keyfile()
{
    my $fh;
    my $k;

    print STDERR "Writing gen/ucstoname.keys\n";
    open($fh, '>', 'gen/ucstoname.keys')
	or die "$0: cannot write gen/ucstoname.keys\n";

    foreach $k ( values(%name_to_ucs) ) {
	printf $fh "%08x\n", $k;
    }

    close($fh);
}

#
# Produce a list of character properties, sans names; this is
# a test in order to figure out how much we could save from a
# range-oriented table for everything except names.
#
sub dump_prop_list()
{
    my $fh, $c;

    print STDERR "Writing gen/propdump.txt\n";
    open($fh, '>', 'gen/propdump.txt')
	or die "$0: cannot write gen/propdump.txt\n";
    binmode $fh, ':utf8';
    
    for ( $c = 0 ; $c <= 0x10ffff ; $c++ ) {
	my %h = %{$ucs_props{$c}};

	# Handle these separately
	delete $h{'Name'};
	delete $h{'Unicode_1_Name'};
	delete $h{'ISO_Comment'};
	delete $h{'Decomposition_Mapping'};
	# delete $h{'Uppercase_Mapping'};
	# delete $h{'Lowercase_Mapping'};
	# delete $h{'Titlecase_Mapping'};
	# delete $h{'Special_Case_Condition'};
	delete $h{'Jamo_Short_Name'};

	# Store these as offsets.
	my $k;
	foreach $k ( 'Simple_Uppercase_Mapping',
		     'Simple_Lowercase_Mapping',
		     'Simple_Titlecase_Mapping' ) {
	    if ( defined($h{$k}) ) {
		$h{$k} -= $c;	# Convert to offset
	    } else {
		$h{$k} = 0;	# Default is zero offset
	    }
	}
    
	my @l = sort(keys(%h));
	my $p;
	printf $fh "%05X ", $c;
	foreach $p ( @l ) {
	    print $fh $p,':',$h{$p},';';
	}
	print $fh "\n";
    }
}

#
# Produce the properties array
#
%prop_array_position = ();

sub emit_int24($) {
    my($v) = @_;
    return sprintf("{0x%02x, 0x%02x, 0x%02x}",
		    $v & 0xff,
		    ($v >> 8) & 0xff,
		    ($v >> 16) & 0xff);
}

sub make_properties_array()
{
    my $fh, $fhi, $c, $prev, $mine, $cnt, $cp;

    # List of boolean properties that translate 1:1 into flags
    my @boolean_props = ('Composition_Exclusion', 'Alphabetic', 'Default_Ignorable_Code_Point',
			 'Lowercase', 'Grapheme_Base', 'Grapheme_Extend', 'ID_Start', 'ID_Continue',
			 'Math', 'Uppercase', 'XID_Start', 'XID_Continue', 'Hex_Digit',
			 'Bidi_Control', 'Dash', 'Deprecated', 'Diacritic', 'Extender',
			 'Grapheme_Link', 'Ideographic', 'IDS_Binary_Operator',
			 'IDS_Trinary_Operator', 'Join_Control', 'Logical_Order_Exception',
			 'Noncharacter_Code_Point', 'Pattern_Syntax', 'Pattern_White_Space',
			 'Quotation_Mark', 'Radical', 'Soft_Dotted', 'STerm',
			 'Terminal_Punctuation', 'Unified_Ideograph', 'Variation_Selector',
			 'White_Space', 'Bidi_Mirrored');

    print STDERR "Writing gen/proparray.c and gen/proparrayindex\n";
    open($fh, '>', 'gen/proparray.c') or die;
    open($fhi, '>', 'gen/proparrayindex') or die;
    binmode $fh, ':utf8';

    undef $prev;
    $cnt = 0;

    for ( $c = 0 ; $c <= 0x10ffff ; $c++ ) {
	$cp = $ucs_props{$c};
	# Careful with the formatting: we rely on the fact that
	# the first 14 characters contain the UCS value and the rest
	# the properties.

	# Code point UCS value
	$mine = sprintf("\t{\n\t\t0x%05x,\n", $c);

	# General category
	my $gc = $$cp{'General_Category'} || 'Cn';
	$mine .= "\t\tUC_GC_$gc,\n";

	# Script
	my $sc = $$cp{'Script'} || 'Common';
	$mine .= "\t\tUC_SC_$sc,\n";

	# Numeric value
	my $nv = $$cp{'Numeric_Value'};
	if ( $nv > 255 ) {
	    my $exp = int(log($nv)/log(10))-1;
	    my $num = int($nv/(10**$exp));
	    $mine .= "\t\t$num, 128+$exp,\n";
	} else {
	    my $num = $nv + 0;
	    my $den = 1;

	    if ( $nv != 0 ) {
		while ( ($nv-($num/$den))/$nv > 1e-7 ) {
		    $den++;
		    $num = int($nv*$den+0.5);
		}
	    }
	    $mine .= "\t\t$num, $den,\n";
	}

	# Boolean properties and block index
	my $bp;
	foreach $bp ( @boolean_props ) {
	    if ( $$cp{$bp} ) {
		$mine .= "\t\tUC_FL_\U$bp\E |\n";
	    }
	}
	my $block = $$cp{'Block'} || 'No_Block';
	$block =~ tr/ .-/___/;
	$mine .= "\t\t((uint64_t)UC_BLK_$block << 48),\n";
	
	# Simple case mappings
	my $sum = ($$cp{'Simple_Uppercase_Mapping'} || $c) - $c;
	$mine .= "\t\t".emit_int24($sum).",\n";
	my $slm = ($$cp{'Simple_Lowercase_Mapping'} || $c) - $c;
	$mine .= "\t\t".emit_int24($slm).",\n";
	my $stm = ($$cp{'Simple_Titlecase_Mapping'} || $c) - $c;
	$mine .= "\t\t".emit_int24($stm).",\n";
	
	# Age (assume 31.7 as maximum; Unicode has traditionally not had
	# many minor versions per major version.)
	my $age = $$cp{'Age'} || '0.0';
	my (@sage) = split(/\./, $age);
	$mine .= sprintf("\t\t(%d << 3) + %d, /* $age */\n", $sage[0], $sage[1]);

	# Canonical Combining Class
	my $ccc = $$cp{'Canonical_Combining_Class'} || 'NR';
	if ( $ccc =~ /^[0-9]+$/ ) {
	    $mine .= "\t\t$ccc,\n"; # Numeric CCC
	} else {
	    $mine .= "\t\tUC_CCC_$ccc,\n";
	}

	# Sentence Break
	my $sb = $$cp{'Sentence_Break'} || 'Other';
	$mine .= "\t\tUC_SB_$sb,\n";

	# Grapheme Cluster Break
	my $gcb = $$cp{'Grapheme_Cluster_Break'} || 'Other';
	$mine .= "\t\tUC_GCB_$gcb,\n";

	# Word Break
	my $wb = $$cp{'Word_Break'} || 'Other';
	$mine .= "\t\tUC_WB_$wb,\n";

	# Arabic Joining Type
	my $ajt = $$cp{'Joining_Type'} ||
	    ($gc eq 'Mn' || $gc eq 'Me' || $gc eq 'Cf') ? 'T' : 'U';
	$mine .= "\t\tUC_JT_$ajt,\n";

	# Arabic Joining Group
	my $ajg = $$cp{'Joining_Group'} || 'No_Joining_Group';
	$ajg =~ tr/ /_/;
	$ajg =~ s/([A-Z])([A-Z]+)/$1\L$2\E/g;
	$mine .= "\t\tUC_JG_$ajg,\n";

	# East Asian Width
	my $ea = $$cp{'East_Asian_Width'} || 'N';
	$mine .= "\t\tUC_EA_$ea,\n";

	# Hangul Syllable Type
	my $hst = $$cp{'Hangul_Syllable_Type'} || 'NA';
	$mine .= "\t\tUC_HST_$hst,\n";

	# Line Break
	my $lb = $$cp{'Line_Break'} || 'XX';
	$mine .= "\t\tUC_LB_$lb,\n";

	# Numeric Type
	my $nt = $$cp{'Numeric_Type'} || 'None';
	$mine .= "\t\tUC_NT_$nt,\n";

	# Bidi Class
	my $bc = $$cp{'Bidi_Class'} || 'L';
	$mine .= "\t\tUC_BC_$bc,\n";

	# Additional properties...
	$mine .= "\t},\n";

	if ( substr($prev,14) ne substr($mine,14) ) {
	    print $fh $mine;
	    $cnt++;
	    $prev = $mine;
	    printf $fhi "0x%05x $cnt\n", $c, $cnt;
	}
	$prop_array_position{$c} = $cnt;
    }
    print $fh "\t/* Total: $cnt ranges */\n";

    close($fh);
    close($fhi);
}

#
# Import files
#
read_separated_file('ucd/UnicodeData.txt',
		    ['!Name', 'eGeneral_Category', 'nCanonical_Combining_Class',
		     'eBidi_Class', '!Decomposition', undef, undef,
		     'eNumeric_Value', 'bBidi_Mirrored',
		     'mUnicode_1_Name', 'mISO_Comment', 'pSimple_Uppercase_Mapping',
		     'pSimple_Lowercase_Mapping', 'pSimple_Titlecase_Mapping'],
		    ['<reserved>', 'Cn', 0, undef, undef, undef, undef, undef,
		     'N', undef, undef, '=', '=', '=']);

read_separated_file('ucd/extracted/DerivedNumericType.txt', ['eNumeric_Type'], []);
read_separated_file('ucd/extracted/DerivedNumericValues.txt', ['eNumeric_Value'], []);
read_separated_file('ucd/extracted/DerivedBidiClass.txt', ['eBidi_Class'], ['L']);
read_separated_file('ucd/ArabicShaping.txt', [undef, 'eJoining_Type', 'eJoining_Group'], []);
read_separated_file('ucd/BidiMirroring.txt', ['pBidi_Mirroring_Glyph'], []);
read_separated_file('ucd/Blocks.txt', ['cBlock'], []);
read_separated_file('ucd/CompositionExclusions.txt', 'bComposition_Exclusion', []);
# read_separated_file('ucd/CaseFolding.txt', ['eCase_Folding_Type', 'sCase_Folding'], []);
read_separated_file('ucd/DerivedAge.txt', ['cAge'], []);
read_separated_file('ucd/EastAsianWidth.txt', ['eEast_Asian_Width'], []);
read_separated_file('ucd/HangulSyllableType.txt', ['eHangul_Syllable_Type'], []);
read_separated_file('ucd/LineBreak.txt', ['eLine_Break'], []);
read_separated_file('ucd/Scripts.txt', ['cScript'], ['Common']);
read_separated_file('ucd/SpecialCasing.txt', ['sUppercase_Mapping', 'sLowercase_Mapping',
					  'sTitlecase_Mapping', 'mSpecial_Case_Condition'], []);
read_separated_file('ucd/Jamo.txt', ['mJamo_Short_Name'], []);
read_separated_file('ucd/auxiliary/GraphemeBreakProperty.txt', ['eGrapheme_Cluster_Break'], []);
read_separated_file('ucd/auxiliary/SentenceBreakProperty.txt', ['eSentence_Break'], []);
read_separated_file('ucd/auxiliary/WordBreakProperty.txt', ['eWord_Break'], []);
read_boolean_file('ucd/DerivedCoreProperties.txt');
read_boolean_file('ucd/PropList.txt');

#
# Produce output
#
make_jamo_tables();
make_names_list();
make_name_keyfile();
make_named_ucs_keyfile();
make_properties_array();
# dump_prop_list();