layout/mathml/updateOperatorDictionary.pl
author Ryan VanderMeulen <ryanvm@gmail.com>
Sun, 10 Jun 2012 19:44:50 -0400
changeset 101616 b7f3236f127b6f8d455efa6ad75f82a3a3c5f3d4
parent 101615 c39d36167b99139c4e39c2180f21cebae7ea6dbd
permissions -rwxr-xr-x
Revert c39d36167b99 due to a horribly munged backout.

#!/usr/bin/perl
# -*- Mode: Perl; tab-width: 2; indent-tabs-mode: nil; -*-
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

use XML::LibXSLT;
use XML::LibXML;
use LWP::Simple;

# output files
$FILE_UNICODE = "unicode.xml";
$FILE_DICTIONARY = "dictionary.xml";
$FILE_DIFFERENCES = "differences.txt";
$FILE_NEW_DICTIONARY = "new_dictionary.txt";
$FILE_SYNTAX_ERRORS = "syntax_errors.txt";
$FILE_JS = "tests/stretchy-and-large-operators.js";

# our dictionary (property file)
$MOZ_DICTIONARY = "mathfont.properties";

# dictionary provided by the W3C in "XML Entity Definitions for Characters"
$WG_DICTIONARY_URL = "http://www.w3.org/2003/entities/2007xml/unicode.xml";

# XSL stylesheet to extract relevant data from the dictionary
$DICTIONARY_XSL = "operatorDictionary.xsl";

# dictionary provided by the W3C transformed with operatorDictionary.xsl 
$WG_DICTIONARY = $FILE_DICTIONARY;

if (!($#ARGV >= 0 &&
      ((($ARGV[0] eq "download") && $#ARGV <= 1) ||
       (($ARGV[0] eq "compare") && $#ARGV <= 1) ||
       (($ARGV[0] eq "check") && $#ARGV <= 0) ||
       (($ARGV[0] eq "make-js") && $#ARGV <= 0) ||
       (($ARGV[0] eq "clean") && $#ARGV <= 0)))) {
    &usage;
}

if ($ARGV[0] eq "download") {
    if ($#ARGV == 1) {
        $WG_DICTIONARY_URL = $ARGV[1];
    }
    print "Downloading $WG_DICTIONARY_URL...\n";
    getstore($WG_DICTIONARY_URL, $FILE_UNICODE);

    print "Converting $FILE_UNICODE into $FILE_DICTIONARY...\n";
    my $xslt = XML::LibXSLT->new();
    my $source = XML::LibXML->load_xml(location => $FILE_UNICODE);
    my $style_doc = XML::LibXML->load_xml(location => $DICTIONARY_XSL,
                                          no_cdata=>1);
    my $stylesheet = $xslt->parse_stylesheet($style_doc);
    my $results = $stylesheet->transform($source);
    open($file, ">$FILE_DICTIONARY") || die ("Couldn't open $FILE_DICTIONARY!");
    print $file $stylesheet->output_as_bytes($results);
    close($file);
    exit 0;
}

if ($ARGV[0] eq "clean") {
    unlink($FILE_UNICODE,
           $FILE_DICTIONARY,
           $FILE_DIFFERENCES,
           $FILE_NEW_DICTIONARY,
           $FILE_SYNTAX_ERRORS);
    exit 0;
}

if ($ARGV[0] eq "compare" && $#ARGV == 1) {
    $WG_DICTIONARY = $ARGV[1];
}

################################################################################
# structure of the dictionary used by this script:
# - key: same as in mathfont.properties
# - table:
#    index | value
#      0   | description
#      1   | lspace
#      2   | rspace
#      3   | minsize
#      4   | largeop
#      5   | movablelimits
#      6   | stretchy
#      7   | separator
#      8   | accent
#      9   | fence
#     10   | symmetric
#     11   | priority
#     12   | linebreakstyle
#     13   | direction
#     14   | integral
#     15   | mirrorable

# 1) build %moz_hash from $MOZ_DICTIONARY

print "loading $MOZ_DICTIONARY...\n";
open($file, $MOZ_DICTIONARY) || die ("Couldn't open $MOZ_DICTIONARY!");

print "building dictionary...\n";
while (<$file>) {
    next unless (m/^operator\.(.*)$/);
    (m/^([\w|\.|\\]*)\s=\s(.*)\s#\s(.*)$/);

    # 1.1) build the key
    $key = $1;

    # 1.2) build the array
    $_ = $2;
    @value = ();
    $value[0] = $3;
    if (m/^(.*)lspace:(\d)(.*)$/) { $value[1] = $2; } else { $value[1] = "5"; }
    if (m/^(.*)rspace:(\d)(.*)$/) { $value[2] = $2; } else { $value[2] = "5"; }
    if (m/^(.*)minsize:(\d)(.*)$/) { $value[3] = $2; } else { $value[3] = "1"; }
    $value[4] = (m/^(.*)largeop(.*)$/);
    $value[5] = (m/^(.*)movablelimits(.*)$/);
    $value[6] = (m/^(.*)stretchy(.*)$/);
    $value[7] = (m/^(.*)separator(.*)$/);
    $value[8] = (m/^(.*)accent(.*)$/);
    $value[9] = (m/^(.*)fence(.*)$/);
    $value[10] = (m/^(.*)symmetric(.*)$/);
    $value[11] = ""; # we don't store "priority" in our dictionary
    $value[12] = ""; # we don't store "linebreakstyle" in our dictionary
    if (m/^(.*)direction:([a-z]*)(.*)$/) { $value[13] = $2; }
    else { $value[13] = ""; }
    $value[14] = (m/^(.*)integral(.*)$/);
    $value[15] = (m/^(.*)mirrorable(.*)$/);

    # 1.3) save the key and value
    $moz_hash{$key} = [ @value ];
}

close($file);

################################################################################
# 2) If mode "make-js", generate tests/stretchy-and-large-operators.js and quit.
#    If mode "check", verify validity of our operator dictionary and quit.
#    If mode "compare", go to step 3)

if ($ARGV[0] eq "make-js") {
    print "generating file $FILE_JS...\n";
    open($file_js, ">$FILE_JS") ||
        die ("Couldn't open $FILE_JS!");
    print $file_js "// This file is automatically generated. Do not edit.\n";
    print $file_js "var stretchy_and_large_operators = [";
    @moz_keys = (keys %moz_hash);
    while ($key = pop(@moz_keys)) {
        @moz = @{ $moz_hash{$key} };

        $_ = $key;
        (m/^operator\.([\w|\.|\\]*)\.(prefix|infix|postfix)$/);
        $opname = "\\$1.$2: ";

        if (@moz[4]) {
            print $file_js "['$opname', '$1','l','$2'],";
        }

        if (@moz[6]) {
            $_ = substr(@moz[13], 0, 1);
            print $file_js "['$opname', '$1','$_','$2'],";
        }
    }
    print $file_js "];\n";
    close($file_js);
    exit 0;
}

if ($ARGV[0] eq "check") {
    print "checking operator dictionary...\n";
    open($file_syntax_errors, ">$FILE_SYNTAX_ERRORS") ||
        die ("Couldn't open $FILE_SYNTAX_ERRORS!");

    $nb_errors = 0;
    $nb_warnings = 0;
    @moz_keys = (keys %moz_hash);
    # check the validity of our private data
    while ($key = pop(@moz_keys)) {
        @moz = @{ $moz_hash{$key} };
        $entry = &generateEntry($key, @moz);
        $valid = 1;

        if (!(@moz[13] eq "" ||
              @moz[13] eq "horizontal" ||
              @moz[13] eq "vertical")) {
            $valid = 0;
            $nb_errors++;
            print $file_syntax_errors "error: invalid direction \"$moz[13]\"\n";
        }

        if (!@moz[4] && @moz[14]) {
            $valid = 0;
            $nb_warnings++;
            print $file_syntax_errors "warning: operator is integral but not largeop\n";
        }
        
        $_ = @moz[0];
        if ((m/^(.*)[iI]ntegral(.*)$/) && !@moz[14]) {
            $valid = 0;
            $nb_warnings++;
            print $file_syntax_errors "warning: operator contains the term \"integral\" in its comment, but is not integral\n";
        }

        if (!$valid) {
            print $file_syntax_errors $entry;
            print $file_syntax_errors "\n";
        }
    }

    # check that all forms have the same direction.
    @moz_keys = (keys %moz_hash);
    while ($key = pop(@moz_keys)) {

        if (@{ $moz_hash{$key} }) {
            # the operator has not been removed from the hash table yet.

            $_ = $key;
            (m/^([\w|\.|\\]*)\.(prefix|infix|postfix)$/);
            $key_prefix = "$1.prefix";
            $key_infix = "$1.infix";
            $key_postfix = "$1.postfix";
            @moz_prefix = @{ $moz_hash{$key_prefix} };
            @moz_infix = @{ $moz_hash{$key_infix} };
            @moz_postfix = @{ $moz_hash{$key_postfix} };

            $same_direction = 1;

            if (@moz_prefix) {
                if (@moz_infix &&
                    !($moz_infix[13] eq $moz_prefix[13])) {
                    $same_direction = 0;
                }
                if (@moz_postfix &&
                    !($moz_postfix[13] eq $moz_prefix[13])) {
                    $same_direction = 0;
                }
            }
            if (@moz_infix) {
                if (@moz_postfix &&
                    !($moz_postfix[13] eq $moz_infix[13])) {
                    $same_direction = 0;
                }
            }

            if (!$same_direction) {
                $nb_errors++;
                print  $file_syntax_errors
                    "error: operator has a stretchy form, but all forms";
                print  $file_syntax_errors
                    " have not the same direction\n";
                if (@moz_prefix) {
                    $_ = &generateEntry($key_prefix, @moz_prefix);
                    print $file_syntax_errors $_;
                }
                if (@moz_infix) {
                    $_ = &generateEntry($key_infix, @moz_infix);
                    print $file_syntax_errors $_;
                }
                if (@moz_postfix) {
                    $_ = &generateEntry($key_postfix, @moz_postfix);
                    print $file_syntax_errors $_;
                }
                print $file_syntax_errors "\n";
            }
            
            if (@moz_prefix) {
                delete $moz_hash{$key.prefix};
            }
            if (@moz_infix) {
                delete $moz_hash{$key_infix};
            }
            if (@moz_postfix) {
                delete $moz_hash{$key_postfix};
            }
        }
    }

    close($file_syntax_errors);
    print "\n";
    if ($nb_errors > 0 || $nb_warnings > 0) {
        print "$nb_errors error(s) found\n";
        print "$nb_warnings warning(s) found\n";
        print "See output file $FILE_SYNTAX_ERRORS.\n\n";
    } else {
        print "No error found.\n\n";
    }

    exit 0;
}

################################################################################
# 3) build %wg_hash and @wg_keys from the page $WG_DICTIONARY

print "loading $WG_DICTIONARY...\n";
my $parser = XML::LibXML->new();
my $doc = $parser->parse_file($WG_DICTIONARY);

print "building dictionary...\n";
@wg_keys = ();

foreach my $entry ($doc->findnodes('/root/entry')) {
    # 3.1) build the key
    $key = "operator.";

    $_ = $entry->getAttribute("unicode");
    $_ = "$_-";
    while (m/^U?0(\w*)-(.*)$/) {
        # Concatenate .\uNNNN
        $key = "$key\\u$1";
        $_ = $2;
    }

    $_ = $entry->getAttribute("form"); # "Form"
    $key = "$key.$_";

    # 3.2) build the array
    @value = ();
    $value[0] = lc($entry->getAttribute("description"));
    $value[1] = $entry->getAttribute("lspace");
    if ($value[1] eq "") { $value[1] = "5"; }
    $value[2] = $entry->getAttribute("rspace");
    if ($value[2] eq "") { $value[2] = "5"; }
    $value[3] = $entry->getAttribute("minsize");
    if ($value[3] eq "") { $value[3] = "1"; }

    $_ = $entry->getAttribute("properties");
    $value[4] = (m/^(.*)largeop(.*)$/);
    $value[5] = (m/^(.*)movablelimits(.*)$/);
    $value[6] = (m/^(.*)stretchy(.*)$/);
    $value[7] = (m/^(.*)separator(.*)$/);
    $value[8] = (m/^(.*)accent(.*)$/);
    $value[9] = (m/^(.*)fence(.*)$/);
    $value[10] = (m/^(.*)symmetric(.*)$/);
    $value[15] = (m/^(.*)mirrorable(.*)$/);
    $value[11] = $entry->getAttribute("priority");
    $value[12] = $entry->getAttribute("linebreakstyle");

    # not stored in the WG dictionary
    $value[13] = ""; # direction
    $value[14] = ""; # integral

    # 3.3) save the key and value
    push(@wg_keys, $key);
    $wg_hash{$key} = [ @value ];
}
@wg_keys = reverse(@wg_keys);

################################################################################
# 4) Compare the two dictionaries and output the result

print "comparing dictionaries...\n";
open($file_differences, ">$FILE_DIFFERENCES") ||
    die ("Couldn't open $FILE_DIFFERENCES!");
open($file_new_dictionary, ">$FILE_NEW_DICTIONARY") ||
    die ("Couldn't open $FILE_NEW_DICTIONARY!");

$conflicting = 0; $conflicting_stretching = 0;
$new = 0; $new_stretching = 0;
$obsolete = 0; $obsolete_stretching = 0;
$unchanged = 0;

# 4.1) look to the entries of the WG dictionary
while ($key = pop(@wg_keys)) {

    @wg = @{ $wg_hash{$key} };
    delete $wg_hash{$key};
    $wg_value = &generateCommon(@wg);

    if (exists($moz_hash{$key})) {
        # entry is in both dictionary
        @moz = @{ $moz_hash{$key} };
        delete $moz_hash{$key};
        $moz_value = &generateCommon(@moz);
        if ($moz_value ne $wg_value) {
            # conflicting entry
            print $file_differences "[conflict]";
            $conflicting++;
            if ($moz[6] != $wg[6]) {
                print $file_differences "[stretching]";
                $conflicting_stretching++;
            }
            print $file_differences " - $key ($wg[0])\n";
            print $file_differences "-$moz_value\n+$wg_value\n\n";
            $_ = &completeCommon($wg_value, $key, @moz, @wg);
            print $file_new_dictionary $_;
        } else {
            # unchanged entry
            $unchanged++;
            $_ = &completeCommon($wg_value, $key, @moz, @wg);
            print $file_new_dictionary $_;
        }
    } else {
        # we don't have this entry in our dictionary yet
        print $file_differences "[new entry]";
        $new++;
        if ($wg[6]) {
            print $file_differences "[stretching]";
            $new_stretching++;
        }
        print $file_differences " - $key ($wg[0])\n";
        print $file_differences "-\n+$wg_value\n\n";
        $_ = &completeCommon($wg_value, $key, (), @wg);
        print $file_new_dictionary $_;
    }
}

print $file_new_dictionary
    "\n# Entries below are not part of the official MathML dictionary\n\n";
# 4.2) look in our dictionary the remaining entries
@moz_keys = (keys %moz_hash);
@moz_keys = reverse(sort(@moz_keys));

while ($key = pop(@moz_keys)) {
    @moz = @{ $moz_hash{$key} };
    $moz_value = &generateCommon(@moz);
    print $file_differences "[obsolete entry]";
    $obsolete++;
    if ($moz[6]) {
        print $file_differences "[stretching]";
        $obsolete_stretching++;
    }
    print $file_differences " - $key ($moz[0])\n";
    print $file_differences "-$moz_value\n+\n\n";
    $_ = &completeCommon($moz_value, $key, (), @moz);
    print $file_new_dictionary $_;
}

close($file_differences);
close($file_new_dictionary);

print "\n";
print "- $obsolete obsolete entries ";
print "($obsolete_stretching of them are related to stretching)\n";
print "- $unchanged unchanged entries\n";
print "- $conflicting conflicting entries ";
print "($conflicting_stretching of them are related to stretching)\n";
print "- $new new entries ";
print "($new_stretching of them are related to stretching)\n";
print "\nSee output files $FILE_DIFFERENCES and $FILE_NEW_DICTIONARY.\n\n";
print "After having modified the dictionary, please run";
print "./updateOperatorDictionary check\n\n";
exit 0;

################################################################################
sub usage {
    # display the accepted command syntax and quit
    print "usage:\n";
    print "  ./updateOperatorDictionary.pl download [unicode.xml]\n";
    print "  ./updateOperatorDictionary.pl compare [dictionary.xml]\n";
    print "  ./updateOperatorDictionary.pl check\n";
    print "  ./updateOperatorDictionary.pl make-js\n";
    print "  ./updateOperatorDictionary.pl clean\n";
    exit 0;
}

sub generateCommon {
    # helper function to generate the string of data shared by both dictionaries
    my(@v) = @_;
    $entry = "lspace:$v[1] rspace:$v[2]";
    if ($v[3] ne "1") { $entry = "$entry minsize:$v[3]"; }
    if ($v[4]) { $entry = "$entry largeop"; }
    if ($v[5]) { $entry = "$entry movablelimits"; }
    if ($v[6]) { $entry = "$entry stretchy"; }
    if ($v[7]) { $entry = "$entry separator"; }
    if ($v[8]) { $entry = "$entry accent"; }
    if ($v[9]) { $entry = "$entry fence"; }
    if ($v[10]) { $entry = "$entry symmetric"; }
    if ($v[15]) { $entry = "$entry mirrorable"; }
    return $entry;
}

sub completeCommon {
    # helper to add key and private data to generateCommon
    my($entry, $key, @v_moz, @v_wg) = @_;
    
    $entry = "$key = $entry";

    if ($v_moz[13]) { $entry = "$entry direction:$v_moz[13]"; }
    if ($v_moz[14]) { $entry = "$entry integral"; }
    if ($v_moz[15]) { $entry = "$entry mirrorable"; }

    if ($v_moz[0]) {
        # keep our previous comment
        $entry = "$entry # $v_moz[0]";
    } else {
        # otherwise use the description given by the WG
        $entry = "$entry # $v_wg[0]";
    }

    $entry = "$entry\n";
    return $entry;
}

sub generateEntry {
    # helper function to generate an entry of our operator dictionary
    my($key, @moz) = @_;
    $entry = &generateCommon(@moz);
    $entry = &completeCommon($entry, $key, @moz, @moz);
    return $entry;
}