#!/usr/bin/perl -CDS

$copyright=<<'EOF';
EIDS decomposition dictionary generator for Tsukurimashou
$Id: make-eids 8758 2021-04-08 01:22:16Z mskala $
Copyright (C) 2012, 2013, 2021  Matthew Skala

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, version 3.

As a special exception, if you create a document which uses this font, and
embed this font or unaltered portions of this font into the document, this
font does not by itself cause the resulting document to be covered by the
GNU General Public License. This exception does not however invalidate any
other reasons why the document might be covered by the GNU General Public
License. If you modify this font, you may extend this exception to your
version of the font, but you are not obligated to do so. If you do not
wish to do so, delete this exception statement from your version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.

Matthew Skala
http://ansuz.sooke.bc.ca/
mskala@ansuz.sooke.bc.ca
EOF

use utf8;

# copyright notice of this script is reused with minor edits for the output
$copyright=~s/\(C\)/©/;
$copyright=~s/generator for/based on/;

print "〖$copyright〗;\n";

# input to this script is all the "proof" files from Tsukurimashou Kaku
# (vanilla options) on pages that contain kanji.  These contain a
# PERL_STRUCTURE tag which is Perl syntax (warning, interpreted with eval!)
# for a data structure tracing out the way the characters were constructed.
while (<>) {
  eval $1 if /^PERL_STRUCTURE\s+(.*)$/;
}

# massage the data structure to make for a better EIDS tree
foreach $glyph (sort keys %structure) {

  # for the moment, handle only U+2E00..U+2EFF, U+2F00..U+2FEF, and
  # U+3400..U+9FFF
  next unless $glyph=~/^uni(2E|3[4-9A-F]|[4-9][0-9A-F])[0-9A-F][0-9A-F]$/i
    || $glyph=~/^uni2F[0-9A-E][0-9A-F]$/i;

  # remove first element of structure, which is glyph name, then
  # pass through the "crunch" subroutine
  $glstruct=[@{$structure{$glyph}}[1..$#{$structure{$glyph}}]];
  $structure{$glyph}=&crunch($glstruct);

  # save the Unicode code point of this glyph's structure for use any
  # time we see the same structure later, unless it is the "mouth" glyph
  # because there are too many structures that look like that one but are
  # not etymologigcally the same component
  $glyph=~/^u(ni)?([0-9A-F]+)$/;
  $literal=chr(hex("0x$2"));
  $head{&struct_canon($structure{$glyph})}=$literal
    unless $literal eq '囗';
}

# collapse intermediate nodes in a PERL_STRUCTURE that are basically unary
sub crunch {
  my($x)=@_;
  my($flag)=1; # true if we may still be able to improve the current node
  my($i);

  # first, recursively crunch up all children
  if (ref($x) eq 'ARRAY') {
    for ($i=0;$i<=$#$x;$i++) {
      $x->[$i]=&crunch($x->[$i]);
    }
  }

  # repeat while changes may still be possible...
  while ($flag) {

    # if this is a unary node, collapse it
    if ((ref($x) eq 'ARRAY') && ($#$x==0)) {
      $x=$x->[0];

    # if this node is a named kanji that consists of a single build_kanji
    # operation, collapse it into just its contents
    } elsif ((ref($x) eq 'ARRAY') && ($#$x==1) &&
        ($x->[0]=~/^kanji\./) && (ref($x->[1]) eq 'ARRAY') &&
        ($x->[1]->[0]=~/^build_kanji\./)) {
      $x=$x->[1];

    # if this node is a build_kanji.tb or .lr and one argument is empty,
    # collapse it into just the non-empty argument
    } elsif ((($x->[0] eq 'build_kanji.tb') || ($x->[0] eq 'build_kanji.lr'))
          && ($#{$x->[2]}==-1)) {
      $x=$x->[3];
    } elsif ((($x->[0] eq 'build_kanji.tb') || ($x->[0] eq 'build_kanji.lr'))
          && ($#{$x->[3]}==-1)) {
      $x=$x->[2];

    # otherwise, there is nothing more we can crunch
    } else {
      $flag=0;
    }
  }

  return $x;
}

# recursively generate a "canonical" version of a PERL_STRUCTURE, as a
# string, with unary nodes collapsed; this can be string-compared to
# recognize the same structure elsewhere.
sub struct_canon {
  my($inp)=@_;
  
  while ((ref($inp) eq 'ARRAY') && ($#$inp==0)) {
    $inp=$inp->[0];
  }
  
  if (ref($inp) eq 'ARRAY') {
    return '['.join(',',map { &struct_canon($_) } @$inp).']';
  } else {
    return $inp;
  }
}

# generate EIDSes for all the structures
foreach $glyph (sort keys %structure) {

  # as before, handle only a limited range of code points
  next unless $glyph=~/^uni((2E|3[4-9A-F]|[4-9][0-9A-F])[0-9A-F][0-9A-F])$/i
    || $glyph=~/^uni(2F[0-9A-E][0-9A-F])$/i;

  # basically just a call to the struct_eids routine to generate the EIDS
  $globhead=chr(hex("0x$1"));
  $glstruct=$structure{$glyph};
  print "【$globhead】".&struct_eids($glstruct,1,1)."\n";
}

# recursively generate EIDS for a PERL_STRUCTURE.  arguments are $struct,
# the nested array reference itself; $istop, true if this is the top-level
# call; and $hashead ("has head," not "hashed"), true if we have already
# printed a head for this EIDS
sub struct_eids {
  my($struct,$istop,$hashead)=@_;
  my($canon)=&struct_canon($struct); # canonical text version

  # if we don't have a head for this structure and could add one, try to
  if ((defined $head{$canon}) && (!$hashead)) {

    # recursive call - see whether we can print something better than just
    # a semicolon or question mark
    my($sub)=&struct_eids($struct,0,1);

    # if we can't, just print the head and depend on its syrupy semicolon
    if (($sub eq ';') || ($sub eq '?')) {
      return $head{$canon};

    # if the recursive call found internal structure, print that
    } else {
      return "<$head{$canon}>$sub";
    }
  }

  # not adding a head; attempt to generate EIDS for structure
  my($todo,$failed,$instrs,$instrs,$done);
  if (ref($struct) eq 'ARRAY') {

    # catch unary node again; just recurse to its child
    if ($#$struct==0) {
      return &struct_eids($struct->[0],$istop,$hashead);
    }

    # now we look for a string starting "eids." in the first or second
    # element of the structure array.  If found, it goes into $instrs
    # and tells how to interpret the rest of the array.  Else, set $failed.
    $failed=0;
    if ($struct->[0]=~/^eids\.(.*)$/) {
      $todo=[@{$struct}[1..$#$struct]];
      $instrs=$1;
    } elsif ($struct->[1]=~/^eids\.(.*)$/) {
      $todo=[@{$struct}[2..$#$struct]];
      $instrs=$1;
    } else {
      $failed=1;
    }

    # if we found such a thing...
    if (!$failed) {
      $done='';
      my($lockout)=0;

      # look at each dot-separated field in the $instrs string
      foreach (split('\.',$instrs)) {

        # Adobe/Unicode glyph names specify literal chars in the EIDS
        if (/^u(ni)?([0-9a-f]+)$/i) {
          $done.=chr(hex("0x$2"));

        # two numbers with an underscore say trim off that many entries
        # on either side and then recurse
        } elsif (/^(\d*)_(\d*)$/) {
          my($left,$right)=(0+$1,0+$2);
          $done.=&struct_eids([@{$todo}[$left..($#$todo-$right)]],0,0);

        # "softhead" specifies a head that will be used for this subtree
        # unless the caller has already assigned a head
        } elsif ($_ eq 'softhead') {
          $lockout=1 if $hashead;

        # anything else is an error
        } else {
          $done.='ERROR';
        }
      }

      # replace with contents in some cases of operator with one side empty
      $done=$hashead?';':$1 if $done=~/^[\x{2FF0}-\x{2FFB}](.)\(\[\]\)$/;

      # at this point we're done except with a softhead that wasn't used
      return $done unless $lockout;
    }
  }

  # at this point we're trying to generate EIDS for a PERL_STRUCTURE when
  # we either did NOT have an instructions string starting "eids.", or our
  # attempt to generate EIDS from it failed

  # this loop searches for the common pattern of overlaying a box, or
  # some added, removed, or replaced strokes, onto some other structure
  # we know how to interpret.  The EIDS ends up looking like
  # ⿻?口 or ⿻?(+1-2*3) where ? is the other stuff we were able to
  # interpret, 1 is the number of strokes added, 2 the number removed,
  # and 3 the number replaced, all those numbers being optional.
  my($init,$content,$box,$nadd,$nsub,$nrep,$i)=('','','',0,0,0,0);
  $failed=0;
  my($build)='';
  if (ref($struct) eq 'ARRAY') {
    for ($i=0;$i<=$#$struct;$i++) {
      if (ref($struct->[$i]) eq 'ARRAY') {
        if ($struct->[$i]->[0] eq 'build_kanji.box') {
          if ($box) {
            $failed=1;
            last;
          } else {
            $box=$struct->[$i];
          }
        } else {
          if ($content) {
            $failed=1;
            last;
          } else {
            $content=$struct->[$i];
          }
        }
      } elsif ($struct->[$i] eq 'push_stroke') {
        $nadd++;
      } elsif ($struct->[$i] eq 'replace_strokep') {
        $nrep++;
      } elsif ($struct->[$i] eq 'bosize0') {
        $nsub++;
      } elsif ($i==0) {
        $init=$struct->[0];
      } else {
        $failed=1;
        last;
      }
    }
    if ($box && !$content) {
      $content=$box;
      $box='';
    }
    if ($init && !content) {
      $content=$init;
      $init='';
    }
    if (($nrep>0) && ($nadd==0) && ($nsub==0) && !$box) {
      return &struct_eids($content,0,$hashead);
    }
    if (($init eq 'build_kanji.tb') || ($init eq 'build_kanji.lr')) {
      $content='';
    }
    if (($content ne '') && (($nadd+$nrep+$nsub>0) || $box)) {
      $build=&struct_eids($content,0,0);
      $build="⿻${build}口" if $box;
      if ($nadd+$nrep+$nsub>0) {
        $build="⿻$build(";
        $build.="+$nadd" if $nadd>0;
        $build.="-$nsub" if $nsub>0;
        $build.="*$nrep" if $nrep>0;
        $build.=')';
      }
      return $build;
    }
  }

  # if we are describing a full kanji that just consists of a call to
  # something else, recurse to describing the something else
  if ((ref($struct) eq 'ARRAY') && ($#{$struct}==1) &&
      ($struct->[0]=~/^kanji\./)) {
    return &struct_eids($struct->[1],$istop,$hashead);
  }

  # if canonical string is just a single macro name with no arguments:
  # treat this as uninterpretable
  if ($canon=~/^[a-z0-9_\.]+$/) {
    return $hashead?';':'?';
  }

  # if it's just a macro name all of whose arguments are macros from a
  # limited selection (evidently a combining operation we don't understand)
  # then this is also uninterpretable
  my($proccanon)=$canon;
  $proccanon=~s/\bpush_stroke\b/X/g;
  $proccanon=~s/\breplace_strokep\b/X/g;
  $proccanon=~s/\bbuild_kanji\.box\b/X/g;
  $proccanon=~s/\bbuild_kanji\.cup\b/X/g;
  $proccanon=~s/\bkanji\.[a-z0-9_\.]+\b/X/g;
  if ($proccanon=~/^\[(build_kanji\.[a-z0-9_\.]+,)?(X,)*X\]$/) {
    return $hashead?';':'?';
  }

  # at this point we have basically failed to interpret the PERL_STRUCTURE.
  # just return its canonical string inside a nullary EIDS for the user to
  # worry about
  return "($canon)";
}

# collect the names of files that vardef each macro
while (<mp/*.mp>) {
  next unless m!/([a-z\-0-9_]+\.mp)$!i;
  $fn=$1;
  open(MP,$_);
  while (<MP>) {
    if (/^vardef ([a-z\-0-9_\.]+) =/i) {
      $mfile{$1}.="$fn ";
    }
  }
  close(MP);
}

# scan the kanji-defining files for glyphs defined just by calling single
# kanji macros and print a dictionary entry for each instance of that
while (<mp/tsuku-*.mp>) {
  open(MP,$_);
  $state=0;
  while (<MP>) {
    if ($state==0) {
      if (/^begintsuglyph\("(u(ni)?([0-9a-f]{4,6}))\"/i) {
        $state=1;
        $glname=$1;
        $glchar=chr(hex("0x$3"));
      }
    } elsif ($state==1) {
      if (/^  (kanji\.\S+\.\S+);/) {
        $state=2;
        $macroname=$1;
      } else {
        $state=0;
      }
    } elsif ($state==2) {
      if (/tsu_render;/) {
        print join(', ',"【$glchar】($glname: $macroname",
                   split(' ',$mfile{$macroname})).")\n";
      }
      $state=0;
    } else {
      $state=0;
    }
  }
  close(MP);
}
