#!/usr/bin/perl -CDS

# CHISE IDS to IDSgrep EIDS translator
# Copyright (C) 2012  Matthew Skala
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
# Matthew Skala
# http://ansuz.sooke.bc.ca/
# mskala@ansuz.sooke.bc.ca

use utf8;

$svnid='$Id: chise2eids 1561 2012-07-21 15:06:23Z mskala $';

print "〖EIDS kanji decomposition dictionary generated by chise2eids\n"
     ."$svnid\nNotices below are from the input.\n〗;\n";

$chisedir=shift;
$chisever="(version unknown)";
$dateline='';
open(CHANGELOG,"$chisedir/ChangeLog");
while (<CHANGELOG>) {
  if (/^\d{4}-\d\d-\d\d\s/) {
    $dateline="$_";
  } elsif (/\* chise.ids ([0-9.]+) /i) {
    $chisever=$1;
    last;
  }
}
close(CHANGELOG);
$mode=0;
($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
 $atime,$mtime,$ctime,$blksize,$blocks)=stat("$chisedir/.git");
if ($mode & 040000) {
  $dateline.=("GIT REPOSITORY VERSION dated ".gmtime($mtime)."\n");
}
print "〖Based on CHISE IDS $chisever\n$dateline〗;\n";

$license='';
$blanks=0;
$seenany=0;
open(README,"$chisedir/README.en");
while (<README>) {
  if ($capturing) {
    if (/^\* \S/) {
      print "〖$license〗;\n" if $license ne '';
      $license='';
      $capturing=0;
      $blanks=0;
      $seenany=0;
    } elsif (/\S/) {
      $license.=("\n"x$blanks) if $seenany;
      $blanks=0;
      $license.=$_;
      $seenany=1;
    } else {
      $blanks++;
    }
  }
  if (/^\* (License|Acknowledgment)/) {
    $capturing=1;
  }
}
close(README);
print "〖$license〗;\n" if $license ne '';

while (<>) {
  chomp;
  next unless /^([^;]\S*)\t(\S+)\t(\S+)($|\s)/;
  ($id,$char,$eids)=($1,$2,$3);
  next if $char eq $eids;
  
  $save=$_;
  $_=$eids;
  $count=0;
  $opped=0;
  while ($_ ne '') {
    if (/^&[a-z0-9\-+_]+?;(.*)$/i) {
      $count++;
      $_=$1;
    } elsif (/^[⿰⿱⿴⿵⿶⿷⿸⿹⿺⿻]/) {
      $count-=1;
      $_=substr($_,1);
      $opped=1;
    } elsif (/^[⿲⿳]/) {
      $count-=2;
      $_=substr($_,1);
      $opped=1;
    } elsif (/^[\x{FE00}-\x{FE0F}\x{E0100}-\x{E01EF}]/) {
      # $count+=0;
      $_=substr($_,1);
    } else {
      $count++;
      $_=substr($_,1);
    }      
  }
  if (($count!=1) || ($eids=~/$char/)) {
    print STDERR "ERR $ARGV: $save\n";
    next;
  } elsif ($opped==0) {
    next;
  }
  
  $char=~s/^&([a-z0-9\-+_]+?);$/$1/i;
  $eids=~s/&([a-z0-9\-+_]+?);/<$1>;/gi;
  $eids=~s/(.[\x{FE00}-\x{FE0F}\x{E0100}-\x{E01EF}])/<$1>;/gi;
  $dictionary{$char}=$eids;
}

foreach $char (sort keys %dictionary) {
  $done='';
  $todo=$dictionary{$char};
#  print STDERR "$char\n";
  while ($todo ne '') {
    if ($todo=~/^<([^>]+?)>;(.*)$/ && exists $dictionary{$1}) {
      $done.="<$1>";
      $todo=$dictionary{$1}.$2;
    } elsif ($todo=~/^(<[^>]+?>)(.*)$/) {
      $done.=$1;
      $todo=$2;
    } elsif (exists $dictionary{substr($todo,0,1)}) {
      $done.=('<'.substr($todo,0,1).'>');
      $todo=$dictionary{substr($todo,0,1)}.substr($todo,1);
    } else {
      $done.=substr($todo,0,1);
      $todo=substr($todo,1);
    }
  }
  $dictionary{$char}=$done;
  print "【$char】$done\n";
}
