#! /usr/bin/perl
##---------------------------------------------------------------------------##
##  File:
##      mtems2flib
##  Date: 
##      2004-04-27
##  Author:
##      Tomaz Erjavec   tomaz.erjavec@ijs.si
##  Description:
##	mtems2flib is a Perl program that converts the common tables
##	of the MULTEXT-East morphosyntactic specifications to
##	an XML encoding, expressed as TEI P4 feaure-structure libraries.
##
##      The common table for each PoS is output as a feature library, e.g.
##      <fLib type="Noun"> which then contains features, 
##      starting with one for one for PoS, followed by a feature for 
##      each line in the common table, e.g. start of Noun table:
##
##
##      <!-- 1. Noun (N)-->
##      <fLib type="Noun">
##      <f id="N0."   select="en ro ..." name="PoS"><sym value="Noun"/></f>
##      <f id="N1.c"  select="en ro ..." name="Type"><sym value="common"/></f>
##
##      The input to the program is the ASCII/LaTeX format of the 
##      MTE MSD specifications
##
##      Program is available from 
##      http://nl.ijs.si/ME/V3/msd/bin/mtems2flib
##---------------------------------------------------------------------------##
##  Copyright (C) 2004         	Tomaz Erjavec, tomaz.erjavec@ijs.si
##
##  This program is free software; you can redistribute it and/or modify
##  it under the terms of the GNU General Public License as published by
##  the Free Software Foundation; either version 2 of the License, or
##  (at your option) any later version.
##---------------------------------------------------------------------------##

print "<!-- THIS FILE IS AUTOMATICALLY GENERATED FROM A LaTeX MASTER - EDIT AT YOUR OWN RISK! -->\n";

#Get POSs
while (!/Part-of-Speech\s+Code/) {$_=<>};
#print "<!--\n";
$_=<>; $_=<>;
while (!/^[= ]+$/) {
  m/^(\w+)\s+(\w)\s*/;
  push(@cats,("$1!$2"));
#  print "$2: $1\n";
  $_=<>;
}
#print "-->\n";

foreach $cat(@cats) {
    $catno++;
    $cat =~ /(.+)!(.+)/; $catnam=$1; $catcode=$2; 
    while (!/$catnam(s)?\s+\($catcode\)/) {$_=<>};
    print "\n<!-- $catno. $catnam ($catcode)-->\n";
    print "<fLib type=\"$catnam\">\n";
    while (!/^=[ =]+/) {$_=<>;}            #skip to first line of table
    ($head,$hlangs)=/(^[= ]+)(.+)/;
    $hlangs=~tr/A-Z/a-z/;
    @hlangs=split(/ +/,$hlangs);
    $langoff=length($head);
    $_ = <>;
    $xlangs=substr($_,$langoff);
    if (($xlangs)=/C  ([ x]*)/) {         #line with CAT/LANG
      $n = 0;
      $att = "PoS";
      $val = "$catnam";
#     $code = "$catcode";
      $code = "";
      &print_f;
    }
    else {print; die}
    $_ = <>;
    $_ = <>;

    while (!/^=/) {          #process cat table
	s/l\.s\./    /;		# get rid of l.s.
	s/l\.s/   /;		# get rid of l.s
        s/\(req.by prep.\)/              /; # Adposition

	if (/^[-*]+/) {$n=999}
	elsif (($val,$code,$xlangs)=
#              /^ +(\w+)[ *]+(\w)  ([ x]*)/) {&print_f;}
               /^ +(\w+)[ *]+(\w)  (.*)/) {&print_f;}
	elsif (($n,$att,$val,$code,$xlangs)=
		/^(\d+) *(\w+) +(\w+)[ *]+(\w)  (.*)/) {&print_f;}
	else {print; die}
	$_ = <>;
    }
    print "</fLib>\n";
}


sub print_f {
  print "<f id=\"$catcode$n.$code\" ";
  if ($n < 10) {print " "};
  if ($xlangs=~/^ *$/) {}
  else {
    $xlangs=~s/[^x ]/ /g;		# get rid of all comments
    $sel='';
    foreach $lang(@hlangs) {
      if ($xlangs=~s/^x {0,3}//) {$sel.="$lang "}
      else {
	$xlangs=~s/^ {0,4}//;
	for ($i = 0; $i <= length($lang); $i++) {$sel.=" "}
      }
    }
#normalise - XSLT protests if more than one space!
    $sel=~s/ +/ /g;
    $sel=~s/^ //;
    $sel=~s/ $//;
    print "select=\"$sel\" ";
  }
  $att=~s/\_/-/g;	# underscore illegal in NAME
  $val=~s/\_/-/g;	# might as well be consistent
  print "name=\"$att\"><sym value=\"$val\"/></f>\n";
}
