#! /usr/bin/perl ##---------------------------------------------------------------------------## ## File: ## mtems2flib ## Date: ## 2004-04-27 ## Author: ## Tomaz Erjavec tomaz.erjavec@ijs.si ## Description: ## mtems2flib is a Perl program that converts the common tables ## of the MULTEXT-East morphosyntactic specifications to ## an XML encoding, expressed as TEI P4 feaure-structure libraries. ## ## The common table for each PoS is output as a feature library, e.g. ## which then contains features, ## starting with one for one for PoS, followed by a feature for ## each line in the common table, e.g. start of Noun table: ## ## ## ## ## ## ## ## The input to the program is the ASCII/LaTeX format of the ## MTE MSD specifications ## ## Program is available from ## http://nl.ijs.si/ME/V3/msd/bin/mtems2flib ##---------------------------------------------------------------------------## ## Copyright (C) 2004 Tomaz Erjavec, tomaz.erjavec@ijs.si ## ## This program is free software; you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by ## the Free Software Foundation; either version 2 of the License, or ## (at your option) any later version. ##---------------------------------------------------------------------------## print "\n"; #Get POSs while (!/Part-of-Speech\s+Code/) {$_=<>}; #print "\n"; foreach $cat(@cats) { $catno++; $cat =~ /(.+)!(.+)/; $catnam=$1; $catcode=$2; while (!/$catnam(s)?\s+\($catcode\)/) {$_=<>}; print "\n\n"; print "\n"; while (!/^=[ =]+/) {$_=<>;} #skip to first line of table ($head,$hlangs)=/(^[= ]+)(.+)/; $hlangs=~tr/A-Z/a-z/; @hlangs=split(/ +/,$hlangs); $langoff=length($head); $_ = <>; $xlangs=substr($_,$langoff); if (($xlangs)=/C ([ x]*)/) { #line with CAT/LANG $n = 0; $att = "PoS"; $val = "$catnam"; # $code = "$catcode"; $code = ""; &print_f; } else {print; die} $_ = <>; $_ = <>; while (!/^=/) { #process cat table s/l\.s\./ /; # get rid of l.s. s/l\.s/ /; # get rid of l.s s/\(req.by prep.\)/ /; # Adposition if (/^[-*]+/) {$n=999} elsif (($val,$code,$xlangs)= # /^ +(\w+)[ *]+(\w) ([ x]*)/) {&print_f;} /^ +(\w+)[ *]+(\w) (.*)/) {&print_f;} elsif (($n,$att,$val,$code,$xlangs)= /^(\d+) *(\w+) +(\w+)[ *]+(\w) (.*)/) {&print_f;} else {print; die} $_ = <>; } print "\n"; } sub print_f { print "\n"; }