Multext-East - Deliverable D1.2. Language-specific resources/Appendix 5 - May 96.




Appendix 5

Perl script for various computations on the lexicons.





#! /usr/local/bin/perl
##---------------------------------------------------------------------------##
##  File:
##      mtewfl-count
##  Author:
##      Tomaz Erjavec             tomaz.erjavec@ijs.si
##  Description:
##      Count as many things as possible in a Multext-East Word Form Lexicon
##  To Do:
##	o lots
##
##---------------------------------------------------------------------------##
##  Copyright (C) 1996  Tomaz Erjavec             tomaz.erjavec@ijs.si
##
##  This program is free software; you can redistribute it and/or modify
##  it under the terms of the GNU General Public License as published by
##  the Free Software Foundation; either version 2 of the License, or
##  (at your option) any later version.
##  
##  This program is distributed in the hope that it will be useful,
##  but WITHOUT ANY WARRANTY; without even the implied warranty of
##  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
##  GNU General Public License for more details.
##  
##  You should have received a copy of the GNU General Public License
##  along with this program; if not, write to the Free Software
##  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
##---------------------------------------------------------------------------##
###############################################################################

@legalcats = (
    "Nouns (N)",
    "Verbs (V)",
    "Adjectives (A)",
    "Pronouns (P)",
    "Determiners (D)",
    "Articles (T)",
    "Adverbs (R)",
    "Adpositions (S)",
    "Conjunctions (C)",
    "Numerals (M)",
    "Interjections (I)",
    "Residuals (X)",
    "Abbreviations (Y)",
    "Particles (Q)",
);

@catcodes = @legalcats;
foreach $catcode (@catcodes) {
    $catcode =~ s/.+ \(([A-Z])\)/$1/;
    }
@catcodes = sort(@catcodes);

@wfl = ;
foreach $wfl (@wfl) {
    $wfl =~ s/(.+)\t(.+)\t(.+)/$3\t$1\t$2/;
};
@wfl = sort(@wfl);

# $tmpfiles = "mte-\*.tmp";
# if (-r $tmpfiles) {print "juhej"; die; exec "rm -f $tmpfiles"};

#exec "rm -f mte-\*.tmp";

open(TMPOUT, ">mte-0\.tmp" );
foreach $wfl (@wfl) {
    $wfl =~ s/(.+)\t(.+)\t(.+)/$2\t$3\t$1/;
    print TMPOUT "$wfl";
};
close(TMPOUT);

foreach $wfl (@wfl) {
    $wfl =~ /^.+\t.+\t(.)/;
    $fcat = $1;
    if ($fcat !~ $curcat) {
	if (!$fresh) {close(TMPOUT)};
	$curcat = $fcat; 
	open(TMPOUT,">mte-$curcat\.tmp" ) || die "Can't open file";
    };
    print TMPOUT "$wfl";
};  
close(TMPOUT);

push(@legalcats,"TOTAL (0)");

printf "%18s %-6s%-6s%-6s%-6s%-6s\n", 
    "Category ", "Entrs", " WFSs", "  Lms", "    =",  " MSDs";

foreach $cat (@legalcats) {
    printf "%18s", "$cat:";
    $cat =~/.+ \(([A-Z0])\)/; 
    $curcat = $1;
    $tmpin = "mte-$curcat\.tmp";
    if (-r $tmpin) {
	open(TMPIN, $tmpin);
	chop(@cwfl = );
	close(TMPIN);
    }
    else {@cwfl = ()};
    @wf=(); @lm=(); @ms=(); 
    foreach $cwfl (@cwfl) {
	($wf,$lm,$ms) = split(/\t/,$cwfl,3);
	push(@wf,$wf);
	push(@lm,$lm);
	push(@ms,$ms);
    }
    @wf = sort(@wf);  @lm = sort(@lm);  @ms = sort(@ms);

    $cnt = @wf; 
    printf "%6d", $cnt;

    $cnt = 0; $old = 0;
    foreach $wf (@wf) {if ($wf ne $old) {$cnt++; $old=$wf}};
    printf "%6d", $cnt;

    $cnt = 0; $old = 0;
    foreach $lm (@lm) {if ($lm ne $old) {$cnt++; $old=$lm}};
    printf "%6d", $cnt;

    $cnt = 0; 
    foreach $lm (@lm) {if ($lm eq "=") {$cnt++}}
    printf "%6d", $cnt;

    $cnt = 0; $old = 0;
    foreach $ms (@ms) {if ($ms ne $old) {$cnt++; $old=$ms}};
    printf "%6d", $cnt;
     
#     $cnt = 0; $old = 0;
#     foreach $ms (@ms) {
# 	if ($lm ne $old) {; $old=$lm}
#     };
#     printf "%6d", 999;
     
    print "\n";
}
##yo LSD

Netscape-HTML Checked! | Top | Table of contents | Multext-East | LPL/CNRS

Copyright © Centre National de la Recherche Scientifique, 1996.