next up previous contents
Next: Romanian Up: Morphosyntactic Tagging Previous: Estonian

Hungarian

 COP project 106 MULTEXT-East Deliverable D2.3 F ``1984'', Hungarian

<cesHeader
  version="4.1"
  type="text"
  lang=en
  creator=OCS
  status="update"
  date.created="1997-11-24"
  date.updated="1997-12-21"
>
    <filedesc>
      <titlestmt>
        <h.title>Multext-East cesAna: Nineteen Eighty-Four, Hungarian</h.title>
        <respstmt>
          <respname>Csaba Oravecz</respname>
          <resptype>Overall Responsibility</resptype>
          <respname>Vladim&iacute;r Petkevi&ccaron;</respname>
          <resptype>Conversion to cesAna DTD </resptype>
        </respstmt>
      </titlestmt>
      <editionstmt version="1.0">MTE Final Release</editionstmt>
      <extent>
        <wordCount>80705</wordCount>
        <byteCount units="MB">18.4</byteCount>
        <extnote>wordCount represents he number of TOK TYPE=WORD
          elements in the text. byteCount is in megaBytes</extnote>
      </extent>
      <publicationstmt>
        <distributor>
           Research Institute for Linguistics, Hungarian Academy of Sciences
        </distributor>
        <pubaddress> Budapest, Sz&iacute;nh&aacute;z u. 5-9.</pubaddress>
        <eaddress type="email">oravecz@nytud.hu</eaddress>
        <eaddress type="www">http://www.nytud.hu</eaddress>
        <availability status="restricted">
          Available for research purposes upon receipt of signed agreement
        </availability>
        <pubDate value="1998-01-01">January 1st, 1998</pubDate>
      </publicationstmt>
      <sourcedesc>
         <biblStruct>
         <monogr>
           <h.title>1984</h.title>
           <h.author>George Orwell</h.author>
           <imprint>
           <pubdate>1989</pubdate>
           <publisher>Eur&oacute;pa K&ouml;nyvkiad&oacute;</publisher>
           <pubplace>Budapest</pubplace>
           </imprint>
         </monogr>
         </biblStruct>
      </sourcedesc>
    </filedesc>
    <encodingdesc>
      <projectdesc>
        MULTEXT-East:
        Multilingual Text Tools and Corpora for Central and Eastern
        European Languages.
        EU Copernicus Project COP106
      </projectdesc>
      <editorialdecl>
        <transduction>
          In the cesDoc to cesAna conversion, DIV, QUOTE, Q tags and
          HEAD, POEM, LIST elements have been omitted. cesDoc P
          elements are encoded as PAR, and S as S.
          cesDoc sub-S level tags are omitted: DATE, NAME, ABBR, etc.
        </transduction>
        <quotation>
          Q and QUOTE tags from the cesDoc source not retained.
        </quotation>
        <segmentation>
          S segmentation same as in cesDoc source (hand-validated).
          TOK segmentation performed with mtseg and manually corrected,
        </segmentation>
      </editorialdecl>
      <tagsdecl>
        <tagusage gi=chunkList occurs=1>
          Element corresponds to TEXT of the cesDoc source
        </tagusage>
        <tagusage gi=chunk occurs=1>
          Element corresponds to BODY of the cesDoc source
        </tagusage>
        <tagusage gi=par occurs=1303>
          Elements correspond to P elements of the cesDoc source.
          The FROM attribute gives the reference to the ID of the
          corresponding cesDoc P element.
        </tagusage>
        <tagusage gi=s occurs=6768>
          Elements correspond to S elements of the cesDoc source
          The FROM attribute gives the reference to the ID of the
        corresponding cesDoc S element.
        </tagusage>
        <tagusage gi=tok occurs=98426>
          Tokens are of TYPE=WORD or PUNCT, with the CLASS attribute
          giving the mtseg class of the token.
        </tagusage>
        <tagusage gi=orth   occurs=98426>
          Contains the orthography of the token, as found in the
          cesDoc source.
        </tagusage>
        <tagusage gi=disamb occurs=80705>
          Contains disambiguated lexical information.
        </tagusage>
        <tagusage gi=lex    occurs=111945>
          Contains undisambiguated lexical information.
        </tagusage>
        <tagusage gi=base   occurs=192650>
          Base or lemmma of a token.
        </tagusage>
        <tagusage gi=msd    occurs=192650>
          Morphosyntactic description of a token.
        </tagusage>
        <tagusage gi=ctag   occurs=98426>
          Corpus tag.
        </tagusage>
      </tagsdecl>
    </encodingdesc>
    <profiledesc>
      <creation date="1997-11-04">
      </creation>
      <langusage>
        <![ %ONECOMPONENT [ &ISOlang; ]]>
        <language id=ns-hu iso639=hu>Newspeak Hungarian</language>
      </langusage>
    </profiledesc>
    <revisiondesc>
      <change>
        <changedate>1997-11-24</changedate>
        <respname>Csaba Oravecz, RIL</respname>
        <h.item>Initial header</h.item>
      </change>
       <change>
       <changedate>1997-12-21</changedate>
        <respname>Tomaz Erjavec, IJS</respname>
         <h.item>Converted from ISO Latin-2 to SGML entities</h.item>
         <h.item>Changed ... to &hellip;</h.item>
         <h.item>Modified EDITIONSTMT, BYTECOUNT</h.item>
       </change>
    </revisiondesc>
  </cesheader>


Multext-East