next up previous contents
Next: Slovene Up: Morphosyntactic Tagging Previous: Hungarian

Romanian

 COP project 106 MULTEXT-East Deliverable D2.3 F ``1984'', Romanian

<cesHeader
  version="4.1"
  type="text"
  lang=en
  creator=DT
  status="update"
  date.created="1997-11-04"
  date.updated="1997-12-21"
>
    <filedesc>
      <titlestmt>
        <h.title>Multext-East cesAna: Nineteen Eighty-Four, Romanian</h.title>
        <respstmt>
          <respname>Dan Tufi&scedil;</respname>
          <resptype>Overall Responsibility</resptype>
          <respname>Ana-Maria Barbu</respname>
          <resptype>Hand-tagging the whole book</resptype>
          <respname>Vasile P&abreve;tra&scedil;cu </respname>
          <resptype>Conversion to cesAna DTD </resptype>
        </respstmt>
      </titlestmt>
      <editionstmt version="1.0">MTE Final Release</editionstmt>
       <extent>
        <wordCount>101508</wordCount>
        <byteCount units="MB">27.1</byteCount>
        <extnote>wordCount represents he number of TOK TYPE=WORD
          elements in the text. byteCount is in megaBytes</extnote>
      </extent>
       <publicationstmt>
        <distributor>
          Center for Advanced Research in Machine Learning, Natural  Language
                  Processing and Conceptual Modelling
        </distributor>
        <pubaddress>Casa Academiei,13, "13 Septembrie, Bucharest 5, 74311, Romania"
                </pubaddress>
        <eaddress type="email">tufis@valhalla.racai.ro</eaddress>
        <eaddress type="www">http://nl.ijs.si/ME</eaddress>
        <availability status="restricted">
          Available for research purposes upon receipt of signed agreement
        </availability>
        <pubDate value="1998-01-01">January 1st, 1998</pubDate>
       </publicationstmt>
       <sourcedesc>
         <biblstruct>
          <monogr>
           <h.title>O mie nou&abreve; sute optzeci &scedil;i patru</h.title>
           <h.author>George Orwell</h.author>
           <h.author>Translator: Mihnea Gafi&tcedil;a</h.author>
           <imprint>
            <pubdate>1991</pubdate>
            <publisher>Editura Univers</publisher>
            <pubplace>Bucharest</pubplace>
           </imprint>
          </monogr>
         </biblstruct>
       </sourcedesc>
    </filedesc>
    <encodingdesc>
     <projectdesc>
        MULTEXT-East:
        Multilingual Text Tools and Corpora for Central and Eastern
        European Languages.
        EU Copernicus Project COP106
     </projectdesc>
     <editorialdecl>
        <transduction>
          The electronic form was obtained by keyboarding at the
          Center for Advanced Research in Machine Learning, Natural  Language
          Processing and Conceptual Modelling, spell-checked and hand tagged.
          In the cesDoc to cesAna conversion, DIV, QUOTE, Q tags and
          HEAD, POEM, LIST elements have been omitted. cesDoc P
          elements are encoded as PAR, and S as S.
          cesDoc sub-S level tags are omitted: DATE, NAME, ABBR, etc.
        </transduction>
        <quotation>
          Q and QUOTE tags from the cesDoc source not retained.
        </quotation>
       <segmentation>
          S segmentation same as in cesDoc source (hand-validated).
          TOK segmentation performed with mtseg and manually corrected,
         </segmentation>
      </editorialdecl>
      <tagsdecl>
        <tagusage gi=chunkList occurs=1>
          Element corresponds to TEXT of the cesDoc source
        </tagusage>
        <tagusage gi=chunk occurs=1>
          Element corresponds to BODY of the cesDoc source
        </tagusage>
        <tagusage gi=par occurs=1343>
          Elements correspond to P, POEM, LIST, HEAD elements of the cesDoc source.
          The FROM attribute gives the reference to the ID of the
          corresponding cesDoc P element.
        </tagusage>
        <tagusage gi=s occurs=6521>
          Elements correspond to S, L, ITEM elements of the cesDoc source
          The FROM attribute gives the reference to the ID of the
        corresponding cesDoc S element.
        </tagusage>
        <tagusage gi=tok occurs=118063>
          Tokens are of TYPE=WORD or PUNCT, with the CLASS attribute
          giving the mtseg class of the token.
        </tagusage>
        <tagusage gi=orth   occurs=118063>
          Contains the orthography of the token, as found in the
          cesDoc source.
        </tagusage>
        <tagusage gi=disamb occurs=101508>
          Contains disambiguated lexical information.
        </tagusage>
        <tagusage gi=lex    occurs=189695>
          Contains undisambiguated lexical information.
        </tagusage>
        <tagusage gi=base   occurs=291203>
          Base or lemmma of a token.
        </tagusage>
        <tagusage gi=msd    occurs=291203>
          Morphosyntactic description of a token.
        </tagusage>
        <tagusage gi=ctag   occurs=307758></tagusage>
      </tagsdecl>
       </encodingdesc>
    <profiledesc>
      <creation date="1997-11-04">
      </creation>
      <langusage>
        <![ %ONECOMPONENT [ &ISOlang; ]]>
        <language id=ns-ro iso639=sl>Newspeak Romanian</language>
      </langusage>
    </profiledesc>
    <revisiondesc>
      <change>
        <changedate>1997-11-04</changedate>
        <respname>Toma&zcaron; Erjavec, IJS</respname>
        <h.item>Initial header</h.item>
      </change>
      <change>
        <changedate>1997-11-06</changedate>
        <respname>Vasile P&abreve;tra&scedil;cu</respname>
         <h.item>
          The Tagusage, Bytecount and Wordcount were updated. Entities that
          were counted as words are those that were identified by the segmenter
          that is words, clitics, compounds (counted as one unit, irrespective
          of the number of constituents), punctuation, numbers.
        </h.item>
      </change>
       <change>
       <changedate>1997-12-21</changedate>
        <respname>Tomaz Erjavec, IJS</respname>
         <h.item>Modified EDITIONSTMT and changed ... to &hellip;</h.item>
       </change>
    </revisiondesc>
  </cesheader>


Multext-East