next up previous contents
Next: Estonian Up: Morphosyntactic Tagging Previous: Bulgarian

Czech

 COP project 106 MULTEXT-East Deliverable D2.3 F ``1984'', Czech

<cesHeader
  version="4.1"
  type="text"
  lang=en
  creator=VP
  status="update"
  date.created="1997-11-28"
  date.updated="1997-12-21"
>
    <filedesc>
      <titlestmt>
        <h.title>Multext-East cesAna: Nineteen Eighty-Four, Czech</h.title>
        <respstmt>
          <respname>Vladim&iacute;r Petkevi&ccaron;</respname>
          <resptype>Overall Responsibility</resptype>
          <respname>Milena Hn&aacute;tkov&aacute;</respname>
          <resptype>Hand-tagging of the first 3 chapters</resptype>
          <resptype>Revision of the tagger results</resptype>
          <respname>Vladim&iacute;r Petkevi&ccaron;</respname>
          <resptype>Conversion to cesAna DTD</resptype>
        </respstmt>
      </titlestmt>
      <editionstmt version="1.0">MTE Final Release</editionstmt>
      <extent>
        <wordCount>79862</wordCount>
        <byteCount units="MB">24.4</byteCount>
      <extnote>
         wordCount represents he number of TOK TYPE=WORD
         elements in the text.
      </extnote>
      </extent>
      <publicationstmt>
        <distributor>
          Institute of Theoretical and Computational Linguistics,
          Faculty of Philosophy, Charles University, Prague
        </distributor>
        <pubaddress>Celetn&aacute; 13 110 00 Praha 1, Czech Republic</pubaddress>
        <eaddress type="email">Vladimir.Petkevic@ff.cuni.cz</eaddress>
        <availability status="restricted">
          Available for research purposes upon receipt of signed agreement
        </availability>
        <pubDate value="1998-01-01">January 1st, 1998</pubDate>
      </publicationstmt>
      <sourcedesc>
        <biblfull>
          <titlestmt>
            <h.title>Multext-East CES1: Nineteen Eighty-Four, Czech</h.title>
          </titlestmt>
          <publicationstmt>
            <distributor>
              Institute of Theoretical and Computational Linguistics,
              Faculty of Philosophy, Charles University, Prague
            </distributor>
          <pubaddress>Celetn&aacute; 13, 110 00 Praha 1, Czech Republic</pubaddress>
          <eaddress type="email">Vladimir.Petkevic@ff.cuni.cz</eaddress>
          <availability status="restricted">
            Available for research purposes upon receipt of signed agreement
          </availability>
          <pubDate value="1997-10-01">November 1, 1997</pubDate>
        </publicationstmt>
      <sourcedesc>
        <biblfull>
          <titlestmt>
            <h.title> 
              Electronic form of 1984 by George Orwell in Czech, 
              obtained via OCR
            </h.title>
            <respstmt>
              <respname>
                Vladim&iacute;r Petkevi&ccaron;
                Institute of Theoretical and Computational Linguistics,
                Faculty of Philosophy, Charles University, Prague, Czech Republic
                (&Uacute;TKL FFUK)
              </respname>
              <resptype>
                OCR'ed the novel
              </resptype>
            </respstmt>
          </titlestmt>
          <publicationstmt>
            <distributor>
              Institute of Theoretical and Computational Linguistics,
              Faculty  of Philosophy, Charles University, Prague, Czech Republic
              (&Uacute;TKL FFUK)
            </distributor>
            <pubaddress>
              Celetn&aacute; 13, Praha 1
              Czech Republic
            </pubaddress>
            <availability status=restricted>
              Available for research purposes upon receipt of signed
              agreement
            </availability>
            <pubdate>1998</pubdate>
          </publicationstmt>
      <sourcedesc>
        <biblstruct>
          <monogr>
            <h.title>1984</h.title>
            <h.author>George Orwell</h.author>
            <h.author>Translator: Eva &Scaron;ime&ccaron;kov&aacute;</h.author>
            <imprint>
              <pubdate>1991</pubdate>
              <publisher>Na&scaron;e vojsko</publisher>
              <pubplace>Prague, Czech Republic</pubplace>
            </imprint>
          </monogr>
        </biblstruct>
      </sourcedesc>
        </biblfull>
      </sourcedesc>
        </biblfull>
      </sourcedesc>
    </filedesc>
    <encodingdesc>
      <projectdesc>
        MULTEXT-East:
        Multilingual Text Tools and Corpora for Central and Eastern
        European Languages.
        EU Copernicus Project COP106
      </projectdesc>
      <editorialdecl>
        <transduction>
          In the cesDoc to cesAna conversion, DIV, QUOTE, Q tags and
          HEAD, POEM, LIST elements have been omitted. cesDoc P
          elements are encoded as PAR, and S as S.
          cesDoc sub-S level tags are omitted: DATE, NAME, ABBR, etc.
         </transduction>
         <quotation>
            Q and QUOTE tags from the cesDoc source not retained.
         </quotation>
         <segmentation>
            S segmentation same as in cesDoc source (hand-validated).
            TOK segmentation performed with mtseg and manually corrected,
         </segmentation>
       </editorialdecl>
       <tagsdecl>
         <tagusage gi=chunklist occurs=1>
           Element corresponds to TEXT of the cesDoc source
         </tagusage>
         <tagusage gi=chunk occurs=1>
           Element corresponds to BODY of the cesDoc source
         </tagusage>
         <tagusage gi=par occurs=1297>
           Elements correspond to P elements of the cesDoc source.
           The FROM attribute gives the reference to the ID of the
           corresponding cesDoc P element.
         </tagusage>
         <tagusage gi=s occurs=6751>
           Elements correspond to S elements of the cesDoc source
           The FROM attribute gives the reference to the ID of the
           corresponding cesDoc S element.
         </tagusage>
         <tagusage gi=tok occurs=100358>
           Tokens are of TYPE=WORD or PUNCT, with the CLASS attribute
           giving the mtseg class of the token.
         </tagusage>
         <tagusage gi=orth   occurs=100358>
           Contains the orthography of the token, as found in the
           cesDoc source.
         </tagusage>
         <tagusage gi=disamb occurs=79862>
           Contains disambiguated lexical information.
         </tagusage>
         <tagusage gi=lex    occurs=214368>
           Contains undisambiguated lexical information.
         </tagusage>
         <tagusage gi=base   occurs=294230>
           Base or lemma of a token.
         </tagusage>
         <tagusage gi=msd    occurs=294230>
           Morphosyntactic description of a token.
         </tagusage>
         <tagusage gi=ctag   occurs=20496>
           Corpus tag (only for punctuation).
         </tagusage>
       </tagsdecl>
     </encodingdesc>
     <profiledesc>
       <creation date="1997-11-04">
       </creation>
       <langusage>
         <![ %ONECOMPONENT [ &ISOlang; ]]>
       <language id=ns-cs iso639=cs>Newspeak Czech</language>
       </langusage>
     </profiledesc>
     <revisiondesc>
       <change>
         <changedate>1997-11-04</changedate>
         <respname>Toma&zcaron; Erjavec, IJS</respname>
         <h.item>Created initial header template and part of the content</h.item>
       </change>
       <change>
       <changedate>1997-11-28</changedate>
         <respname>Vladim&iacute;r Petkevi&ccaron;, &Uacute;TKL</respname>
         <h.item>Created the specific part of the header content</h.item>
       </change>
       <change>
       <changedate>1997-12-21</changedate>
        <respname>Tomaz Erjavec, IJS</respname>
         <h.item>Converted from ISO Latin-2 to SGML entities</h.item>
         <h.item>Modified EDITIONSTMT, BYTECOUNT</h.item>
       </change>
     </revisiondesc>
</cesheader>


Multext-East