%%%%%%%%%%Bibliography related to the MULTEXT-East project %%2003-10-27 %%Toma\v{z} Erjavec %%%%%%%%%%%%%%%%%Overview papers: @inproceedings{mte:slav, author = "Toma\v{z} Erjavec and Cvetana Krstev and Vladim\'{\i}r Petkevi\v{c} and Kiril Simov and Marko Tadi\'{c} and Du\v{s}ko Vitas", title = "{The MULTEXT-East Morphosyntactic Specifications for Slavic Languages}", booktitle = "Proceedings of the EACL 2003 Workshop on Morphological Processing of Slavic Languages", pages = {25-32}, address = "Budapest", year = "2003", note = {http://nl.ijs.si/ME/V2/} } @inproceedings{mte:nlprs, author = "Toma\v{z} Erjavec", title = "{Harmonised Morphosyntactic Tagging for Seven Languages and Orwell's 1984}", booktitle = "6th Natural Language Processing Pacific Rim Symposium, NLPRS'01 ", pages = {487-492}, address = "Tokyo", year = "2001", note = {http://nl.ijs.si/ME/V2/} } @Article{elsnews01:v2, author = {Toma\v{z} Erjavec}, title = "{The MULTEXT-East Resources Revisited}", journal = {ElsNews}, year = 2001, volume = 10, number = 1, pages = "3--2" } @InProceedings{coling98:mte, author = "Ludmila Dimitrova and Toma\v{z} Erjavec and Nancy Ide and Heiki-Jan Kaalep and Vladim\'{\i}r Petkevi\v{c} and Dan Tufi\c{s}", title = "{Multext-East: Parallel and Comparable Corpora and Lexicons for Six Central and Eastern European Languages}", booktitle = "COLING-ACL '98", pages = "315-319", address = "Montr\'eal, Qu\'ebec, Canada", year = 1998 } @InProceedings{lrec98:mtecorp, author = "Toma\v{z} Erjavec and Nancy Ide", title = "The {MULTEXT-East} Corpus", pages = "971--974", booktitle = "First International Conference on Language Resources and Evaluation, LREC'98", address = "Granada", year = 1998, organization = "ELRA" } @InProceedings{lrec98:mtelex, author = "Nancy Ide and Dan Tufi\c{s} and Toma\v{z} Erjavec", title = "{Development and Assessment of Common Lexical Specifications for Six Central and Eastern European Languages}", pages = "233--240", booktitle = "First International Conference on Language Resources and Evaluation, LREC'98", address = "Granada", year = 1998, organization = "ELRA" } @Misc{telri:CD, author = "Toma\v{z} Erjavec and Ann Lawson and Laurent Romary", title = "{East meets West: A Compendium of Multilingual Resources}", howpublished = "CD-ROM", year = 1998, organization = "TELRI Association e.V", note = "ISBN: 3-922641-46-6", } @InProceedings{lrec98:mtecd, author = "Toma\v{z} Erjavec and Ann Lawson and Laurent Romary", title = "{East meets West: Producing Multilingual Resources in a European Context}", pages = "233--240", booktitle = "First International Conference on Language Resources and Evaluation, LREC'98", address = "Granada", year = 1998, organization = "ELRA", note = "http://nl.ijs.si/ME/" } @InProceedings{lrec98:ces, author = "Nancy Ide", title = "{Corpus Encoding Standard}: {SGML} Guidelines for Encoding Linguistic Corpora", pages = "463--470", booktitle = "First International Conference on Language Resources and Evaluation, LREC'98", address = "Granada", year = 1998, organization = "ELRA", note = "http://www.cs.vassar.edu/CES/" } @InProceedings{Erj:TihMTE, author = "Toma\v{z} Erjavec and Nancy Ide and Vladim\'{\i}r Petkevi\v{c} and Jean V\'{e}ronis", title = "{MULTEXT-East}: Multilingual Text Tools and Corpora for {Central and Eastern European} Languages", booktitle = "Proceedings of the First TELRI European Seminar: Language Resources for Language Technology", year = "1996", pages = "87--98", note = "15--16 September 1995, Tihany, Hungary" } @InProceedings{coling94:mt, author = "Nancy Ide and Jean V\'{e}ronis", title = "MULTEXT (Multilingual Tools and Corpora)", booktitle = "Proceedings of the 15th CoLing", address = "Kyoto", year = "1994", pages = "90-96" } %%%%%%%%%%%%%%%%%%Deliverable reports on MULTEXT-East and related projects: @TechReport{MTE:D11F, key = "Toma\v{z} Erjavec and Monica Monachini", title = "Specifications and Notation for Lexicon Encoding", type = "{MULTEXT-East} Final Report", number = "D1.1F", year = "1997", month = "December", editor = "Toma\v{z} Erjavec and Monica Monachini", institution = "Institute Jo\v{z}ef Stefan", address = "Ljubljana, Slovenia", note = "http://nl.ijs.si/ME/CD/docs/mte-d11f/" } @TechReport{MTE:D21F, title = "Sample Corpus Collection and Preparation", type = "MULTEXT-East Final Report", number = "D2.1F", year = "1997", month = "December", editor = "Toma\v{z} Erjavec (ed.)", author = "Ludmila Dimitrova and Lydia Sinapova and Vladimir Petkevi\v{c} and Jana Kl\'{\i}mov\'{a} and Vera Schmiedtov\'{a} and Heiki-Jan Kaalep and Viire Villandi and Heili Orav and Leho Paldre and Urve Talvik and Kadri Muischnek and Csaba Oravecz and Laszlo Tihanyi and \c{S}tefan Bruda and C\u{a}lin Diaconu and Lidia Diaconu and Dan Tufi\c{s} and Toma\v{z} Erjavec and Miro Romih and Olga Vukovi\v{c}", institution = "Institute Jo\v{z}ef Stefan", address = "Ljubljana, Slovenia", note = "70pp", } @TechReport{MTE:D23F, title = "Corpus Markup", type = "MULTEXT-East Final Report", number = "D2.3F", year = "1997", month = "December", author = "Greg-Priest Dorman and Toma\v{z} Erjavec and Nancy Ide and Vladimir Petkevi\v{c}", institution = "Institute Jo\v{z}ef Stefan", address = "Ljubljana, Slovenia", note = "34pp" } @TechReport{mt:mtseg, title = "MtSeg: The Multext multilingual segmenter tools", type = "MULTEXT Deliverable", number = "MSG 1, Version 1.3.1", year = "1996", author = "Philippe Di Cristo", institution = "CNRS", address = "Aix-en-Provence", note = "http://www.lpl.univ-aix.fr/projects/multext/MtSeg/" } @TechReport{mt:D161B, title = "Common Specifications and Notation for Lexicon Encoding and Preliminary Proposal for the Tagsets", type = "MULTEXT Deliverable", number = "D1.6.1B", year = "1995", editor = "Nuria Bel, Nicoletta Calzolari and Monica Monachini", author = "Nuria Bel and Nicoletta Calzolari and Monica Monachini (eds.)", institution = "ILC", address = "Pisa" } @TechReport{eagles:morphana, title = "Recommendations for the Morphosyntactic Annotation of Corpora", author = "Geoffrey Leech and Andrew Wilson", type = "EAGLES Report", number = "EAG--TCWG--MAC/R", year = "1996", institution = "ILC", address = "Pisa", note = "http://www.ilc.cnr.it/EAGLES96/annotate/" } @TechReport{eagles:morphsyn, title = "Synopsis and Comparison of Morphosyntactic Phenomena Encoded in Lexicons and Corpora: A Common Proposal and Applications to European Languages", author = "Nicoletta Calzolari and Monica Monachini (eds.)", type = "EAGLES Report", number = "EAG---CLWG---MORPHSYN/R", year = "1996", institution = "ILC", address = "Pisa", note = "http://www.ilc.cnr.it/EAGLES96/morphsyn/" } @TechReport{eagles:morphsyn-de, title = "EAGLES specification for German morphosyntax", author = "Simone Teufel and Christine St\u{o}ckert", year = "1996" } %%%%%%%%%%Sense disambiguation: @Article{ide00:wsd, author = "Nancy Ide", title = "{Cross-lingual sense determination: Can it work?}", journal = "Computers and the Humanities", pages = "223-34", volume = "34", number = "1-2", year = "2000" } @InProceedings{IdeErjTuf2001, author = "Nancy Ide and Toma\v{z} Erjavec and Dan Tufi\c{s}", title = "{Automatic Sense Tagging Using Parallel Corpora}", pages = "83-89", booktitle = "Proceedings of the Sixth Natural Language Processing Pacific Rim Symposium", year = 2001, address = "Tokyo", } @InProceedings{IdeErjTuf2002, author = "Nancy Ide and Toma\v{z} Erjavec and Dan Tufi\c{s}", title = "{Sense Discrimination with Parallel Corpora}", pages = "54-60", booktitle = "Workshop on Word Sense Disambiguation: Recent Successes and Future Directions", year = 2002, address = "Philadelphia", organization = "ACL", month = "July" } %%%%%%%%%%%%%%%%Tagging experiments: @InProceedings{anlp00:hajic, author = "Jan Haji\v{c}", title = "{Morphological Tagging: Data vs.\ Dictionaries}", booktitle = "ANLP/NAACL 2000", address = "Seatle", year = 2000 } @InProceedings{lnia99:tufis, author = "Dan Tufi\c{s}", title = "{Tiered Tagging and Combined Language Model Classifiers}", editor = "Jelinek and Noth", booktitle = "Text, Speech and Dialogue", number = 1692, series = "Lecture Notes in Artificial Intelligence", pages = "28--33", year = 1999, publisher = {Springer-Verlag} } @InProceedings{lrec00:tufis, author = "Dan Tufi\c{s}", title = "{Using a Large Set of EAGLES-compliant Morpho-Syntactic Descriptors as Tags for Probabilistic Tagging}", booktitle = "Second International Conference on Language Resources and Evaluation, LREC'00", address = "Athens", year = 2000, organization = "ELRA" } @InProceedings{tufis:eq01, author = "Dan Tufi\c{s} and Ana Maria Barbu", title = "{Accurate Automatic extraction of Translation Equivalents from Parallel Corpora}", booktitle = "Proceedings of the Corpus Linguistics 2001 conference", address = "Lancaster", pages = "581-586", series = "UCREL technical paper", volume = 13, year = 2001, } @techreport{dzerza99, author = "Sa\v{s}o D\v{z}eroski and Toma\v{z} Erjavec and Jakub Zavrel", title = "Morphosyntactic Tagging of Slovene: Evaluating PoS Taggers and Tagsets", type = "Research Report", number = "IJS-DP 8018", institution = "Jo\v{z}ef Stefan Institute", address = "Ljubljana, Slovenia", year = "1999", note = "http://nl.ijs.si/lll/bib/dzerza-report/" } @InProceedings{lrec00:dzeroskietal, author = "Sa\v{s}o D\v{z}eroski and Toma\v{z} Erjavec and Jakub Zavrel", title = "{Morphosyntactic Tagging of Slovene: Evaluating PoS Taggers and Tagsets}", booktitle = "Second International Conference on Language Resources and Evaluation, LREC'00", address = "Athens", year = 2000, organization = "ELRA" } @InProceedings{lrec00:tufisetal, author = "Dan Tufi\c{s} and Peter Dienes and Csaba Oravecz and Tamas Varadi", title = "Principled Hidden Tagset Design for Tiered Tagging of Hungarian", booktitle = "Second International Conference on Language Resources and Evaluation, LREC'00", address = "Athens", year = 2000, organization = "ELRA" } @InProceedings{lrec00:bohus, author = "Marian Boldea Bohus", title = "A Web-Based Text Corpora Development System", booktitle = "Second International Conference on Language Resources and Evaluation, LREC'00", address = "Athens", year = 2000, organization = "ELRA", note = {http://www.cs.cmu.edu/People/dbohus/docs/wbtcds\_lrec2000.ps.gz} } %%%Papers on Machine Learning, esp. ILP: %%%Bibliography from http://www.cs.york.ac.uk/mlg/lll/ @InProceedings{cussens99:_morph, author = {James Cussens and Sa\v{s}o D\v{z}eroski and Toma\v{z} Erjavec}, title = {Morphosyntactic Tagging of {S}lovene using {P}rogol}, booktitle = {Inductive Logic Programming: Proc.\ of the 9th International Workshop (ILP-99)}, year = 1999, editor = {Sa\v{s}o D\v{z}eroski and Peter Flach}, address = {Bled, Slovenia}, month = {June}, series = "Lecture Notes in Artificial Intelligence", number = 1634, pages = "68-79", publisher = {Springer-Verlag}, abstract = {We consider the task of tagging Slovene words with morphosyntactic descriptions (MSDs). MSDs contain not only part-of-speech information but also attributes such as gender and case. In the case of Slovene there are 2,083 possible MSDs. P-Progol was used to learn morphosyntactic disambiguation rules from annotated data (consisting of 161,314 examples) produced by the MULTEXT-East project. P-Progol produced 1,148 rules taking 36 hours. Using simple grammatical background knowledge, e.g.\ looking for case disagreement, P-Progol induced 4,094 clauses in eight parallel runs. These rules have proved effective at detecting and explaining incorrect MSD annotations in an independent test set, but have not so far produced a tagger comparable to other existing taggers in terms of accuracy.} } @InProceedings{kazakov99:_learn, author = {Dimitar Kazakov and Suresh Manandhar and Toma\v{z} Erjavec}, title = {Learning word segmentation rules for tag prediction}, booktitle = {Inductive Logic Programming: Proc.\ of the 9th International Workshop (ILP-99)}, year = 1999, editor = {Sa\v{s}o D\v{z}eroski and Peter Flach}, address = {Bled, Slovenia}, month = {June}, series = "Lecture Notes in Artificial Intelligence", number = 1634, pages = "152-161", publisher = {Springer-Verlag} } @InProceedings{ManDzeErj:Clog, author = "Suresh Manandhar and Sa\v{s}o D\v{z}eroski and Toma\v{z} Erjavec", title = "Learning Multilingual Morphology with {CLOG}", editor = "David Page", booktitle = "Inductive Logic Programming; 8th International Workshop ILP-98, Proceedings", series = "Lecture Notes in Artificial Intelligence", number = 1446, pages = "135-144", publisher = {Springer-Verlag}, year = "1998" } @article{er98, author = "Sa\v{s}o D\v{z}eroski and Toma\v{z} Erjavec", title = "Inductive Learning of Multilingual Morphology", journal = "Electrotechnical Review", address = "Ljubljana, Slovenia", volume = "65", number = "6", pages = "296-302", year = "1998" } @InProceedings{Alexin98-WAGA99:proc, author = "Zolt{\'a}n Alexin and Szilvia Zvada and and Tibor Gyim{\'o}thy", title = "Application of {AGLEARN} on {H}ungarian {P}art-of-speech {T}agging", booktitle = "Second Workshop on Attribute Grammars and their Applications, WAGA'99", year = "1999", address = "Amsterdam, The Netherlands", editor = "D. Parigot and M. Mernik", publisher = "INRIA rocquencourt", pages = "133--152", month = mar, mail = "alexin@inf.u-szeged.hu,gyimi@inf.u-szeged.hu,zvada@inf.u-szeged.hu", abstract = "In this paper we present an application of the AGLEARN method to the part-of-speech (POS) tagging of Hungarian sentences. The task of AGLEARN is to infer the semantic functions associated with production. In the learning process the grammar, the background semantic functions and the examples can be used. We applied the AGLEARN method to infer context rules to choose the correct tags. A corpus with about 100 000 pre-tagged words has been used for training and testing. By using AGLEARN method learning data are generated to the C 4.5 attribute value learner. These generated data contain information about the phrase structure of the sentences. A background attribute grammar has been used to determine these sructural information. Our experinces showed that using this structural background information C4.5 learner was able to infer more precise context rules.", postscript = "WAGA99/proceedings/alexin/alexin.ps", ppdf = "WAGA99/proceedings/alexin/alexin.pdf", note = "http://www-rocq.inria.fr/oscar/www/fnc2/WAGA99/accept.html", } @InProceedings{Horvath99-ILP99:proc, author = "T. Horv{\'a}th and Zolt{\'a}n Alexin and Tibor Gyim{\'o}thy and S. Wrobel", title = "Application of {D}ifferent {L}earning {M}ethods to {H}ungarian {P}art-of-speech {T}agging", booktitle = "Proc. 9th Int. Conference on Inductive Logic Programming, ILP99", pages = {128-139}, year = 1999, editor = {Sa\v{s}o D\v{z}eroski and Peter Flach}, address = {Bled, Slovenia}, month = {June}, number = {1634}, series = {Lecture Notes in Artificial Intelligence}, address = "Bled, Slovenia" } %%%%%%%%%%Papers on the MULTEXT-East related Concede project: @InProceedings{complex99:cnc, author = "Toma\v{z} Erjavec and Dan Tufi\c{s} and Tamas Varadi", title = "Developing {TEI}-conformant Lexical Databases for {CEE} Languages", booktitle = "Proceedings of the 4th International Conference on Computational Lexicography, COMPLEX'99", year = 1999, pages = "205-209", address = "Pecs, Hungary" } @InProceedings{lrec00:cnc, author = "Toma\v{z} Erjavec and Roger Evans and Nancy Ide and Adam Kilgarriff", title = "The Concede Model for Lexical Databases", booktitle = "Second International Conference on Language Resources and Evaluation, LREC'00", address = "Athens", year = 2000, organization = "ELRA" } @InProceedings{cnc-complex03, author = "Toma\v{z} Erjavec and Roger Evans and Nancy Ide and Adam Kilgarriff", title = "{From Machine Readable Dictionaries to Lexical Databases: the Concede Experience}", booktitle = "Proceedings of the 7th International Conference on Computational Lexicography, COMPLEX'03", year = 2003, pages = "", address = "Budapest, Hungary" } %%%%%%%%%%Language Specific papers on MULTEXT-East resources: @InProceedings{petkevic:orwl, author = "Vladimir Petkevi\v{c}", title = "{Czech translation of G. Orwell's '1984': Morphology and syntactic patterns in the corpus}", series = "Lecture Notes in Artificial Intelligence", number = 1692, pages = "77-82", year = 1999, publisher = {Springer-Verlag} } @InProceedings{erk:mtelex, author = "Toma\v{z} Erjavec", title = "The {Multext-East Slovene Lexicon}", booktitle = "Proceedings of the 7th Slovene Electrotechnical Conference, ERK '98 ", pages = "189-192", address = "Portoro\v{z}, Slovenia", year = 1998, note = "http://nl.ijs.si/et/Bib/ERK98/" } @InProceedings{sdjt:fida, author = "Toma\v{z} Erjavec and Vojko Gorjanc and Marko Stabej", title = "Korpus {FIDA}", booktitle = "Proceedings of the Conference 'Language Technologies for the Slovene Language'", organization = "Institute ``Jo\v{z}ef Stefan''", pages = "124-127", address = "Ljubljana, Slovenia", year = 1998 } @InProceedings{lrec00:rojckacic, author = "Matej Rojc and Zdravko Ka\v{c}i\v{c}", title = "A Computational Platform for Development of Morphologic and Phonetic Lexica", booktitle = "Second International Conference on Language Resources and Evaluation, LREC'00", address = "Athens", year = 2000, organization = "ELRA" } @InProceedings{lrec00:lat, author = "Kristine Lev?ne and Andrejs Spektors", title = "Morphemic Analysis and Morphological Tagging of Latvian Corpus", booktitle = "Second International Conference on Language Resources and Evaluation, LREC'00", address = "Athens", year = 2000, organization = "ELRA" }