mybib.bib

@preamble{{\newcommand{\nic}[1]{}\frenchspacing\newcommand{\urls}[1]{{\small #1}}}}
@article{cermak:rosen:10,
  abstract = {This paper introduces InterCorp, a parallel corpus including texts in Czech and 27 other languages, available for online searches via a web interface. After discussing some issues and merits of a multilingual resource we argue that it has an important role especially for languages with fewer native speakers, supporting both comparative research and studies of the language from the perspective of other languages. We proceed with an overview of the corpus --- the strategy and criteria for including new texts, the representation of available languages and text types, linguistic annotation, and a sketch of pre-processing issues. Finally, we present the search interface and suggest some research opportunities.},
  author = {František Čermák and Alexandr Rosen},
  date-modified = {2018-07-21 14:52:22 +0000},
  issn = {1384-6655},
  journal = {International Journal of Corpus Linguistics},
  keywords = {parallel corpora; comparative corpus linguistics; European languages; Czech; multilingualism; intercorp},
  number = {3},
  pages = {411--427},
  title = {The Case of {I}nter{C}orp, a multilingual parallel corpus},
  url = {http://utkl.ff.cuni.cz/~rosen/public/2012_intercorp_ijcl.pdf},
  volume = {13},
  year = {2012},
  bdsk-url-1 = {http://utkl.ff.cuni.cz/~rosen/public/2012_intercorp_lrej.pdf}
}
@article{Haj:Hajova:Ros:92,
  author = {Jan Hajič and Eva Hajičová and Alexandr Rosen},
  date-modified = {2018-07-21 14:42:46 +0000},
  journal = {META},
  keywords = {machine translation},
  note = {A special issue, eds. M. C. Cormier and D. Estival},
  number = {4},
  pages = {802-816},
  title = {{Machine Translation Research in Czechoslovakia}},
  volume = {37},
  year = {1992}
}
@incollection{hajic:etal:1996,
  author = {Jan Hajič and Eva Hajičová and Alexandr Rosen},
  booktitle = {TELRI Newsletter No.3},
  date-added = {2013-05-31 15:30:36 +0000},
  date-modified = {2018-07-21 14:42:13 +0000},
  keywords = {treebank},
  pages = {12--19},
  title = {{Formal Representation of Language Structure}},
  year = {1996}
}
@techreport{Hajicova:etal:1994,
  author = {Hajičová, Eva and Sgall, Petr and Uszkoreit, Hans and Oliva, Karel and Plátek, Martin and Kuboň, Vladislav and Hric, Jano and Bémová, Alevtina and Petkevič, Vladimír and Skoumalová, Hana and Rosen, Alexandr and Korbayová, Ivana},
  date-added = {2012-10-21 18:47:53 +0000},
  date-modified = {2018-07-21 14:44:07 +0000},
  institution = {Charles University},
  keywords = {grammar checker},
  title = {Adaptation and transfer of parsing techniques},
  type = {Deliverable of the joint project {PECO} 2824},
  year = {1994}
}
@inproceedings{hana:etal:10,
  address = {Uppsala, Sweden},
  author = {Jirka Hana and Alexandr Rosen and Svatava Škodová and Barbora Štindlová},
  booktitle = {Proceedings of the Fourth Linguistic Annotation Workshop},
  date-modified = {2018-07-19 15:06:42 +0000},
  isbn = {978-1-932432-72-5},
  keywords = {akces; CzeSL},
  organization = {Association for Computational Linguistics},
  pages = {11--19},
  title = {Error-tagged learner corpus of {C}zech},
  url = {http://www.aclweb.org/anthology/W10-1802},
  year = 2010,
  bdsk-url-1 = {http://utkl.ff.cuni.cz/~rosen/public/hanaetal_law2010.pdf}
}
@inproceedings{Hana:etal:2012,
  address = {Istanbul, Turkey},
  author = {Jirka Hana and Alexandr Rosen and Barbora Štindlová and Petr Jäger},
  booktitle = {Proceedings of the Eight International Conference on Language Resources and Evaluation (LREC'12)},
  date = {23-25},
  date-added = {2012-05-28 22:46:05 +0000},
  date-modified = {2018-07-19 15:18:12 +0000},
  editor = {Nicoletta Calzolari and Khalid Choukri and Thierry Declerck and Mehmet Uğur Doğan and Bente Maegaard and Joseph Mariani and Jan Odijk and Stelios Piperidis},
  isbn = {978-2-9517408-7-7},
  keywords = {learner corpus; akces; CzeSL},
  language = {english},
  pages = {3228--3232},
  publisher = {European Language Resources Association (ELRA)},
  title = {Building a learner corpus},
  url = {http://www.lrec-conf.org/proceedings/lrec2012/pdf/992_Paper.pdf},
  year = {2012},
  bdsk-url-1 = {http://www.lrec-conf.org/proceedings/lrec2012/pdf/992_Paper.pdf}
}
@article{Hana:etal:2014,
  abstract = {The need for data about the acquisition of Czech by non-native learners prompted the compilation of the first learner corpus of Czech. After introducing its basic design and parameters, including a multi-tier manual annotation scheme and error taxonomy, we focus on the more technical aspects: transcription of hand-written source texts, process of annotation, and options for exploiting the result, together with tools used for these tasks and decisions behind the choices. To support or even substitute manual annotation we assign some error tags automatically and use automatic annotation tools (tagger, spell checker).},
  author = {Jirka Hana and Alexandr Rosen and Barbora Štindlová and Jan Štěpánek},
  date-added = {2013-04-14 15:44:30 +0000},
  date-modified = {2018-07-19 15:17:49 +0000},
  doi = {10.1007/s10579-014-9278-z},
  issn = {1574-020X},
  journal = {Language Resources and Evaluation},
  keywords = {learner corpus; error annotation; Czech; akces; CzeSL},
  number = {4},
  pages = {741-752},
  publisher = {Springer Netherlands},
  title = {Building a learner corpus},
  url = {https://www.researchgate.net/publication/265164699_Building_a_learner_corpus},
  volume = {48},
  year = {2014},
  bdsk-url-1 = {https://www.researchgate.net/publication/265164699_Building_a_learner_corpus},
  bdsk-url-2 = {http://dx.doi.org/10.1007/s10579-014-9278-z}
}
@incollection{Hebal:etal:2016,
  abstract = {The aim of the text is to confront expectations of parallel corpora users with possibilities of their builders. The idea arose from discussions of the first two co-authors as representatives of the corpus users with some of the builders of InterCorp, especially with the third co-author, mainly as a response to the fact that the efforts of the corpus builders (aimed, i.a., at a steady growth of the text volumes and improvements of corpus search tools) do not quite meet some research needs of the users. Our comments are presented from two points of view: the builder's perspective and the users' perspective, based on comparative analyses and translatological studies.

Między młotem a kowadłem, czyli czego potrzebuje użytkownik korpusu równoległego, a jakie są możliwości twórców korpusów (na przykładzie czesko-polskiej części korpusu równoległego InterCorp)

Celem artykułu jest próba porównania oczekiwań użytkownika korpusu równoległego co do możliwości prowadzenia różnego typu badań, zwłaszcza analiz konfrontatywnych oraz translatologicznych z technicznymi możliwościami twórców korpusu.
Autorzy rozpoczynają rozważania od szczegółowego opisu problemów twórców InterCorp. Wskazują na największe bolączki polegające na braku proporcji pomiędzy liczbą tekstów w poszczególnych językach umieszczonych w korpusie, a także na tym, że teksty reprezentują różne poziomy anotacji i tagowania. Szczegółowo opisana została polska część korpusu InterCorp. Autorzy podają dane statystyczne dotyczące poszczególnych wersji korpusu. Wiele miejsca poświęcono również problemowi anotacji i tokenizacji (znakowania). Zauważono, że dużym utrudnieniem jest brak jednolitego systemu znakowania dla wszystkich obecnych w InterCorpie języków.
Na przedstawione w skrócie problemy twórców korpusu nakładają się trudności, jakie napotykają jego użytkownicy oraz ich oczekiwania względem jego zasobów. Osoby korzystające np. z zasobów polsko-czeskiej części InterCorpu narzekać mogą na zestawienie tekstów. O ile literatura piękna jest opracowywana ręcznie, o tyle tzw. kolekcje tekstów (Acquis, PressEurope, Europarl, Open Subtitles) są opracowywane tylko automatycznie. Paradoksalnie więc teksty, które nie sprawiają kłopotów twórcom korpusu, są dla niektórych użytkowników mniej przydatne. Nie można na przykład przeprowadzić szeregu badań opartych na materiale korpusowym, jeżeli nie da się ustalić kierunku przekładu albo języka źródłowego. Dotyczy to wszystkich analiz translatologicznych. Również niedostateczna wielkość korpusu stanowi dla użytkowników dużą przeszkodę. Zbyt mała liczba poświadczeń może uniemożliwić całkowicie przeprowadzenie badań nad konkretnym zjawiskiem leksykalnym czy gramatycznym (przykłady podane zostały w artykule).
Użytkownicy sięgają jednak do korpusów paralelnych, ponieważ, mimo wszelkich niedociągnięć, stanowią one niezwykłe narzędzie służące do poszukiwania ekwiwalentów, a także porównywania znaczeń jednostek językowych. Dopasowanie odpowiedniego tematu badania do możliwości korpusu jest w tym przypadku podstawową czynnością poprzedzającą samo badanie, a jednocześnie gwarantem wiarygodności wyników.
Sposób rozbudowywania InterCorpu jest sprawą powodującą prawdopodobnie największe kontrowersje pomiędzy twórcami a użytkownikami korpusu. Korzystającym z części polsko-czeskiej czy czesko-angielskiej zależy na tym, aby twórcy poświęcili jak najwięcej uwagi tej konkretnej parze języków, tę część rozbudowywali i doskonalili. Twórcy natomiast chcą uwzględnić w korpusie jak najwięcej języków. Z puntu widzenia użytkowników to zabieg mniej ważny, z punktu widzenia twórców to działanie przyszłościowe. Zarówno użytkownik korpusu, jak i jego twórca, znajdują się w sytuacji pomiędzy tym, co mogą i tym, co by chcieli -- między swoistym młotem i kowadłem.

korpus równoległy, język polski, język czeski, badania komparatywne, ekwiwalenty leksykalne},
  address = {Warszawa},
  author = {Milena Hebal-Jezierska and Elżbieta Kaczmarska and Alexandr Rosen},
  booktitle = {Polskojęzyczne korpusy równoległe. Polish-language Parallel Corpora},
  date-added = {2016-03-23 20:00:16 +0000},
  date-modified = {2018-07-21 14:50:16 +0000},
  editor = {Ewa Gruszczyńska and Agnieszka Leńko-Szymańska},
  keywords = {parallel corpus; Polish; Czech; comparative studies; lexical equivalents; contrastive lexical analysis; intercorp},
  pages = {41--56},
  publisher = {Instytut Lingwistyki Stosowanej},
  series = {Multilingual Applied Linguistics -- Wielojęzyczna Lingwistyka Stosowana},
  title = {Between the devil and the deep blue sea or between users' needs and the compilers' powers: An analysis of the {C}zech-{P}olish part of the parallel corpus {InterCorp}},
  url = {http://rownolegle.blog.ils.uw.edu.pl/files/2016/03/03_Hebal-Jezierska_Kaczmarska_Rosen.pdf},
  volume = {1},
  year = {2016},
  bdsk-url-1 = {http://rownolegle.blog.ils.uw.edu.pl/files/2016/03/03_Hebal-Jezierska_Kaczmarska_Rosen.pdf}
}
@inproceedings{Hnatkova:etal:2011,
  address = {Praha},
  author = {Milena Hnátková and Petr Jäger and Tomáš Jelínek and Vladimír Petkevič and Alexandr Rosen and Hana Skoumalová},
  booktitle = {Korpusová lingvistika Praha 2011: 3 -- Gramatika a značkování korpusů},
  date-added = {2015-11-25 21:45:39 +0000},
  date-modified = {2018-07-21 14:28:12 +0000},
  editor = {Vladimír Petkevič and Alexandr Rosen},
  keywords = {treebank},
  pages = {143-153},
  publisher = {Nakladatelství Lidové noviny},
  series = {Studie z korpusové lingvistiky},
  title = {Syntakticky anotovaný korpus českých textů},
  url = {http://utkl.ff.cuni.cz/~rosen/public/2011_synt_korpling.pdf},
  volume = {16},
  year = {2011},
  bdsk-url-1 = {http://utkl.ff.cuni.cz/~rosen/public/2011_synt_korpling.pdf}
}
@inproceedings{Hnatkova:etal:2017,
  abstract = {We propose a multidimensional taxonomy of multiword expressions (MWEs) as a pattern applicable to entries in a representative lexicon of Czech MWEs. The taxonomy and the lexicon are useful for many reasons concerning lexicography, teaching Czech as a foreign language, and theoretical issues of MWEs as entities standing between lexicon and grammar, as well as for NLP tasks such as tagging and parsing, identification and search of MWEs, or word sense and semantic disambiguation. In addition to the description of various types of idiomaticity, the taxonomy and the lexicon are designed to account for flexibility in morphology and word order, syntactic and lexical variants and even creatively used fragments.},
  address = {Cham},
  author = {Milena Hnátková and Tomáš Jelínek and Marie Kopřivová and Vladimír Petkevič and Alexandr Rosen and Hana Skoumalová and Pavel Vondřička},
  booktitle = {Computational and Corpus-Based Phraseology. EUROPHRAS 2017.},
  date-added = {2018-01-31 14:18:37 +0000},
  date-modified = {2018-07-21 14:26:38 +0000},
  doi = {10.1007/978-3-319-69805-2_12},
  editor = {Ruslan Mitkov},
  isbn = {978-3-319-69804-5},
  keywords = {MWE},
  pages = {160--175},
  publisher = {Springer},
  series = {Lecture Notes in Computer Science},
  title = {Eye of a Needle in a Haystack},
  url = {https://link.springer.com/chapter/10.1007/978-3-319-69805-2_12#citeas},
  volume = {10596},
  year = {2017},
  bdsk-url-1 = {https://link.springer.com/chapter/10.1007/978-3-319-69805-2_12#citeas},
  bdsk-url-2 = {https://dx.doi.org/10.1007/978-3-319-69805-2_12}
}
@article{Hnatkova:etal:2018,
  author = {Milena Hnátková and Tomáš Jelínek and Marie Kopřivová and Vladimír Petkevič and Alexandr Rosen and Hana Skoumalová and Pavel Vondřička},
  booktitle = {Lepší vrabec v hrsti nežli holub na střeše. Víceslovné lexikální jednotky v češtině: typologie a slovník},
  date-added = {2020-01-24 13:39:37 +0100},
  date-modified = {2020-11-14 22:30:52 +0100},
  issn = {ISSN 1804-137X},
  journal = {Korpus – gramatika – axiologie},
  keywords = {MWE},
  number = {17},
  pages = {3–22},
  title = {Lepší vrabec v hrsti nežli holub na střeše. {V}íceslovné lexikální jednotky v češtině: typologie a slovník},
  volume = {9},
  year = {2018}
}
@inproceedings{Hnatkova:etal:2019,
  address = {St. Petersburg},
  annote = {ISSN 2412-9623},
  author = {Milena Hnátková and Tomáš Jelínek and Marie Kopřivová and Vladimír Petkevič and Alexandr Rosen and Hana Skoumalová and Pavel Vondřička},
  booktitle = {Trudy meždunarodnoj konferencii „Korpusnaja lingvistika – 2019“ (Proceedings of the International Conference „Corpus Linguistics – 2019“)},
  date-added = {2019-11-28 16:28:31 +0100},
  date-modified = {2019-11-29 15:50:27 +0100},
  editor = {Viktor Pavlovich Zakharov},
  pages = {9-16},
  publisher = {Saint Petersburg University Press},
  title = {Lexical Database of Multiword Expressions in {C}zech},
  year = {2019}
}
@inproceedings{jaeger:etal:10,
  address = {Frankfurt am Main},
  author = {Petr Jäger and Vladimír Petkevič and Alexandr Rosen and Hana Skoumalová},
  booktitle = {Slavic Languages in Formal Grammar. Proceedings of FDSL 8.5, Brno 2010},
  date-modified = {2018-07-21 14:28:29 +0000},
  editor = {Markéta Ziková and Mojmír Dočekal},
  isbn = {978-3-631-63609-1},
  keywords = {treebank},
  pages = {49-63},
  publisher = {Peter Lang},
  title = {Towards a treebank for all tastes},
  url = {http://utkl.ff.cuni.cz/~rosen/public/2010-gacr-fdsl.pdf},
  year = 2012,
  bdsk-url-1 = {http://utkl.ff.cuni.cz/~rosen/public/2010-gacr-fdsl.pdf}
}
@unpublished{Janssen:Rosen:2018,
  abstract = {Different ways of tagging errors in learner corpora

The most distinctive feature of learner corpora is the fact that texts written by language learners contain errors, or deviant forms, or non-native variants of the target language if you wish. To provide a systematic analysis of those errors, learner corpora typically include error annotation, indicating errors in the text. This is traditionally done by an error code assigned to the incorrect part of the text, optionally accompanied by a target hypothesis, i.e. a reformulation of the error in the native standard of the target language. While standards are emerging for linguistic annotation of corpora including standard native language, choosing appropriate categories for annotating errors is not easy. The codes usually reflect their interpretation in terms of a standard grammar (spelling, morphological paradigms, morphosyntactic categories, agreement, government or valency, etc.), and thus their design and application to various phenomena of a non-native language is far from trivial. Moreover, the interplay of categories presumably responsible for the phenomena is not easily represented by tags assigned to the linear text of the original.

We compare the traditional linear approach to error annotation with two alternative approaches: (i) the approach introduced in the COPLE2 corpus (del Río et al. 2016), in which errors are not indicated by a code, but rather the error is provided with an orthographic, a morphosyntactic, and a lexical correction, which together provide detailed information about the type of error; and (ii) the approach introduced in the CzeSL corpus, in which the erroneous sentence is aligned with two corrected versions of of the sentence, with alignments between the words in the three variants, as well as error codes (Hana et al. 2014).

For the comparison, we will describe what the various options provide, using the two tools in their respective project as examples: ​feat1 for the parallel scheme approach, and TEITOK (Janssen 2016) for the multi-layered correction approach. We look at how the different approaches can represent complex cases of overlapping errors (where a group of words is involved in various distinct errors), discontinuous errors (where the words involved in an error are not next to each other), word order errors, and secondary errors (where the correction of one error leads to another error). We will not only discuss how the three different paradigms can represent the complex error cases, but also how once the errors are correctly represented in the corpus, they can be used for concrete search queries to answer research questions.
Error annotation of learner corpora is often combined with linguistic annotation, as it is applied in corpora of standard (native) language. If a corrected version of the text is available, standard tools (such as tokenizers, taggers, and parsers) can be used to apply such tools with high accuracy, and the result can even be projected to or linked with the original uncorrected text. Annotating the uncorrected original text is typically a more challenging task, both in terms of accuracy of existing tools, and in terms of missing concepts. We will show how the paradigms discussed above integrate with annotations of this type.

1 ​https://bitbucket.org/jhana/feat-hg/wiki/Home
 
References
del Río, I., Antunes, S., Mendes, A., and Janssen, M. (2016). Towards error annotation in a learner corpus of portuguese. In ​5th NLP4CALL and 1st NLP4LA workshop in Sixth Swedish Language Technology Conference (SLTC)​, Umeå University, Sweden.
Hana, J., Rosen, A., Štindlová, B., and Štěpánek, J. (2014). Building a learner corpus. Language Resources and Evaluation​, 48(4):741–752.
Janssen, M. (2016). TEITOK: Text Faithful Corpora. In: ​Proceedings of LREC 2016.​ ELRA. Portorož, Slovenia, pp. 4037–4043.},
  author = {Maarten Janssen and Alexandr Rosen},
  booktitle = {Grammar \& Corpora 2018},
  date-added = {2018-06-18 14:21:30 +0000},
  date-modified = {2020-08-09 14:20:57 +0200},
  keywords = {akces; CzeSL},
  local-url = {https://docs.google.com/presentation/d/1LdtMiKKif6PFLFwqi6LU3VYFR_K7-5xzLT6KhN4uAWw/edit#slide=id.g44349cf621_0_122},
  note = {A talk presented at the 7th conference Grammar and Corpora (GaC 2018) in Paris, November 2018},
  title = {Different ways of tagging errors in learner corpora},
  url = {http://drehu.linguist.univ-paris-diderot.fr/gac-2018/?fichier=programme},
  year = {2018},
  bdsk-file-1 = {YnBsaXN0MDDSAQIDBFxyZWxhdGl2ZVBhdGhZYWxpYXNEYXRhXxAuLi4vUFJPSkVDVFMvQ0hLL19QUC8yMDE4LWN6ZXNsLUdhQy9HQUMyMDE4LnBkZk8RAWgAAAAAAWgAAgAADE1hY2ludG9zaCBIRAAAAAAAAAAAAAAAAAAAAAAAAABCRAAB/////wtHQUMyMDE4LnBkZgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAD/////AAAAAFBERiBDQVJPAAEABQAACiBjdQAAAAAAAAAAAAAAAAAOMjAxOC1jemVzbC1HYUMAAgA3LzpVc2VyczphaXI6UFJPSkVDVFM6Q0hLOl9QUDoyMDE4LWN6ZXNsLUdhQzpHQUMyMDE4LnBkZgAADgAYAAsARwBBAEMAMgAwADEAOAAuAHAAZABmAA8AGgAMAE0AYQBjAGkAbgB0AG8AcwBoACAASABEABIANVVzZXJzL2Fpci9QUk9KRUNUUy9DSEsvX1BQLzIwMTgtY3plc2wtR2FDL0dBQzIwMTgucGRmAAATAAEvAAAVAAIACv//AAAACAANABoAJABVAAAAAAAAAgEAAAAAAAAABQAAAAAAAAAAAAAAAAAAAcE=}
}
@inproceedings{Jelinek:etal:2012,
  address = {Istanbul, Turkey},
  author = {Tomáš Jelínek and Vladimír Petkevič and Alexandr Rosen and Hana Skoumalová},
  booktitle = {Proceedings of the META-RESEARCH Workshop on Advanced Treebanking, LREC 2012},
  date-added = {2012-04-18 08:32:25 +0000},
  date-modified = {2013-04-23 21:39:08 +0000},
  editor = {Jan Hajič and Koenraad De Smedt and Marko Tadić and António Branco},
  isbn = {978-2-9517408-7-7},
  keywords = {treebanks},
  organization = {ELRA},
  pages = {37--44},
  publisher = {European Language Resources Association},
  title = {Czech Treebanking Unlimited},
  url = {http://utkl.ff.cuni.cz/synttb/Team_Czech_Treebanking_Unlimited.pdf},
  year = {2012},
  bdsk-url-1 = {http://utkl.ff.cuni.cz/synttb/Team_Czech_Treebanking_Unlimited.pdf}
}
@incollection{Jelinek:etal:2012a,
  address = {Berlin, Heidelberg},
  author = {Jelínek, Tomáš and Štindlová, Barbora and Rosen, Alexandr and Hana, Jirka},
  booktitle = {Text, Speech and Dialogue: 15th International Conference, TSD 2012, Brno, Czech Republic, September 3-7, 2012. Proceedings},
  date-added = {2017-04-03 19:45:12 +0000},
  date-modified = {2018-07-21 14:27:37 +0000},
  doi = {10.1007/978-3-642-32790-2_15},
  editor = {Sojka, Petr and Horák, Aleš and Kopeček, Ivan and Pala, Karel},
  isbn = {978-3-642-32790-2},
  keywords = {CzeSL},
  pages = {127--134},
  publisher = {Springer Berlin Heidelberg},
  title = {Combining Manual and Automatic Annotation of a Learner Corpus},
  url = {http://dx.doi.org/10.1007/978-3-642-32790-2_15},
  year = {2012},
  bdsk-url-1 = {http://dx.doi.org/10.1007/978-3-642-32790-2_15}
}
@inproceedings{Jelinek:etal:2014,
  abstract = {We describe main features of a treebank of Czech, licensed by an HPSG-style grammar. The grammar interacts with corpus texts, preprocessed by mor- phological analysis and morphological disambiguation, a (largely) stochas- tic dependency parser and subsequent transformation into phrase-structure trees. The resulting trees, including functional and categorial information on words and phrases, are represented as typed feature structures. The grammar cooperates with a valency lexicon: the actual data are matched with surface valency frames, derived by lexical rules from predicate-argument structures in the lexicon. If the match is successful, the resulting annotation is enriched with information derived from the parsed data and from the lexicon. The paper concludes with an evaluation of the individual processing steps.},
  address = {Tübingen},
  author = {Tomáš Jelínek and Vladimír Petkevič and Alexandr Rosen and Hana Skoumalová and Přemysl Vítovec and Jiří Znamenáček},
  booktitle = {Proceedings of the Eleventh International Workshop on Treebanks and Linguistic Theories (TLT13)},
  date-added = {2014-11-11 19:40:48 +0000},
  date-modified = {2018-07-21 14:28:47 +0000},
  editor = {Verena Henrich and Erhard Hinrichs and Daniel de Kok and Petya Osenova and Adam Przepiórkowski},
  keywords = {treebank},
  pages = {218--229},
  title = {A grammar-licensed treebank of {C}zech},
  url = {http://tlt13.sfs.uni-tuebingen.de/tlt13-proceedings.pdf},
  year = {2014},
  bdsk-url-1 = {http://tlt13.sfs.uni-tuebingen.de/tlt13-proceedings.pdf}
}
@inproceedings{Jelinek:etal:2015,
  address = {Warsaw, Poland},
  author = {Tomáš Jelínek and Vladimír Petkevič and Alexandr Rosen and Hana Skoumalová and Přemysl Vítovec},
  booktitle = {Proceedings of the Fourteenth International Workshop on Treebanks and Linguistic Theories (TLT14)},
  date-added = {2015-12-18 18:33:39 +0000},
  date-modified = {2018-07-21 14:29:41 +0000},
  editor = {Markus Dickinson and Erhard Hinrichs and Agnieszka Patejuk and Adam Przepiórkowski},
  isbn = {978-83-63159-18-4},
  keywords = {treebank},
  pages = {119--133},
  publisher = {Institute of Computer Science, Polish Academy of Sciences},
  title = {Taking Care of Orphans: Ellipsis in Dependency and Constituency-Based Treebanks},
  url = {http://tlt14.ipipan.waw.pl/files/4914/4974/3227/TLT14_proceedings.pdf},
  year = {2015},
  bdsk-url-1 = {http://tlt14.ipipan.waw.pl/files/4914/4974/3227/TLT14_proceedings.pdf}
}
@article{Kaczmarska:etal:2014,
  abstract = {Our goal is to identify factors which determine the choice of Polish equivalents of Czech verbs expressing mental states, which Polish speakers often find difficult to translate or even understand, such as toužit `to miss', `to want' or `to desire', translated, i.a., as marzyć, tęsknić or pragnąć.  We try to predict the equivalents by linear and syntactic contexts where the Czech verbs occur, applying various methods to texts from a parallel corpus. First we manually classify parallel concordances including the verbs according to syntactico-semantic properties of their arguments. The manual analysis is complemented by an automatically extracted bilingual glossary with frequencies. Then we use stochastic classifiers to predict Polish equivalents given the Czech linear and syntactic context. The methods are evaluated on word-aligned test data from the corpus with an inconclusive result: none of the methods was able to outperform a simple baseline model choosing always the most frequent Polish translation. Valency alone may be an important, but not the only factor to decide about Polish equivalents of a selected set of Czech psych verb.},
  author = {Elżbieta Kaczmarska and Alexandr Rosen and Jirka Hana and Barbora Hladká},
  date-added = {2014-07-15 22:29:17 +0000},
  date-modified = {2018-07-21 14:58:29 +0000},
  issn = {0138-0567},
  journal = {Prace Filologiczne},
  keywords = {Czech; Polish; psych verbs; valency; parallel corpus; word-to-word alignment; stochastic classifier; contrastive lexical analysis},
  pages = {151-174},
  title = {A syntactico-semantic analysis of arguments as a method for establishing equivalents of {C}zech and {P}olish verbs expressing mental states},
  url = {http://www.pracefilologiczne.uw.edu.pl/Wersje-online},
  volume = {LXVII},
  year = {2015},
  bdsk-url-1 = {http://www.pracefilologiczne.uw.edu.pl/Wersje-online}
}
@article{Kaczmarska:Rosen:2013,
  abstract = {Some entries in translation dictionaries (even in those for closely related languages) offer multiple target equivalents with disparate meanings even when the meaning of the source word is not considered ambiguous within the source language. The translator's task of choosing the best fitting text unit is made even more difficult by sparse exemplification and missing or sketchy valency information. Using a parallel corpus, we explore to what extent valency (seen in a broad sense both as a syntactic and semantic property) can help to cope with a few difficult cases of Czech psych verbs in relation to Polish and how the lexical caps can be filled. Next, we compare the manually compiled lists of translation equivalents in context, annotated and sorted by their valency properties, with results of an automatic method, using word-to-word alignments in a lemmatised parallel corpus to build a list of equivalent pairs, annotated by their frequency. We conclude that valency is an important (though not omnipotent) discriminant for the choice of the target equivalent and that the list of translation pairs extracted automatically from a parallel corpus can be a useful supplement to a standard translation dictionary. },
  author = {Elżbieta Kaczmarska and Alexandr Rosen},
  date-added = {2013-07-23 14:05:35 +0000},
  date-modified = {2018-07-21 14:47:07 +0000},
  issn = {0081-7090},
  journal = {Studia z Filologii Polskiej i Słowiańskiej},
  keywords = {psych verbs; valency; Czech; Polish; parallel corpus; extraction of translation equivalents; contrastive lexical analysis},
  pages = {103--121},
  title = {Między znaczeniem leksykalnym a walencją -- próba opracowania metody ekstrakcji ekwiwalentów na podstawie korpusu równoległego ({B}etween lexical meaning and valency -- towards a method for extracting equivalents based on a parallel corpus)},
  url = {http://utkl.ff.cuni.cz/~rosen/public/Kaczmarska_Rosen_artykul_SFPiS_2013.pdf},
  volume = {48},
  year = {2013},
  bdsk-url-1 = {http://utkl.ff.cuni.cz/~rosen/public/Kaczmarska_Rosen_artykul_SFPiS_2013.pdf}
}
@incollection{Kaczmarska:Rosen:2014,
  author = {Elżbieta Kaczmarska and Alexandr Rosen},
  booktitle = {Praktyczny przewodnik po korpusach języków słowiańskich},
  date-added = {2014-07-15 20:11:30 +0000},
  date-modified = {2018-07-21 14:30:06 +0000},
  editor = {Milena Hebal-Jezierska},
  isbn = {978-83-6411-04-4},
  keywords = {intercorp},
  pages = {207-231},
  publisher = {Uniwersytet Warszawski},
  title = {Praktyczny przewodnik po korpusie równoległym {InterCorp}},
  url = {http://www.iszip.uw.edu.pl/files/pdf/praktyczny_przewodnik.pdf},
  year = {2014},
  bdsk-url-1 = {http://www.iszip.uw.edu.pl/files/pdf/praktyczny_przewodnik.pdf}
}
@article{Kaczmarska:Rosen:2014b,
  acknowledgment = {LM2011023},
  address = {Kraków},
  author = {Elżbieta Kaczmarska and Alexandr Rosen},
  conference = {Przyszłość językoznawstwa -- językoznawstwo przyszłości, Konferencja jubileuszowa Instytutu Języka Polskiego PAN},
  date-added = {2014-07-15 20:26:49 +0000},
  date-modified = {2018-07-21 14:54:29 +0000},
  issn = {0137-9712},
  journal = {Polonica},
  keywords = {lexical equivalent; parallel corpora; word-to-word alignment; contrastive lexical analysis},
  pages = {53--66},
  publisher = {Instytut Języka Polskiego PAN},
  title = {Czego nie można wyrazić w języku polskim, czyli o leksykalnych w nim brakach [{T}hings that cannot be expressed in {P}olish -- about lexical gaps]},
  url = {http://polonica.ijp-pan.krakow.pl/downloads/volumes/34/4.pdf},
  volume = {XXXIV},
  year = {2014},
  bdsk-url-1 = {http://polonica.ijp-pan.krakow.pl/downloads/volumes/34/4.pdf}
}
@article{Kaczmarska:Rosen:2015,
  abstract = {Our goal is to identify factors that influence the choice of equivalents of `psych' verbs when translating between typologically close languages such as Polish and Czech. Using the example of the Czech verb toužit `to yearn, to desire' we show that these verbs may be perceived differently by native speakers of Polish and Czech --- as ambiguous or unambiguous. Translation of such verbs is equally challenging. We start with the hypothesis that the choice of an equivalent is determined primarily by syntactico-semantic properties of the source lexeme, especially by its valency. Based on the analysis of lexemes and their arguments in parallel texts we identify regularities and preferences for the choice of an equivalent. Manual analysis is complemented by an automatically extracted bilingual glossary with frequencies. The results show that valency is an important, but not the only factor.},
  author = {Elżbieta Kaczmarska and Alexandr Rosen},
  date-added = {2015-06-15 15:17:13 +0000},
  date-modified = {2016-02-17 14:40:41 +0000},
  issn = {0008-7386},
  journal = {Časopis pro moderní filologii},
  keywords = {Czech, Polish, `psych' verbs, valency, parallel corpus},
  number = {2},
  pages = {157--168},
  title = {Jak najít optimální překlad polysémních sloves -- porovnání metod automatické analýzy paralelních textů},
  url = {http://cejsh.icm.edu.pl/cejsh/element/bwmeta1.element.desklight-c0c212c1-c413-4a5c-a594-bd43587342f2/c/Elzbieta_Kaczmarska__Alexandr_Rosen_157-168.pdf},
  volume = {97},
  year = {2015},
  bdsk-url-1 = {http://cejsh.icm.edu.pl/cejsh/element/bwmeta1.element.desklight-c0c212c1-c413-4a5c-a594-bd43587342f2/c/Elzbieta_Kaczmarska__Alexandr_Rosen_157-168.pdf}
}
@incollection{Kaczmarska:Rosen:2016,
  abstract = {Z paralelních korpusů lze získat pozoruhodné údaje o kombinatorických vlastnostech lexémů v kontrastivním pohledu. Podobné metody už posloužily ke kontrastivní česko-polské analýze deminutiv, identifikaci chybějících pojmů ve druhém jazyce a hledání ekvivalentů obtížně přeložitelných lexémů. Příspěvek se zaměří na metody, které pro výzkum valence srovnatelných lexémů: zarovnání po slovech s automatickou excerpcí lexikálních ekvivalentů, kolokační profily, automatickou syntaktickou analýzu s identifikací syntaktických argumentů, stochastický klasifikátor lexémů podle kontextu o velikosti několika slov bez syntaktické informace, nebo s omezením kontextu na slovní formy určité slovnědruhové třídy. Úspěšnost některých metod bude ověřena shodou s paralelními konkordancemi při úkolu vybrat nejvhodnější lexikální ekvivalent.


A Syntactico-semantic Description of Selected Groups of Verbs Expressing Emotions and Feelings - Methods of Contrastive Study of Valency Based on a Parallel Corpus

Our goal is to identify factors underlying the choice of equivalents of psych verbs when translating from Czech to Polish, using context and the syntactico-semantic properties of the source lexeme's syntactic arguments. We start with the relation of meaning to context and valency and an overview of options for studying and distinguishing meaning using a parallel corpus. Then we proceed with a manual valency-based analysis, examining parallel concordances of a Czech verb seen from the Polish perspective as highly polysemous (toužit `to yearn, to desire'). The results, complemented and verified by a bilingual glossary automatically extracted from the parallel corpus, show that valency is not the only predictor of an appropriate target equivalent for a lexeme.
In the second part of the study, we examine options for formalizing the choice of an equivalent using the source context of the lexeme. First, we focus on collocation profiles and syntactic analysis as methods for aggregating data about the object argument of the source lexeme and evaluate their reliability as cues for predicting the target equivalent. Finally, we turn to machine learning methods, using a stochastic classifier to determine equivalents in both linear and syntactically structured contexts.
None of the above methods confirmed the hypothesis that valency is the main predictor for the choice of the target equivalent in general. It is not even obvious that methods based on syntactically structured contexts outperform those based on linear contexts. However, the study still yielded intermediate conclusions: of all syntactic dependents, the object argument is the best predictor, and valency can be the primary factor in specific cases (infinitival complements), the methods for extracting the binary glossary originally used for this research have been successfully applied to build a lexical database for many language pairs in the parallel corpus InterCorp, etc.
},
  address = {Praha},
  author = {Elżbieta Kaczmarska and Alexandr Rosen},
  booktitle = {Výzkum slovesné valence ve slovanských zemích},
  date-added = {2016-12-03 13:41:00 +0000},
  date-modified = {2018-07-21 14:49:56 +0000},
  editor = {Karolina Skwarska and Elżbieta Kaczmarska},
  isbn = {978-80-86420-60-8},
  keywords = {parallel corpus; contrastive lexical analysis; valency; lexical equivalents; automatic syntactic analysis; Czech; Polish; contrastive lexical analysis; intercorp},
  pages = {319-350},
  publisher = {Slovanský ústav AV ČR},
  series = {Práce Slovanského ústavu AV ČR -- Nová řada},
  title = {Syntakticko-sémantický popis vybraných skupin sloves vyjadřujících emoce a pocity -- metody kontrastivního zkoumání valence na základě paralelního korpusu},
  volume = {43},
  year = {2016},
  bdsk-file-1 = {YnBsaXN0MDDSAQIDBFxyZWxhdGl2ZVBhdGhZYWxpYXNEYXRhbxBPAC4ALgAvAFAAUgBPAEoARQBDAFQAUwAvAFQATwBVAFoASQBUAC8AXwBQAFAALwAyADAAMQA0AF8AVgBBAEwARQBOAEMARQAvAEsAYQBjAHoAbQBhAHIAcwBrAGEAIAAtACAAUgBvAHMAZQBuACAAMQAyADAANQAxADYAIABhAHUAdABvAHIAdQMKAG0AIABjAG8AcgByAC4AcABkAGZPEQHqAAAAAAHqAAIAAAxNYWNpbnRvc2ggSEQAAAAAAAAAAAAAAAAAAAAAAAAAQkQAAf////8fS2Fjem1hcnNrYSAtIFJvc2VuI0ZGRkZGRkZGLnBkZgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA/////wAAAABQREYgQ0FSTwABAAUAAAogY3UAAAAAAAAAAAAAAAAADDIwMTRfVkFMRU5DRQACAFkvOlVzZXJzOmFpcjpQUk9KRUNUUzpUT1VaSVQ6X1BQOjIwMTRfVkFMRU5DRTpLYWN6bWFyc2thIC0gUm9zZW4gMTIwNTE2IGF1dG9ydcyKbSBjb3JyLnBkZgAADgBYACsASwBhAGMAegBtAGEAcgBzAGsAYQAgAC0AIABSAG8AcwBlAG4AIAAxADIAMAA1ADEANgAgAGEAdQB0AG8AcgB1AwoAbQAgAGMAbwByAHIALgBwAGQAZgAPABoADABNAGEAYwBpAG4AdABvAHMAaAAgAEgARAASAFdVc2Vycy9haXIvUFJPSkVDVFMvVE9VWklUL19QUC8yMDE0X1ZBTEVOQ0UvS2Fjem1hcnNrYSAtIFJvc2VuIDEyMDUxNiBhdXRvcnXMim0gY29yci5wZGYAABMAAS8AABUAAgAK//8AAAAIAA0AGgAkAMUAAAAAAAACAQAAAAAAAAAFAAAAAAAAAAAAAAAAAAACsw==}
}
@incollection{Kaczmarska:Rosen:2016a,
  abstract = {Artykuł porusza problem niedosłowności w odkodowywaniu w języku polskim treści ujętych w języku czeskim. Analizując wieloznaczne i niejednoznaczne jednostki, spotykamy się z sytuacjami, kiedy nie możemy nazwać danego pojęcia w języku polskim, a w konsekwencji trudno znaleźć nam ekwiwalent; każdy wybór odpowiednika wiąże się ze stratą czy uproszczeniem w tłumaczeniu. Odkrywamy tym samym pole niedosłowności. Materiałem poddanym analizie będą przykłady ekscerpowane z korpusu równoległego InterCorp oraz pary leksemów wygenerowane automatycznie z tego korpusu metodą wyrównania segmentów słownych -- word-to-word alignment. W trakcie analizy, dzięki wykorzystaniu danych korpusowych, autorzy starają się ustalić, czy czynniki doboru ekwiwalentu zależą również od struktury syntaktyczno-semantycznej danego segmentu. Jednostką poddaną analizie w tym artykule jest czeski czasownik toužit.

Inliterality in the Czech-Polish dialogue

The paper deals with the problem of decoding the meaning of some Czech predicates in Polish. The analysis of vague and ambiguous lexemes shows that some concepts cannot be denoted easily in the target language. As a result, it is difficult to find an appropriate equivalent: every choice is accompanied by a loss or simplification of the translated meaning. A field of inliterality opens. We analyse examples excerpted from InterCorp, a multilingual parallel corpus, and pairs of lexemes generated from the corpus using an automatic method of word-to-word alignment. Using the corpus data, we try to discover to what extent the factors relevant for the choice of an equivalent are determined also by a syntactico-semantic structure of the given segment. In this paper we focus on the Czech verb toužit `desire' (approximate translation).},
  address = {Kraków},
  author = {Elżbieta Kaczmarska and Alexandr Rosen},
  booktitle = {Niedosłowność w języku},
  date-added = {2015-01-07 14:23:03 +0000},
  date-modified = {2018-07-21 14:30:54 +0000},
  editor = {Marcin Odelski and Aleksandra Knapik and Piotr Chruszczewski and Władysław Chłopicki},
  isbn = {978-83-61678-21-2},
  journal = {Język a komunikacja},
  keywords = {contrastive lexical analysis},
  pages = {45--59},
  publisher = {Krakowskie Towarzystwo Popularyzowania Wiedzy o Komunikacji Językowej ``Tertium''},
  series = {Język a komunikacja},
  title = {Niedosłowności w dialogu czesko-polskim},
  volume = {37},
  year = {2016}
}
@article{Kaczmarska:Rosen:2019,
  abstract = {Problemy ze zrozumieniem i przekładem czasowników polisemicznych dotykają nie tylko języków odległych genetycznie, typologicznie czy geograficznie; trudności mogą występować również w przypadku języków blisko spokrewnionych - np. języka czeskiego i polskiego. Ekwiwalenty identyczne pod względem strukturalnym często nie istnieją lub są nieoptymalne ze względu na odmienne konotacje semantyczne w języku wyjściowym i docelowym. Niniejszy artykuł omawia problem przekładu wieloznacznych czeskich czasowników na język polski; szczegółowej analizie poddany jest czasownik postrádat. Badanie wykonane na podstawie czeskich oryginałów i ich przekładów na język polski ukazuje, iż w zgromadzonym materiale tłumacze najchętniej wybierali czasownik brakować lub leksem brak dla wyrażenia treści zawartej w analizowanym czasowniku. Druga część analizy, wykorzystująca polskie oryginały i ich przekład na język czeski, skłania ku przypuszczeniu, iż istnieje pewna reguła pojawiania się czasownika postrádat jako tłumaczenia. Czasownik ten pojawia się często w przekładzie tekstu zawierającego w języku pol- skim prefiks bez- lub słowo bez (jeśli jednostka w języku polskim jest wielowyrazowa), np. bezsensowna, bez sensu, bez przekonania. Korpus, obejmujący również teksty obcojęzyczne, oferuje więcej odpowiedników, jednak interpretacja wyników jest skomplikowana ze względu na przekład z trzeciego języka.

Polysemous verbs may be difficult to translate even into a closely related language. A structurally similar equivalent is often unavailable or suboptimal due to the different patterning of a given semantic field in the target language. Czech psych verbs translated into Polish represent a case in point. Following previous studies of a number of such verbs, the focus of the contribution is on postrádat 'miss'. Lexical entries in several monolingual, bilingual, and valency dictionaries are contrasted with concordances found in a parallel corpus, offering a colorful picture of various strategies, many of them structurally different from the original. Despite relatively low frequencies, some preferences in the choice of equivalents have been detected, namely for brak or brakować. Interestingly, in Polish originals the equivalents of the Czech target verb postrádat are often preceded by bez 'without' or bez-, the homograph prefix. The corpus offers more equivalents if neither Czech nor Polish text is the original. However, the results are more difficult to interpret due to the unclear status of the translation process.},
  author = {Elżbieta Kaczmarska and Alexandr Rosen},
  date-added = {2020-02-05 19:55:10 +0100},
  date-modified = {2020-02-05 20:20:21 +0100},
  journal = {Prace Filologiczne},
  keywords = {psych verbs; parallel corpus; Czech; Polish; lexical equivalent},
  pages = {231–244},
  title = {Analiza korpusowa czeskiego czasownika postrádat i jego polskich ekwiwalentów},
  url = {https://www.researchgate.net/publication/338735995_Analiza_korpusowa_czeskiego_czasownika_postradat_i_jego_polskich_ekwiwalentow},
  volume = {74},
  year = {2019},
  bdsk-url-1 = {https://www.researchgate.net/publication/338735995_Analiza_korpusowa_czeskiego_czasownika_postradat_i_jego_polskich_ekwiwalentow}
}
@article{Kirsch:Ros:89,
  author = {Zdeněk Kirschner and Alexandr Rosen},
  date-modified = {2018-07-21 14:43:17 +0000},
  journal = {Machine Translation},
  keywords = {machine translation},
  number = {3},
  pages = {177-193},
  title = {{APAČ -- An Experiment in Machine Translation}},
  volume = {4},
  year = {1989}
}
@inproceedings{Kren:etal:2011,
  address = {Praha},
  author = {Michal Křen and Alexandr Rosen and Michal Štourač and Martin Vavřín and Pavel Vondřička},
  booktitle = {Korpusová lingvistika Praha 2011: 2 - Výzkum a výstavba korpusů},
  date-added = {2011-10-09 16:11:10 +0200},
  date-modified = {2018-07-21 14:31:20 +0000},
  editor = {František Čermák},
  keywords = {intercorp},
  organization = {Ústav Českého národního korpusu},
  pages = {105-115},
  series = {Studie z korpusové lingvistiky},
  title = {Paralelní korpus {I}nter{C}orp po sedmi letech [{T}he parallel corpus {I}nter{C}orp after seven years]},
  url = {http://utkl.ff.cuni.cz/~rosen/public/2011_intercorp_korpling.pdf},
  volume = {15},
  volumetitle = {2 - Výzkum a výstavba korpusů},
  year = {2011},
  bdsk-url-1 = {http://utkl.ff.cuni.cz/~rosen/public/2011_intercorp_korpling.pdf}
}
@incollection{panevova:rosen:11,
  address = {Praha},
  author = {Jarmila Panevová and Alexandr Rosen},
  booktitle = {Kapitoly z české gramatiky},
  date-modified = {2018-07-21 14:31:46 +0000},
  editor = {František Štícha},
  isbn = {978-80-200-1845-8},
  keywords = {control},
  pages = {900--909},
  publisher = {Academia},
  title = {Zvláštní případy shody: doplněk u infinitivu},
  year = {2011}
}
@incollection{Petkevic:etal:2012,
  address = {Liberec},
  author = {Vladimír Petkevič and Alexandr Rosen and Barbora Štindlová and Tomáš Jelínek and Milena Hnátková and Petr Jäger},
  booktitle = {Čeština -- cílový jazyk a korpusy},
  chapter = {4},
  date-added = {2013-01-10 22:43:21 +0000},
  date-modified = {2018-07-19 15:18:40 +0000},
  editor = {Karel Šebesta and Svatava Škodová},
  isbn = {978-80-7372-848-9},
  keywords = {learner corpora; akces; CzeSL},
  pages = {61--88},
  publisher = {Technická univerzita v Liberci},
  title = {Anotace chybových textů v českém žákovském korpusu},
  url = {http://utkl.ff.cuni.cz/~rosen/public/czesl_monografie.pdf},
  year = {2012},
  bdsk-url-1 = {http://utkl.ff.cuni.cz/~rosen/public/czesl_monografie.pdf}
}
@article{Petkevic:etal:2015,
  author = {Vladimír Petkevič and Alexandr Rosen and Hana Skoumalová},
  date-added = {2015-12-18 19:01:11 +0000},
  date-modified = {2015-12-18 19:03:30 +0000},
  isbn = {0138-0567},
  journal = {Prace Filologiczne},
  keywords = {treebank, Czech, formal grammar, valency lexicon, annotation, typed feature structures},
  pages = {239--260},
  title = {The grammarian is opening a treebank account},
  url = {http://www.pracefilologiczne.uw.edu.pl},
  volume = {LXVII},
  year = {2015},
  bdsk-url-1 = {http://www.pracefilologiczne.uw.edu.pl}
}
@inproceedings{Petkevic:etal:2015a,
  address = {Hissar, Bulgaria},
  author = {Vladimír Petkevič and Alexandr Rosen and Hana Skoumalová and Přemysl Vítovec},
  booktitle = {The 5th Workshop on Balto-Slavic Natural Language Processing (BSNLP 2015)},
  date-added = {2015-12-18 18:24:45 +0000},
  date-modified = {2018-07-21 14:32:48 +0000},
  editor = {Jakub Piskorski and Lidia Pivovarova and Jan Šnajder and Hristo Tanev and Roman Yangarber},
  isbn = {978-954-452-033-5},
  keywords = {treebank; morphology; syntax},
  language = {english},
  pages = {9--16},
  title = {Analytic Morphology -- Merging the Paradigmatic and Syntagmatic Perspective in a Treebank},
  url = {http://bsnlp-2015.cs.helsinki.fi/bsnlp2015-book.pdf},
  year = {2015},
  bdsk-url-1 = {http://bsnlp-2015.cs.helsinki.fi/bsnlp2015-book.pdf}
}
@article{Petkevic:etal:2020,
  abstract = {Typology of Multiword Expressions in Czech and Frequency
of Their Main Features in a Genre-Balanced Corpus
The paper consists of two main parts:
(a) In the first part, a typology of multiword expressions (MWE) in Czech is described in a detailed way. This typology is part of the description of MWE database entries in the lexical database LEMUR containing more than 10,500 MWE entries as of June 2020. MWE properties reflected in this typology are accounted for by categories and their values. Each MWE is identified by a unique lemma; a group of related MWEs is assigned a “superlemma”. A MWE is described by the following properties: a MWE definition, characteristic examples, lemmas and morphological features of MWE components (words), as well as the following key categories: MWE style/register, type of usage, syntactic structure (including its representation by a dependency and a phrase-structure tree), aspects of flexibility (variants and fragments, internal modifiability of individual MWE components, possibilities of syntactic transformations of the main MWE components and morphological constraints) and types of idiomaticity on the lexical, morphological, syntactic, semantic and pragmatic level.
(b) In the second part of the paper, the authors focus on the frequency of the main features of the adopted typology in the real language material represented by the genre-balanced SYN2015 corpus, containing 100 mil. word forms (excluding punctuation): a type of usage correlated with a syntactic type and frequency of various kinds of idiomaticity. Our paper seems to be the first attempt at approaching the MWE properties from the point of view of MWE frequencies as types rather than tokens (i.e. frequencies of occurrences of a given MWE).

Příspěvek má dvě hlavní části:
(a) V první části je podrobně popsána typologie (vlastnosti) víceslovných lexikálních jednotek (dále VLJ) v češtině, přičemž tato typologie je součástí popisu databázových hesel těchto jednotek v lexikální databázi LEMUR, obsahující k červnu 2020 více než 10 500 hesel. Jednotlivé vlastnosti těchto jednotek jsou zachyceny prostřednictvím kategorií a jejich hodnot. U každé jednotky uvádíme její identifikační lemma a tzv. superlemma, definici, typické příklady; dále popisujeme lemmata a morfologické vlastnosti jednotlivých komponent (slov) a poté takové charakteristiky jako styl/varieta VLJ, její typ užití, syntaktická struktura (včetně reprezentace v podobě závislostního a frázového stromu), aspekty ustálenosti/flexibility (včetně variant a fragmentů VLJ, vnitřní modifikovatelnosti jednotlivých komponent VLJ, možností syntaktických transformací hlavních komponent VLJ a též morfologických omezení) a konečně typy idiomatičnosti na rovině lexikální, morfologické, syntaktické, sémantické a pragmatické.
(b) V druhé, hlavní části příspěvku sledujeme frekvenční zastoupení hlavních aspektů této typologie u dosud zpracovaných VLJ: typ užití v korelaci se syntaktickým typem a dále zastoupení různých druhů idiomatičnosti, a to v reálném jazykovém materiálu reprezentovaném žánrově vyváženým korpusem SYN2015 (obsahuje sto milionů slovních tvarů mimo interpunkci). Jde patrně vůbec o první pokus zaměřit se na vlastnosti víceslovných lexikálních jednotek z hlediska četnosti jejich výskytů jakožto typů, nikoli tokenů (tj. četností výskytů dané jednotky).},
  author = {Vladimír Petkevič and Marie Kopřivová and Milena Hnátková and Tomáš Jelínek and Pavel Kopřiva and Alexandr Rosen and Hana Skoumalová and Pavel Vondřička},
  date-added = {2020-11-14 22:07:43 +0100},
  date-modified = {2020-11-14 22:11:50 +0100},
  issn = {2336-6702},
  journal = {Studie z aplikované lingvistiky / Studies in Applied Linguistics},
  keywords = {MWE, multiword (lexical) expressions in Czech, typology of multiword expressions, frequency of types of multiword expressions, idiomaticity, lexical database, genre-balanced corpus multiword (lexical) expressions in Czech, typology of multiword expressions, frequency of types of multiword expressions, idiomaticity, lexical database, genre-balanced corpus},
  number = {2},
  pages = {37--62},
  title = {Typologie víceslovných jednotek v češtině a frekvenční zastoupení jejich hlavních vlastností v žánrově vyváženém korpusu},
  url = {https://dspace.cuni.cz/handle/20.500.11956/123089},
  volume = {11},
  year = {2020},
  bdsk-url-1 = {https://dspace.cuni.cz/handle/20.500.11956/123089}
}
@article{prz:ros:04,
  author = {Adam Przepiórkowski and Alexandr Rosen},
  date-modified = {2018-07-21 14:33:07 +0000},
  journal = {Research in Language},
  keywords = {control},
  pages = {33--66},
  title = {Czech and {P}olish Raising/Control with or without Structure Sharing},
  url = {http://dach.ipipan.waw.pl/~adamp/Papers/2004-control-czech-polish/control-czpl.pdf},
  volume = 3,
  year = 2005,
  bdsk-url-1 = {http://dach.ipipan.waw.pl/~adamp/Papers/2004-control-czech-polish/control-czpl.pdf}
}
@inproceedings{prz:ros:04a,
  address = {Frankfurt am Main},
  author = {Adam Przepiórkowski and Alexandr Rosen},
  booktitle = {Formal Description of Slavic Languages: The Fifth Conference, Leipzig 2003},
  date-modified = {2018-07-21 14:33:19 +0000},
  editor = {Zybatow Gerhild et al.},
  keywords = {control},
  pages = {478-492},
  publisher = {Peter Lang},
  title = {On the Case of Predicative Complements in {C}zech Infinitival Clauses},
  url = {http://utkl.ff.cuni.cz/~rosen/public/fdsl_paper.pdf},
  year = 2008,
  bdsk-url-1 = {http://utkl.ff.cuni.cz/~rosen/public/fdsl_paper.pdf}
}
@inproceedings{Ramasamy:etal:2015,
  address = {Prague},
  author = {Loganathan Ramasamy and Alexandr Rosen and Pavel Straňák},
  booktitle = {ITAT 2015: Information technologies -- Applications and Theory / SloNLP 2015},
  date-added = {2015-07-13 12:35:52 +0000},
  date-modified = {2018-07-19 15:08:20 +0000},
  editor = {Jakub Yaghob},
  isbn = {978-1515120650},
  issn = {1613-0073},
  keywords = {akces; CzeSL},
  pages = {73--80},
  publisher = {Charles University in Prague},
  title = {Improvements to {K}orektor: A case study with native and non-native {C}zech},
  url = {http://ceur-ws.org/Vol-1422/73.pdf},
  year = {2015},
  bdsk-url-1 = {http://itat.ics.upjs.sk/uploads/ITAT_2015.pdf}
}
@inproceedings{Richter:etal:2012,
  abstract = {We present Korektor -- a flexible and powerful purely statistical text correction tool for Czech that goes beyond a traditional spell checker. We use a combination of several language models and an error model to offer the best ordering of correction proposals and also to find errors that cannot be detected by simple spell checkers, namely spelling errors that happen to be homographs of existing word forms. Our system works also without any adaptation as a diacritics generator with the best reported results for Czech text. The design of Korektor contains no language-specific parts other than trained statistical models, which makes it highly suitable to be trained for other languages with available resources. The evaluation demonstrates that the system is a state-of-the-art tool for Czech, both as a spell checker and as a diacritics generator. We also show that these functions combine into a potential aid in the error annotation of a learner corpus of Czech.},
  address = {Mumbai, India},
  author = {Richter, Michal and Straňák, Pavel and Rosen, Alexandr},
  booktitle = {Proceedings of COLING 2012: Posters},
  date-added = {2013-03-12 18:59:07 +0000},
  date-modified = {2018-07-21 14:46:19 +0000},
  keywords = {spellchecking; diacritics completion; language model; error model; akces; CzeSL; grammar checker},
  month = {December},
  pages = {1019--1028},
  publisher = {The COLING 2012 Organizing Committee},
  title = {{K}orektor -- A System for Contextual Spell-Checking and Diacritics Completion},
  url = {http://www.aclweb.org/anthology/C12-2099},
  year = {2012},
  bdsk-url-1 = {http://www.aclweb.org/anthology/C12-2099}
}
@phdthesis{Ros:01,
  author = {Alexandr Rosen},
  date-modified = {2018-07-21 14:33:53 +0000},
  keywords = {word order},
  school = {Faculty of Philosophy, Charles University, Prague},
  title = {{A constraint-based approach to dependency syntax applied to some issues of Czech word order}},
  year = {2001}
}
@article{Ros:02,
  author = {Alexandr Rosen},
  date-modified = {2018-07-21 14:34:21 +0000},
  journal = {The Prague Bulletin of Mathematical Linguistics},
  keywords = {word order},
  number = {78},
  pages = {5-21},
  title = {{Word order factors as constraints on feature structures}},
  year = {2002}
}
@misc{Ros:03,
  author = {Alexandr Rosen},
  date-modified = {2018-07-21 14:34:41 +0000},
  keywords = {control},
  note = {In Czech},
  title = {{Survey on the Usage of Predicative Complements in Czech}},
  url = {http://utkl.ff.cuni.cz/~rosen/public/dotaznik-ori-ar.html},
  year = 2003,
  bdsk-url-1 = {http://utkl.ff.cuni.cz/~rosen/public/dotaznik-ori-ar.html}
}
@incollection{ros:10a,
  abstract = {Tagsets, used to annotate corpora, often classify word classes and morphological categories   according to different criteria, even within a single language. Then it can be dfficult to identify corresponding morphosyntactic categories in texts tagged by different schemata. In tagset A, cardinal and ordinal numerals may belong to the class of numerals, while personal and possessive pronouns to the class of pronouns. In tagset B, on the other hand, cardinal numerals and personal pronouns may belong to the class of nouns, while ordinal numerals and possessive pronouns to the class of adjectives.
Texts tagged in such disparate ways make searching and automatic processing harder. For a parallel corpus such as InterCorp (http://korpus.cz/intercorp-info.php), currently including 25 languages, a single "harmonized" tagset could be designed (similarly as in the project MULTEXT-East), or - even better - to encode the information from all tagsets into a morphosyntactic "interlingua" (see Dan Zeman's Interset). The parallel with natural languages is appropriate: problems with missing equivalents occur in the translation of words as well as tags. Thus we propose a tagset interlingua as a hierarchy (lattice) of categories, corrosponding to language-specific tags. A missing tag in a language can be substituted by a more general tag or a by a disjunction of more specific tags. Similarly as with multilingual lexical databases the methods of Formal Concept Analysis can be used.},
  abstract_cs = {Sady morfosyntaktických značek, které se používají při anotaci korpusu, často třídí slovní druhy a morfologické kategorie na základě odlišných kritérií, a to i v rámci jednoho jazyka. Identifikace odpovídajících morfosyntaktických kategorií v textech označkovaných podle odlišných schémat pak může být obtížná. Např. základní číslovky spolu s řadovými mohou v tagsetu A patřit do třídy číslovek, zatímco osobní zájmena spolu s přivlastňovacími do třídy zájmen. Oproti tomu v tagsetu B mohou základní číslovky a osobní zájmena patřit do stejné třídy spolu se substantivy, pričemž číslovky řadové a přivlastňovaci zájmena do stejné třídy spolu s adjektivy. Takto disparátně označkované texty mohou komplikovat automatické zpracování i hledání v korpusu. Pro účely paralelního korpusu InterCorp (http://korpus.cz/intercorp-info.php), který toho času obsahuje 25 jazyků, by bylo možné navrhnout jeden "harmonizovany" tagset (podobně jako v projektu MULTEXT-East), nebo -- ještě lépe -- zakódovat informace ze všech tagsetů do morfosyntaktické "interlingvy" (viz Interset Dana Zemana). Paralela s přirozenými jazyky je zde na místě: problémy s chybějícími ekvivalenty vznikaji při překladu značek i slov. Proto navrhujeme tagsetovou interlingvu jako hierarchii (svaz) kategorií, které odpovídají značkám v jednotlivých jazycích/tagsetech. Pokud pro danou kategorii v některém jazyce chybí značka, je možné ji nahradit značkou obecnější nebo disjunkcí značek specifičtějších. Podobně jako při konstrukci vícejazyčné lexikální databáze lze i pro hierarchii morfosyntaktických kategorií využít metody formální konceptuální analýzy.},
  address = {Praha},
  annote = {
},
  author = {Alexandr Rosen},
  booktitle = {InterCorp: Exploring a Multilingual Corpus},
  date-modified = {2018-07-21 14:35:18 +0000},
  editor = {František Čermák and Aleš Klégr and Patrick Corness},
  isbn = {978-80-7422-042-5},
  keywords = {universal PoS tagset},
  pages = {205--234},
  publisher = {Nakladatelství Lidové noviny},
  series = {Studie z korpusové lingvistiky},
  title = {Morphological Tags in Parallel Corpora},
  title_cs = {Morfologické značky v paralelních korpusech},
  url = {http://utkl.ff.cuni.cz/~rosen/public/unitags_pap.pdf},
  volume = {13},
  year = {2010},
  bdsk-url-1 = {http://utkl.ff.cuni.cz/~rosen/public/unitags_pap.pdf}
}
@inproceedings{ros:10b,
  abstract = {Comparison of existing morphosyntactic tagsets often reveals different assumptions, obscuring similarities and distinctions across languages. To overcome the formal and conceptual mismatches, we build an abstract interlingual tagset as a hierarchy of categories, using Formal Concept Analysis.},
  abstract_cs = {Porovnání existujících morfosyntaktických sad značek často odhalí odlišná východiska, která zakrývají shody a odlišnosti mezi různými jazyky. K překonání formálních a věcných nekompatibilit vytváříme abstraktní jazykově nezávislou sadu značek v podobě hierarchie kategorií, a to pomocí formální konceptuální analýzy.},
  address = {Moscow},
  author = {Alexandr Rosen},
  booktitle = {Computational Linguistics and Intellectual Technologies, Papers from the Annual International Conference "Dialogue" (2010)},
  date-modified = {2018-07-21 14:35:39 +0000},
  editor = {A. E. Kubrik et al.},
  isbn = {5-7281-1148-1},
  keywords = {universal PoS tagset},
  pages = {651--658},
  publisher = {Institute of Informatics Problems of the Russian Academy of Sciences / Russian State University for the Humanities},
  title = {Harmonizing tagsets for multilingual corpora via concept lattice},
  title_cs = {Harmonizace sad značek pro vícejazyčné korpusy pomocí pojmového svazu},
  url = {http://www.dialog-21.ru/dialog2010/materials/pdf/93.pdf},
  volume = 9,
  year = 2010,
  bdsk-url-1 = {http://www.dialog-21.ru/dialog2010/materials/pdf/93.pdf}
}
@inproceedings{ros:10c,
  abstract = {The issue of incompatible morphosyntactic tagsets in multilingual corpora
could be solved by an abstract hierarchy of concepts, mapped to language-specific tagsets. The hierarchy supports the user and tools by resolving categories that do not match the relevant tagset in queries, by providing links between language-specific tagsets, and by displaying responses using a preferred tagset. The hierarchy, built using the methods of Formal Concept Analysis, can also help to refine morphosyntactic annotation in one language by using word-to-word alignments to parallel texts tagged by a different tagset.},
  abstract_cs = {Problém nekompatibilních sad morfosyntaktických značek ve vícejazyčných korpusech lze řešit abstraktní hierarchií pojmů, odpovídajících značkám pro jednotlivé jazyky. Hierarchie pomáhá uživatelům i aplikacím při dekódování kategorií, které v dotazu neodpovídají příslušné sadě, vytváří vztahy ekvivalence mezi značkami z různých sad a generuje odpovědi v sadě podle vlastní volby. Hierarchie, vytvořená metodami formální konceptuální analýzy, může také pomoci zpřesnit morfosyntaktickou anotaci v jednom jazyce pomocí zarovnání po slovech k paralelním textům označkovaných jinou sadou.},
  address = {Tartu, Estonia},
  author = {Alexandr Rosen},
  booktitle = {Proceedings of the Workshop on Annotation and Exploitation of Parallel Corpora (AEPC)},
  date-modified = {2018-07-21 14:53:44 +0000},
  editor = {Lars Ahrenberg and Joerg Tiedemann and Martin Volk},
  issn = {1736-6305},
  keywords = {parallel corpora; tagsets; morphosyntactic categories; FCA; universal PoS tagset},
  pages = {53--62},
  publisher = {Northern European Association for Language Technology},
  series = {NEALT Proceedings Series},
  title = {Mediating between incompatible tagsets},
  title-cs = {Jak smiřovat nekompatibilní sady značek},
  url = {http://hdl.handle.net/10062/15893},
  volume = 10,
  year = 2010,
  bdsk-url-1 = {http://hdl.handle.net/10062/15893}
}
@incollection{ros:10d,
  address = {Praha},
  author = {Alexandr Rosen},
  booktitle = {Padesát je málo -- Komorně laděný sborník u příležitosti 50. narozenin profesora Jana Hajiče},
  date-added = {2011-01-17 17:21:57 +0100},
  date-modified = {2018-07-21 14:37:07 +0000},
  editor = {Jarmila Panevová and Barbora Vidová Hladká},
  keywords = {treebank},
  pages = {49--58},
  publisher = {Ústav formální a aplikované lingvistiky MFF UK},
  title = {Liberální dendrologie aneb o stromech stejného významu v Pražském závislostním polesí},
  url = {http://utkl.ff.cuni.cz/~rosen/public/LiberDendro.pdf},
  year = {2010},
  bdsk-url-1 = {http://utkl.ff.cuni.cz/~rosen/public/LiberDendro.pdf}
}
@techreport{Ros:96a,
  address = {Praha},
  author = {Alexandr Rosen},
  date-modified = {2018-07-21 14:42:32 +0000},
  institution = {Open Society Institute, Research Support Scheme},
  keywords = {machine translation},
  number = {340/93},
  title = {{Lexical Issues in Machine Translation}},
  type = {Final report on project},
  url = {http://utkl.ff.cuni.cz/~rosen/public/rssfrep.ps},
  year = {1996},
  bdsk-url-1 = {http://utkl.ff.cuni.cz/~rosen/public/rssfrep.ps}
}
@article{Ros:98,
  author = {Alexandr Rosen},
  date-modified = {2018-07-21 14:44:23 +0000},
  journal = {The Prague Bulletin of Mathematical Linguistics},
  keywords = {parsing},
  number = {69},
  pages = {57-63},
  title = {{Review of Fred Karlsson et al. (eds.) Constraint Grammar: A Language-Independent System for Parsing Unrestricted Text, Mouton de Gruyter 1995}},
  year = {1998}
}
@incollection{ros:cerm:06,
  address = {Praha},
  author = {Alexandr Rosen},
  booktitle = {Korpusová lingvistika: Stav a modelové přístupy},
  date-modified = {2018-07-21 14:36:05 +0000},
  editor = {František Čermák and Renata Blatná},
  keywords = {control},
  number = {1},
  pages = {254--284},
  publisher = {Nakladatelství Lidové noviny},
  series = {Studie z korpusové lingvistiky},
  title = {O čem vypovídá pád doplňku infinitivu},
  year = {2006}
}
@inproceedings{ros:fdsl:06,
  address = {Frankfurt am Main},
  author = {Alexandr Rosen},
  booktitle = {Linguistic Investigations into Formal Description of Slavic Languages},
  date-modified = {2013-10-14 16:45:57 +0000},
  editor = {Peter Kosta and Lilia Schürcks},
  keywords = {Czech, agreement},
  pages = {309--318},
  publisher = {Peter Lang},
  series = {Potsdam Linguistic Investigations},
  title = {Hybrid Agreement in {C}zech Predicates},
  url = {http://utkl.ff.cuni.cz/~rosen/public/fdsl05_rosen.pdf},
  volume = {1},
  year = {2007},
  bdsk-url-1 = {http://utkl.ff.cuni.cz/~rosen/public/fdsl05_rosen.pdf}
}
@inproceedings{Ros:Hajova:Haj:92,
  author = {Alexandr Rosen and Eva Hajičová and Jan Hajič},
  booktitle = {Proceedings of COLING-92 Nantes},
  date-modified = {2018-07-21 14:44:41 +0000},
  editor = {Christian Boitet},
  keywords = {valency},
  pages = {553-559},
  title = {{Derivation of Underlying Valency Frames from a Learner's Dictionary}},
  volume = {II},
  year = 1992
}
@article{ros:lop:06,
  author = {Alexandr Rosen},
  date-modified = {2018-07-21 14:44:58 +0000},
  journal = {{Slovo a slovesnost}},
  keywords = {preposition},
  number = {1},
  title = {{Review of Markéta Lopatková: O homonymii předložkových skupin v češtině (Co umí počítač?), Karolinum 2003}},
  volume = {66},
  year = {2006}
}
@inproceedings{ros:slovko:05,
  address = {Bratislava},
  author = {Alexandr Rosen},
  booktitle = {Computer Treatment of {S}lavic and {E}ast {E}uropean Languages: Third International Serminar, Bratislava 10--12 November 2005},
  date-modified = {2018-07-21 14:37:40 +0000},
  editor = {Radovan Garabík},
  keywords = {intercorp; parallel corpora},
  pages = {174--185},
  publisher = {VEDA},
  title = {In Search of the Best Method for Sentence Alignment in Parallel Texts},
  url = {http://utkl.ff.cuni.cz/~rosen/public/slovko05.pdf},
  year = {2005},
  bdsk-url-1 = {http://utkl.ff.cuni.cz/~rosen/public/slovko05.pdf}
}
@unpublished{rosen:1994,
  author = {Alexandr Rosen},
  date-added = {2013-05-31 15:30:36 +0000},
  date-modified = {2018-07-21 14:38:05 +0000},
  file = {rosen1994.ps:rosen1994.ps:PDF},
  keywords = {word order},
  note = {Unpublished manuscript.},
  timestamp = {2007.03.20},
  title = {{Grammar Formalisms and the Description of Word Order Variations}},
  url = {http://utkl.ff.cuni.cz/~rosen/public/worep.ps},
  year = {1994},
  bdsk-url-1 = {http://utkl.ff.cuni.cz/~rosen/public/worep.ps}
}
@article{Rosen:2012,
  abstract = {Multilingual parallel corpora can be annotated with monolingual tools, such as morphosyntactic taggers. However, even taggers for typologically similar languages often use incompatible tagsets, which results in conceptual and formal variety of tags within a single corpus. Retraining taggers on data annotated with a common tagset is not a realistic option.

Differences between tagsets are often rooted in different linguistic perspectives rather than in real distinctions between the languages, which means good chances to find a common ground. Moreover, a different perspective may provide additional information missing in one tagset but present in another. 

Our first goal is to delegate the task of dealing with multiple tagsets to an abstract interlingual representation of linguistic categories. Ideally, each tag in every language-specific tagset used in the corpus is linked to a position in a tangled hierarchy of concepts. To accommodate the different perspectives, the hierarchy takes three views of word class. The Czech tag for a relative pronoun is decoded as a category with the properties of inflectional adjective, syntactic noun, and semantic pronoun, each with its appropriate morphological characteristics. 

Comparison of different tagsets reveals mismatches, where tags are seen as ambiguous wrt concepts. Such mismatches are properly represented, which allows for a principled mapping strategy between languages-specific tagsets, and for intuitive and underspecified queries. The hierarchy can be built and the mismatches partially resolved using Formal Concept Analysis (Ganter & Wille, 1999). 

Our second goal is to refine existing morphosyntactic annotation by projecting distinctions in one tagset onto a conceptually different tagset. The hierarchy and automatic word-to-word alignment is used to learn from word tokens in another language. We show results of an experiment for different languages and tagsets, including untagged texts.
},
  address = {Warszawa},
  annote = {Taggers for typologically similar languages often use incompatible tagsets. We propose an abstract interlingual representation of linguistic categories. Mismatches between tagsets are properly represented, which allows for a principled mapping. We also refine existing morphosyntactic annotation by projecting distinctions in one tagset onto a conceptually different tagset using w2w alignment.
},
  author = {Alexandr Rosen},
  date-added = {2014-01-20 11:09:11 +0000},
  date-modified = {2018-07-21 14:48:32 +0000},
  editor = {Marek Łaziński},
  issn = {0138-0567},
  journal = {Prace Filologiczne},
  keywords = {parallel corpus; morphosyntactic tags; linguistic ontology; formal concept analysis; multilinguality; universal PoS tagset},
  pages = {241--256},
  publisher = {Wydział Polonistyki Uniwersytetu Warszawskiego},
  title = {On the art of taming and exploiting parallel tags in a multilingual corpus},
  url = {http://utkl.ff.cuni.cz/~rosen/public/2010_unitags_slavicorp.pdf},
  volume = {LXIII},
  year = {2012},
  bdsk-file-1 = {YnBsaXN0MDDSAQIDBFxyZWxhdGl2ZVBhdGhZYWxpYXNEYXRhXxBBLi4vUFJPSkVDVFMvVU5JVEFHUy9fUFAvMjAxMF9zbGF2aWNvcnAvMjAxMF91bml0YWdzX3NsYXZpY29ycC5wZGZPEQGqAAAAAAGqAAIAAAxNYWNpbnRvc2ggSEQAAAAAAAAAAAAAAAAAAAAAAAAAQkQAAf////8aMjAxMF91bml0YWdzX3NsYXZpY29ycC5wZGYAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA/////wAAAAAAAAAAAAAAAAABAAUAAAogY3UAAAAAAAAAAAAAAAAADjIwMTBfc2xhdmljb3JwAAIASi86VXNlcnM6YWlyOlBST0pFQ1RTOlVOSVRBR1M6X1BQOjIwMTBfc2xhdmljb3JwOjIwMTBfdW5pdGFnc19zbGF2aWNvcnAucGRmAA4ANgAaADIAMAAxADAAXwB1AG4AaQB0AGEAZwBzAF8AcwBsAGEAdgBpAGMAbwByAHAALgBwAGQAZgAPABoADABNAGEAYwBpAG4AdABvAHMAaAAgAEgARAASAEhVc2Vycy9haXIvUFJPSkVDVFMvVU5JVEFHUy9fUFAvMjAxMF9zbGF2aWNvcnAvMjAxMF91bml0YWdzX3NsYXZpY29ycC5wZGYAEwABLwAAFQACAAr//wAAAAgADQAaACQAaAAAAAAAAAIBAAAAAAAAAAUAAAAAAAAAAAAAAAAAAAIW}
}
@conference{Rosen:2014,
  address = {Poznań, Poland},
  author = {Alexandr Rosen and Barbora Štindlová and Svatava Škodová and Jirka Hana},
  booktitle = {SLE 2014 --- 47th Annual Meeting of the Societas Linguistica Europaea, Workshop on Interlanguage Annotation},
  date-added = {2014-03-28 16:03:50 +0000},
  date-modified = {2018-07-19 15:08:39 +0000},
  doi = {10.13140/2.1.1588.9600},
  keywords = {akces; CzeSL},
  organization = {Adam Mickiewicz University},
  title = {Using a cross-classifying taxonomy of non-standard forms to analyze non-native {C}zech},
  url = {https://www.researchgate.net/profile/Alexandr_Rosen/publication/268123637_Using_a_cross-classifying_taxonomy_of_non-standard_forms_to_analyze_non-native_Czech/links/5461db480cf2c1a63c001fef.pdf?ev=pub_int_doc_dl&origin=publication_list&inViewer=true},
  year = {2014},
  bdsk-url-1 = {https://www.researchgate.net/profile/Alexandr_Rosen/publication/268123637_Using_a_cross-classifying_taxonomy_of_non-standard_forms_to_analyze_non-native_Czech/links/5461db480cf2c1a63c001fef.pdf?ev=pub_int_doc_dl&origin=publication_list&inViewer=true},
  bdsk-url-2 = {http://dx.doi.org/10.13140/2.1.1588.9600}
}
@conference{Rosen:2014a,
  abstract = {The standard sets of 8--10 word classes (POS) are defined by a mix of mor- phological, syntactic, and semantic criteria. For some POS the three criteria yield the same result, but POS such as numerals and pronouns end up as heterogeneous classes. The goal of this contribution is to support the idea of a multidimensional taxonomy of word classes using arguments from the practical domains of corpus and applied linguistics. Annotating corpora with a cross-classifying POS tagset facilitates both cor- pus queries and their use by application tools. Disparate morphosyntactic annotation of multilingual corpora can be harmonized when the concepts behind language- or theory-specific tagsets are properly located in the 3D space of word classes. Finally, a cross-classification of POS can be applied as a powerful tool for the analysis of texts produced by non-native speakers.},
  address = {Olomouc},
  author = {Alexandr Rosen},
  booktitle = {Complex Visibles Out There. Proceedings of the Olomouc Linguistics Colloquium 2014: Language Use and Linguistic Structure},
  date-added = {2014-03-28 16:16:55 +0000},
  date-modified = {2018-07-21 14:50:42 +0000},
  editor = {Ludmila Veselovská and Markéta Janebová},
  keywords = {POS; multidimensional taxonomy; tagset; morphological categories; universal PoS tagset},
  pages = {575--590},
  publisher = {Palacký University},
  series = {Olomouc Modern Language Series},
  title = {A 3{D} taxonomy of word classes at work},
  url = {http://olinco.upol.cz/assets/olinco-2014-proceedings.pdf},
  volume = {4},
  year = {2014},
  bdsk-url-1 = {http://olinco.upol.cz/assets/olinco-2014-proceedings.pdf}
}
@incollection{Rosen:2014b,
  abstract = {The chapter  is devoted to the problem of clitic haplology in a comparative Slavic - non-Slavic perspective. The author presents various types of clitics in the Czech language and the rules for their clustering in the sentence (if there are several of them). The main focus is on the phenomenon of haplology; on the basis of excerpted examples the author shows the elimination of phonetically duplicated units, stating his conclusions in the framework of HPSG grammar. The author contrasts his conclusions from the analysis of the Czech material with examples from Polish, Early Modern German, and Romanian. The results of the research are compared with excerpts from the Czech National Corpus.},
  address = {Sapporo, Japan},
  author = {Alexandr Rosen},
  booktitle = {Slavic and German in Contact: Studies from Areal and Contrastive Linguistics},
  date-added = {2014-03-28 18:37:44 +0000},
  date-modified = {2018-07-21 14:38:41 +0000},
  editor = {Elżbieta Kaczmarska and Motoki Nomachi},
  isbn = {978-4-938637-77-4},
  keywords = {clitics},
  pages = {97--116},
  publisher = {Slavic Research Center, Hokkaido University},
  review = {Journal of Slavic Linguistics, Volume 23, Number 2, Summer-Fall 2015, pp. 313-318 | 10.1353/jsl.2015.0013, Krzysztof E. Borowski, Alexandra Fisher, http://muse.jhu.edu/journals/journal_of_slavic_linguistics/v023/23.2.borowski.html; Wayles Browne's and Alexandr Rosen's articles examine clitic syntax. Focusing on West and South Slavic languages, Browne argues that the rules for clitic ordering within a group are dependent on case for pronoun clitics and person and number for verb clitics. However, taking Burgenland Croatian into account, we see that person can also influence pronominal clitic placement. While it is possible that this phenomenon [End Page 316] comes from contact with Bavarian German dialects, Browne argues that this is unlikely because Bavarian German fails to distinguish first- and second-person reflexive pronoun forms and their nonreflexive counterparts. Rosen's article continues on the topic of clitic ordering but with a narrower focus on Czech. He claims that the haplology of dative and accusative reflexive clitics in Czech is due to strict clitic-ordering rules, one of which states that there can be only one reflexive clitic in a cluster, even if the clitics represent different cases. The author argues that this constraint is not due to phonological factors, because it only occurs with reflexives. To prove this, he uses corpus data and carefully selected examples to show that there is no correlation in preference to haplologize one type of clitic over the other. Additionally, Rosen discusses the phenomenon of ``clitic climbing,'' in which clitics in an embedded clause may shift to a less embedded clitic cluster. That is, the clitics that are placed later in a sentence may climb to position themselves in Wackernagel's position, directly following the first stressed word or syntactic phrase in a clause. The clitics cannot, however, climb over one another. Rosen asserts that the degree of embeddedness of the clitics is not a factor in determining the extent to which they may climb in a sentence. While Browne and Rosen are able to challenge other researchers' hypotheses, neither author reaches a definitive conclusion as to why clitics behave in these ways. Therefore, while they provide insight into the subject, they demonstrate that further research is necessary in this area. Although Rosen does not discuss Germanic and Slavic language contact, his contribution is an excellent complement to Browne's article. While Browne introduces the topic from a broad perspective, Rosen follows up with a more detailed case study. As such, these articles are a useful reference for scholars looking to examine the issue of clitic syntax not only in Slavic, but in other languages as well.},
  title = {Haplology of Reflexive Clitics in {C}zech},
  year = {2014}
}
@techreport{Rosen:2014c,
  author = {Alexandr Rosen},
  date-added = {2014-05-28 17:13:43 +0000},
  date-modified = {2018-07-19 15:02:55 +0000},
  institution = {Univerzita Karlova v Praze},
  keywords = {akces, CzeSL},
  title = {Cze{SL}-{SGT} -- korpus češtiny nerodilých mluvčích s automaticky provedenou anotací},
  url = {http://utkl.ff.cuni.cz/~rosen/public/2014-czesl-sgt-cs.pdf},
  year = {2014},
  bdsk-url-1 = {http://utkl.ff.cuni.cz/~rosen/public/2014-czesl-sgt-cs.pdf}
}
@techreport{Rosen:2014d,
  author = {Alexandr Rosen},
  date-added = {2020-03-09 15:55:48 +0100},
  date-modified = {2020-03-09 15:56:48 +0100},
  institution = {Charles University},
  keywords = {akces, CzeSL},
  title = {Cze{SL}-{SGT} -- a corpus of non-native speakers’ {C}zech with automatic annotation},
  url = {http://utkl.ff.cuni.cz/~rosen/public/2014-czesl-sgt-en.pdf},
  year = {2014},
  bdsk-url-1 = {http://utkl.ff.cuni.cz/~rosen/public/2014-czesl-sgt-cs.pdf}
}
@techreport{Rosen:2015,
  author = {Alexandr Rosen},
  date-added = {2014-05-28 17:17:05 +0000},
  date-modified = {2020-03-08 20:17:10 +0100},
  institution = {Charles University in Prague},
  keywords = {akces, CzeSL},
  title = {{CzeSL-MAN} -- a corpus of non-native speakers' {C}zech with manual annotation},
  url = {http://utkl.ff.cuni.cz/~rosen/public/2015-czesl-man-en.pdf},
  year = {2015},
  bdsk-url-1 = {http://utkl.ff.cuni.cz/~rosen/public/2014-czesl-sgt-cs.pdf}
}
@incollection{Rosen:2016,
  abstract = {The paper describes InterCorp, one of the largest multilingual parallel corpora, compiled at Charles University in Prague. This collection comprises texts in 39 languages, with Czech being its best-represented and pivot language. The chapter presents a detailed makeup of the corpus, and compares it to other resources of this kind. It also explains its organisational status and describes the compilation process. Finally, the paper briefly reviews the types of queries facilitated by the corpus interface.


InterCorp - korpus równoległy od kuchni

korpus równoległy; język czeski; wielojęzyczność; feedback od użytkowników; anotacja; równowaga

InterCorp to projekt, który powstał na Wydziale Filozoficznym Uniwersytetu Karola w Pradze. Jego celem jest zbudowanie obszernego równoległego korpusu synchronicznego, który obejmowałby jak najwięcej języków. W tworzeniu korpusu uczestniczą pracownicy naukowi i studenci Wydziału Filozoficznego Uniwersytetu Karola, osoby związane z Czeskim Korpusem Narodowym, a także współpracownicy zewnętrzni.
InterCorp to rzeczywiście obszerny i ciągle rozwijający się synchroniczny korpus równoległy, obejmujący teksty w języku czeskim i 38 innych językach -- w tym w języku polskim. Trzon korpusu jest uzupełniony automatycznie opracowanymi tekstami z zakresu publicystyki i prawa, a także zapisami debat parlamentarnych i napisami  filmowymi. W sumie korpus obejmuje około 1,6 miliarda słów. Wszystkie teksty dysponują wiązaniem segmentów na poziomie zdania i w miarę możliwości są opatrzone lingwistyczną anotacją oraz danymi bibliograficznymi. Po krótkiej prezentacji koncepcji korpusu przedstawiamy jego parametry liczbowe. W części poświęconej wykorzystaniu korpusu zwracamy uwagę na możliwości i ograniczenia wyszukiwarki KonText oraz różne sposoby wykorzystania. Spojrzenie na korpus od strony użytkownika jest uzupełnione komentarzem twórców korpusu. W części przedstawiającej opracowywanie tekstów przed ich włączeniem do korpusu oczekiwania i życzenia użytkowników zostają skonfrontowane z koncepcyjnymi, technicznymi i  zycznymi możliwościami budowy korpusu paralelnego. Końcowa część zawiera wnioski, jakie się nasuwają na podstawie dotychczasowych doświadczeń, a także plany na przyszłość.
Powstały i ciągle rozwijany korpus równoległy InterCorp ma z założenia służyć między innymi jako źródło danych do badań teoretycznych, analiz gramatycznych i leksykogra cznych, prac translatorskich, projektów dotyczących nauki języków obcych, a także jako materiał do badań dla studentów.},
  address = {Warszawa},
  author = {Alexandr Rosen},
  booktitle = {Polskojęzyczne korpusy równoległe. Polish-language Parallel Corpora},
  date-added = {2016-03-23 19:43:41 +0000},
  date-modified = {2018-07-21 14:48:52 +0000},
  doi = {10.13140/RG.2.1.2808.7444},
  editor = {Ewa Gruszczyńska and Agnieszka Leńko-Szymańska},
  isbn = {978-83-935320-4-9},
  keywords = {parallel corpus; Czech; multilinguality; user feedback; annotation; balance; intercorp},
  pages = {21--40},
  publisher = {Instytut Lingwistyki Stosowanej},
  series = {Multilingual Applied Linguistics -- Wielojęzyczna Lingwistyka Stosowana},
  title = {{InterCorp} -- a look behind the façade of a parallel corpus},
  url = {http://rownolegle.blog.ils.uw.edu.pl/files/2016/03/02_Rosen.pdf},
  volume = {1},
  year = {2016},
  bdsk-url-1 = {http://rownolegle.blog.ils.uw.edu.pl/files/2016/03/02_Rosen.pdf},
  bdsk-url-2 = {http://dx.doi.org/10.13140/RG.2.1.2808.7444}
}
@inproceedings{Rosen:2016a,
  abstract = {Investigating language acquisition by non-native learners helps to understand important linguistic issues and to develop teaching methods, better suited both to the specific target language and to the learner. These tasks can now be based on empirical evidence from learner corpora including Czech. They are equipped with morphological and syntactic annotation, together with the detection and categorization of non-standard linguistic phenomena. After an overview of existing resources we propose solutions to several issues inherent to the process of compiling, annotating and using such corpora, including automatic identification of errors, design and application of error taxonomy, and user-friendly search tool. Finally, we deal with the question to what extent resources and tools available for standard language can be applied to the language of non-native learners.},
  address = {Bratislava, Slovakia},
  author = {Alexandr Rosen},
  booktitle = {Proceedings of the 16th {ITAT}: Slovenskočeský {NLP} workshop (Slo{NLP} 2016)},
  date-added = {2016-10-25 10:16:07 +0000},
  date-modified = {2018-07-19 15:05:14 +0000},
  editor = {Broňa Brejová},
  isbn = {978-1537016740},
  issn = {1613-0073},
  keywords = {akces, learner corpus; language acquisition; error annotation; spelling checker; grammar checker; morphology; syntax; Czech; CzeSL},
  organization = {Comenius University in Bratislava, Faculty of Mathematics, Physics and Informatics},
  pages = {80--87},
  publisher = {CreateSpace Independent Publishing Platform},
  series = {{CEUR} Workshop Proceedings},
  title = {Building and using corpora of non-native {C}zech},
  url = {http://ceur-ws.org/Vol-1649/80.pdf},
  volume = {1649},
  year = {2016},
  bdsk-file-1 = {YnBsaXN0MDDSAQIDBFxyZWxhdGl2ZVBhdGhZYWxpYXNEYXRhXxBALi4vUFJPSkVDVFMvQ0hLL19QUC8yMDE2LWN6ZXNsLXNsb25scC8yMDE2LWN6ZXNsLXNsb25scC1zaG93LnBkZk8RAa4AAAAAAa4AAgAADE1hY2ludG9zaCBIRAAAAAAAAAAAAAAAAAAAAAAAAABCRAAB/////xoyMDE2LWN6ZXNsLXNsb25scC1zaG93LnBkZgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAD/////AAAAAAAAAAAAAAAAAAEABQAACiBjdQAAAAAAAAAAAAAAAAARMjAxNi1jemVzbC1zbG9ubHAAAAIASS86VXNlcnM6YWlyOlBST0pFQ1RTOkNISzpfUFA6MjAxNi1jemVzbC1zbG9ubHA6MjAxNi1jemVzbC1zbG9ubHAtc2hvdy5wZGYAAA4ANgAaADIAMAAxADYALQBjAHoAZQBzAGwALQBzAGwAbwBuAGwAcAAtAHMAaABvAHcALgBwAGQAZgAPABoADABNAGEAYwBpAG4AdABvAHMAaAAgAEgARAASAEdVc2Vycy9haXIvUFJPSkVDVFMvQ0hLL19QUC8yMDE2LWN6ZXNsLXNsb25scC8yMDE2LWN6ZXNsLXNsb25scC1zaG93LnBkZgAAEwABLwAAFQACAAr//wAAAAgADQAaACQAZwAAAAAAAAIBAAAAAAAAAAUAAAAAAAAAAAAAAAAAAAIZ},
  bdsk-url-1 = {http://ceur-ws.org/Vol-1649/80.pdf}
}
@inproceedings{Rosen:2016b,
  abstract = {A specific language as used by different speakers and in different situations
	has a number of more or less distant varieties. Extending the notion of
	non-standard language to varieties that do not fit an explicitly or implicitly
	assumed norm or pattern, we look for methods and tools that could be applied to
	this domain. The needs start from the theoretical side: categories usable for
	the analysis of non-standard language are not readily available, and continue
	to methods and tools required for its detection and diagnostics. A general
	discussion of issues related to non-standard language is followed by two case
	studies. The first study presents a taxonomy of morphosyntactic categories as
	an attempt to analyse non-standard forms produced by non-native learners of
	Czech. The second study focusses on the role of a rule-based grammar and
	lexicon in the process of building and
	using a parsebank.},
  address = {Osaka, Japan},
  author = {Rosen, Alexandr},
  booktitle = {Proceedings of the Workshop on Grammar and Lexicon: interactions and interfaces (GramLex)},
  date-added = {2016-12-03 14:02:34 +0000},
  date-modified = {2018-07-21 14:39:25 +0000},
  editor = {Eva Hajičová and Igor Boguslavski},
  isbn = {978-4-87974-706-8},
  keywords = {CzeSL; treebank},
  month = {December},
  pages = {120--131},
  publisher = {The COLING 2016 Organizing Committee},
  title = {Modeling non-standard language},
  url = {http://aclweb.org/anthology/W16-3815},
  year = {2016},
  bdsk-url-1 = {http://aclweb.org/anthology/W16-3815}
}
@incollection{Rosen:2016c,
  author = {Alexandr Rosen},
  booktitle = {Nový encyklopedický slovník češtiny},
  date-added = {2020-11-14 22:37:09 +0100},
  date-modified = {2020-11-14 23:00:19 +0100},
  editor = {Petr Karlík and Marek Nekula and Jana Pleskalová},
  isbn = {978-80-7422-480-5},
  publisher = {Nakladatelství Lidové noviny},
  title = {{HPSG}},
  url = {https://www.czechency.org/slovnik/HPSG},
  year = {2016},
  bdsk-url-1 = {https://www.czechency.org/slovnik/HPSG}
}
@incollection{Rosen:2016d,
  author = {Alexandr Rosen},
  booktitle = {Nový encyklopedický slovník češtiny},
  date-added = {2020-11-14 23:05:07 +0100},
  date-modified = {2020-11-14 23:07:13 +0100},
  editor = {Petr Karlík and Marek Nekula and Jana Pleskalová},
  isbn = {978-80-7422-480-5},
  publisher = {Nakladatelství Lidové noviny},
  title = {{LFG}},
  url = {https://www.czechency.org/slovnik/LFG},
  year = {2016},
  bdsk-url-1 = {https://www.czechency.org/slovnik/LFG}
}
@inproceedings{Rosen:2017,
  address = {Frankfurt am Main, Bern, Bruxelles, New York, Oxford, Warszawa, Wien},
  author = {Alexandr Rosen},
  booktitle = {Language, Corpora and Cognition},
  date-added = {2015-03-18 15:34:41 +0000},
  date-modified = {2018-07-19 15:09:00 +0000},
  doi = {http://dx.doi.org/10.3726/b10717},
  editor = {Piotr Pezik and Jacek Walinski and Krzysztof Kosecki},
  isbn = {9783631707098},
  keywords = {akces; CzeSL; CzeSL-SGT},
  pages = {163--180},
  publisher = {Peter Lang},
  title = {Introducing a corpus of non-native {C}zech with automatic annotation},
  url = {http://utkl.ff.cuni.cz/~rosen/public/2016_SGT_lodz.pdf},
  year = {2017},
  bdsk-file-1 = {YnBsaXN0MDDSAQIDBFxyZWxhdGl2ZVBhdGhZYWxpYXNEYXRhXxA1Li4vUFJPSkVDVFMvQ0hLL19QUC8yMDE0LWN6ZXNsLWxvZHovMjAxNl9TR1RfbG9kei5wZGZPEQGCAAAAAAGCAAIAAAxNYWNpbnRvc2ggSEQAAAAAAAAAAAAAAAAAAAAAAAAAQkQAAf////8RMjAxNl9TR1RfbG9kei5wZGYAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA/////wAAAAAAAAAAAAAAAAABAAUAAAogY3UAAAAAAAAAAAAAAAAADzIwMTQtY3plc2wtbG9kegAAAgA+LzpVc2VyczphaXI6UFJPSkVDVFM6Q0hLOl9QUDoyMDE0LWN6ZXNsLWxvZHo6MjAxNl9TR1RfbG9kei5wZGYADgAkABEAMgAwADEANgBfAFMARwBUAF8AbABvAGQAegAuAHAAZABmAA8AGgAMAE0AYQBjAGkAbgB0AG8AcwBoACAASABEABIAPFVzZXJzL2Fpci9QUk9KRUNUUy9DSEsvX1BQLzIwMTQtY3plc2wtbG9kei8yMDE2X1NHVF9sb2R6LnBkZgATAAEvAAAVAAIACv//AAAACAANABoAJABcAAAAAAAAAgEAAAAAAAAABQAAAAAAAAAAAAAAAAAAAeI=}
}
@article{Rosen:2017a,
  author = {Alexandr Rosen},
  date-added = {2017-02-01 10:57:29 +0000},
  date-modified = {2018-07-19 15:09:20 +0000},
  journal = {Prace Filologiczne},
  keywords = {akces; CzeSL},
  pages = {377--397},
  title = {Valency Patterns in {C}zech Learner Corpora},
  volume = {LXX},
  year = {2017},
  bdsk-file-1 = {YnBsaXN0MDDSAQIDBFxyZWxhdGl2ZVBhdGhZYWxpYXNEYXRhXxA9Li4vUFJPSkVDVFMvQ0hLL19QUC8yMDE2LWN6ZXNsLXZhbGVuY2UvMjAxNi1jemVzbC12YWxlbmNlLmRvY08RAZ4AAAAAAZ4AAgAADE1hY2ludG9zaCBIRAAAAAAAAAAAAAAAAAAAAAAAAABCRAAB/////xYyMDE2LWN6ZXNsLXZhbGVuY2UuZG9jAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAD/////AAAAAFc4Qk5NU1dEAAEABQAACiBjdQAAAAAAAAAAAAAAAAASMjAxNi1jemVzbC12YWxlbmNlAAIARi86VXNlcnM6YWlyOlBST0pFQ1RTOkNISzpfUFA6MjAxNi1jemVzbC12YWxlbmNlOjIwMTYtY3plc2wtdmFsZW5jZS5kb2MADgAuABYAMgAwADEANgAtAGMAegBlAHMAbAAtAHYAYQBsAGUAbgBjAGUALgBkAG8AYwAPABoADABNAGEAYwBpAG4AdABvAHMAaAAgAEgARAASAERVc2Vycy9haXIvUFJPSkVDVFMvQ0hLL19QUC8yMDE2LWN6ZXNsLXZhbGVuY2UvMjAxNi1jemVzbC12YWxlbmNlLmRvYwATAAEvAAAVAAIACv//AAAACAANABoAJABkAAAAAAAAAgEAAAAAAAAABQAAAAAAAAAAAAAAAAAAAgY=},
  bdsk-file-2 = {YnBsaXN0MDDSAQIDBFxyZWxhdGl2ZVBhdGhZYWxpYXNEYXRhXxA9Li4vUFJPSkVDVFMvQ0hLL19QUC8yMDE2LWN6ZXNsLXZhbGVuY2UvMjAxN19Sb3Nlbi1WYWxlbmNlLnBkZk8RAZ4AAAAAAZ4AAgAADE1hY2ludG9zaCBIRAAAAAAAAAAAAAAAAAAAAAAAAABCRAAB/////xYyMDE3X1Jvc2VuLVZhbGVuY2UucGRmAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAD/////AAAAAAAAAAAAAAAAAAEABQAACiBjdQAAAAAAAAAAAAAAAAASMjAxNi1jemVzbC12YWxlbmNlAAIARi86VXNlcnM6YWlyOlBST0pFQ1RTOkNISzpfUFA6MjAxNi1jemVzbC12YWxlbmNlOjIwMTdfUm9zZW4tVmFsZW5jZS5wZGYADgAuABYAMgAwADEANwBfAFIAbwBzAGUAbgAtAFYAYQBsAGUAbgBjAGUALgBwAGQAZgAPABoADABNAGEAYwBpAG4AdABvAHMAaAAgAEgARAASAERVc2Vycy9haXIvUFJPSkVDVFMvQ0hLL19QUC8yMDE2LWN6ZXNsLXZhbGVuY2UvMjAxN19Sb3Nlbi1WYWxlbmNlLnBkZgATAAEvAAAVAAIACv//AAAACAANABoAJABkAAAAAAAAAgEAAAAAAAAABQAAAAAAAAAAAAAAAAAAAgY=},
  bdsk-file-3 = {YnBsaXN0MDDSAQIDBFxyZWxhdGl2ZVBhdGhZYWxpYXNEYXRhXxBILi4vUFJPSkVDVFMvQ0hLL19QUC8yMDE2LWN6ZXNsLXZhbGVuY2UvUFJFU0VOVEFDRS8yMDE2LWN6ZXNsLXZhbGVuY2UucGRmTxEBrgAAAAABrgACAAAMTWFjaW50b3NoIEhEAAAAAAAAAAAAAAAAAAAAAAAAAEJEAAH/////FjIwMTYtY3plc2wtdmFsZW5jZS5wZGYAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAP////8AAAAAAAAAAAAAAAAAAQAGAAAKIGN1AAAAAAAAAAAAAAAAAApQUkVTRU5UQUNFAAIAUS86VXNlcnM6YWlyOlBST0pFQ1RTOkNISzpfUFA6MjAxNi1jemVzbC12YWxlbmNlOlBSRVNFTlRBQ0U6MjAxNi1jemVzbC12YWxlbmNlLnBkZgAADgAuABYAMgAwADEANgAtAGMAegBlAHMAbAAtAHYAYQBsAGUAbgBjAGUALgBwAGQAZgAPABoADABNAGEAYwBpAG4AdABvAHMAaAAgAEgARAASAE9Vc2Vycy9haXIvUFJPSkVDVFMvQ0hLL19QUC8yMDE2LWN6ZXNsLXZhbGVuY2UvUFJFU0VOVEFDRS8yMDE2LWN6ZXNsLXZhbGVuY2UucGRmAAATAAEvAAAVAAIACv//AAAACAANABoAJABvAAAAAAAAAgEAAAAAAAAABQAAAAAAAAAAAAAAAAAAAiE=}
}
@conference{Rosen:2017b,
  abstract = {
A collection of essays, written by non-native learners of Czech, has been used to build several corpora, with or without metadata, in a dedicated multi-level format or structured according to a commonly used search tool. Its manual or automatic annotation includes correction, lemmas and standard morphosyntactic tags assigned to the source or the corrected target, and error labels based on a formally defined or grammar-based taxonomy. An overview of the annotation and the corpora themselves will be presented. The results suggest a question: to what extent they meet expectations of existing or prospective users and what can be done to better suit their needs.
},
  author = {Alexandr Rosen},
  booktitle = {SWE-CLARIN: Workshop on interoperability of L2 resources and tools},
  date-added = {2018-06-18 14:46:18 +0000},
  date-modified = {2018-08-03 13:51:52 +0000},
  keywords = {akces; CzeSL},
  note = {Presentation},
  title = {Trying to make a learner corpus user happy: from annotation to search tools},
  year = {2017},
  bdsk-file-1 = {YnBsaXN0MDDSAQIDBFxyZWxhdGl2ZVBhdGhZYWxpYXNEYXRhXxBALi4vUFJPSkVDVFMvQ0hLL19QUC8yMDE3LWN6ZXNsLWNsYXJpbi8yMDE3LWN6ZXNsLWNsYXJpbi1zaG93LnBkZk8RAa4AAAAAAa4AAgAADE1hY2ludG9zaCBIRAAAAAAAAAAAAAAAAAAAAAAAAABCRAAB/////xoyMDE3LWN6ZXNsLWNsYXJpbi1zaG93LnBkZgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAD/////AAAAAAAAAAAAAAAAAAEABQAACiBjdQAAAAAAAAAAAAAAAAARMjAxNy1jemVzbC1jbGFyaW4AAAIASS86VXNlcnM6YWlyOlBST0pFQ1RTOkNISzpfUFA6MjAxNy1jemVzbC1jbGFyaW46MjAxNy1jemVzbC1jbGFyaW4tc2hvdy5wZGYAAA4ANgAaADIAMAAxADcALQBjAHoAZQBzAGwALQBjAGwAYQByAGkAbgAtAHMAaABvAHcALgBwAGQAZgAPABoADABNAGEAYwBpAG4AdABvAHMAaAAgAEgARAASAEdVc2Vycy9haXIvUFJPSkVDVFMvQ0hLL19QUC8yMDE3LWN6ZXNsLWNsYXJpbi8yMDE3LWN6ZXNsLWNsYXJpbi1zaG93LnBkZgAAEwABLwAAFQACAAr//wAAAAgADQAaACQAZwAAAAAAAAIBAAAAAAAAAAUAAAAAAAAAAAAAAAAAAAIZ}
}
@inproceedings{Rosen:2018,
  address = {Heidelberg},
  author = {Alexandr Rosen},
  booktitle = {Grammar and Corpora 2016},
  date-added = {2018-06-18 13:31:48 +0000},
  date-modified = {2019-01-18 11:38:41 +0100},
  doi = {https://dx.doi.org/10.17885/heiup.361.509},
  editor = {Eric Fuß and Marek Konopka and Beata Trawinski and Ulrich H. Waßner},
  isbn = {978-3-946054-84-9},
  keywords = {akces; treebank; CzeSL},
  pages = {271--287},
  publisher = {Heidelberg University Publishing},
  title = {Coping with unruly language: non-standard usage in a corpus},
  url = {https://heiup.uni-heidelberg.de/catalog/book/361},
  year = {2018},
  bdsk-url-1 = {https://heiup.uni-heidelberg.de/catalog/book/361},
  bdsk-url-2 = {https://dx.doi.org/10.17885/heiup.361.509}
}
@phdthesis{rosen:diss,
  address = {Prague},
  author = {Alexandr Rosen},
  date-added = {2013-05-31 15:30:36 +0000},
  date-modified = {2020-11-14 23:48:15 +0100},
  file = {rosen-diss.pdf:rosen-diss.pdf:PDF},
  keywords = {Czech; Word-order; Discontinuity; Clitics; DG; HPSG},
  school = {Charles University},
  title = {A constraint-based approach to dependency syntax applied to some issues of Czech word order},
  url = {http://utkl.ff.cuni.cz/~rosen/public/THESIS/},
  year = {2001},
  bdsk-url-1 = {http://utkl.ff.cuni.cz/~rosen/public/THESIS/}
}
@article{Rosen:etal:2013,
  author = {Rosen, Alexandr and Hana, Jirka and Štindlová, Barbora and Feldman, Anna},
  date-added = {2013-04-23 19:14:03 +0000},
  date-modified = {2018-07-19 15:17:30 +0000},
  doi = {http://dx.doi.org/10.1007/s10579-013-9226-3},
  issn = {1574-020X},
  journal = {Language Resources and Evaluation -- Special Issue: Resources for language learning},
  keywords = {Learner corpus; Error annotation; Second language acquisition; Czech; akces; CzeSL},
  language = {English},
  month = {March},
  number = {1},
  pages = {65--92},
  publisher = {Springer Netherlands},
  title = {Evaluating and automating the annotation of a learner corpus},
  url = {http://utkl.ff.cuni.cz/~rosen/public/2011-czesl-lrej_prefinal.pdf},
  volume = {48},
  year = {2014},
  bdsk-url-1 = {http://dx.doi.org/10.1007/s10579-013-9226-3}
}
@inproceedings{Rosen:etal:2014a,
  abstract = {Some kinds of diminutives tend to be misinterpreted by foreign language learners, even if their native language is typologically similar, as in the case of Polish learners of Czech. This is especially true about words with formal derivation suffixes but lacking any diminutive or emphatic meaning. However, a simple type of mismatch -- the presence and absence of diminutive suffixes or meanings in translational equivalents -- is often accompanied by polysemy and/or homonymy and the prevalent phenomenon of false friends. Following a detailed analysis of the individual types of mismatches we present a study of translational equivalents, extracted automatically from a parallel corpus (about 12 mil. word tokens per language). Diminutive lexemes are classifies according to their suffix. The results complement previous results about a higher number of diminutive types in Czech by detailed statistics calculated on tokens.},
  address = {Warszawa},
  author = {Alexandr Rosen and Elżbieta Kaczmarska and Svatava Škodová},
  booktitle = {Glottodydaktyka wobec wielokulturowości},
  date-added = {2014-10-28 12:30:37 +0000},
  date-modified = {2018-07-21 14:40:42 +0000},
  doi = {10.13140/2.1.2926.3681},
  editor = {Elżbieta Kaczmarska and Andrzej Zieniewicz},
  isbn = {978-83-923037},
  keywords = {contrastive lexical analysis},
  pages = {51--66},
  publisher = {Wydział Polonistyki Uniwersytetu Warszawskiego},
  title = {Zdrobnienia jako element kultury i pułapka glottodydaktyczna. {C}zeskie i polskie deminutiva w ujęciu konfrontatywnym na podstawie badań korpusowych [{Diminutives as a cultural element and a glottodidactic trap -- Czech and Polish diminutives from a contrastive corpus-based perspective}]},
  year = {2014},
  bdsk-url-1 = {http://dx.doi.org/10.13140/2.1.2926.3681}
}
@book{Rosen:etal:2020,
  abstract = {Žákovské korpusy, čili korpusy, které dokumentují jazyk tak, jak jej používají nerodilí mluvčí, poskytují důležité informace pro výzkum osvojování jazyka i pedagogickou praxi. Tato monografie představuje CzeSL – korpus češtiny nerodilých mluvčích, a to na pozadí teoretických a praktických otázek současného výzkumu v oboru žákovských korpusů.
Jazyky s bohatou morfologií a volným slovosledem, včetně češtiny, jsou pro analýzu osvojovaného jazyka obzvláště náročné. Autoři se zabývají složitostí chybové anotace a popisují tři vzájemně se doplňující anotační schémata. Věnují se také popisu nerodilé češtiny z hlediska standardních jazykových kategorií.
Kniha podrobně rozebírá praktické aspekty tvorby korpusu: proces sběru a anotace, potřebné nástroje, výsledná data, jejich formáty a vyhledávací rozhraní. Kapitola o aplikacích korpusu ilustruje jeho užitečnost pro výuku, výzkum akvizice i počítačovou lingvistiku. Každý, kdo se zabývá tvorbou žákovských korpusů, jistě ocení závěrečnou kapitolu, shrnující úskalí, kterým je třeba se vyhnout.

Learner corpora, linguistic collections documenting a language as used by learners, provide an important empirical foundation for language acquisition research and teaching practice. This book presents CzeSL, a corpus of non-native Czech, against the background of theoretical and practical issues in the current learner corpus research. 
Languages with rich morphology and relatively free word order, including Czech, are particularly challenging for the analysis of learner language. The authors address both the complexity of learner error annotation, describing three complementary annotation schemes, and the complexity of description of non-native Czech in terms of standard linguistic categories.
The book discusses in detail practical aspects of the corpus creation: the process of collection and annotation itself, the supporting tools, the resulting data, their formats and search platforms.
The chapter on use cases exemplifies the usefulness of learner corpora for teaching, language acquisition research, and computational linguistics. Any researcher developing learner corpora will surely appreciate the concluding chapter listing lessons learned and pitfalls to avoid.},
  address = {Praha},
  author = {Alexandr Rosen and Jiří Hana and Barbora Hladká and Tomáš Jelínek and Svatava Škodová and Barbora Štindlová},
  booktitle = {Compiling and annotating a learner corpus for a morphologically rich language – CzeSL, a corpus of non-native Czech},
  date-added = {2020-11-02 13:22:14 +0100},
  date-modified = {2020-11-14 21:53:44 +0100},
  keywords = {CzeSL},
  publisher = {Karolinum, Charles University Press},
  title = {Compiling and annotating a learner corpus for a morphologically rich language – CzeSL, a corpus of non-native Czech},
  url = {https://dspace.cuni.cz/handle/20.500.11956/123103},
  year = {2020},
  bdsk-url-1 = {https://dspace.cuni.cz/handle/20.500.11956/123103}
}
@article{Rosen:etal:2020a,
  abstract = {Multi-word lexemes in syntactic context. 
We start with the assumption that (i) a corpus represents the use of language, i.e. linguistic performance, (ii) a rule-based grammar represents language as a system, i.e. linguistic competence, and (iii) corpus annotation represents the interface between the two. To detect and diagnose mismatches between the language use and the language system we use a constraint-based grammar run as a constraint solver on texts tagged and dependency-parsed by stochastic tools. The texts also have MWEs (multi-word expressions) identified and transformed into a constituency-based format before the grammar is applied. We describe the role and results of the grammar, and its use to check texts annotated with morphosyntactic categories, syntactic structure and information about the status of relevant expressions as MWEs. The grammar also employs lexical resources such as a valency lexicon and a database of MWEs to make the checking more accurate and the annotation more informative. The results are represented as typed feature structures where MWE-related information can be shared by lexical and phrasal nodes. This allows for the annotation of MWEs as lexical units, independently of their analysis in terms of syntactic structure. Focusing on the interplay of MWEs with their syntactic context we analyse a number of representative examples, pointing out the pros and cons of specific solutions and the whole approach.},
  author = {Alexandr Rosen and Hana Skoumalová and Jiří Znamenáček},
  date-added = {2020-11-14 22:12:34 +0100},
  date-modified = {2020-11-14 22:19:44 +0100},
  journal = {Studie z aplikované lingvistiky / Studies in Applied Linguistics},
  keywords = {MWE, Czech, HPSG, syntax, treebank, multi-word expressions},
  number = {2},
  pages = {63--84},
  title = {Víceslovné lexémy v syntaktickém kontextu},
  url = {https://dspace.cuni.cz/handle/20.500.11956/123090},
  volume = {11},
  year = {2020},
  bdsk-url-1 = {https://dspace.cuni.cz/handle/20.500.11956/123090}
}
@article{rosen:saloni:2006,
  author = {Alexandr Rosen and Zygmunt Saloni},
  date-added = {2013-05-31 15:30:36 +0000},
  date-modified = {2013-10-14 16:43:35 +0000},
  file = {rosenSaloni2006-honorativVCesKonjugaci.pdf:rosenSaloni2006-honorativVCesKonjugaci.pdf:PDF},
  journal = {Slovo a slovesnost},
  keywords = {Czech},
  number = {1},
  timestamp = {2007.03.21},
  title = {Kategorie honorativu v českých konjugačních paradigmatech},
  url = {http://utkl.ff.cuni.cz/~rosen/public/paradigmata2.pdf},
  volume = {66},
  year = {2006},
  bdsk-url-1 = {http://utkl.ff.cuni.cz/~rosen/public/paradigmata2.pdf}
}
@article{Rosen:Skoumalova:2018,
  abstract = {Niektóre wielowyrazowe leksemy wykazują w uzusie znaczny stopień zmienności.
Ogólne zasady morfologii i składni przejawiają się zarówno w szyku wyrazów, formach
wariantywnych i modyfikowalności części leksemu, jak i w możliwościach uzupełnień walencyjnych
oraz transformacjach składniowych. Przenikanie się powszechnych i specyficznych 
pod względem leksykalnym właściwości leksemu w przypadku wielowyrazowych wyrażeń
przejawia się również w częściowym uzgodnieniu z wymogami walencyjnymi ich komponentów.
Dlatego zdarza się, że słowniki walencyjne przedstawiają wybrane typy wyrażeń wielowyrazowych
jako część jednowyrazowego hasła. Nasza propozycja organizacji słownika frazeologicznego
polega na opisie wszystkich typów wyrażeń wielowyrazowych bez konieczności
redundantnego określania ogólnych właściwości leksemów. W artykule ukazujemy problemy
związane z aplikacją tej zasady na przykładzie walencji i transformacji syntaktycznych.},
  author = {Alexandr Rosen and Hana Skoumalová},
  date-added = {2018-12-30 20:27:26 +0100},
  date-modified = {2018-12-30 20:31:24 +0100},
  journal = {Prace Filologiczne},
  keywords = {idioms; phraseology; Czech; lexical database; transformations},
  pages = {301--320},
  title = {No way to have your say out of the frame: specifying valency of multi-word expressions},
  volume = {LXXII},
  year = {2018}
}
@inproceedings{Rosen:Vavrin:2012,
  abstract = {We present the architecture and the current state of InterCorp, a multilingual parallel corpus centered around Czech, intended primarily for human users and consisting of written texts with a focus on fiction. Following an outline of its recent development and a comparison with some other multilingual parallel corpora we give an overview of the data collection procedure that covers text selection criteria, data format, conversion, alignment, lemmatization and tagging. Finally, we discuss challenges and prospects of the project.},
  address = {Istanbul, Turkey},
  author = {Alexandr Rosen and Martin Vavřín},
  booktitle = {Proceedings of the Eight International Conference on Language Resources and Evaluation (LREC'12)},
  date = {23-25},
  date-added = {2012-05-28 22:44:16 +0000},
  date-modified = {2018-07-21 14:49:42 +0000},
  editor = {Nicoletta Calzolari and Khalid Choukri and Thierry Declerck and Mehmet Uğur Doğan and Bente Maegaard and Joseph Mariani and Jan Odijk and Stelios Piperidis},
  isbn = {978-2-9517408-7-7},
  keywords = {parallel corpora; multilingual; Czech; intercorp},
  language = {english},
  pages = {2447--2452},
  publisher = {European Language Resources Association (ELRA)},
  sponsor = {MSM0021620823},
  title = {Building a multilingual parallel corpus for human users},
  url = {http://www.lrec-conf.org/proceedings/lrec2012/summaries/200.html},
  year = {2012},
  bdsk-url-1 = {http://www.lrec-conf.org/proceedings/lrec2012/summaries/200.html}
}
@misc{Sebesta:etal:2012,
  author = {Šebesta, Karel and Bedřichová, Zuzanna and Štindlová, Barbora and Hrdlička, Milan and Hrdličková, Tereza and Hana, Jiří and Rosen, Alexandr and Petkevič, Vladimír and Jelínek, Tomáš and Škodová, Svatava and Janeš, Petr and Lundáková, Kateřina and Skoumalová, Hana and Šťastný, Klement and Sládek, Šimon},
  copyright = {Attribution-{NonCommercial}-{NoDerivs} 3.0 Unported ({CC} {BY}-{NC}-{ND} 3.0)},
  date-added = {2020-04-30 17:47:35 +0200},
  date-modified = {2020-11-09 12:19:01 +0100},
  keywords = {CzeSL},
  note = {{LINDAT}/{CLARIAH}-{CZ} digital library at the Institute of Formal and Applied Linguistics ({ÚFAL}), Faculty of Mathematics and Physics, Charles University},
  title = {{AKCES} 4},
  url = {http://hdl.handle.net/11858/00-097C-0000-000C-2293-0},
  year = {2012},
  bdsk-url-1 = {http://hdl.handle.net/11858/00-097C-0000-000C-2293-0}
}
@inproceedings{Skodova:etal:2011,
  address = {Praha},
  author = {Svatava Škodová and Barbora Štindlová and Jirka Hana and Alexandr Rosen},
  booktitle = {Korpusová lingvistika Praha 2011: 3 -- Gramatika a značkování korpusů},
  date-added = {2011-10-09 17:21:10 +0200},
  date-modified = {2018-07-19 15:11:13 +0000},
  editor = {Vladimír Petkevič and Alexandr Rosen},
  isbn = {978-80-7422-116-3},
  keywords = {akces; CzeSL},
  organization = {Ústav Českého národního korpusu},
  pages = {208-225},
  publisher = {Nakladatelství Lidové noviny},
  series = {Studie z korpusové lingvistiky},
  title = {Víceúrovňová anotace českého žákovského korpusu},
  url = {http://utkl.ff.cuni.cz/~rosen/public/2011-czesl-korpling.pdf},
  volume = {16},
  year = {2011},
  bdsk-url-1 = {http://utkl.ff.cuni.cz/~rosen/public/2011-czesl-korpling.pdf}
}
@techreport{Skodova:etal:2016,
  address = {Louvain-la-Neuve},
  author = {Svatava Škodová and Barbora Štindlová and Jirka Hana and Alexandr Rosen},
  date-added = {2018-06-18 15:04:56 +0000},
  date-modified = {2018-08-04 10:57:47 +0000},
  institution = {ELC UC},
  keywords = {akces; CzeSL},
  title = {Building and annotating corpora of non-native {C}zech},
  type = {presentation},
  year = {2016},
  bdsk-file-1 = {YnBsaXN0MDDSAQIDBFxyZWxhdGl2ZVBhdGhZYWxpYXNEYXRhXxA9Li4vUFJPSkVDVFMvQ0hLL19QUC8yMDE2LWN6ZXNsLWxvdXZhaW4vMjAxNi1jemVzbC1sb3V2YWluLnBkZk8RAZ4AAAAAAZ4AAgAADE1hY2ludG9zaCBIRAAAAAAAAAAAAAAAAAAAAAAAAABCRAAB/////xYyMDE2LWN6ZXNsLWxvdXZhaW4ucGRmAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAD/////AAAAAAAAAAAAAAAAAAEABQAACiBjdQAAAAAAAAAAAAAAAAASMjAxNi1jemVzbC1sb3V2YWluAAIARi86VXNlcnM6YWlyOlBST0pFQ1RTOkNISzpfUFA6MjAxNi1jemVzbC1sb3V2YWluOjIwMTYtY3plc2wtbG91dmFpbi5wZGYADgAuABYAMgAwADEANgAtAGMAegBlAHMAbAAtAGwAbwB1AHYAYQBpAG4ALgBwAGQAZgAPABoADABNAGEAYwBpAG4AdABvAHMAaAAgAEgARAASAERVc2Vycy9haXIvUFJPSkVDVFMvQ0hLL19QUC8yMDE2LWN6ZXNsLWxvdXZhaW4vMjAxNi1jemVzbC1sb3V2YWluLnBkZgATAAEvAAAVAAIACv//AAAACAANABoAJABkAAAAAAAAAgEAAAAAAAAABQAAAAAAAAAAAAAAAAAAAgY=}
}
@techreport{Skodova:etal:2019,
  address = {Praha},
  author = {Svatava Škodová and Barbora Štindlová and Alexandr Rosen and Tomáš Jelínek and Barbora Hladká},
  date-added = {2019-01-17 22:02:53 +0100},
  date-modified = {2020-11-09 12:25:18 +0100},
  doi = {10.13140/RG.2.2.34952.78080},
  edition = {1.3},
  institution = {Univerzita Karlova},
  keywords = {CzeSL},
  title = {Příručka k morfologické anotaci češtiny nerodilých mluvčích ({A} Guide for Morphological Annotation of Non-native {C}zech)},
  type = {Manual},
  url = {https://docs.google.com/document/d/1-CXo7CY8gAE3qtflqA6Orl3VfaP5Sf9N4pEDDEqfP2E/edit?usp=sharing},
  year = {2019},
  bdsk-url-1 = {https://docs.google.com/document/d/1-CXo7CY8gAE3qtflqA6Orl3VfaP5Sf9N4pEDDEqfP2E/edit?usp=sharing}
}
@inproceedings{Stemle:etal:2019,
  address = {Louvain-la-Neuve},
  author = {Egon W. Stemle and Adriane Boyd and Maarten Janssen and Therese Lindström Tiedemann and Nives Mikelić Preradović and Alexandr Rosen and Dan Rosén and Elena Volodina},
  booktitle = {Widening the Scope of Learner Corpus Research. Selected Papers from the Fourth Learner Corpus Research Conference},
  date-added = {2019-07-26 23:18:15 +0200},
  date-modified = {2020-07-02 16:10:29 +0200},
  editor = {Andrea Abel and Aivars Glaznieks and Verena Lyding and Lionel Nicolas},
  pages = {427--468},
  publisher = {Presses universitaires de Louvain},
  series = {Corpora and Language in Use – Proceedings 5},
  title = {Working together towards an ideal infrastructure for language learner corpora},
  url = {https://www.researchgate.net/publication/337874363_Working_together_towards_an_ideal_infrastructure_for_language_learner_corpora_Towards_an_infrastructure_for_language_learner_corpora},
  year = {2019},
  bdsk-url-1 = {https://docs.google.com/document/d/19_L87PNfJxy-tE2vmi0NbXxMuk-fLQPQt100Kw_e7FA/edit#heading=h.qxwd5b3pc9qg}
}
@inproceedings{Stindlova:2011b,
  abstract = {The paper describes a corpus of texts produced by non-native speakers of
Czech. We discuss its annotation scheme, consisting of three interlinked levels, designed to handle a wide range of error types present in the input. Each level corrects different types of errors; links between the levels allow capturing errors in word order and complex discontinuous expressions. Errors are not only corrected, but also classified. The annotation scheme is tested on a doubly-annotated sample of approx. 10,000 words with fair inter-annotator agreement results. We also explore the possibility of applying automated linguistic annotation tools (taggers, spell checkers and grammar checkers) to the learner text to support or even substitute manual annotation.},
  address = {Frankfurt am Main},
  annote = {PALC 2011 -- Practical Applications in Language and Computers, Łódź 13--15 April 2011},
  author = {Barbora Štindlová and Alexandr Rosen and Jirka Hana and Svatava Škodová},
  booktitle = {Corpus Data across Languages and Disciplines},
  date-added = {2011-09-29 17:23:17 +0200},
  date-modified = {2018-07-19 14:58:57 +0000},
  editor = {Piotr Pęzik},
  isbn = {978-3-631-62547-7},
  issn = {1437-5281},
  keywords = {learner corpus; error annotation; second language acquisition; Czech; akces; CzeSL},
  pages = {21--32},
  publisher = {Peter Lang},
  series = {Łód{\'{z}} Studies in Language},
  title = {Cze{SL} -- an error tagged corpus of {C}zech as a second language},
  url = {http://utkl.ff.cuni.cz/~rosen/public/2011-czesl-palc.pdf},
  volume = {28},
  year = {2012},
  bdsk-file-1 = {YnBsaXN0MDDSAQIDBFxyZWxhdGl2ZVBhdGhZYWxpYXNEYXRhXxA9Li4vUFJPSkVDVFMvQ0hLL19QUC8yMDExLWN6ZXNsLXBhbGMvMjAxMS1jemVzbC1wYWxjLWZpbmFsLnBkZk8RAaIAAAAAAaIAAgAADE1hY2ludG9zaCBIRAAAAAAAAAAAAAAAAAAAAAAAAABCRAAB/////xkyMDExLWN6ZXNsLXBhbGMtZmluYWwucGRmAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAD/////AAAAAAAAAAAAAAAAAAEABQAACiBjdQAAAAAAAAAAAAAAAAAPMjAxMS1jemVzbC1wYWxjAAACAEYvOlVzZXJzOmFpcjpQUk9KRUNUUzpDSEs6X1BQOjIwMTEtY3plc2wtcGFsYzoyMDExLWN6ZXNsLXBhbGMtZmluYWwucGRmAA4ANAAZADIAMAAxADEALQBjAHoAZQBzAGwALQBwAGEAbABjAC0AZgBpAG4AYQBsAC4AcABkAGYADwAaAAwATQBhAGMAaQBuAHQAbwBzAGgAIABIAEQAEgBEVXNlcnMvYWlyL1BST0pFQ1RTL0NISy9fUFAvMjAxMS1jemVzbC1wYWxjLzIwMTEtY3plc2wtcGFsYy1maW5hbC5wZGYAEwABLwAAFQACAAr//wAAAAgADQAaACQAZAAAAAAAAAIBAAAAAAAAAAUAAAAAAAAAAAAAAAAAAAIK},
  bdsk-url-1 = {http://utkl.ff.cuni.cz/~rosen/public/2011-czesl-palc.pdf}
}
@inproceedings{Stindlova:etal:2010,
  address = {Frankfurt am Main},
  author = {Barbora Štindlová and Svatava Škodová and Alexandr Rosen and Jirka Hana},
  booktitle = {Slavic Languages in Formal Grammar. Proceedings of FDSL 8.5, Brno 2010},
  date-added = {2011-09-29 20:06:23 +0200},
  date-modified = {2018-07-19 15:13:47 +0000},
  editor = {Markéta Ziková and Mojmír Dočekal},
  isbn = {978-3-631-63609-1},
  keywords = {akces; CzeSL},
  pages = {205-219},
  publisher = {Peter Lang},
  title = {Annotating foreign learners' {C}zech},
  url = {http://utkl.ff.cuni.cz/~rosen/public/2010-czesl-fdsl-prefinal.pdf},
  year = {2012},
  bdsk-file-1 = {YnBsaXN0MDDSAQIDBFxyZWxhdGl2ZVBhdGhZYWxpYXNEYXRhXxBALi4vUFJPSkVDVFMvQ0hLL19QUC8yMDEwLWN6ZXNsLWZkc2wvMjAxMC1jemVzbC1mZHNsLXByZWZpbmFsLnBkZk8RAbAAAAAAAbAAAgAADE1hY2ludG9zaCBIRAAAAAAAAAAAAAAAAAAAAAAAAABCRAAB/////xwyMDEwLWN6ZXNsLWZkc2wtcHJlZmluYWwucGRmAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAD/////AAAAAAAAAAAAAAAAAAEABQAACiBjdQAAAAAAAAAAAAAAAAAPMjAxMC1jemVzbC1mZHNsAAACAEkvOlVzZXJzOmFpcjpQUk9KRUNUUzpDSEs6X1BQOjIwMTAtY3plc2wtZmRzbDoyMDEwLWN6ZXNsLWZkc2wtcHJlZmluYWwucGRmAAAOADoAHAAyADAAMQAwAC0AYwB6AGUAcwBsAC0AZgBkAHMAbAAtAHAAcgBlAGYAaQBuAGEAbAAuAHAAZABmAA8AGgAMAE0AYQBjAGkAbgB0AG8AcwBoACAASABEABIAR1VzZXJzL2Fpci9QUk9KRUNUUy9DSEsvX1BQLzIwMTAtY3plc2wtZmRzbC8yMDEwLWN6ZXNsLWZkc2wtcHJlZmluYWwucGRmAAATAAEvAAAVAAIACv//AAAACAANABoAJABnAAAAAAAAAgEAAAAAAAAABQAAAAAAAAAAAAAAAAAAAhs=},
  bdsk-url-1 = {http://utkl.ff.cuni.cz/~rosen/public/2010-czesl-fdsl.pdf}
}
@inproceedings{Stindlova:etal:2013,
  address = {Louvain-la-Neuve},
  author = {Barbora Štindlová and Svatava Škodová and Jirka Hana and Alexandr Rosen},
  booktitle = {Twenty Years of Learner Corpus Research: Looking back, Moving ahead},
  date-added = {2012-06-13 13:40:03 +0000},
  date-modified = {2018-07-19 15:19:19 +0000},
  editor = {Sylviane Granger and Gaëtanelle Gilquin and Fanny Meunier},
  keywords = {learner corpora; error annotation; akces; CzeSL},
  month = {September},
  publisher = {Presses Universitaires de Louvain},
  series = {Corpora and Language in Use -- Proceedings 1},
  title = {A learner corpus of {C}zech: current state and future directions},
  url = {http://utkl.ff.cuni.cz/~rosen/public/LCR2011_proceedings_Stindlova-et-al_prefinal.pdf},
  year = {2013},
  bdsk-url-1 = {http://utkl.ff.cuni.cz/~rosen/public/LCR2011_proceedings_Stindlova-et-al_prefinal.pdf}
}
@unpublished{Stindlova:Rosen:2012,
  author = {Barbora Štindlová and Alexandr Rosen},
  date-added = {2012-06-13 13:08:19 +0000},
  date-modified = {2020-05-13 21:26:32 +0200},
  doi = {10.13140/RG.2.2.24106.64968},
  institution = {Technical University Liberec and Charles University Prague},
  keywords = {learner corpora, error annotation, akces, CzeSL},
  note = {[Annotation manual for the CzeSL learner corpus].},
  title = {Návod k anotaci chybového korpusu},
  url = {http://utkl.ff.cuni.cz/~rosen/public/anotace.pdf},
  urldate = {2012.06.13},
  year = {2012},
  bdsk-url-1 = {http://utkl.ff.cuni.cz/~rosen/public/anotace.pdf}
}
@conference{Stindlova:Rosen:2016,
  address = {Praha},
  author = {Barbora Štindlová and Alexandr Rosen},
  booktitle = {VIII. mezinárodní sympozium},
  date-added = {2018-06-18 14:56:23 +0000},
  date-modified = {2018-07-19 15:14:08 +0000},
  keywords = {akces; CzeSL},
  note = {Presentation},
  organization = {Ústav bohemistických studií FF UK},
  title = {Analýza genitivu v jazyce nerodilých mluvčích na základě žákovského korpusu},
  year = {2016},
  bdsk-file-1 = {YnBsaXN0MDDSAQIDBFxyZWxhdGl2ZVBhdGhZYWxpYXNEYXRhXxBKLi4vUFJPSkVDVFMvQ0hLL19QUC8yMDE2LWN6ZXNsLXN5bXBvc2l1bS8yMDE2LWN6ZXNsLXN5bXBvc2l1bS1nZW5pdGl2LnBwdHhPEQHSAAAAAAHSAAIAAAxNYWNpbnRvc2ggSEQAAAAAAAAAAAAAAAAAAAAAAAAAQkQAAf////8fMjAxNi1jemVzbC1zeW1wb3MjRkZGRkZGRkYucHB0eAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA/////wAAAABQUFRYUFBUMwABAAUAAAogY3UAAAAAAAAAAAAAAAAAFDIwMTYtY3plc2wtc3ltcG9zaXVtAAIAUy86VXNlcnM6YWlyOlBST0pFQ1RTOkNISzpfUFA6MjAxNi1jemVzbC1zeW1wb3NpdW06MjAxNi1jemVzbC1zeW1wb3NpdW0tZ2VuaXRpdi5wcHR4AAAOAEQAIQAyADAAMQA2AC0AYwB6AGUAcwBsAC0AcwB5AG0AcABvAHMAaQB1AG0ALQBnAGUAbgBpAHQAaQB2AC4AcABwAHQAeAAPABoADABNAGEAYwBpAG4AdABvAHMAaAAgAEgARAASAFFVc2Vycy9haXIvUFJPSkVDVFMvQ0hLL19QUC8yMDE2LWN6ZXNsLXN5bXBvc2l1bS8yMDE2LWN6ZXNsLXN5bXBvc2l1bS1nZW5pdGl2LnBwdHgAABMAAS8AABUAAgAK//8AAAAIAA0AGgAkAHEAAAAAAAACAQAAAAAAAAAFAAAAAAAAAAAAAAAAAAACRw==},
  bdsk-file-2 = {YnBsaXN0MDDSAQIDBFxyZWxhdGl2ZVBhdGhZYWxpYXNEYXRhXxBJLi4vUFJPSkVDVFMvQ0hLL19QUC8yMDE2LWN6ZXNsLXN5bXBvc2l1bS8yMDE2LWN6ZXNsLXN5bXBvc2l1bS1nZW5pdGl2LnBkZk8RAcwAAAAAAcwAAgAADE1hY2ludG9zaCBIRAAAAAAAAAAAAAAAAAAAAAAAAABCRAAB/////x8yMDE2LWN6ZXNsLXN5bXBvc2kjRkZGRkZGRkYucGRmAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAD/////AAAAAAAAAAAAAAAAAAEABQAACiBjdQAAAAAAAAAAAAAAAAAUMjAxNi1jemVzbC1zeW1wb3NpdW0AAgBSLzpVc2VyczphaXI6UFJPSkVDVFM6Q0hLOl9QUDoyMDE2LWN6ZXNsLXN5bXBvc2l1bToyMDE2LWN6ZXNsLXN5bXBvc2l1bS1nZW5pdGl2LnBkZgAOAEIAIAAyADAAMQA2AC0AYwB6AGUAcwBsAC0AcwB5AG0AcABvAHMAaQB1AG0ALQBnAGUAbgBpAHQAaQB2AC4AcABkAGYADwAaAAwATQBhAGMAaQBuAHQAbwBzAGgAIABIAEQAEgBQVXNlcnMvYWlyL1BST0pFQ1RTL0NISy9fUFAvMjAxNi1jemVzbC1zeW1wb3NpdW0vMjAxNi1jemVzbC1zeW1wb3NpdW0tZ2VuaXRpdi5wZGYAEwABLwAAFQACAAr//wAAAAgADQAaACQAcAAAAAAAAAIBAAAAAAAAAAUAAAAAAAAAAAAAAAAAAAJA}
}
@inproceedings{VavRos:08,
  author = {Martin Vavřín and Alexandr Rosen},
  booktitle = {Proceedings of the International Conference Corpus Linguistics -- 2008},
  date-modified = {2018-07-21 14:41:25 +0000},
  isbn = {978-5-288-04769-5},
  keywords = {intercorp},
  language = {English},
  location = {St. Petersburg},
  pages = {97-104},
  publisher = {St. Petersburg State University},
  title = {{InterCorp: A Multilingual Parallel Corpus Project}},
  url = {http://utkl.ff.cuni.cz/~rosen/public/2008_intercorp_peterburg.pdf},
  year = 2008,
  bdsk-url-1 = {http://utkl.ff.cuni.cz/~rosen/public/2008_intercorp_peterburg.pdf}
}
@comment{{BibDesk Smart Groups




	
		conditions
		
			
				comparison
				2
				key
				Any Field
				value
				Štindlová
				version
				1
			
			
				comparison
				2
				key
				Any Field
				value
				CzeSL
				version
				1
			
			
				comparison
				2
				key
				Any Field
				value
				Šebesta
				version
				1
			
			
				comparison
				2
				key
				Any Field
				value
				Bedřichová
				version
				1
			
			
				comparison
				2
				key
				Any Field
				value
				Šormová
				version
				1
			
			
				comparison
				2
				key
				Any Field
				value
				Škodová
				version
				1
			
			
				comparison
				2
				key
				Any Field
				value
				non-native Czech
				version
				1
			
		
		conjunction
		1
		group name
		AKCES
	
	
		conditions
		
			
				comparison
				2
				key
				Author
				value
				Vavřín
				version
				1
			
			
				comparison
				2
				key
				Any Field
				value
				InterCorp
				version
				1
			
		
		conjunction
		1
		group name
		InterCorp
	
	
		conditions
		
			
				comparison
				2
				key
				Author
				value
				Rosen
				version
				1
			
			
				comparison
				2
				key
				Author
				value
				Alexandr
				version
				1
			
		
		conjunction
		0
		group name
		Rosen
	


}}

This file was generated by bibtex2html 1.96.