@conference {Simoes2011, title = {XML schemas for parallel corpora}, booktitle = {Xata 2011}, year = {2011}, pages = {11}, publisher = {CEH - CL - Livros de Actas CEH - CL - Livros de Actas}, organization = {CEH - CL - Livros de Actas CEH - CL - Livros de Actas}, abstract = {

Parallel corpora are resources used in Natural Language Processing and Computational Linguistics. They are defined as a set of texts, in different languages, that are translations of each other. Note that these translations do not need to cover the full document, as we might have sentences translated just on some of the languages. When dealing with the process of sharing resources, recent years have bet on the use of XML formats. This is no different when talking about parallel corpora sharing. When visiting different projects in the web that release parallel corpora for download, we can find at least three different formats. In fact, this abundance of formats has led some projects to adopt all the three formats. This article discusses these three main formats: XML Corpus Encoding Standard, Translation Memory Exchange format and the Text Encoding Initiative. We will compare their formal definition and their XML schema.

}, keywords = {Corpora, Parallel corpora, XML, XML schemas}, url = {http://hdl.handle.net/1822/17078}, author = {Alberto Sim{\~o}es and Sara Fernandes} }