TFDS now supports the Croissant 🥐 format! Read the documentation to know more.

wmt20_mlqe_task3

References:

plain_text

Use the following command to load this dataset in TFDS:

ds = tfds.load('huggingface:wmt20_mlqe_task3/plain_text')

Description:

This shared task (part of WMT20) will build on its previous editions
to further examine automatic methods for estimating the quality
of neural machine translation output at run-time, without relying
on reference translations. As in previous years, we cover estimation
at various levels. Important elements introduced this year include: a new
task where sentences are annotated with Direct Assessment (DA)
scores instead of labels based on post-editing; a new multilingual
sentence-level dataset mainly from Wikipedia articles, where the
source articles can be retrieved for document-wide context; the
availability of NMT models to explore system-internal information for the task.

The goal of this task 3 is to predict document-level quality scores as well as fine-grained annotations.

License: Unknown
Version: 1.1.0
Splits:

Split	Examples
`'test'`	180
`'train'`	1448
`'validation'`	200

Features:

{
    "document_id": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "source_segments": {
        "feature": {
            "dtype": "string",
            "id": null,
            "_type": "Value"
        },
        "length": -1,
        "id": null,
        "_type": "Sequence"
    },
    "source_tokenized": {
        "feature": {
            "dtype": "string",
            "id": null,
            "_type": "Value"
        },
        "length": -1,
        "id": null,
        "_type": "Sequence"
    },
    "mt_segments": {
        "feature": {
            "dtype": "string",
            "id": null,
            "_type": "Value"
        },
        "length": -1,
        "id": null,
        "_type": "Sequence"
    },
    "mt_tokenized": {
        "feature": {
            "dtype": "string",
            "id": null,
            "_type": "Value"
        },
        "length": -1,
        "id": null,
        "_type": "Sequence"
    },
    "annotations": {
        "feature": {
            "segment_id": {
                "feature": {
                    "dtype": "int32",
                    "id": null,
                    "_type": "Value"
                },
                "length": -1,
                "id": null,
                "_type": "Sequence"
            },
            "annotation_start": {
                "feature": {
                    "dtype": "int32",
                    "id": null,
                    "_type": "Value"
                },
                "length": -1,
                "id": null,
                "_type": "Sequence"
            },
            "annotation_length": {
                "feature": {
                    "dtype": "int32",
                    "id": null,
                    "_type": "Value"
                },
                "length": -1,
                "id": null,
                "_type": "Sequence"
            },
            "severity": {
                "num_classes": 3,
                "names": [
                    "minor",
                    "major",
                    "critical"
                ],
                "names_file": null,
                "id": null,
                "_type": "ClassLabel"
            },
            "severity_weight": {
                "dtype": "float32",
                "id": null,
                "_type": "Value"
            },
            "category": {
                "num_classes": 45,
                "names": [
                    "Addition",
                    "Agreement",
                    "Ambiguous Translation",
                    "Capitalization",
                    "Character Encoding",
                    "Company Terminology",
                    "Date/Time",
                    "Diacritics",
                    "Duplication",
                    "False Friend",
                    "Grammatical Register",
                    "Hyphenation",
                    "Inconsistency",
                    "Lexical Register",
                    "Lexical Selection",
                    "Named Entity",
                    "Number",
                    "Omitted Auxiliary Verb",
                    "Omitted Conjunction",
                    "Omitted Determiner",
                    "Omitted Preposition",
                    "Omitted Pronoun",
                    "Orthography",
                    "Other POS Omitted",
                    "Over-translation",
                    "Overly Literal",
                    "POS",
                    "Punctuation",
                    "Shouldn't Have Been Translated",
                    "Shouldn't have been translated",
                    "Spelling",
                    "Tense/Mood/Aspect",
                    "Under-translation",
                    "Unidiomatic",
                    "Unintelligible",
                    "Unit Conversion",
                    "Untranslated",
                    "Whitespace",
                    "Word Order",
                    "Wrong Auxiliary Verb",
                    "Wrong Conjunction",
                    "Wrong Determiner",
                    "Wrong Language Variety",
                    "Wrong Preposition",
                    "Wrong Pronoun"
                ],
                "names_file": null,
                "id": null,
                "_type": "ClassLabel"
            }
        },
        "length": -1,
        "id": null,
        "_type": "Sequence"
    },
    "token_annotations": {
        "feature": {
            "segment_id": {
                "feature": {
                    "dtype": "int32",
                    "id": null,
                    "_type": "Value"
                },
                "length": -1,
                "id": null,
                "_type": "Sequence"
            },
            "first_token": {
                "feature": {
                    "dtype": "int32",
                    "id": null,
                    "_type": "Value"
                },
                "length": -1,
                "id": null,
                "_type": "Sequence"
            },
            "last_token": {
                "feature": {
                    "dtype": "int32",
                    "id": null,
                    "_type": "Value"
                },
                "length": -1,
                "id": null,
                "_type": "Sequence"
            },
            "token_after_gap": {
                "feature": {
                    "dtype": "int32",
                    "id": null,
                    "_type": "Value"
                },
                "length": -1,
                "id": null,
                "_type": "Sequence"
            },
            "severity": {
                "num_classes": 3,
                "names": [
                    "minor",
                    "major",
                    "critical"
                ],
                "names_file": null,
                "id": null,
                "_type": "ClassLabel"
            },
            "category": {
                "num_classes": 45,
                "names": [
                    "Addition",
                    "Agreement",
                    "Ambiguous Translation",
                    "Capitalization",
                    "Character Encoding",
                    "Company Terminology",
                    "Date/Time",
                    "Diacritics",
                    "Duplication",
                    "False Friend",
                    "Grammatical Register",
                    "Hyphenation",
                    "Inconsistency",
                    "Lexical Register",
                    "Lexical Selection",
                    "Named Entity",
                    "Number",
                    "Omitted Auxiliary Verb",
                    "Omitted Conjunction",
                    "Omitted Determiner",
                    "Omitted Preposition",
                    "Omitted Pronoun",
                    "Orthography",
                    "Other POS Omitted",
                    "Over-translation",
                    "Overly Literal",
                    "POS",
                    "Punctuation",
                    "Shouldn't Have Been Translated",
                    "Shouldn't have been translated",
                    "Spelling",
                    "Tense/Mood/Aspect",
                    "Under-translation",
                    "Unidiomatic",
                    "Unintelligible",
                    "Unit Conversion",
                    "Untranslated",
                    "Whitespace",
                    "Word Order",
                    "Wrong Auxiliary Verb",
                    "Wrong Conjunction",
                    "Wrong Determiner",
                    "Wrong Language Variety",
                    "Wrong Preposition",
                    "Wrong Pronoun"
                ],
                "names_file": null,
                "id": null,
                "_type": "ClassLabel"
            }
        },
        "length": -1,
        "id": null,
        "_type": "Sequence"
    },
    "token_index": {
        "feature": {
            "feature": {
                "feature": {
                    "dtype": "int32",
                    "id": null,
                    "_type": "Value"
                },
                "length": -1,
                "id": null,
                "_type": "Sequence"
            },
            "length": -1,
            "id": null,
            "_type": "Sequence"
        },
        "length": -1,
        "id": null,
        "_type": "Sequence"
    },
    "total_words": {
        "dtype": "int32",
        "id": null,
        "_type": "Value"
    }
}