TFDS now supports the Croissant 🥐 format! Read the documentation to know more.

wili_2018

References:

WiLI-2018 dataset

Use the following command to load this dataset in TFDS:

ds = tfds.load('huggingface:wili_2018/WiLI-2018 dataset')

Description:

It is a benchmark dataset for language identification and contains 235000 paragraphs of 235 languages

License: ODC Open Database License v1.0
Version: 1.1.0
Splits:

Split	Examples
`'test'`	117500
`'train'`	117500

Features:

{
    "sentence": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "label": {
        "num_classes": 235,
        "names": [
            "cdo",
            "glk",
            "jam",
            "lug",
            "san",
            "rue",
            "wol",
            "new",
            "mwl",
            "bre",
            "ara",
            "hye",
            "xmf",
            "ext",
            "cor",
            "yor",
            "div",
            "asm",
            "lat",
            "cym",
            "hif",
            "ace",
            "kbd",
            "tgk",
            "rus",
            "nso",
            "mya",
            "msa",
            "ava",
            "cbk",
            "urd",
            "deu",
            "swa",
            "pus",
            "bxr",
            "udm",
            "csb",
            "yid",
            "vro",
            "por",
            "pdc",
            "eng",
            "tha",
            "hat",
            "lmo",
            "pag",
            "jav",
            "chv",
            "nan",
            "sco",
            "kat",
            "bho",
            "bos",
            "kok",
            "oss",
            "mri",
            "fry",
            "cat",
            "azb",
            "kin",
            "hin",
            "sna",
            "dan",
            "egl",
            "mkd",
            "ron",
            "bul",
            "hrv",
            "som",
            "pam",
            "nav",
            "ksh",
            "nci",
            "khm",
            "sgs",
            "srn",
            "bar",
            "cos",
            "ckb",
            "pfl",
            "arz",
            "roa-tara",
            "fra",
            "mai",
            "zh-yue",
            "guj",
            "fin",
            "kir",
            "vol",
            "hau",
            "afr",
            "uig",
            "lao",
            "swe",
            "slv",
            "kor",
            "szl",
            "srp",
            "dty",
            "nrm",
            "dsb",
            "ind",
            "wln",
            "pnb",
            "ukr",
            "bpy",
            "vie",
            "tur",
            "aym",
            "lit",
            "zea",
            "pol",
            "est",
            "scn",
            "vls",
            "stq",
            "gag",
            "grn",
            "kaz",
            "ben",
            "pcd",
            "bjn",
            "krc",
            "amh",
            "diq",
            "ltz",
            "ita",
            "kab",
            "bel",
            "ang",
            "mhr",
            "che",
            "koi",
            "glv",
            "ido",
            "fao",
            "bak",
            "isl",
            "bcl",
            "tet",
            "jpn",
            "kur",
            "map-bms",
            "tyv",
            "olo",
            "arg",
            "ori",
            "lim",
            "tel",
            "lin",
            "roh",
            "sqi",
            "xho",
            "mlg",
            "fas",
            "hbs",
            "tam",
            "aze",
            "lad",
            "nob",
            "sin",
            "gla",
            "nap",
            "snd",
            "ast",
            "mal",
            "mdf",
            "tsn",
            "nds",
            "tgl",
            "nno",
            "sun",
            "lzh",
            "jbo",
            "crh",
            "pap",
            "oci",
            "hak",
            "uzb",
            "zho",
            "hsb",
            "sme",
            "mlt",
            "vep",
            "lez",
            "nld",
            "nds-nl",
            "mrj",
            "spa",
            "ceb",
            "ina",
            "heb",
            "hun",
            "que",
            "kaa",
            "mar",
            "vec",
            "frp",
            "ell",
            "sah",
            "eus",
            "ces",
            "slk",
            "chr",
            "lij",
            "nep",
            "srd",
            "ilo",
            "be-tarask",
            "bod",
            "orm",
            "war",
            "glg",
            "mon",
            "gle",
            "min",
            "ibo",
            "ile",
            "epo",
            "lav",
            "lrc",
            "als",
            "mzn",
            "rup",
            "fur",
            "tat",
            "myv",
            "pan",
            "ton",
            "kom",
            "wuu",
            "tcy",
            "tuk",
            "kan",
            "ltg"
        ],
        "names_file": null,
        "id": null,
        "_type": "ClassLabel"
    }
}