เคาน์เตอร์

ข้อมูลอ้างอิง:

ใช้คำสั่งต่อไปนี้เพื่อโหลดชุดข้อมูลนี้ใน TFDS:

ds = tfds.load('huggingface:counter')
  • คำอธิบาย :
The COrpus of Urdu News TExt Reuse (COUNTER) corpus contains 1200 documents with real examples of text reuse from the field of journalism. It has been manually annotated at document level with three levels of reuse: wholly derived, partially derived and non derived.
  • ใบอนุญาต : corpus ได้รับอนุญาตภายใต้ Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License
  • เวอร์ชัน : 1.0.0
  • แยก :
แยก ตัวอย่าง
'train' 600
  • คุณสมบัติ :
{
    "source": {
        "filename": {
            "dtype": "string",
            "id": null,
            "_type": "Value"
        },
        "headline": {
            "dtype": "string",
            "id": null,
            "_type": "Value"
        },
        "body": {
            "dtype": "string",
            "id": null,
            "_type": "Value"
        },
        "total_number_of_words": {
            "dtype": "int64",
            "id": null,
            "_type": "Value"
        },
        "total_number_of_sentences": {
            "dtype": "int64",
            "id": null,
            "_type": "Value"
        },
        "number_of_words_with_swr": {
            "dtype": "int64",
            "id": null,
            "_type": "Value"
        },
        "newspaper": {
            "dtype": "string",
            "id": null,
            "_type": "Value"
        },
        "newsdate": {
            "dtype": "string",
            "id": null,
            "_type": "Value"
        },
        "domain": {
            "num_classes": 5,
            "names": [
                "business",
                "sports",
                "national",
                "foreign",
                "showbiz"
            ],
            "names_file": null,
            "id": null,
            "_type": "ClassLabel"
        },
        "classification": {
            "num_classes": 3,
            "names": [
                "wholly_derived",
                "partially_derived",
                "not_derived"
            ],
            "names_file": null,
            "id": null,
            "_type": "ClassLabel"
        }
    },
    "derived": {
        "filename": {
            "dtype": "string",
            "id": null,
            "_type": "Value"
        },
        "headline": {
            "dtype": "string",
            "id": null,
            "_type": "Value"
        },
        "body": {
            "dtype": "string",
            "id": null,
            "_type": "Value"
        },
        "total_number_of_words": {
            "dtype": "int64",
            "id": null,
            "_type": "Value"
        },
        "total_number_of_sentences": {
            "dtype": "int64",
            "id": null,
            "_type": "Value"
        },
        "number_of_words_with_swr": {
            "dtype": "int64",
            "id": null,
            "_type": "Value"
        },
        "newspaper": {
            "dtype": "string",
            "id": null,
            "_type": "Value"
        },
        "newsdate": {
            "dtype": "string",
            "id": null,
            "_type": "Value"
        },
        "domain": {
            "num_classes": 5,
            "names": [
                "business",
                "sports",
                "national",
                "foreign",
                "showbiz"
            ],
            "names_file": null,
            "id": null,
            "_type": "ClassLabel"
        },
        "classification": {
            "num_classes": 3,
            "names": [
                "wholly_derived",
                "partially_derived",
                "not_derived"
            ],
            "names_file": null,
            "id": null,
            "_type": "ClassLabel"
        }
    }
}