blbooks

مراجع:

الجميع

استخدم الأمر التالي لتحميل مجموعة البيانات هذه في TFDS:

ds = tfds.load('huggingface:blbooks/all')
  • وصف :
A dataset comprising of text created by OCR from the 49,455 digitised books, equating to 65,227 volumes (25+ million pages), published between c. 1510 - c. 1900.
The books cover a wide range of subject areas including philosophy, history, poetry and literature.
  • الترخيص : لا يوجد ترخيص معروف
  • الإصدار : 1.0.2
  • الإنشقاقات :
ينقسم أمثلة
'train' 14011953
  • سمات :
{
    "record_id": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "date": {
        "dtype": "int32",
        "id": null,
        "_type": "Value"
    },
    "raw_date": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "title": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "place": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "empty_pg": {
        "dtype": "bool",
        "id": null,
        "_type": "Value"
    },
    "text": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "pg": {
        "dtype": "int32",
        "id": null,
        "_type": "Value"
    },
    "mean_wc_ocr": {
        "dtype": "float32",
        "id": null,
        "_type": "Value"
    },
    "std_wc_ocr": {
        "dtype": "float64",
        "id": null,
        "_type": "Value"
    },
    "name": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "all_names": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Publisher": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Country of publication 1": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "all Countries of publication": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Physical description": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_1": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_2": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_3": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_4": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "multi_language": {
        "dtype": "bool",
        "id": null,
        "_type": "Value"
    }
}

القرن التاسع عشر

استخدم الأمر التالي لتحميل مجموعة البيانات هذه في TFDS:

ds = tfds.load('huggingface:blbooks/1800s')
  • وصف :
A dataset comprising of text created by OCR from the 49,455 digitised books, equating to 65,227 volumes (25+ million pages), published between c. 1510 - c. 1900.
The books cover a wide range of subject areas including philosophy, history, poetry and literature.
  • الترخيص : لا يوجد ترخيص معروف
  • الإصدار : 1.0.2
  • الإنشقاقات :
ينقسم أمثلة
'train' 13781747
  • سمات :
{
    "record_id": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "date": {
        "dtype": "int32",
        "id": null,
        "_type": "Value"
    },
    "raw_date": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "title": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "place": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "empty_pg": {
        "dtype": "bool",
        "id": null,
        "_type": "Value"
    },
    "text": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "pg": {
        "dtype": "int32",
        "id": null,
        "_type": "Value"
    },
    "mean_wc_ocr": {
        "dtype": "float32",
        "id": null,
        "_type": "Value"
    },
    "std_wc_ocr": {
        "dtype": "float64",
        "id": null,
        "_type": "Value"
    },
    "name": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "all_names": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Publisher": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Country of publication 1": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "all Countries of publication": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Physical description": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_1": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_2": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_3": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_4": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "multi_language": {
        "dtype": "bool",
        "id": null,
        "_type": "Value"
    }
}

القرن الثامن عشر

استخدم الأمر التالي لتحميل مجموعة البيانات هذه في TFDS:

ds = tfds.load('huggingface:blbooks/1700s')
  • وصف :
A dataset comprising of text created by OCR from the 49,455 digitised books, equating to 65,227 volumes (25+ million pages), published between c. 1510 - c. 1900.
The books cover a wide range of subject areas including philosophy, history, poetry and literature.
  • الترخيص : لا يوجد ترخيص معروف
  • الإصدار : 1.0.2
  • الإنشقاقات :
ينقسم أمثلة
'train' 178224
  • سمات :
{
    "record_id": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "date": {
        "dtype": "int32",
        "id": null,
        "_type": "Value"
    },
    "raw_date": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "title": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "place": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "empty_pg": {
        "dtype": "bool",
        "id": null,
        "_type": "Value"
    },
    "text": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "pg": {
        "dtype": "int32",
        "id": null,
        "_type": "Value"
    },
    "mean_wc_ocr": {
        "dtype": "float32",
        "id": null,
        "_type": "Value"
    },
    "std_wc_ocr": {
        "dtype": "float64",
        "id": null,
        "_type": "Value"
    },
    "name": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "all_names": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Publisher": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Country of publication 1": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "all Countries of publication": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Physical description": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_1": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_2": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_3": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_4": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "multi_language": {
        "dtype": "bool",
        "id": null,
        "_type": "Value"
    }
}

1510_1699

استخدم الأمر التالي لتحميل مجموعة البيانات هذه في TFDS:

ds = tfds.load('huggingface:blbooks/1510_1699')
  • وصف :
A dataset comprising of text created by OCR from the 49,455 digitised books, equating to 65,227 volumes (25+ million pages), published between c. 1510 - c. 1900.
The books cover a wide range of subject areas including philosophy, history, poetry and literature.
  • الترخيص : لا يوجد ترخيص معروف
  • الإصدار : 1.0.2
  • الإنشقاقات :
ينقسم أمثلة
'train' 51982
  • سمات :
{
    "record_id": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "date": {
        "dtype": "timestamp[s]",
        "id": null,
        "_type": "Value"
    },
    "raw_date": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "title": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "place": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "empty_pg": {
        "dtype": "bool",
        "id": null,
        "_type": "Value"
    },
    "text": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "pg": {
        "dtype": "int32",
        "id": null,
        "_type": "Value"
    },
    "mean_wc_ocr": {
        "dtype": "float32",
        "id": null,
        "_type": "Value"
    },
    "std_wc_ocr": {
        "dtype": "float64",
        "id": null,
        "_type": "Value"
    },
    "name": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "all_names": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Publisher": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Country of publication 1": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "all Countries of publication": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Physical description": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_1": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_2": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_3": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_4": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "multi_language": {
        "dtype": "bool",
        "id": null,
        "_type": "Value"
    }
}

1500_1899

استخدم الأمر التالي لتحميل مجموعة البيانات هذه في TFDS:

ds = tfds.load('huggingface:blbooks/1500_1899')
  • وصف :
A dataset comprising of text created by OCR from the 49,455 digitised books, equating to 65,227 volumes (25+ million pages), published between c. 1510 - c. 1900.
The books cover a wide range of subject areas including philosophy, history, poetry and literature.
  • الترخيص : لا يوجد ترخيص معروف
  • الإصدار : 1.0.2
  • الإنشقاقات :
ينقسم أمثلة
'train' 14011953
  • سمات :
{
    "record_id": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "date": {
        "dtype": "timestamp[s]",
        "id": null,
        "_type": "Value"
    },
    "raw_date": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "title": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "place": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "empty_pg": {
        "dtype": "bool",
        "id": null,
        "_type": "Value"
    },
    "text": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "pg": {
        "dtype": "int32",
        "id": null,
        "_type": "Value"
    },
    "mean_wc_ocr": {
        "dtype": "float32",
        "id": null,
        "_type": "Value"
    },
    "std_wc_ocr": {
        "dtype": "float64",
        "id": null,
        "_type": "Value"
    },
    "name": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "all_names": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Publisher": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Country of publication 1": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "all Countries of publication": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Physical description": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_1": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_2": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_3": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_4": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "multi_language": {
        "dtype": "bool",
        "id": null,
        "_type": "Value"
    }
}

1800_1899

استخدم الأمر التالي لتحميل مجموعة البيانات هذه في TFDS:

ds = tfds.load('huggingface:blbooks/1800_1899')
  • وصف :
A dataset comprising of text created by OCR from the 49,455 digitised books, equating to 65,227 volumes (25+ million pages), published between c. 1510 - c. 1900.
The books cover a wide range of subject areas including philosophy, history, poetry and literature.
  • الترخيص : لا يوجد ترخيص معروف
  • الإصدار : 1.0.2
  • الإنشقاقات :
ينقسم أمثلة
'train' 13781747
  • سمات :
{
    "record_id": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "date": {
        "dtype": "timestamp[s]",
        "id": null,
        "_type": "Value"
    },
    "raw_date": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "title": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "place": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "empty_pg": {
        "dtype": "bool",
        "id": null,
        "_type": "Value"
    },
    "text": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "pg": {
        "dtype": "int32",
        "id": null,
        "_type": "Value"
    },
    "mean_wc_ocr": {
        "dtype": "float32",
        "id": null,
        "_type": "Value"
    },
    "std_wc_ocr": {
        "dtype": "float64",
        "id": null,
        "_type": "Value"
    },
    "name": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "all_names": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Publisher": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Country of publication 1": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "all Countries of publication": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Physical description": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_1": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_2": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_3": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_4": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "multi_language": {
        "dtype": "bool",
        "id": null,
        "_type": "Value"
    }
}

1700_1799

استخدم الأمر التالي لتحميل مجموعة البيانات هذه في TFDS:

ds = tfds.load('huggingface:blbooks/1700_1799')
  • وصف :
A dataset comprising of text created by OCR from the 49,455 digitised books, equating to 65,227 volumes (25+ million pages), published between c. 1510 - c. 1900.
The books cover a wide range of subject areas including philosophy, history, poetry and literature.
  • الترخيص : لا يوجد ترخيص معروف
  • الإصدار : 1.0.2
  • الإنشقاقات :
ينقسم أمثلة
'train' 178224
  • سمات :
{
    "record_id": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "date": {
        "dtype": "timestamp[s]",
        "id": null,
        "_type": "Value"
    },
    "raw_date": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "title": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "place": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "empty_pg": {
        "dtype": "bool",
        "id": null,
        "_type": "Value"
    },
    "text": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "pg": {
        "dtype": "int32",
        "id": null,
        "_type": "Value"
    },
    "mean_wc_ocr": {
        "dtype": "float32",
        "id": null,
        "_type": "Value"
    },
    "std_wc_ocr": {
        "dtype": "float64",
        "id": null,
        "_type": "Value"
    },
    "name": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "all_names": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Publisher": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Country of publication 1": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "all Countries of publication": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Physical description": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_1": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_2": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_3": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_4": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "multi_language": {
        "dtype": "bool",
        "id": null,
        "_type": "Value"
    }
}