如何使用 TF-Hub 解决 Kaggle 上的问题

在 TensorFlow.org 上查看 在 Google Colab 运行 在 GitHub 上查看源代码 下载笔记本 查看 TF Hub 模型

TF-Hub 是一个平台,用于共享打包为可重用资源的机器学习专业知识,尤其是经过预训练的模块。在本教程中,我们将使用 TF-Hub 文本嵌入向量模块来训练具有合理基线准确率的简单情感分类器。之后,我们会将预测结果提交给 Kaggle。

有关使用 TF-Hub 进行文本分类的更详细教程,以及提高准确率的后续步骤,请查看使用 TF-Hub 进行分本分类

设置

pip install -q kaggle
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import zipfile

from sklearn import model_selection
2022-12-14 21:06:07.262712: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-12-14 21:06:07.262817: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
2022-12-14 21:06:07.262829: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.

由于本教程将使用 Kaggle 中的数据集,因此需要为您的 Kaggle 帐号创建 API 令牌,并将其上传到 Colab 环境。

import os
import pathlib

# Upload the API token.
def get_kaggle():
  try:
    import kaggle
    return kaggle
  except OSError:
    pass

  token_file = pathlib.Path("~/.kaggle/kaggle.json").expanduser()
  token_file.parent.mkdir(exist_ok=True, parents=True)

  try:
    from google.colab import files
  except ImportError:
    raise ValueError("Could not find kaggle token.")

  uploaded = files.upload()
  token_content = uploaded.get('kaggle.json', None)
  if token_content:
    token_file.write_bytes(token_content)
    token_file.chmod(0o600)
  else:
    raise ValueError('Need a file named "kaggle.json"')

  import kaggle
  return kaggle


kaggle = get_kaggle()

开始

数据

我们将尝试完成 Kaggle 的 Sentiment Analysis on Movie Reviews 任务。数据集由 Rotten Tomatoes 电影评论中符合句法的子短语组成。任务是采用五分制将短语标记为负面正面

在使用此 API 下载数据之前,您必须接受竞赛规则

SENTIMENT_LABELS = [
    "negative", "somewhat negative", "neutral", "somewhat positive", "positive"
]

# Add a column with readable values representing the sentiment.
def add_readable_labels_column(df, sentiment_value_column):
  df["SentimentLabel"] = df[sentiment_value_column].replace(
      range(5), SENTIMENT_LABELS)

# Download data from Kaggle and create a DataFrame.
def load_data_from_zip(path):
  with zipfile.ZipFile(path, "r") as zip_ref:
    name = zip_ref.namelist()[0]
    with zip_ref.open(name) as zf:
      return pd.read_csv(zf, sep="\t", index_col=0)


# The data does not come with a validation set so we'll create one from the
# training set.
def get_data(competition, train_file, test_file, validation_set_ratio=0.1):
  data_path = pathlib.Path("data")
  kaggle.api.competition_download_files(competition, data_path)
  competition_path = (data_path/competition)
  competition_path.mkdir(exist_ok=True, parents=True)
  competition_zip_path = competition_path.with_suffix(".zip")

  with zipfile.ZipFile(competition_zip_path, "r") as zip_ref:
    zip_ref.extractall(competition_path)

  train_df = load_data_from_zip(competition_path/train_file)
  test_df = load_data_from_zip(competition_path/test_file)

  # Add a human readable label.
  add_readable_labels_column(train_df, "Sentiment")

  # We split by sentence ids, because we don't want to have phrases belonging
  # to the same sentence in both training and validation set.
  train_indices, validation_indices = model_selection.train_test_split(
      np.unique(train_df["SentenceId"]),
      test_size=validation_set_ratio,
      random_state=0)

  validation_df = train_df[train_df["SentenceId"].isin(validation_indices)]
  train_df = train_df[train_df["SentenceId"].isin(train_indices)]
  print("Split the training data into %d training and %d validation examples." %
        (len(train_df), len(validation_df)))

  return train_df, validation_df, test_df


train_df, validation_df, test_df = get_data(
    "sentiment-analysis-on-movie-reviews",
    "train.tsv.zip", "test.tsv.zip")
Split the training data into 140315 training and 15745 validation examples.

注:本竞赛的任务不是对整个评论进行评分,而是对评论中的各个短语进行评分。这是一项更加艰巨的任务。

train_df.head(20)

训练模型

注:我们也可以将此任务建模为回归模型,请参阅使用 TF-Hub 进行文本分类

class MyModel(tf.keras.Model):
  def __init__(self, hub_url):
    super().__init__()
    self.hub_url = hub_url
    self.embed = hub.load(self.hub_url).signatures['default']
    self.sequential = tf.keras.Sequential([
      tf.keras.layers.Dense(500),
      tf.keras.layers.Dense(100),
      tf.keras.layers.Dense(5),
    ])

  def call(self, inputs):
    phrases = inputs['Phrase'][:,0]
    embedding = 5*self.embed(phrases)['default']
    return self.sequential(embedding)

  def get_config(self):
    return {"hub_url":self.hub_url}
model = MyModel("https://tfhub.dev/google/nnlm-en-dim128/1")
model.compile(
    loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=tf.optimizers.Adam(), 
    metrics = [tf.keras.metrics.SparseCategoricalAccuracy(name="accuracy")])
history = model.fit(x=dict(train_df), y=train_df['Sentiment'],
          validation_data=(dict(validation_df), validation_df['Sentiment']),
          epochs = 25)
Epoch 1/25
4385/4385 [==============================] - 14s 3ms/step - loss: 1.0246 - accuracy: 0.5858 - val_loss: 0.9966 - val_accuracy: 0.5968
Epoch 2/25
4385/4385 [==============================] - 12s 3ms/step - loss: 0.9997 - accuracy: 0.5953 - val_loss: 0.9854 - val_accuracy: 0.5933
Epoch 3/25
4385/4385 [==============================] - 12s 3ms/step - loss: 0.9951 - accuracy: 0.5971 - val_loss: 0.9866 - val_accuracy: 0.5996
Epoch 4/25
4385/4385 [==============================] - 12s 3ms/step - loss: 0.9930 - accuracy: 0.5980 - val_loss: 0.9843 - val_accuracy: 0.5943
Epoch 5/25
4385/4385 [==============================] - 13s 3ms/step - loss: 0.9919 - accuracy: 0.5971 - val_loss: 0.9815 - val_accuracy: 0.5980
Epoch 6/25
4385/4385 [==============================] - 12s 3ms/step - loss: 0.9903 - accuracy: 0.5977 - val_loss: 0.9844 - val_accuracy: 0.5921
Epoch 7/25
4385/4385 [==============================] - 12s 3ms/step - loss: 0.9892 - accuracy: 0.5984 - val_loss: 0.9821 - val_accuracy: 0.5952
Epoch 8/25
4385/4385 [==============================] - 12s 3ms/step - loss: 0.9889 - accuracy: 0.5975 - val_loss: 0.9838 - val_accuracy: 0.5881
Epoch 9/25
4385/4385 [==============================] - 13s 3ms/step - loss: 0.9885 - accuracy: 0.5985 - val_loss: 0.9816 - val_accuracy: 0.5921
Epoch 10/25
4385/4385 [==============================] - 13s 3ms/step - loss: 0.9879 - accuracy: 0.5986 - val_loss: 0.9821 - val_accuracy: 0.5964
Epoch 11/25
4385/4385 [==============================] - 13s 3ms/step - loss: 0.9876 - accuracy: 0.5998 - val_loss: 0.9804 - val_accuracy: 0.5945
Epoch 12/25
4385/4385 [==============================] - 13s 3ms/step - loss: 0.9876 - accuracy: 0.5991 - val_loss: 0.9837 - val_accuracy: 0.5915
Epoch 13/25
4385/4385 [==============================] - 12s 3ms/step - loss: 0.9875 - accuracy: 0.5994 - val_loss: 0.9790 - val_accuracy: 0.5953
Epoch 14/25
4385/4385 [==============================] - 12s 3ms/step - loss: 0.9871 - accuracy: 0.5990 - val_loss: 0.9855 - val_accuracy: 0.5985
Epoch 15/25
4385/4385 [==============================] - 12s 3ms/step - loss: 0.9868 - accuracy: 0.5989 - val_loss: 0.9804 - val_accuracy: 0.5952
Epoch 16/25
4385/4385 [==============================] - 13s 3ms/step - loss: 0.9871 - accuracy: 0.5995 - val_loss: 0.9797 - val_accuracy: 0.5940
Epoch 17/25
4385/4385 [==============================] - 13s 3ms/step - loss: 0.9866 - accuracy: 0.5994 - val_loss: 0.9787 - val_accuracy: 0.5955
Epoch 18/25
4385/4385 [==============================] - 13s 3ms/step - loss: 0.9866 - accuracy: 0.5996 - val_loss: 0.9794 - val_accuracy: 0.5971
Epoch 19/25
4385/4385 [==============================] - 13s 3ms/step - loss: 0.9864 - accuracy: 0.5998 - val_loss: 0.9734 - val_accuracy: 0.5975
Epoch 20/25
4385/4385 [==============================] - 12s 3ms/step - loss: 0.9862 - accuracy: 0.5990 - val_loss: 0.9796 - val_accuracy: 0.5954
Epoch 21/25
4385/4385 [==============================] - 12s 3ms/step - loss: 0.9864 - accuracy: 0.5991 - val_loss: 0.9755 - val_accuracy: 0.6002
Epoch 22/25
4385/4385 [==============================] - 12s 3ms/step - loss: 0.9862 - accuracy: 0.5997 - val_loss: 0.9811 - val_accuracy: 0.5983
Epoch 23/25
4385/4385 [==============================] - 13s 3ms/step - loss: 0.9860 - accuracy: 0.5999 - val_loss: 0.9815 - val_accuracy: 0.5903
Epoch 24/25
4385/4385 [==============================] - 12s 3ms/step - loss: 0.9862 - accuracy: 0.6004 - val_loss: 0.9816 - val_accuracy: 0.5918
Epoch 25/25
4385/4385 [==============================] - 13s 3ms/step - loss: 0.9860 - accuracy: 0.5993 - val_loss: 0.9786 - val_accuracy: 0.5959

预测

为验证集和训练集运行预测。

plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
[<matplotlib.lines.Line2D at 0x7f550c6f09a0>]

png

train_eval_result = model.evaluate(dict(train_df), train_df['Sentiment'])
validation_eval_result = model.evaluate(dict(validation_df), validation_df['Sentiment'])

print(f"Training set accuracy: {train_eval_result[1]}")
print(f"Validation set accuracy: {validation_eval_result[1]}")
4385/4385 [==============================] - 12s 3ms/step - loss: 0.9820 - accuracy: 0.6014
493/493 [==============================] - 1s 2ms/step - loss: 0.9786 - accuracy: 0.5959
Training set accuracy: 0.6014040112495422
Validation set accuracy: 0.5959352254867554

混淆矩阵

另一个非常有趣的统计数据(尤其对于多类问题而言)是混淆矩阵。混淆矩阵允许可视化显示正确和错误标记的样本的比例。我们可以很容易看出分类器出现了多大偏差,以及标签分布是否有意义。理想情况下,预测中的最大数值应沿对角线分布。

predictions = model.predict(dict(validation_df))
predictions = tf.argmax(predictions, axis=-1)
predictions
493/493 [==============================] - 1s 2ms/step
<tf.Tensor: shape=(15745,), dtype=int64, numpy=array([2, 2, 2, ..., 2, 2, 2])>
cm = tf.math.confusion_matrix(validation_df['Sentiment'], predictions)
cm = cm/cm.numpy().sum(axis=1)[:, tf.newaxis]
sns.heatmap(
    cm, annot=True,
    xticklabels=SENTIMENT_LABELS,
    yticklabels=SENTIMENT_LABELS)
plt.xlabel("Predicted")
plt.ylabel("True")
Text(50.72222222222221, 0.5, 'True')

png

我们可以将以下代码粘贴到代码单元,然后执行该代码,从而轻松将预测值提交回 Kaggle:

test_predictions = model.predict(dict(test_df))
test_predictions = np.argmax(test_predictions, axis=-1)

result_df = test_df.copy()

result_df["Predictions"] = test_predictions

result_df.to_csv(
    "predictions.csv",
    columns=["Predictions"],
    header=["Sentiment"])
kaggle.api.competition_submit("predictions.csv", "Submitted from Colab",
                              "sentiment-analysis-on-movie-reviews")

提交后,查看排行榜了解您的表现。