此页面由 Cloud Translation API 翻译。
Switch to English

定制差异化

在TensorFlow.org上查看 在GitHub上查看源代码

本教程将向您展示如何在5行Swift中定义自己的自定义导数,执行导数手术以及实现自己的梯度检查点API。

声明自定义派生

您可以为任何具有可区分参数和结果的Swift函数定义自定义派生类。这样,您甚至可以导入C函数并使它具有可区分性。

 import Glibc

func sillyExp(_ x: Float) -> Float {
    let 𝑒 = Float(M_E)
    print("Taking 𝑒(\(𝑒)) to the power of \(x)!")
    return pow(𝑒, x)
}

@derivative(of: sillyExp)
func sillyDerivative(_ x: Float) -> (value: Float, pullback: (Float) -> Float) {
    let y = sillyExp(x)
    return (value: y, pullback: { v in v * y })
}

print("exp(3) =", sillyExp(3))
print("𝛁exp(3) =", gradient(of: sillyExp)(3))
 
Taking 𝑒(2.7182817) to the power of 3.0!
exp(3) = 20.085535
Taking 𝑒(2.7182817) to the power of 3.0!
𝛁exp(3) = 20.085535

阻止衍生品的传播

在机器学习用例中,通常称为“停止梯度”,而withoutDerivative(at:)派生方法withoutDerivative(at:)阻止导数传播。

另外,没有withoutDerivative(at:)有时可以帮助Swift编译器识别不区分哪些内容并产生更有效的衍生。当检测到函数的导数始终为零时,Swift编译器将产生警告。显式使用withoutDerivative(at:)该警告withoutDerivative(at:)

 let x: Float = 2.0
let y: Float = 3.0
gradient(at: x, y) { x, y in
    sin(sin(sin(x))) + withoutDerivative(at: cos(cos(cos(y))))
}
 
▿ 2 elements

  - .0 : -0.18009877
  - .1 : 0.0

衍生手术

withDerivative(_:)方法使封闭函数在向后传播期间以某个值对渐变执行任意操作(包括突变)。

使用它来调试或对反向传播进行实验性调整。

它可以在任何地方工作

标准库提供的所有差异化API均在所有符合“ Differentiable协议”的类型上进行了通用定义: FloatDoubleFloat80 ,SIMD向量,甚至您自己的类型!

阅读技术文档“可区分类型”以获取有关可Differentiable协议的更多见解。

 var x: Float = 30
gradient(at: x) { x -> Float in
    // Print the partial derivative with respect to the result of `sin(x)`.
    let a = sin(x).withDerivative { print("∂+/∂sin = \($0)") } 
    // Force the partial derivative with respect to `x` to be `0.5`.
    let b = log(x.withDerivative { (dx: inout Float) in
        print("∂log/∂x = \(dx), but rewritten to 0.5");
        dx = 0.5
    })
    return a + b
}
 
∂log/∂x = 0.033333335, but rewritten to 0.5
∂+/∂sin = 1.0

0.65425146

在神经网络模块中使用

就像我们在简单的Float函数中使用它的方式一样,我们可以在任何数值应用程序中使用它,例如以下使用Swift for TensorFlow深度学习库构建的神经网络。

 import TensorFlow

struct MLP: Layer {
    var layer1 = Dense<Float>(inputSize: 2, outputSize: 10, activation: relu)
    var layer2 = Dense<Float>(inputSize: 10, outputSize: 1, activation: relu)
    
    @differentiable
    func callAsFunction(_ input: Tensor<Float>) -> Tensor<Float> {
        let h0 = layer1(input).withDerivative { print("∂L/∂layer1 =", $0) }
        return layer2(h0)
    }
}

var classifier = MLP()
let optimizer = SGD(for: classifier, learningRate: 0.02)

let x: Tensor<Float> = [[0, 0], [0, 1], [1, 0], [1, 1]]
let y: Tensor<Float> = [0, 1, 1, 0]

for _ in 0..<10 {
    let 𝛁model = gradient(at: classifier) { classifier -> Tensor<Float> in
        let ŷ = classifier(x).withDerivative { print("∂L/∂ŷ =", $0) }
        let loss = (ŷ - y).squared().mean()
        print("Loss: \(loss)")
        return loss
    }
    optimizer.update(&classifier, along: 𝛁model)
}
 
Loss: 0.4281609
∂L/∂ŷ = [[      -0.25],
 [      -0.25],
 [ 0.19194186],
 [-0.12774679]]
∂L/∂layer1 = [[         0.0,          0.0,          0.0,          0.0,          0.0,          0.0,
           0.0,          0.0,          0.0,          0.0],
 [         0.0,          0.0,          0.0,          0.0,          0.0,          0.0,
           0.0,          0.0,          0.0,          0.0],
 [  0.09375162, -0.124435335,   0.06574867, -0.023810884,   0.08656448,  0.104850195,
  -0.040182292, -0.007851038, -0.016217625,    0.1361082],
 [-0.062396336,   0.08281786, -0.043758992,   0.01584732,  -0.05761294,  -0.06978299,
   0.026743302, 0.0052252538,  0.010793631,  -0.09058674]]
Loss: 0.4256891
∂L/∂ŷ = [[      -0.25],
 [      -0.25],
 [ 0.18452805],
 [-0.12899026]]
∂L/∂layer1 = [[          0.0,           0.0,           0.0,           0.0,           0.0,           0.0,
            0.0,           0.0,           0.0,           0.0],
 [          0.0,           0.0,           0.0,           0.0,           0.0,           0.0,
            0.0,           0.0,           0.0,           0.0],
 [   0.08978041,   -0.11994719,   0.062917426,  -0.022909585,    0.08303144,    0.10047636,
   -0.038630243, -0.0075702597,  -0.015591215,    0.13058722],
 [  -0.06275902,   0.083846435,  -0.043981038,    0.01601444,  -0.058041297,   -0.07023578,
    0.027003618,   0.005291823,   0.010898695,   -0.09128411]]
Loss: 0.42343885
∂L/∂ŷ = [[      -0.25],
 [      -0.25],
 [ 0.17764306],
 [-0.12992996]]
∂L/∂layer1 = [[         0.0,          0.0,          0.0,          0.0,          0.0,          0.0,
           0.0,          0.0,          0.0,          0.0],
 [         0.0,          0.0,          0.0,          0.0,          0.0,          0.0,
           0.0,          0.0,          0.0,          0.0],
 [  0.08611148, -0.115765825,  0.060307615, -0.022072455,   0.07976325,  0.096430376,
  -0.037188895, -0.007308357, -0.015009486,   0.12547429],
 [ -0.06298282,   0.08467232,  -0.04410961,   0.01614402,  -0.05833966,  -0.07053017,
    0.02720034, 0.0053454074,  0.010978092,  -0.09177318]]
Loss: 0.4213786
∂L/∂ŷ = [[      -0.25],
 [      -0.25],
 [ 0.17123967],
 [-0.13059705]]
∂L/∂layer1 = [[         0.0,          0.0,          0.0,          0.0,          0.0,          0.0,
           0.0,          0.0,          0.0,          0.0],
 [         0.0,          0.0,          0.0,          0.0,          0.0,          0.0,
           0.0,          0.0,          0.0,          0.0],
 [   0.0827157,  -0.11186533,  0.057897244, -0.021293767,  0.076734796,   0.09268116,
   -0.03584837, -0.007063774, -0.014468448,   0.12073135],
 [ -0.06308366,   0.08531482,  -0.04415571,   0.01623983, -0.058522295,   -0.0706839,
   0.027339993,  0.005387233,  0.011034457,  -0.09207655]]
Loss: 0.419482
∂L/∂ŷ = [[      -0.25],
 [      -0.25],
 [ 0.16527513],
 [-0.13101962]]
∂L/∂layer1 = [[         0.0,          0.0,          0.0,          0.0,          0.0,          0.0,
           0.0,          0.0,          0.0,          0.0],
 [         0.0,          0.0,          0.0,          0.0,          0.0,          0.0,
           0.0,          0.0,          0.0,          0.0],
 [  0.07956703, -0.108221985,  0.055666752, -0.020568334,   0.07392358,    0.0892009,
  -0.034599714, -0.006835082,  -0.01396449,  0.116324194],
 [-0.063075684,   0.08579151, -0.044129066,  0.016305268,  -0.05860192,  -0.07071281,
   0.027428458,  0.005418419,   0.01107016, -0.092214435]]
Loss: 0.41772705
∂L/∂ŷ = [[      -0.25],
 [      -0.25],
 [ 0.15971094],
 [-0.13122296]]
∂L/∂layer1 = [[         0.0,          0.0,          0.0,          0.0,          0.0,          0.0,
           0.0,          0.0,          0.0,          0.0],
 [         0.0,          0.0,          0.0,          0.0,          0.0,          0.0,
           0.0,          0.0,          0.0,          0.0],
 [ 0.076642305,  -0.10481434,    0.0535988, -0.019891495,   0.07130953,   0.08596473,
  -0.033434875, -0.006620981, -0.013494358,  0.112222254],
 [ -0.06297146,  0.086118385,  -0.04403827,  0.016343407, -0.058589894,  -0.07063102,
   0.027471025, 0.0054399823,  0.011087341, -0.092204936]]
Loss: 0.4160954
∂L/∂ŷ = [[     -0.25],
 [     -0.25],
 [ 0.1545125],
 [-0.1312299]]
∂L/∂layer1 = [[         0.0,          0.0,          0.0,          0.0,          0.0,          0.0,
           0.0,          0.0,          0.0,          0.0],
 [         0.0,          0.0,          0.0,          0.0,          0.0,          0.0,
           0.0,          0.0,          0.0,          0.0],
 [  0.07392088,  -0.10162293,   0.05167799,  -0.01925905,   0.06887471,   0.08295049,
    -0.0323466,  -0.00642029, -0.013055129,   0.10839817],
 [-0.062782176,   0.08630996, -0.043890934,  0.016357018,  -0.05849638,  -0.07045116,
    0.02747248, 0.0054528536,  0.011087928,  -0.09206428]]
Loss: 0.41457158
∂L/∂ŷ = [[      -0.25],
 [      -0.25],
 [ 0.14964825],
 [-0.13106102]]
∂L/∂layer1 = [[         0.0,          0.0,          0.0,          0.0,          0.0,          0.0,
           0.0,          0.0,          0.0,          0.0],
 [         0.0,          0.0,          0.0,          0.0,          0.0,          0.0,
           0.0,          0.0,          0.0,          0.0],
 [  0.07138416,   -0.0986299,  0.049890514, -0.018667182,  0.066602945,   0.08013815,
  -0.031328287,  -0.00623192, -0.012644137,   0.10482717],
 [-0.062517814,   0.08637945, -0.043693807,  0.016348602,  -0.05833045,  -0.07018449,
   0.027437123,  0.005457877,  0.011073658,  -0.09180699]]
Loss: 0.41314268
∂L/∂ŷ = [[      -0.25],
 [      -0.25],
 [  0.1450899],
 [-0.13073498]]
∂L/∂layer1 = [[          0.0,           0.0,           0.0,           0.0,           0.0,           0.0,
            0.0,           0.0,           0.0,           0.0],
 [          0.0,           0.0,           0.0,           0.0,           0.0,           0.0,
            0.0,           0.0,           0.0,           0.0],
 [   0.06901559,   -0.09581909,   0.048224125,  -0.018112455,    0.06447981,   0.077509835,
   -0.030374015, -0.0060548857,  -0.012258992,    0.10148715],
 [  -0.06218732,    0.08633894,  -0.043452926,   0.016320443,    -0.0581003,   -0.06984116,
    0.027368868,   0.005455827,   0.011046111,  -0.091446206]]
Loss: 0.41179782
∂L/∂ŷ = [[      -0.25],
 [      -0.25],
 [ 0.14081168],
 [-0.13026857]]
∂L/∂layer1 = [[          0.0,           0.0,           0.0,           0.0,           0.0,           0.0,
            0.0,           0.0,           0.0,           0.0],
 [          0.0,           0.0,           0.0,           0.0,           0.0,           0.0,
            0.0,           0.0,           0.0,           0.0],
 [  0.066800244,   -0.09317576,   0.046667818,  -0.017591748,    0.06249226,    0.07504944,
   -0.029478388, -0.0058882837,  -0.011897515,    0.09835809],
 [  -0.06179865,   0.086199336,   -0.04317362,   0.016274586,  -0.057813223,  -0.069430195,
    0.027271228,  0.0054474054,   0.011006703,   -0.09099364]]

在反向传播期间重新计算激活以节省内存(检查点)

检查点是反向模式自动微分的传统技术,可节省内存。代替在原始计算中保存较大的中间值以计算导数,而是在反向传播期间根据需要重新计算中间值。

这种技术也已经在现代深度学习库中实现。在Swift中,具有withRecomputationInPullbacks(_:) API使您可以控制反向传播期间要重新计算的内容,并且在所有Differentiable类型上均可用。

但是今天,让我们学习仅需几行代码,即可从头开始定义自己的梯度检查点API。

我们的梯度检查点API

我们可以使用标准库函数differentiableFunction(from:)来定义自己的梯度检查点API makeRecomputedInGradient(_:) ,这是直接从派生函数(也称为“ vector-Jacobian产品”)创建可微函数的简写(VJP)功能”)。

正如我们之前所见,派生函数返回原始函数结果的元组和回退闭包。我们返回value: original(x) ,并在original上调用pullback(at:in:)再次评估原始函数并获得回退。

 /// Given a differentiable function, returns the same differentiable function except when
/// derivatives of this function are being computed. In that case, values in the original function needed
/// for computing the derivatives will be recomputed, instead of being captured by the differential or pullback.
///
/// - Parameter body: The body of the differentiable function.
/// - Returns: The same differentiable function whose derivatives, when computed, will recompute
///   some values from the original function.
func makeRecomputedInGradient<T: Differentiable, U: Differentiable>(
    _ original: @escaping @differentiable (T) -> U
) -> @differentiable (T) -> U {
    return differentiableFunction { x in
        (value: original(x), pullback: { v in pullback(at: x, in: original)(v) })
    }
}
 

验证它是否有效

 let input: Float = 10.0
print("Running original computation...")

// Differentiable multiplication with checkpointing.
let square = makeRecomputedInGradient { (x: Float) -> Float in
    print("  Computing square...")
    return x * x
}

// Differentiate `f(x) = (cos(x))^2`.
let (output, backprop) = valueWithPullback(at: input) { input -> Float in
    return square(cos(input))
}
print("Running backpropagation...")
let grad = backprop(1)
print("Gradient = \(grad)")
 
Running original computation...
  Computing square...
Running backpropagation...
  Computing square...
Gradient = -0.9129453

将其扩展到神经网络模块

在此示例中,我们定义了一个简单的卷积神经网络。

 struct Model: Layer {
    var conv = Conv2D<Float>(filterShape: (5, 5, 3, 6))
    var maxPool = MaxPool2D<Float>(poolSize: (2, 2), strides: (2, 2))
    var flatten = Flatten<Float>()
    var dense = Dense<Float>(inputSize: 36 * 6, outputSize: 10)

    @differentiable
    func call(_ input: Tensor<Float>) -> Tensor<Float> {
        return input.sequenced(through: conv, maxPool, flatten, dense)
    }
}
 

我们希望在反向传播期间重新计算卷积层( conv )中的激活。但是,使用makeRecomputedInGradient(_:)可能会使生成的代码显得笨重,尤其是当我们想使用sequenced(in:through:_:_:_:_:)顺序应用图层时。

 input.sequenced(in: context, through: conv, maxPool, flatten, dense)
 

那么,为什么不定义一个特殊的图层类型来包装图层并在反向传播期间重新计算其激活呢?我们开始做吧。

首先,我们定义一个带二进制函数的makeRecomputedInGradient(_:)函数。

 // Same as the previous `makeRecomputedInGradient(_:)`, except it's for binary functions.
func makeRecomputedInGradient<T: Differentiable, U: Differentiable, V: Differentiable>(
    _ original: @escaping @differentiable (T, U) -> V
) -> @differentiable (T, U) -> V {
    return differentiableFunction { x, y in
        (value: original(x, y), pullback: { v in pullback(at: x, y, in: original)(v) })
    }
}
 

然后,我们定义一个通用层ActivationDiscarding<Wrapped>

 import TensorFlow

/// A layer wrapper that makes the underlying layer's activations be discarded during application
/// and recomputed during backpropagation.
struct ActivationDiscarding<Wrapped: Layer>: Layer {
    /// The wrapped layer.
    var wrapped: Wrapped

    @differentiable
    func callAsFunction(_ input: Wrapped.Input) -> Wrapped.Output {
        let apply = makeRecomputedInGradient { (layer: Wrapped, input: Input) -> Wrapped.Output in
            print("    Applying \(Wrapped.self) layer...")
            return layer(input)
        }
        return apply(wrapped, input)
    }
}
 

最后,我们可以在所有返回相同层的层上添加一个方法,除了在应用期间丢弃其激活并在反向传播期间重新计算其激活之外。

 extension Layer {
    func discardingActivations() -> ActivationDiscarding<Self> {
        return ActivationDiscarding(wrapped: self)
    }
}
 

回到模型中,我们要做的就是将卷积层包装到激活-重新计算层中。

 var conv = Conv2D<Float>(filterShape: (5, 5, 3, 6)).discardingActivations()
 

现在,只需在模型中使用它即可!

 struct Model: Layer {
    var conv = Conv2D<Float>(filterShape: (5, 5, 3, 6)).discardingActivations()
    var maxPool = MaxPool2D<Float>(poolSize: (2, 2), strides: (2, 2))
    var flatten = Flatten<Float>()
    var dense = Dense<Float>(inputSize: 36 * 6, outputSize: 10)

    @differentiable
    func callAsFunction(_ input: Tensor<Float>) -> Tensor<Float> {
        return input.sequenced(through: conv, maxPool, flatten, dense)
    }
}
 

当运行训练循环时,我们可以看到卷积层的激活被计算了两次:一次在层应用期间,一次在反向传播期间。

 // Use random training data.
let x = Tensor<Float>(randomNormal: [10, 16, 16, 3])
let y = Tensor<Int32>(rangeFrom: 0, to: 10, stride: 1)

var model = Model()
let opt = SGD(for: model)

for i in 1...5 {
    print("Starting training step \(i)")
    print("  Running original computation...")
    let (logits, backprop) = model.appliedForBackpropagation(to: x)
    let (loss, dL_dŷ) = valueWithGradient(at: logits) { logits in
        softmaxCrossEntropy(logits: logits, labels: y)
    }
    print("  Loss: \(loss)")
    print("  Running backpropagation...")
    let (dL_dθ, _) = backprop(dL_dŷ)
    
    opt.update(&model, along: dL_dθ)
}
 
Starting training step 1
  Running original computation...
    Applying Conv2D<Float> layer...
  Loss: 3.3176332
  Running backpropagation...
    Applying Conv2D<Float> layer...
Starting training step 2
  Running original computation...
    Applying Conv2D<Float> layer...
  Loss: 2.7285542
  Running backpropagation...
    Applying Conv2D<Float> layer...
Starting training step 3
  Running original computation...
    Applying Conv2D<Float> layer...
  Loss: 2.3323467
  Running backpropagation...
    Applying Conv2D<Float> layer...
Starting training step 4
  Running original computation...
    Applying Conv2D<Float> layer...
  Loss: 2.0400991
  Running backpropagation...
    Applying Conv2D<Float> layer...
Starting training step 5
  Running original computation...
    Applying Conv2D<Float> layer...
  Loss: 1.8097637
  Running backpropagation...
    Applying Conv2D<Float> layer...

这样,为不同领域定义通用的可区分编程库非常容易。