12.4.求导

Linux 5.4.0-74-generic
Python 3.9.5 @ GCC 7.3.0
Latest build date 2021.06.21
tensorflow version:  2.5.0

from toolkit import H
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['font.family'] = "SIMHEI"
matplotlib.rcParams['axes.unicode_minus'] = False

自动求导

TensorFlow 为自动微分提供了tf.GradientTapeAPI。TensorFlow 会把tf.GradientTape上下文中执行的所有计算步骤都记录在 tape 之中。然后基于这个 tape 和每次操作产生的导数，用反向微分法（reverse mode differentiation）来计算这些被“记录在案”的函数的导数。

# 先创建一个待优化的张量
x = tf.Variable(initial_value=3.)
print("x trainable:", x.trainable)

# 在 tf.GradientTape() 的上下文内，所有计算步骤都会被记录以用于求导
with tf.GradientTape() as tape:
    y = tf.square(x)

# 计算y关于x的导数
y_grad = tape.gradient(y, x)
print(y_grad)

x trainable: True
tf.Tensor(6.0, shape=(), dtype=float32)

如果待求导的张量并不是待优化张量，在 tf.GradientTape 之中调用 watch 方法，也可以让 TensorFlow 跟踪张量的变化。

x = tf.constant(3.)

with tf.GradientTape() as tape:
    tape.watch(x)
    y = tf.square(x)

y_grad = tape.gradient(y, x)
print(y_grad)

tf.Tensor(6.0, shape=(), dtype=float32)

GradientTape 会记录所有执行的操作，即使是 Python 控制流（if 和 while 的代码段）也会被记录。

def f(x, y):
    output = 1.0
    for i in range(y):
        if i > 1 and i < 5:
            output = tf.multiply(output, x)
    return output


def grad(x, y):
    with tf.GradientTape() as t:
        t.watch(x)
        out = f(x, y)
    return t.gradient(out, x)


x = tf.convert_to_tensor(2.0)
assert grad(x, 6).numpy() == 12.0
assert grad(x, 5).numpy() == 12.0
assert grad(x, 4).numpy() == 4.0

GradientTape 对象持久化

默认情况下，调用 GradientTape.gradient() 方法时， GradientTape 占用的资源会立即得到释放。通过创建一个持久的梯度带，则可以计算同个函数的多个导数，即可以多次调用 gradient() 方法。例如：

x = tf.constant(3.0)
with tf.GradientTape(persistent=True) as t:
    t.watch(x)
    y = x * x
    z = y * y
dz_dx = t.gradient(z, x)  # 108.0 (4*x^3 at x = 3)
dy_dx = t.gradient(y, x)  # 6.0
del t  # Drop the reference to the tape

多元函数求导

X = tf.constant([[1., 2.], [3., 4.]])
y = tf.constant([[1.], [2.]])
w = tf.Variable(initial_value=[[1.], [2.]])
b = tf.Variable(initial_value=1.)

with tf.GradientTape() as tape:
    L = 0.5 * tf.reduce_sum(tf.square(tf.matmul(X, w) + b - y))

# 计算L(w, b)关于w, b的偏导数
w_grad, b_grad = tape.gradient(L, [w, b])
print("L:", L.numpy())
print("w_grad:", w_grad.numpy())
print("b_grad:", b_grad.numpy())

L: 62.5
w_grad: [[35.]
 [50.]]
b_grad: 15.0

高阶导数

在 GradientTape 上下文管理器中执行的操作都会被记录，用于自动微分。所以即使在上下文中计算导数，导数也会被记录下来。因此，同个 GradientTape 可以用于高阶导数。例如：

x = tf.Variable(1.0)

with tf.GradientTape() as t:
    with tf.GradientTape() as t2:
        y = x * x * x
    # Compute the gradient inside the 't' context manager
    # which means the gradient computation is differentiable as well.
    dy_dx = t2.gradient(y, x)
d2y_dx2 = t.gradient(dy_dx, x)

assert dy_dx.numpy() == 3.0
assert d2y_dx2.numpy() == 6.0

NumPy 实现梯度下降

制造样本

# 随机采样
X = np.random.uniform(low=-10., high=10., size=100)
# 服从正态分布的噪音
eps = np.random.normal(loc=0., scale=1.5, size=100)
# 观察值
y = 1.477 * X + 0.089 + eps

定义相关函数：

先推导出导数公式
套用导数公式，计算损失函数关于自变量（模型参数）的梯度

# 定义mse损失函数
def mse(w, b, X, y):
    y_predict = w * X + b
    total_error = np.sum((y - y_predict) ** 2)
    mse = total_error / float(len(y))
    return mse

# 更新梯度
def step_gradient(w_current, b_current, X, y, learn_ratio):
    w_gradient = 0
    b_gradient = 0
    num = len(y)
    w_gradient += (2 / num) * np.sum(X * ((w_current * X + b_current) - y))
    b_gradient += (2 / num) * np.sum((w_current * X + b_current) - y)
    w_new = w_current - learn_ratio * w_gradient
    b_new = b_current - learn_ratio * b_gradient
    return w_new, b_new


def gradient_descent(X, y, w_start, b_start, learn_ratio, max_iter):
    w = w_start
    b = b_start
    loss_l = []
    for i in range(max_iter):
        w, b = step_gradient(w_current=w, b_current=b,
                             X=X, y=y, learn_ratio=learn_ratio)
        loss = mse(w, b, X, y)
        loss_l.append(loss)
        print(f"iter {i}  loss: {loss}")

    return w, b, loss_l

w, b, loss = gradient_descent(X, y, w_start=0, b_start=0, 
                              learn_ratio=0.02, max_iter=100)

\

TensorFlow 实现自动梯度下降

使用 tape.gradient(ys, xs) 自动计算梯度
使用 optimizer.apply_gradients(grads_and_vars) 自动更新模型参数

def gradient_descent_tf(X, y, w_start, b_start, lr, max_iter):
    w = tf.Variable(initial_value=w_start)
    b = tf.Variable(initial_value=b_start)
    variables = [w, b]
    optimizer = tf.keras.optimizers.SGD(learning_rate=lr)

    loss_l = []
    num = len(y)
    for i in range(max_iter):
        with tf.GradientTape() as tape:
            y_pred = w * X + b
            loss = tf.reduce_sum(tf.square(y_pred - y)) / num
            loss_l.append(loss)
            print(f"iter {i}  loss: {loss}")

        # 计算梯度
        grads = tape.gradient(loss, variables)
        optimizer.apply_gradients(grads_and_vars=zip(grads, variables))

    return w.numpy(), b.numpy(), loss_l

w, b, loss = gradient_descent_tf(X, y, 0., 0., 0.02, 100)

\