12.4.求导
Linux 5.4.0-74-generic
Python 3.9.5 @ GCC 7.3.0
Latest build date 2021.06.21
tensorflow version: 2.5.0
from toolkit import H
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['font.family'] = "SIMHEI"
matplotlib.rcParams['axes.unicode_minus'] = False
自动求导
TensorFlow 为自动微分提供了tf.GradientTape
API。TensorFlow 会把tf.GradientTape
上下文中执行的所有计算步骤都记录在 tape 之中。然后基于这个 tape 和每次操作产生的导数,用反向微分法(reverse mode differentiation)来计算这些被“记录在案”的函数的导数。
# 先创建一个待优化的张量
x = tf.Variable(initial_value=3.)
print("x trainable:", x.trainable)
# 在 tf.GradientTape() 的上下文内,所有计算步骤都会被记录以用于求导
with tf.GradientTape() as tape:
y = tf.square(x)
# 计算y关于x的导数
y_grad = tape.gradient(y, x)
print(y_grad)
x trainable: True
tf.Tensor(6.0, shape=(), dtype=float32)
如果待求导的张量并不是待优化张量, 在 tf.GradientTape
之中调用 watch
方法,也可以让 TensorFlow 跟踪张量的变化。
x = tf.constant(3.)
with tf.GradientTape() as tape:
tape.watch(x)
y = tf.square(x)
y_grad = tape.gradient(y, x)
print(y_grad)
tf.Tensor(6.0, shape=(), dtype=float32)
GradientTape
会记录所有执行的操作,即使是 Python 控制流(if 和 while 的代码段)也会被记录。
def f(x, y):
output = 1.0
for i in range(y):
if i > 1 and i < 5:
output = tf.multiply(output, x)
return output
def grad(x, y):
with tf.GradientTape() as t:
t.watch(x)
out = f(x, y)
return t.gradient(out, x)
x = tf.convert_to_tensor(2.0)
assert grad(x, 6).numpy() == 12.0
assert grad(x, 5).numpy() == 12.0
assert grad(x, 4).numpy() == 4.0
GradientTape 对象持久化
默认情况下,调用 GradientTape.gradient()
方法时, GradientTape
占用的资源会立即得到释放。通过创建一个持久的梯度带,则可以计算同个函数的多个导数,即可以多次调用 gradient()
方法。例如:
x = tf.constant(3.0)
with tf.GradientTape(persistent=True) as t:
t.watch(x)
y = x * x
z = y * y
dz_dx = t.gradient(z, x) # 108.0 (4*x^3 at x = 3)
dy_dx = t.gradient(y, x) # 6.0
del t # Drop the reference to the tape
多元函数求导
X = tf.constant([[1., 2.], [3., 4.]])
y = tf.constant([[1.], [2.]])
w = tf.Variable(initial_value=[[1.], [2.]])
b = tf.Variable(initial_value=1.)
with tf.GradientTape() as tape:
L = 0.5 * tf.reduce_sum(tf.square(tf.matmul(X, w) + b - y))
# 计算L(w, b)关于w, b的偏导数
w_grad, b_grad = tape.gradient(L, [w, b])
print("L:", L.numpy())
print("w_grad:", w_grad.numpy())
print("b_grad:", b_grad.numpy())
L: 62.5
w_grad: [[35.]
[50.]]
b_grad: 15.0
高阶导数
在 GradientTape
上下文管理器中执行的操作都会被记录,用于自动微分。所以即使在上下文中计算导数,导数也会被记录下来。因此,同个 GradientTape
可以用于高阶导数。例如:
x = tf.Variable(1.0)
with tf.GradientTape() as t:
with tf.GradientTape() as t2:
y = x * x * x
# Compute the gradient inside the 't' context manager
# which means the gradient computation is differentiable as well.
dy_dx = t2.gradient(y, x)
d2y_dx2 = t.gradient(dy_dx, x)
assert dy_dx.numpy() == 3.0
assert d2y_dx2.numpy() == 6.0
NumPy 实现梯度下降
制造样本
# 随机采样
X = np.random.uniform(low=-10., high=10., size=100)
# 服从正态分布的噪音
eps = np.random.normal(loc=0., scale=1.5, size=100)
# 观察值
y = 1.477 * X + 0.089 + eps
定义相关函数:
- 先推导出导数公式
- 套用导数公式,计算损失函数关于自变量(模型参数)的梯度
# 定义mse损失函数
def mse(w, b, X, y):
y_predict = w * X + b
total_error = np.sum((y - y_predict) ** 2)
mse = total_error / float(len(y))
return mse
# 更新梯度
def step_gradient(w_current, b_current, X, y, learn_ratio):
w_gradient = 0
b_gradient = 0
num = len(y)
w_gradient += (2 / num) * np.sum(X * ((w_current * X + b_current) - y))
b_gradient += (2 / num) * np.sum((w_current * X + b_current) - y)
w_new = w_current - learn_ratio * w_gradient
b_new = b_current - learn_ratio * b_gradient
return w_new, b_new
def gradient_descent(X, y, w_start, b_start, learn_ratio, max_iter):
w = w_start
b = b_start
loss_l = []
for i in range(max_iter):
w, b = step_gradient(w_current=w, b_current=b,
X=X, y=y, learn_ratio=learn_ratio)
loss = mse(w, b, X, y)
loss_l.append(loss)
print(f"iter {i} loss: {loss}")
return w, b, loss_l
w, b, loss = gradient_descent(X, y, w_start=0, b_start=0,
learn_ratio=0.02, max_iter=100)
\
TensorFlow 实现自动梯度下降
- 使用
tape.gradient(ys, xs)
自动计算梯度 - 使用
optimizer.apply_gradients(grads_and_vars)
自动更新模型参数
def gradient_descent_tf(X, y, w_start, b_start, lr, max_iter):
w = tf.Variable(initial_value=w_start)
b = tf.Variable(initial_value=b_start)
variables = [w, b]
optimizer = tf.keras.optimizers.SGD(learning_rate=lr)
loss_l = []
num = len(y)
for i in range(max_iter):
with tf.GradientTape() as tape:
y_pred = w * X + b
loss = tf.reduce_sum(tf.square(y_pred - y)) / num
loss_l.append(loss)
print(f"iter {i} loss: {loss}")
# 计算梯度
grads = tape.gradient(loss, variables)
optimizer.apply_gradients(grads_and_vars=zip(grads, variables))
return w.numpy(), b.numpy(), loss_l
w, b, loss = gradient_descent_tf(X, y, 0., 0., 0.02, 100)
\