1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168
| import sys, os sys.path.append(os.path.realpath(os.path.dirname(os.path.realpath(__file__))))
import numpy as np from layers import * from dropout_layers import *
def batchnorm_forward(x, gamma, beta, bn_param): """ 使用类似动量衰减的运行时平均,计算总体均值与方差 例如: running_mean = momentum * running_mean + (1 - momentum) * sample_mean running_var = momentum * running_var + (1 - momentum) * sample_var Input: - x: 数据(N, D) - gamma: 缩放参数 (D,) - beta: 平移参数 (D,) - bn_param: 字典型,使用下列键值: - mode: 'train' 或'test'; - eps: 保证数值稳定 - momentum: 运行时平均衰减因子 - running_mean: 形状为(D,)的运行时均值 - running_var : 形状为 (D,)的运行时方差
Returns 元组: - out: 输出(N, D) - cache: 用于反向传播的缓存 """ mode = bn_param['mode'] eps = bn_param.get('eps', 1e-5) momentum = bn_param.get('momentum', 0.9)
N, D = x.shape running_mean = bn_param.get('running_mean', np.zeros(D, dtype=x.dtype)) running_var = bn_param.get('running_var', np.zeros(D, dtype=x.dtype))
out, cache = None, None if mode == 'train': mu = 1/float(N)*np.sum(x, axis=0) xmu = x-mu carre = xmu**2 var = 1/float(N)*np.sum(carre, axis=0) sqrtvar = np.sqrt(var+eps) invvar = 1./sqrtvar va2 = xmu*invvar va3 = gamma*va2 out = va3+beta running_mean = momentum*running_mean+(1.0-momentum)*mu running_var = momentum * running_var+(1.0-momentum)*var cache=(mu, xmu, carre, var, sqrtvar, invvar, va2, va3, gamma, beta, x, bn_param) elif mode == 'test': mu = running_mean var = running_var xhat = (x-mu)/np.sqrt(var+eps) out = gamma*xhat+beta cache = (mu, var, gamma, beta, bn_param) else: raise ValueError('无法识别的BN模式: "%s"' % mode) bn_param['running_mean'] = running_mean bn_param['running_var'] = running_var
return out, cache
def batchnorm_backward(dout, cache): """ BN反向传播 Inputs: - dout: 上层梯度 (N, D) - cache: 前向传播时的缓存. Returns 元组: - dx: 数据梯度 (N, D) - dgamma: gamma梯度 (D,) - dbeta: beta梯度 (D,) """ dx, dgamma, dbeta = None, None, None mu, xmu, carre, var, sqrtvar, invvar, va2, va3, gamma, beta, x, bn_param=cache eps = bn_param.get('eps', 1e-5) N, D = dout.shape dva3 = dout dbeta = np.sum(dout, axis=0) dva2 = gamma*dva3 dgamma = np.sum(va2*dva3, axis=0) dxmu = invvar*dva2 dinvvar = np.sum(xmu*dva2, axis=0) dsqrtvar = -1./(sqrtvar**2)*dinvvar dvar = 0.5*(var+eps)**(-0.5)*dsqrtvar dcarre = 1/float(N)*np.ones((carre.shape))*dvar dxmu += 2*xmu*dcarre dx = dxmu dmu = -np.sum(dxmu, axis=0) dx += 1/float(N)*np.ones((dxmu.shape))*dmu return dx, dgamma, dbeta
def batchnorm_backward_alt(dout, cache): """ 可选的BN反向传播 """ dx, dgamma, dbeta = None, None, None mu, xmu, carre, var, sqrtvar, invvar, va2, va3, gamma, beta, x, bn_param = cache eps = bn_param.get('eps', 1e-5) N, D = dout.shape dbeta = np.sum(dout, axis=0) dgamma = np.sum((x - mu) * (var + eps)**(-1. / 2.) * dout, axis=0) dx = (1./N) * gamma * (var + eps)**(-1./2.)*(N*dout-np.sum( dout, axis=0)-(x-mu)*(var+eps)**(-1.0)*np.sum(dout*(x-mu),axis=0)) return dx, dgamma, dbeta
def affine_bn_relu_forward(x,w,b,gamma, beta,bn_param): x_affine,cache_affine= affine_forward(x,w,b) x_bn,cache_bn = batchnorm_forward(x_affine,gamma, beta,bn_param) out,cache_relu = relu_forward(x_bn) cache = (cache_affine,cache_bn,cache_relu) return out,cache
def affine_bn_relu_backward(dout,cache): cache_affine,cache_bn,cache_relu = cache drelu = relu_backward(dout,cache_relu) dbn,dgamma, dbeta= batchnorm_backward_alt(drelu,cache_bn) dx,dw,db = affine_backward(dbn,cache_affine) return dx,dw,db,dgamma,dbeta
|