function softmax(z)
#z = z - maximum(z)
o = exp(z)
return o / sum(o)
end
function gradient_together(z, y)
o = softmax(z)
o[y] -= 1.0
return o
end
function gradient_separated(z, y)
o = softmax(z)
∂o_∂z = diagm(o) - o*o'
∂f_∂o = zeros(size(o))
∂f_∂o[y] = -1.0 / o[y]
return ∂o_∂z * ∂f_∂o
end
The reason why this is done is because you only need the softmax layer at the time of inferencing. While training, to calculate the loss you don’t need to softmax and just calculate loss without it. This way the number of computations get reduced!