This paper was converted on www.awesomepapers.org from LaTeX by an anonymous user.
Want to know more? Visit the Converter page.

Logit Standardization in Knowledge Distillation

First Author
Institution1
Institution1 address
[email protected]
   Second Author
Institution2
First line of institution2 address
[email protected]
𝐯n=fT(𝐱n){\mathbf{v}}_{n}=f_{T}({\mathbf{x}}_{n})

,

𝐳n=fS(𝐱n){\mathbf{z}}_{n}=f_{S}({\mathbf{x}}_{n})

,

aT=𝐯¯n=1Kk=1K𝐯n(k)a_{T}=\overline{{\mathbf{v}}}_{n}=\frac{1}{K}\sum_{k=1}^{K}{\mathbf{v}}_{n}^{(k)}
aS=𝐳¯n=1Kk=1K𝐳n(k)a_{S}=\overline{{\mathbf{z}}}_{n}=\frac{1}{K}\sum_{k=1}^{K}{\mathbf{z}}_{n}^{(k)}
bT=σ(𝐯n)=[1Kk=1K(𝐯n(k)𝐯¯n)2]1/2b_{T}=\sigma({\mathbf{v}}_{n})=\left[{\frac{1}{K}\sum_{k=1}^{K}\left({\mathbf{v}}_{n}^{(k)}-\overline{{\mathbf{v}}}_{n}\right)^{2}}\right]^{1/2}
bS=σ(𝐳n)=[1Kk=1K(𝐳n(k)𝐳¯n)2]1/2b_{S}=\sigma({\mathbf{z}}_{n})=\left[{\frac{1}{K}\sum_{k=1}^{K}\left({\mathbf{z}}_{n}^{(k)}-\overline{{\mathbf{z}}}_{n}\right)^{2}}\right]^{1/2}
q(𝐯n)=softmax[(𝐯naT)/bT/τ]q({\mathbf{v}}_{n})=\text{\text{softmax}}\left[({\mathbf{v}}_{n}-a_{T})/b_{T}/\tau\right]
q(𝐳n)=softmax[(𝐳naS)/bS/τ]q({\mathbf{z}}_{n})=\text{\text{softmax}}\left[({\mathbf{z}}_{n}-a_{S})/b_{S}/\tau\right]

q(𝐳n)=softmax(𝐳n)q^{\prime}({\mathbf{z}}_{n})=\text{\text{softmax}}\left({\mathbf{z}}_{n}\right)

λKDτ2(q(𝐯n),q(𝐳n))\lambda_{KD}\tau^{2}{\mathcal{L}}\left(q({\mathbf{v}}_{n}),q({\mathbf{z}}_{n})\right)

Update fSf_{S} towards minimizing λCECE(yn,q(𝐳n))+λKDτ2(q(𝐯n),q(𝐳n))\lambda_{CE}{\mathcal{L}}_{CE}\left({{y}}_{n},q^{\prime}({\mathbf{z}}_{n})\right)+\lambda_{KD}\tau^{2}{\mathcal{L}}\left(q({\mathbf{v}}_{n}),q({\mathbf{z}}_{n})\right)

Input: Transfer set 𝒟{\mathcal{D}} with samples of image-label pair {𝐱n,yn}n=1N\{{\mathbf{x}}_{n},{{y}}_{n}\}_{n=1}^{N}, Number of classes KK, Base Temperature τ\tau, Teacher fTf_{T}, Student fSf_{S}, Loss {\mathcal{L}} (e.g., KL{\rm{KL}} divergence KL{\mathcal{L}}_{\rm{KL}})
Output: Trained student model fSf_{S}
1
2foreach  (𝐱n,yn)({\mathbf{x}}_{n},{{y}}_{n}) in 𝒟{\mathcal{D}} do
3       𝐯n=fT(𝐱n){\mathbf{v}}_{n}=f_{T}({\mathbf{x}}_{n}), 𝐯¯n=1Kk=1K𝐯n(k)\overline{{\mathbf{v}}}_{n}=\frac{1}{K}\sum_{k=1}^{K}{\mathbf{v}}_{n}^{(k)}
4      𝐳n=fS(𝐱n){\mathbf{z}}_{n}=f_{S}({\mathbf{x}}_{n}),  𝐳¯n=1Kk=1K𝐳n(k)\overline{{\mathbf{z}}}_{n}=\frac{1}{K}\sum_{k=1}^{K}{\mathbf{z}}_{n}^{(k)}
5      σ(𝐯n)=[1Kk=1K(𝐯n(k)𝐯¯n)2]1/2\sigma({\mathbf{v}}_{n})=\left[{\frac{1}{K}\sum_{k=1}^{K}\left({\mathbf{v}}_{n}^{(k)}-\overline{{\mathbf{v}}}_{n}\right)^{2}}\right]^{1/2}
6      σ(𝐳n)=[1Kk=1K(𝐳n(k)𝐳¯n)2]1/2\sigma({\mathbf{z}}_{n})=\left[{\frac{1}{K}\sum_{k=1}^{K}\left({\mathbf{z}}_{n}^{(k)}-\overline{{\mathbf{z}}}_{n}\right)^{2}}\right]^{1/2}
7      q(𝐯n)=softmax[(𝐯n𝐯¯n)/σ(𝐯n)/τ]q({\mathbf{v}}_{n})=\text{\text{softmax}}\left[({\mathbf{v}}_{n}-\overline{{\mathbf{v}}}_{n})/\sigma({\mathbf{v}}_{n})/\tau\right]
8      q(𝐳n)=softmax[(𝐳n𝐳¯n)/σ(𝐳n)/τ]q({\mathbf{z}}_{n})=\text{\text{softmax}}\left[({\mathbf{z}}_{n}-\overline{{\mathbf{z}}}_{n})/\sigma({\mathbf{z}}_{n})/\tau\right]
9      q(𝐳n)=softmax(𝐳n)q^{\prime}({\mathbf{z}}_{n})=\text{\text{softmax}}\left({\mathbf{z}}_{n}\right)
10      Update fSf_{S} towards minimizing λCECE(yn,q(𝐳n))+λKDτ2(q(𝐯n),q(𝐳n))\lambda_{CE}{\mathcal{L}}_{CE}\left({{y}}_{n},q^{\prime}({\mathbf{z}}_{n})\right)+\lambda_{KD}\tau^{2}{\mathcal{L}}\left(q({\mathbf{v}}_{n}),q({\mathbf{z}}_{n})\right)
11 end foreach
Algorithm 1 𝒵\mathcal{Z}-score logit standardization pre-process in knowledge distillation.