This paper was converted on www.awesomepapers.org from LaTeX by an anonymous user.
Want to know more? Visit the Converter page.

Appendix Accompanying with A Differentiable Ranking Metric Using Relaxed Sorting Operations for Top-K Recommender Systems

Implementation Details

Math Details

Interpretation. We have the gradient update rule \textneu\nabla\mathcal{L}_{\text{neu}} of the loss \eqrefeq:neuloss with respect to a latent vector 𝐱\mathbf{x} such that 𝐱=α\mathbf{x}=\alpha or 𝐱=βj\mathbf{x}=\beta_{j} for all jj:

{aligned}\textneu=2τ(yu^𝐱)TiW(i)(P~[1:K]yu)\aligned\nabla\mathcal{L}_{\text{neu}}=\frac{2}{\tau}\left(\frac{\partial{\hat{y_{u}}}}{\partial\mathbf{x}}\right)^{T}\sum_{i}W^{(i)}\left(\tilde{P}_{[1:K]}-y_{u}\right) (1)

where

\textsgnij={1&\textify^u,i>y^u,j0\textify^u,i=y^u,j1\textify^u,i<y^u,j,\text{sgn}_{ij}=\cases{1}&\text{if}\hat{y}_{u,i}>\hat{y}_{u,j}\\ 0\text{if}\hat{y}_{u,i}=\hat{y}_{u,j}\\ -1\text{if}\hat{y}_{u,i}<\hat{y}_{u,j},\\

and

{aligned}z(i)&=τ1((n+12i)yu^Ayu^𝟏),W(i)=(H(i)(D(i)+R))T,H(i)=\textdiag(\textsoftmax(z(i)))\textsoftmax(z(i))(\textsoftmax(z(i)))T,D(i)=\textdiag((n+12i)𝟏),R=\textdiag((k;kj\textsgnjk)𝟏)+{bmatrix}\textsgnjkn×n\aligned z^{(i)}&=\tau^{-1}\left((n+1-2i)\hat{y_{u}}-A_{\hat{y_{u}}}\mathbf{1}\right),\\ W^{(i)}=(H^{(i)}(D^{(i)}+R))^{T},\\ H^{(i)}=\text{diag}(\text{softmax}(z^{(i)}))-\text{softmax}(z^{(i)})(\text{softmax}(z^{(i)}))^{T},\\ D^{(i)}=\text{diag}((n+1-2i)\mathbf{1}),\\ R=\text{diag}\left(\left(-\sum_{k;k\neq j}\text{sgn}_{jk}\right)\mathbf{1}\right)+\bmatrix\text{sgn}_{jk}{}_{n\times n}\\

for all jj.

Sketch of the derivation.
(i)(i) Let Q:=P~[1:K]Q:=\tilde{P}_{[1:K]} with Pi~={bmatrix}Pi1~Pi2~Pin~T\tilde{P_{i}}=\bmatrix\tilde{P_{i1}}\ \tilde{P_{i2}}\ \cdots\ \tilde{P_{in}}^{T}. For convenience, let s=yu^s=\hat{y_{u}} with si=yu,i^s_{i}=\hat{y_{u,i}}. By Chain Rule,

\textneu𝐱=\textneuQ(iQPi~Pi~s)s𝐱.\frac{\partial\mathcal{L}_{\text{neu}}}{\partial\mathbf{x}}=\frac{\partial\mathcal{L}_{\text{neu}}}{\partial Q}\left(\sum_{i}\frac{\partial Q}{\partial\tilde{P_{i}}}\frac{\partial\tilde{P_{i}}}{\partial s}\right)\frac{\partial s}{\partial\mathbf{x}}.

Since \textneu=(yuQ)2\mathcal{L}_{\text}{neu}=(y_{u}-Q)^{2}, we can deduce

\textneuQ=2(Qyu)T.\frac{\partial\mathcal{L}_{\text}{neu}}{\partial Q}=2(Q-y_{u})^{T}.

(ii)(ii) By the definition of Q, we can derive

{aligned}QPi~&={bmatrix}(jPj1~)Pi1~(jPj1~)Pi2~(jPj1~)Pin~(jPj2~)Pi1~(jPj2~)Pi2~(jPj2~)Pin~(jPjn~)Pi1~(jPjn~)Pi2~(jPjn~)Pin~={bmatrix}100010001n×n\aligned\frac{\partial Q}{\partial\tilde{P_{i}}}&=\bmatrix\frac{\partial(\sum_{j}\tilde{P_{j1}})}{\partial\tilde{P_{i1}}}\frac{\partial(\sum_{j}\tilde{P_{j1}})}{\partial\tilde{P_{i2}}}\cdots\frac{\partial(\sum_{j}\tilde{P_{j1}})}{\partial\tilde{P_{in}}}\\ \frac{\partial(\sum_{j}\tilde{P_{j2}})}{\partial\tilde{P_{i1}}}\frac{\partial(\sum_{j}\tilde{P_{j2}})}{\partial\tilde{P_{i2}}}\cdots\frac{\partial(\sum_{j}\tilde{P_{j2}})}{\partial\tilde{P_{in}}}\\ \vdots\vdots\ddots\vdots\\ \frac{\partial(\sum_{j}\tilde{P_{jn}})}{\partial\tilde{P_{i1}}}\frac{\partial(\sum_{j}\tilde{P_{jn}})}{\partial\tilde{P_{i2}}}\cdots\frac{\partial(\sum_{j}\tilde{P_{jn}})}{\partial\tilde{P_{in}}}\\ =\bmatrix 10\cdots 0\\ 01\cdots 0\\ \vdots\vdots\ddots\vdots\\ 00\cdots 1_{n\times n}

for all i=1, 2, 3,,ni=1,\ 2,\ 3,\ \cdots,\ n.

(iii)(iii) Set z(i):=τ1((n+12i)sAs𝟏)nz^{(i)}:=\tau^{-1}((n+1-2i)s-A_{s}\mathbf{1})\in\mathbb{R}^{n}. With the property |sisj|=\textsgnij(sisj)|s_{i}-s_{j}|=\text{sgn}_{ij}(s_{i}-s_{j}), we can write As𝟏A_{s}\mathbf{1} as

(As𝟏)j=k;kj\textsgnjk(sjsk).(A_{s}\mathbf{1})_{j}=\sum_{k;k\neq j}\text{sgn}_{jk}(s_{j}-s_{k}).

Hence the jj-th component z(i)j{z^{(i)}}_{j} of z(i)z^{(i)} is

z(i)j=(n+12i)sjk;kj\textsgnjk(sjsk).{z^{(i)}}_{j}=(n+1-2i)s_{j}-\sum_{k;k\neq j}\text{sgn}_{jk}(s_{j}-s_{k}).

(iv)(iv) Note that Pi~s=Pi~z(i)z(i)s\frac{\partial\tilde{P_{i}}}{\partial s}=\frac{\partial\tilde{P_{i}}}{\partial z^{(i)}}\frac{\partial z^{(i)}}{\partial s}. Now, we consider about Pij~\tilde{P_{ij}} , the jj-th component of Pi~\tilde{P_{i}}. By differentiating Pij~\tilde{P_{ij}} with respect to z(i)l{z^{(i)}}_{l}, we obtain

{aligned}Pij^z(i)l&={σ(z(i))l(1σ(z(i))j)\textifl=j,σ(z(i))jσ(z(i))l\textiflj.\aligned\frac{\partial\hat{P_{ij}}}{\partial{z^{(i)}}_{l}}&=\cases{\sigma}({z^{(i)}})_{l}\left(1-\sigma({z^{(i)}})_{j}\right)\text{if}\ l=j,\\ \\ -\ \sigma({z^{(i)}})_{j}\cdot\sigma({z^{(i)}})_{l}\text{if}\ l\neq j.

where σ()=\textsoftmax()\sigma(\cdot)=\text{softmax}(\cdot). Hence

{aligned}Pi^z(i)&={bmatrix}σ(z(i))1(1σ(z(i))1)σ(z(i))1σ(z(i))2σ(z(i))1σ(z(i))nσ(z(i))2σ(z(i))1σ(z(i))2(1σ(z(i))2)σ(z(i))2σ(z(i))nσ(z(i))nσ(z(i))1σ(z(i))nσ(z(i))2σ(z(i))n(1σ(z(i))n)={bmatrix}σ(z(i))1000σ(z(i))2000σ(z(i))n{bmatrix}σ(z(i))1σ(z(i))1σ(z(i))1σ(z(i))2σ(z(i))1σ(z(i))nσ(z(i))2σ(z(i))1σ(z(i))2σ(z(i))2σ(z(i))2σ(z(i))nσ(z(i))nσ(z(i))1σ(z(i))nσ(z(i))2σ(z(i))nσ(z(i))n=\textdiag(σ(z(i)))σ(z(i))(σ(z(i)))T=:H(i)()\aligned\frac{\partial\hat{P_{i}}}{\partial z^{(i)}}&=\bmatrix\sigma({z^{(i)}})_{1}\left(1-\sigma({z^{(i)}})_{1}\right)-\ \sigma({z^{(i)}})_{1}\cdot\sigma({z^{(i)}})_{2}\cdots-\ \sigma({z^{(i)}})_{1}\cdot\sigma({z^{(i)}})_{n}\\ -\ \sigma({z^{(i)}})_{2}\cdot\sigma({z^{(i)}})_{1}\sigma({z^{(i)}})_{2}\left(1-\sigma({z^{(i)}})_{2}\right)\cdots-\ \sigma({z^{(i)}})_{2}\cdot\sigma({z^{(i)}})_{n}\\ \vdots\vdots\ddots\vdots\\ -\ \sigma({z^{(i)}})_{n}\cdot\sigma({z^{(i)}})_{1}-\ \sigma({z^{(i)}})_{n}\cdot\sigma({z^{(i)}})_{2}\cdots\sigma({z^{(i)}})_{n}\left(1-\sigma({z^{(i)}})_{n}\right)\\ =\bmatrix\sigma({z^{(i)}})_{1}0\cdots 0\\ 0\sigma({z^{(i)}})_{2}\cdots 0\\ \vdots\vdots\ddots\vdots\\ 00\cdots\sigma({z^{(i)}})_{n}\\ \qquad-\bmatrix\sigma({z^{(i)}})_{1}\cdot\sigma({z^{(i)}})_{1}\sigma({z^{(i)}})_{1}\cdot\sigma({z^{(i)}})_{2}\cdots\sigma({z^{(i)}})_{1}\cdot\sigma({z^{(i)}})_{n}\\ \sigma({z^{(i)}})_{2}\cdot\sigma({z^{(i)}})_{1}\sigma({z^{(i)}})_{2}\cdot\sigma({z^{(i)}})_{2}\cdots\sigma({z^{(i)}})_{2}\cdot\sigma({z^{(i)}})_{n}\\ \vdots\vdots\ddots\vdots\\ \sigma({z^{(i)}})_{n}\cdot\sigma({z^{(i)}})_{1}\sigma({z^{(i)}})_{n}\cdot\sigma({z^{(i)}})_{2}\cdots\sigma({z^{(i)}})_{n}\cdot\sigma({z^{(i)}})_{n}\\ =\text{diag}(\sigma(z^{(i)}))-\sigma(z^{(i)})(\sigma(z^{(i)}))^{T}\\ =:H^{(i)}\qquad(\ast)

(v)(v) From the definition of z(i)j{z^{(i)}}_{j}, its partial derivative with respect to sls_{l} is

{aligned}z(i)jsl&=τ1((n+12i)sjslk;kj\textsgnjksl(sjsk))={τ1((n+12i)k;kj\textsgnjk)\textifl=jτ1\textsgnjl\textiflj.\aligned\frac{\partial{z^{(i)}}_{j}}{\partial s_{l}}&=\tau^{-1}\left((n+1-2i)\frac{\partial s_{j}}{\partial s_{l}}-\sum_{k;k\neq j}\text{sgn}_{jk}\frac{\partial}{\partial s_{l}}(s_{j}-s_{k})\right)\\ =\cases{\tau}^{-1}\left((n+1-2i)-\sum_{k;k\neq j}\text{sgn}_{jk}\right)\text{if}\ l=j\\ \\ \tau^{-1}\text{sgn}_{jl}\text{if}\ l\neq j.

Hence,

{aligned}z(i)s&=τ1{bmatrix}n+12i000n+12i000n+12i+τ1{bmatrix}k;k1\textsgn1k000k;k2\textsgn2k000k;kn\textsgnnk+τ1{bmatrix} 0\textsgn12\textsgn1n\textsgn210\textsgn2n\textsgnn1\textsgnn20=τ1(\textdiag((n+12i)𝟏)+\textdiag((k;kj\textsgnjk)𝟏)+{bmatrix}\textsgnjk)=:τ1(D(i)+R)()\aligned\frac{\partial z^{(i)}}{\partial s}&=\tau^{-1}\bmatrix n+1-2i0\cdots 0\\ 0n+1-2i\cdots 0\\ \vdots\vdots\ddots\vdots\\ 00\cdots n+1-2i\\ \qquad+\tau^{-1}\bmatrix-\sum_{k;k\neq 1}\text{sgn}_{1k}0\cdots 0\\ 0-\sum_{k;k\neq 2}\text{sgn}_{2k}\cdots 0\\ \vdots\vdots\ddots\vdots\\ 00\cdots-\sum_{k;k\neq n}\text{sgn}_{nk}\\ \qquad+\tau^{-1}\bmatrix\ 0\text{sgn}_{12}\cdots\text{sgn}_{1n}\\ \text{sgn}_{21}0\cdots\text{sgn}_{2n}\\ \vdots\vdots\ddots\vdots\\ \text{sgn}_{n1}\text{sgn}_{n2}\cdots 0\\ =\tau^{-1}\left(\text{diag}((n+1-2i)\mathbf{1})+\text{diag}\left(\left(-\sum_{k;k\neq j}\text{sgn}_{jk}\right)\mathbf{1}\right)+\bmatrix\text{sgn}_{jk}\right)\\ =:\tau^{-1}(D^{(i)}+R)\qquad(\ast\ast)

By ()(\ast) and ()(\ast\ast), we obtain

Pi~s=τ1(H(i)(D(i)+R)).\frac{\partial\tilde{P_{i}}}{\partial s}=\tau^{-1}(H^{(i)}(D^{(i)}+R)).

Finally, from (i)(v)(i)\sim(v), we derive

\textneu𝐱=2τ(P~[1:K]yu)Ti(H(i)(D(i)+R))yu^𝐱.\frac{\partial\mathcal{L}_{\text{neu}}}{\partial\mathbf{x}}=\frac{2}{\tau}\left(\tilde{P}_{[1:K]}-y_{u}\right)^{T}\sum_{i}(H^{(i)}(D^{(i)}+R))\frac{\partial{\hat{y_{u}}}}{\partial\mathbf{x}}.

Since \textneu=(\textneu𝐱)T\nabla\mathcal{L}_{\text{neu}}=\left(\frac{\partial\mathcal{L}_{\text{neu}}}{\partial\mathbf{x}}\right)^{T}, taking transpose on left- and right-hand side. Then we have

{aligned}\textneu&=2τ(yu^𝐱)Ti(H(i)(D(i)+R))T(P~[1:K]yu)=:2τ(yu^𝐱)TiW(i)(P~[1:K]yu).\aligned\nabla\mathcal{L}_{\text{neu}}&=\frac{2}{\tau}\left(\frac{\partial{\hat{y_{u}}}}{\partial\mathbf{x}}\right)^{T}\sum_{i}(H^{(i)}(D^{(i)}+R))^{T}\left(\tilde{P}_{[1:K]}-y_{u}\right)\\ =:\frac{2}{\tau}\left(\frac{\partial{\hat{y_{u}}}}{\partial\mathbf{x}}\right)^{T}\sum_{i}W^{(i)}\left(\tilde{P}_{[1:K]}-y_{u}\right).

Gradient.

For \texthinge\mathcal{L}_{\text{hinge}},

y^uα={bmatrixβ1β2βnT&\textscorefunctionisdotproduct 2{bmatrix}αβ1αβ2αβnT\textscorefunctionisL2distance,(3)Equation 33∂^yu∂βi={bmatrix0⋯ 0⏟α-⁢i\text⁢th 0⋯ 0&\textscorefunctionisdotproduct {bmatrix}𝟎 02(αβ)i\textth 0 0T\textscorefunctionisL2distance, (4)Equation 44=∂L⁢\texthinge∂α⁢otherwise{Φ⁢uiI[>+-γ⁢s(u,i)⁢s(u,j)0](-βjβi)&\textscorefunctionisdotproduct 2Φui𝕀[γs(u,i)+s(u,j)>0](βjβi)T\textscorefunctionisL2distance, (5)Equation 55=⁢{aligned}∂L⁢\texthinge∂βi⁢otherwise{-Φ⁢uiI[>+-γ⁢s(u,i)⁢s(u,j)0]αu&\textscorefunctionisdotproduct 2Φui𝕀[γs(u,i)+s(u,j)>0](βiαu)T\textscorefunctionisL2distance, \texthingeβj={Φui𝕀[γs(u,i)+s(u,j)>0]αuT&\textscorefunctionisdotproduct 2Φui𝕀[γs(u,i)+s(u,j)>0](αuβj)T\textscorefunctionisL2distance.\frac{\partial\hat{y}_{u}}{\partial\mathbf{\alpha}}=\cases{}{bmatrix}\beta_{1}\ \beta_{2}\ \cdots\ \beta_{n}^{T}&\text{scorefunctionisdotproduct}{\\ }-2\bmatrix\alpha-\beta_{1}\ \alpha-\beta_{2}\ \cdots\ \alpha-\beta_{n}^{T}\text{scorefunctionisL2distance},\end{equation}\begin{equation}\frac{\partial\hat{y}_{u}}{\partial\mathbf{\beta}_{i}}=\cases{}{bmatrix}\mathbf{0}\ \cdots\ \mathbf{0}\ \underbrace{\alpha}_{{i\text{-th}}}\ \mathbf{0}\ \cdots\ \mathbf{0}^{T}&\text{scorefunctionisdotproduct}{\\ }\bmatrix\mathbf{0}\ \cdots\ \mathbf{0}\ \underbrace{2(\alpha-\beta)}_{{i\text{-th}}}\ \mathbf{0}\ \cdots\ \mathbf{0}^{T}\text{scorefunctionisL2distance},\end{equation}{\\ }\begin{equation}\frac{\partial\mathcal{L}_{\text{hinge}}}{\partial\alpha}=\cases{\Phi}_{ui}\mathbb{I}{[\gamma-s(u,i)+s(u,j)>0]}(\beta_{j}-\beta_{i})^{T}&\text{scorefunctionisdotproduct}{\\ }2\Phi_{ui}\mathbb{I}{[\gamma-s(u,i)+s(u,j)>0]}(\beta_{j}-\beta_{i})^{T}\text{scorefunctionisL2distance},\end{equation}{\\ }\begin{equation}\aligned\frac{\partial\mathcal{L}_{\text{hinge}}}{\partial\beta_{i}}=\cases{-}\Phi_{ui}\mathbb{I}{[\gamma-s(u,i)+s(u,j)>0]}{\alpha_{u}}^{T}&\text{scorefunctionisdotproduct}{\\ }2\Phi_{ui}\mathbb{I}{[\gamma-s(u,i)+s(u,j)>0]}(\beta_{i}-\alpha_{u})^{T}\text{scorefunctionisL2distance},{\\ }{\\ }\frac{\partial\mathcal{L}_{\text{hinge}}}{\partial\beta_{j}}=\cases{\Phi}_{ui}\mathbb{I}{[\gamma-s(u,i)+s(u,j)>0]}{\alpha_{u}}^{T}&\text{scorefunctionisdotproduct}{\\ }2\Phi_{ui}\mathbb{I}{[\gamma-s(u,i)+s(u,j)>0]}(\alpha_{u}-\beta_{j})^{T}\text{scorefunctionisL2distance}.\end{equation}\@add@PDF@RDFa@triples\par\end{document} (5)