从EMD、WMD到WRD：文本向量序列的相似度计算

Earth Mover’s Distance

$$\sum_j \gamma_{i,j}=p_i,\quad \sum_i \gamma_{i,j}=q_j,\quad \gamma_{i,j} \geq 0\label{eq:cond}$$

$$\min_{\gamma_{i,j} \geq 0} \sum_{i,j} \gamma_{i,j} d_{i,j}\quad \text{s.t.} \quad \sum_j \gamma_{i,j}=p_i,\sum_i \gamma_{i,j}=q_j$$

import numpy as np
from scipy.optimize import linprog

def wasserstein_distance(p, q, D):
"""通过线性规划求Wasserstein距离
p.shape=[m], q.shape=[n], D.shape=[m, n]
p.sum()=1, q.sum()=1, p∈[0,1], q∈[0,1]
"""
A_eq = []
for i in range(len(p)):
A = np.zeros_like(D)
A[i, :] = 1
A_eq.append(A.reshape(-1))
for i in range(len(q)):
A = np.zeros_like(D)
A[:, i] = 1
A_eq.append(A.reshape(-1))
A_eq = np.array(A_eq)
b_eq = np.concatenate([p, q])
D = D.reshape(-1)
result = linprog(D, A_eq=A_eq[:-1], b_eq=b_eq[:-1])
return result.fun

Word Mover’s Distance

$$\min_{\gamma_{i,j} \geq 0} \sum_{i,j} \gamma_{i,j} \left\Vert \boldsymbol{w}_i – \boldsymbol{w}’_j\right\Vert\quad \text{s.t.} \quad \sum_j \gamma_{i,j}=\frac{1}{n},\sum_i \gamma_{i,j}=\frac{1}{n’}$$

Word Mover’s Distance的示意图，来自论文《From Word Embeddings To Document Distances》

def word_mover_distance(x, y):
"""WMD（Word Mover's Distance）的参考实现
x.shape=[m,d], y.shape=[n,d]
"""
p = np.ones(x.shape[0]) / x.shape[0]
q = np.ones(y.shape[0]) / y.shape[0]
D = np.sqrt(np.square(x[:, None] - y[None, :]).mean(axis=2))
return wasserstein_distance(p, q, D)

\begin{aligned}

\sum_{i,j} \gamma_{i,j} \left\Vert \boldsymbol{w}_i – \boldsymbol{w}’_j\right\Vert =& \sum_{i,j} \left\Vert \gamma_{i,j}(\boldsymbol{w}_i – \boldsymbol{w}’_j)\right\Vert\\

\geq& \left\Vert \sum_{i,j}\gamma_{i,j}(\boldsymbol{w}_i – \boldsymbol{w}’_j)\right\Vert\\

=& \left\Vert \sum_i\left(\sum_j\gamma_{i,j}\right)\boldsymbol{w}_i – \sum_j\left(\sum_i\gamma_{i,j}\right)\boldsymbol{w}’_j\right\Vert\\

=& \left\Vert \frac{1}{n}\sum_i\boldsymbol{w}_i – \frac{1}{n’}\sum_j\boldsymbol{w}’_j\right\Vert\\

\end{aligned}

Word Rotator’s Distance

WMD其实已经挺不错了，但非要鸡蛋里挑骨头的话，还是能挑出一些缺点来：

1、它使用的是欧氏距离作为语义差距度量，但从Word2Vec的经验我们就知道要算词向量的相似度的话，用$\cos$往往比欧氏距离要好；
2、WMD理论上是一个无上界的量，这意味着我们不大好直观感知相似程度，从而不能很好调整相似与否的阈值。


\begin{aligned}&p_i = \frac{\left\Vert \boldsymbol{w}_i\right\Vert}{Z}, &Z=\sum_{i=1}^n \left\Vert\boldsymbol{w}_i\right\Vert\\

&q_j = \frac{\left\Vert \boldsymbol{w}’_j\right\Vert}{Z’}, &Z’=\sum_{j=1}^{n’}\left\Vert\boldsymbol{w}’_j\right\Vert

\end{aligned}

$$d_{i,j}=1 – \frac{\boldsymbol{w}_i\cdot \boldsymbol{w}’_j}{\left\Vert\boldsymbol{w}_i\right\Vert\times \left\Vert\boldsymbol{w}’_j\right\Vert}$$

$$\min_{\gamma_{i,j} \geq 0} \sum_{i,j} \gamma_{i,j} \left(1 – \frac{\boldsymbol{w}_i\cdot \boldsymbol{w}’_j}{\left\Vert\boldsymbol{w}_i\right\Vert\times \left\Vert\boldsymbol{w}’_j\right\Vert}\right)\quad \text{s.t.} \quad \sum_j \gamma_{i,j}=\frac{\left\Vert \boldsymbol{w}_i\right\Vert}{Z},\sum_i \gamma_{i,j}=\frac{\left\Vert \boldsymbol{w}’_j\right\Vert}{Z’}$$

def word_rotator_distance(x, y):
"""WRD（Word Rotator's Distance）的参考实现
x.shape=[m,d], y.shape=[n,d]
"""
x_norm = (x**2).sum(axis=1, keepdims=True)**0.5
y_norm = (y**2).sum(axis=1, keepdims=True)**0.5
p = x_norm[:, 0] / x_norm.sum()
q = y_norm[:, 0] / y_norm.sum()
D = 1 - np.dot(x / x_norm, (y / y_norm).T)
return wasserstein_distance(p, q, D)

def word_rotator_similarity(x, y):
"""1 - WRD
x.shape=[m,d], y.shape=[n,d]
"""
return 1 - word_rotator_distance(x, y)

\begin{aligned}

2\sum_{i,j} \gamma_{i,j} \left(1 – \frac{\boldsymbol{w}_i\cdot \boldsymbol{w}’_j}{\left\Vert\boldsymbol{w}_i\right\Vert\times \left\Vert\boldsymbol{w}’_j\right\Vert}\right)=&\sum_{i,j} \gamma_{i,j} \left\Vert \frac{\boldsymbol{w}_i}{\left\Vert \boldsymbol{w}_i\right\Vert} – \frac{\boldsymbol{w}’_j}{\left\Vert \boldsymbol{w}’_j\right\Vert}\right\Vert \\

=& \sum_{i,j} \left\Vert \gamma_{i,j}\left(\frac{\boldsymbol{w}_i}{\left\Vert \boldsymbol{w}_i\right\Vert} – \frac{\boldsymbol{w}’_j}{\left\Vert \boldsymbol{w}’_j\right\Vert}\right)\right\Vert\\

\geq& \left\Vert \sum_{i,j}\gamma_{i,j}\left(\frac{\boldsymbol{w}_i}{\left\Vert \boldsymbol{w}_i\right\Vert} – \frac{\boldsymbol{w}’_j}{\left\Vert \boldsymbol{w}’_j\right\Vert}\right)\right\Vert\\

=& \left\Vert \sum_i\left(\sum_j\gamma_{i,j}\right)\frac{\boldsymbol{w}_i}{\left\Vert \boldsymbol{w}_i\right\Vert} – \sum_j\left(\sum_i\gamma_{i,j}\right)\frac{\boldsymbol{w}’_j}{\left\Vert \boldsymbol{w}’_j\right\Vert}\right\Vert\\

=& \left\Vert \frac{1}{Z}\sum_i\boldsymbol{w}_i – \frac{1}{Z’}\sum_j\boldsymbol{w}’_j\right\Vert\\

\end{aligned}