def jaccard_metric(x, y): """ x: scipy.sparse CSR matrix shape (1, n) y: scipy.sparse CSR matrix shape (1, n) returns: jaccard similarity """ return x.minimum(y).sum()/x.maximum(y).sum() def l2_metric(x,y): """ x: scipy.sparse CSR matrix shape (1, n) y: scipy.sparse CSR matrix shape (1, n) returns: l2 similarity """ from scipy.sparse import linalg return -linalg.norm(x-y) def cos_metric(x, y): """ x: scipy.sparse CSR matrix shape (1, n) y: scipy.sparse CSR matrix shape (1, n) returns: cosine similarity """ from scipy.sparse import linalg return x.dot(y) / (linalg.norm(x) * linalg.norm(y)) def jaccard_pdist(X, Y): """ X: scipy.sparse CSR matrix, shape (m1, n) Y: scipy.sparse CSR matrix, shape (m2, n) returns: pairwise jaccard distance between X and Y, shape (m1, m2) """ m2 = Y.shape[0] d = [] Y_idx = np.repeat(0, m2) for i in range(m2): num = np.asarray(X.minimum(Y[Y_idx]).sum(1)) denom = np.asarray(X.maximum(Y[Y_idx]).sum(1)) d.append(num/denom) Y_idx += 1 return np.hstack(d) def l2_pdist(X, Y): """ help from https://stackoverflow.com/a/37903795 X: scipy.sparse CSR matrix, shape (m1, n) Y: scipy.sparse CSR matrix, shape (m2, n) returns: pairwise l2 distance between X and Y, shape (m1, m2) """ m2 = Y.shape[0] d = [] Y_idx = np.repeat(0, m2) for i in range(m2): d.append(-np.sqrt(np.asarray((X-Y[Y_idx]).power(2).sum(1)))) Y_idx += 1 return np.hstack(d) def cos_pdist(X, Y): """ help from https://stackoverflow.com/a/43493487 X: scipy.sparse CSR matrix, shape (m1, n) Y: scipy.sparse CSR matrix, shape (m2, n) returns: pairwise cosine distance between X and Y, shape (m1, m2) """ sumyy = np.asarray((Y.power(2)).sum(1)).flatten() sumxx = np.asarray((X.power(2)).sum(1)) sumxy = X.dot(Y.T).toarray() return (sumxy/np.sqrt(sumxx))/np.sqrt(sumyy)