Created
August 27, 2014 12:46
-
-
Save dgleich/cf170a226aa848240cf4 to your computer and use it in GitHub Desktop.
Heat Kernel Based Community Detection
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Heat Kernel Based Community Detection | |
# by Kyle Kloster and David F. Gleich | |
# supported by NSF award CCF-1149756. | |
# | |
# This demo shows our algorithm running on the Twitter graph. | |
# Our other codes are available from David's website: | |
# | |
# https://www.cs.purdue.edu/homes/dgleich/codes/hkgrow/ | |
# | |
# To use this demo, you need pylibbvg graph by David Gleich, Wei-Yen Day, and | |
# Yongyang Yu (heavily based on webgraph by Sebastiano Vigna -- he did all the | |
# fundamental work!) as well as the symmetrized version of the twitter graph. | |
# | |
# If you are on a mac or linux, run | |
# | |
# pip install pylibbvg | |
# wget https://www.cs.purdue.edu/homes/dgleich/data/twitter-2010-symm.graph | |
# wget https://www.cs.purdue.edu/homes/dgleich/data/twitter-2010-symm.offsets | |
# wget https://www.cs.purdue.edu/homes/dgleich/data/twitter-2010-symm.properties | |
# | |
# Then, as long as these files are in your directory, our code will run! | |
# | |
# You need about 4-5GB of space and memory to run this demo. | |
import sys | |
import math | |
import bvg | |
import collections | |
import time | |
import random | |
def compute_psis(N,t): | |
psis = {} | |
psis[N] = 1. | |
for i in xrange(N-1,0,-1): | |
psis[i] = psis[i+1]*t/(float(i+1.))+1. | |
return psis | |
start = time.time() | |
print "Loading graph ..." | |
G = bvg.BVGraph('twitter-2010-symm',1) | |
print " ... done! %.1f seconds"%(time.time() - start) | |
time.sleep(0.75) | |
print "\n" | |
print "Twitter graph has" | |
print "nodes: %i"%(G.nverts) | |
print "edges: %i"%(G.nedges) | |
time.sleep(0.75) | |
print "\nt = 15" | |
print "eps = 10^-4\n" | |
time.sleep(0.75) | |
Gvol = G.nedges | |
## Setup parameters that can be computed automatically | |
N = 47 # see paper for how to set this automatically | |
t = 15. | |
eps = 0.0001 | |
psis = compute_psis(N,t) | |
x = {} | |
r = {} | |
currun = 0 | |
while 1: | |
if currun%15==0: | |
# print out the header | |
print "%10s %5s %4s %4s %7s %7s %7s"%( | |
'seed ID','degree','time','cond','edges','nnz','setsize') | |
currun += 1 | |
time.sleep(0.5) | |
randiseed = random.randint(1, len(G)) | |
seed = [randiseed] | |
start = time.time() | |
## Estimate hkpr vector | |
# | |
# This is our main algorithm. | |
# | |
# G is a networkx graph, or something that implements the same | |
# interface | |
# | |
# t, tol, N, psis are precomputed | |
# npush tracks number of operations | |
x = {} # Store x, r as dictionaries | |
r = {} # initialize residual | |
Q = collections.deque() # initialize queue | |
npush = 0. | |
for s in seed: | |
r[(s,0)] = 1./len(seed) | |
Q.append((s,0)) | |
npush += len(seed) | |
while len(Q) > 0: | |
(v,j) = Q.popleft() # v has r[(v,j)] ... | |
rvj = r[(v,j)] | |
# perform the hk-relax step | |
if v not in x: x[v] = 0. | |
x[v] += rvj | |
r[(v,j)] = 0. | |
update = rvj/G.out_degree(v) | |
mass = (t/(float(j)+1.))*update | |
for u in G[v]: # for neighbors of v | |
next = (u,j+1) # in the next block | |
if j+1 == N: | |
x[u] += update | |
continue | |
if next not in r: r[next] = 0. | |
thresh = math.exp(t)*eps*G.out_degree(u) | |
thresh = thresh/(N*psis[j+1]) | |
if r[next] < thresh and \ | |
r[next] + mass >= thresh: | |
Q.append(next) # add u to queue | |
r[next] = r[next] + mass | |
npush += G.out_degree(v) | |
## Step 2 do a sweep cut based on this vector | |
# Find cluster, first normalize by degree | |
for v in x: x[v] = x[v]/G.out_degree(v) | |
# now sort x's keys by value, decreasing | |
sv = sorted(x.iteritems(), key=lambda x: x[1], reverse=True) | |
S = set() | |
volS = 0. | |
cutS = 0. | |
bestcond = 1. | |
bestset = sv[0] | |
for p in sv: | |
s = p[0] # get the vertex | |
volS += G.out_degree(s) # add degree to volume | |
for v in G[s]: | |
if v in S: | |
cutS -= 1 | |
else: | |
cutS += 1 | |
S.add(s) | |
if cutS/min(volS,Gvol-volS) < bestcond: | |
bestcond = cutS/min(volS,Gvol-volS) | |
bestset = set(S) # make a copy | |
# print out the info on this cluster | |
print "%10i %5i %4.2f %4.2f %7i %7i %7i"%( | |
seed[0], G.out_degree(seed[0]), | |
time.time()-start, bestcond, | |
npush, len(x), len(bestset)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment