psychemedia · June 20, 2012 10:23
diff --git a/gplus_espTest.py b/gplus_espTest.py
 # ABOUT:
 # A script that grabs a list of the friends or followers of a user on Google+,
 # grabs a sample of their friends, and generates the resulting social graph

 # USAGE:
 # Requirements: networkx (see DEPENDENCIES)
 # Configuration: see CONFIGURATION
 # Output: files will be save to the reports directory
 # To run the script:
 # 1) Download this file to a new directory somewhere as eg gplusESPnet.py
 # 2) cd to the directory
 # 3) *The first time*, create to new subdirectories (reports and cache); for example, run the following from the command line: mkdir reports; mkdir cache
 # 4) Call the script by running the following from the command line:
 # python gplusESPnet.py

 # DEPENDENCIES
 # The script makes use of the networkx library; you should only need to install it once.
 # To install networkx, from the command line type: easy_install networkx
 # If that doesn't work, follow the instructions on http://networkx.lanl.gov/install.html
 # In short: a) download and unzip http://networkx.lanl.gov/download/networkx/networkx-1.5.zip
 # b) cd to the networkx-1.5 directory, c) type: python setup.py install 
 # END DEPENDENCIES

 import networkx as nx

 #--- the following should already be available
 import urllib2,re
 import md5,urllib,os,tempfile,time
 import random
 import datetime

 import StringIO


 #USER SETTINGS
 #rootID is the Google+ ID of the person whos ESP net you want to map
 rootID='100095426689697101649'
 #You also need to provide the name of this user
 name='Tony Hirst'


 #----
 # Do some checks...
 def checkDir(dirpath):
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)

 checkDir('reports')
 checkDir('cache')
 #---

 oidRootNamePairs={rootID:name}

 defCache=360000
 typ='fo'
 typ2='fr'
 DG=nx.DiGraph()


 reobj = re.compile(r'.*([0-9]{21}).*')
 reobj2 = re.compile(r',\["([^"]*)".*')
 reobj3=re.compile(r'.*[0-9]{21}"\]\n,\[\]\n,\["[^"]*')
 #oids = reobj3.findall(data)
 #for oid in oids:
 #,[[,,"112696985248193005986"]\n,[]\n,["Dawn Wicks-Sutton
 reobj4=re.compile(r',\[+,,"([0-9]{21})"]\n,\[\]\n,\["(.*)$')
 #ascii(reobj4.match(oid).group(2)) is name, tho check not '' if so 'U N Owen", reobj4.match(oid).group(1) is ID 
 def ascii(s): return "".join(i for i in s if ord(i)<128)

 def getoidName(i,currIDs,oidNames):
 	l=i.next()
 	#print l
 	oid = reobj.match(l)
 	if oid==None:
 		print 'at the end???'
 		return i,currIDs,oidNames,-1
 	else: oid=oid.group(1)
 	#if we don't get an ID, then return oidNames, i, -1
 	if oid not in currIDs:
 		#print 'toploop'
 		i.next()
 		n=i.next()
 		n=ascii(reobj2.match(n).group(1))
 		if oid not in oidNames:
 			oidNames[oid]=n
 		currIDs.append(oid)
 		#print oid,n
 		next=''
 		while next!=',[]\n':
 			next=i.next()
 			#print '...'+next+',,,,'
 		next=''
 		while next!=']\n':
 			next=i.next()
 	else:
 		print 'bottomloop'
 		next=''
 		while next!=']\n':
 			next=i.next()
 	return i,currIDs,oidNames,1

 def getoidNames(oidNames,oid='',typ='fr'):
 	#oidNames = {}
 	if oid=='': return oidNames,[]
 	currIDs=[]
 	#???I suspect this only does one page of up to 1000(?) users? Need to check?
 	if typ=='fr':
 		url='https://plus.google.com/u/0/_/socialgraph/lookup/visible/?o=%5Bnull%2Cnull%2C%22'+oid+'%22%5D&rt=j'
 	elif typ=='fo':
 		url='https://plus.google.com/u/0/_/socialgraph/lookup/incoming/?o=%5Bnull%2Cnull%2C%22'+oid+'%22%5D&n=1000&rt=j'
 	else:
 		exit(-1)
 	print url
 	#data = urllib.urlopen(url).read()
 	data=getGenericCachedData(url)
 	i=StringIO.StringIO(data)
 	i.next()
 	i.next()
 	i.next()
 	#if flag returns <0, we're done
 	flag=1
 	while flag>0:
 		i,currIDs,oidNames,flag=getoidName(i,currIDs,oidNames)
 	#print currIDs,oidNames
 	return oidNames,currIDs
 #friends
 #https://plus.google.com/u/0/_/socialgraph/lookup/visible/?o=%5Bnull%2Cnull%2C%22GOOGLEPLUSUSERID%22%5D&rt=j

 #followers
 #https://plus.google.com/u/0/_/socialgraph/lookup/incoming/?o=%5Bnull%2Cnull%2C%22GOOGLEPLUSUSERID%22%5D&n=1000&rt=j

 #----------------------------------------------------------------
 #Yield successive n-sized chunks from l
 def chunks(l, n):   
    for i in xrange(0, len(l), n):
        yield l[i:i+n]

 def report(m, verbose=False):
  if verbose is True:
    print m


 class DiskCacheFetcherfname:
    def __init__(self, cache_dir=None):
        # If no cache directory specified, use system temp directory
        if cache_dir is None:
            cache_dir = tempfile.gettempdir()
        self.cache_dir = cache_dir
    def fetch(self, url, max_age=0):
        # Use MD5 hash of the URL as the filename
        filename = md5.new(url).hexdigest()
        filepath = os.path.join(self.cache_dir, filename)
        if os.path.exists(filepath):
            if int(time.time()) - os.path.getmtime(filepath) < max_age:
                #return open(filepath).read()
                report("using "+filename+", cached copy of fetched url: "+url)
                return filepath
        report("fetching fresh copy of fetched url: "+url)
        # Retrieve over HTTP and cache, using rename to avoid collisions
        data = urllib.urlopen(url).read()
        fd, temppath = tempfile.mkstemp()
        fp = os.fdopen(fd, 'w')
        fp.write(data)
        fp.close()
        os.rename(temppath, filepath)
        return filepath

 def getGenericCachedData(url, cachetime=defCache):
  fetcher=DiskCacheFetcherfname('cache')
  fn=fetcher.fetch(url, cachetime)
  f=open(fn)
  data=f.read()
  f.close()

  return data
  

 def addDirectedEdges(DG,fromNode,toSet,flip=False):
 	for toNode in toSet:
 		if flip==True:
 			DG.add_edge(toNode,fromNode)
 		else:
 			DG.add_edge(fromNode,toNode)
 	#print nx.info(DG)
 	return DG

 def labelNodes(G,names):
 	for nodeID in G.node:
 		G.node[nodeID]['label']=names[nodeID]
 	return G



 oidNamePairs={}
 for id in oidRootNamePairs:
 	oidNamePairs,currIDs=getoidNames(oidNamePairs,id,typ)
 	print currIDs
 	flip=(typ=='fr')
 	DG=addDirectedEdges(DG, id, currIDs,flip=flip)
 	n=len(currIDs)
 	print str(n)
 	c=1
 	for cid in currIDs:
 		print '\tSub-level run: getting ',typ2,str(c),'of',str(n),typ,cid
 		oidNamePairs,ccurrIDs=getoidNames(oidNamePairs,cid,typ2)
 		DG=addDirectedEdges(DG, cid, ccurrIDs)
 		c=c+1
 for id in oidRootNamePairs:
 	if id not in oidNamePairs:
 		oidNamePairs[id]=oidRootNamePairs[id]
 DG=labelNodes(DG,oidNamePairs)
 print nx.info(DG)

 now = datetime.datetime.now()
 ts = now.strftime("_%Y-%m-%d-%H-%M-%S")

 fname=name.replace(' ','_')
 nx.write_graphml(DG, '/'.join(['reports',fname+'_google'+typ+'Friends_'+ts+".graphml"]))
 nx.write_edgelist(DG, '/'.join(['reports',fname+'_google'+typ+'Friends_'+ts+".txt"]),data=False)

 def filterNet(DG,mindegree,indegree,outdegree,outdegreemax,typ,typ2,addUserFriendships,user,indegreemax):
 	#need to tweak this to allow filtering by in and out degree?
 	if addUserFriendships==1:
 		DG=addFocus(DG,user,typ)
 	#handle min,in,out degree
 	filter=[]
 	#filter=[n for n in DG if DG.degree(n)>=mindegree]
 	for n in DG:
 		if outdegreemax==None or DG.out_degree(n)<=outdegreemax:
 			if mindegree!=None:
 				if DG.degree(n)>=mindegree:
 					filter.append(n)
 			else:
 				if indegree!=None:
 					if DG.in_degree(n)>=indegree:
 						filter.append(n)
 				if outdegree!=None:
 					if DG.out_degree(n)>=outdegree:
 						filter.append(n)
 	#the filter represents the intersect of the *degreesets
 	#indegree and outdegree values are ignored if mindegree is set
 	filter=set(filter)
 	H=DG.subgraph(filter)
 	#Superstitiously, perhaps, make sure we only grab nodes that project edges...
 	filter= [n for n in H if H.degree(n)>0]
 	L=H.subgraph(filter)
 	#print "Filter set:",filter
 	print L.order(),L.size()
 	#L=labelGraph(L,filter)
 	
 	if mindegree==None: tm='X'
 	else: tm=str(mindegree)
 	if indegree==None: ti='X'
 	else: ti=str(indegree)
 	if outdegree==None: to='X'
 	else: to=str(outdegree)
 	if outdegreemax==None: tom='X'
 	else: tom=str(outdegreemax)
 	st='/'.join([projname,name+'_google'+typ+typ2+'degree_'+tm+'_'+ti+'_'+to+'_'+tom+"_esp"])
 	print nx.info(L)
 	nx.write_graphml(L, st+".graphml")
 	nx.write_edgelist(L, st+".txt",data=False)


 mindegree=None
 indegree=20
 outdegree=25
 outdegreemax=None
 addUserFriendships=0
 user=''
 indegreemax=None
 projname='reports/'
 filterNet(DG,mindegree,indegree,outdegree,outdegreemax,typ,typ2,addUserFriendships,user,indegreemax)
	# ABOUT:
	# A script that grabs a list of the friends or followers of a user on Google+,
	# grabs a sample of their friends, and generates the resulting social graph

	# USAGE:
	# Requirements: networkx (see DEPENDENCIES)
	# Configuration: see CONFIGURATION
	# Output: files will be save to the reports directory
	# To run the script:
	# 1) Download this file to a new directory somewhere as eg gplusESPnet.py
	# 2) cd to the directory
	# 3) The first time, create to new subdirectories (reports and cache); for example, run the following from the command line: mkdir reports; mkdir cache
	# 4) Call the script by running the following from the command line:
	# python gplusESPnet.py

	# DEPENDENCIES
	# The script makes use of the networkx library; you should only need to install it once.
	# To install networkx, from the command line type: easy_install networkx
	# If that doesn't work, follow the instructions on http://networkx.lanl.gov/install.html
	# In short: a) download and unzip http://networkx.lanl.gov/download/networkx/networkx-1.5.zip
	# b) cd to the networkx-1.5 directory, c) type: python setup.py install
	# END DEPENDENCIES

	import networkx as nx

	#--- the following should already be available
	import urllib2,re
	import md5,urllib,os,tempfile,time
	import random
	import datetime

	import StringIO


	#USER SETTINGS
	#rootID is the Google+ ID of the person whos ESP net you want to map
	rootID='100095426689697101649'
	#You also need to provide the name of this user
	name='Tony Hirst'


	#----
	# Do some checks...
	def checkDir(dirpath):
	if not os.path.exists(dirpath):
	os.makedirs(dirpath)

	checkDir('reports')
	checkDir('cache')
	#---

	oidRootNamePairs={rootID:name}

	defCache=360000
	typ='fo'
	typ2='fr'
	DG=nx.DiGraph()


	reobj = re.compile(r'.([0-9]{21}).')
	reobj2 = re.compile(r',\["([^"])".')
	reobj3=re.compile(r'.[0-9]{21}"\]\n,\[\]\n,\["[^"]')
	#oids = reobj3.findall(data)
	#for oid in oids:
	#,[[,,"112696985248193005986"]\n,[]\n,["Dawn Wicks-Sutton
	reobj4=re.compile(r',\[+,,"([0-9]{21})"]\n,\[\]\n,\["(.*)$')
	#ascii(reobj4.match(oid).group(2)) is name, tho check not '' if so 'U N Owen", reobj4.match(oid).group(1) is ID
	def ascii(s): return "".join(i for i in s if ord(i)<128)

	def getoidName(i,currIDs,oidNames):
	l=i.next()
	#print l
	oid = reobj.match(l)
	if oid==None:
	print 'at the end???'
	return i,currIDs,oidNames,-1
	else: oid=oid.group(1)
	#if we don't get an ID, then return oidNames, i, -1
	if oid not in currIDs:
	#print 'toploop'
	i.next()
	n=i.next()
	n=ascii(reobj2.match(n).group(1))
	if oid not in oidNames:
	oidNames[oid]=n
	currIDs.append(oid)
	#print oid,n
	next=''
	while next!=',[]\n':
	next=i.next()
	#print '...'+next+',,,,'
	next=''
	while next!=']\n':
	next=i.next()
	else:
	print 'bottomloop'
	next=''
	while next!=']\n':
	next=i.next()
	return i,currIDs,oidNames,1

	def getoidNames(oidNames,oid='',typ='fr'):
	#oidNames = {}
	if oid=='': return oidNames,[]
	currIDs=[]
	#???I suspect this only does one page of up to 1000(?) users? Need to check?
	if typ=='fr':
	url='https://plus.google.com/u/0/_/socialgraph/lookup/visible/?o=%5Bnull%2Cnull%2C%22'+oid+'%22%5D&rt=j'
	elif typ=='fo':
	url='https://plus.google.com/u/0/_/socialgraph/lookup/incoming/?o=%5Bnull%2Cnull%2C%22'+oid+'%22%5D&n=1000&rt=j'
	else:
	exit(-1)
	print url
	#data = urllib.urlopen(url).read()
	data=getGenericCachedData(url)
	i=StringIO.StringIO(data)
	i.next()
	i.next()
	i.next()
	#if flag returns <0, we're done
	flag=1
	while flag>0:
	i,currIDs,oidNames,flag=getoidName(i,currIDs,oidNames)
	#print currIDs,oidNames
	return oidNames,currIDs
	#friends
	#https://plus.google.com/u/0/_/socialgraph/lookup/visible/?o=%5Bnull%2Cnull%2C%22GOOGLEPLUSUSERID%22%5D&rt=j

	#followers
	#https://plus.google.com/u/0/_/socialgraph/lookup/incoming/?o=%5Bnull%2Cnull%2C%22GOOGLEPLUSUSERID%22%5D&n=1000&rt=j

	#----------------------------------------------------------------
	#Yield successive n-sized chunks from l
	def chunks(l, n):
	for i in xrange(0, len(l), n):
	yield l[i:i+n]

	def report(m, verbose=False):
	if verbose is True:
	print m


	class DiskCacheFetcherfname:
	def __init__(self, cache_dir=None):
	# If no cache directory specified, use system temp directory
	if cache_dir is None:
	cache_dir = tempfile.gettempdir()
	self.cache_dir = cache_dir
	def fetch(self, url, max_age=0):
	# Use MD5 hash of the URL as the filename
	filename = md5.new(url).hexdigest()
	filepath = os.path.join(self.cache_dir, filename)
	if os.path.exists(filepath):
	if int(time.time()) - os.path.getmtime(filepath) < max_age:
	#return open(filepath).read()
	report("using "+filename+", cached copy of fetched url: "+url)
	return filepath
	report("fetching fresh copy of fetched url: "+url)
	# Retrieve over HTTP and cache, using rename to avoid collisions
	data = urllib.urlopen(url).read()
	fd, temppath = tempfile.mkstemp()
	fp = os.fdopen(fd, 'w')
	fp.write(data)
	fp.close()
	os.rename(temppath, filepath)
	return filepath

	def getGenericCachedData(url, cachetime=defCache):
	fetcher=DiskCacheFetcherfname('cache')
	fn=fetcher.fetch(url, cachetime)
	f=open(fn)
	data=f.read()
	f.close()

	return data


	def addDirectedEdges(DG,fromNode,toSet,flip=False):
	for toNode in toSet:
	if flip==True:
	DG.add_edge(toNode,fromNode)
	else:
	DG.add_edge(fromNode,toNode)
	#print nx.info(DG)
	return DG

	def labelNodes(G,names):
	for nodeID in G.node:
	G.node[nodeID]['label']=names[nodeID]
	return G



	oidNamePairs={}
	for id in oidRootNamePairs:
	oidNamePairs,currIDs=getoidNames(oidNamePairs,id,typ)
	print currIDs
	flip=(typ=='fr')
	DG=addDirectedEdges(DG, id, currIDs,flip=flip)
	n=len(currIDs)
	print str(n)
	c=1
	for cid in currIDs:
	print '\tSub-level run: getting ',typ2,str(c),'of',str(n),typ,cid
	oidNamePairs,ccurrIDs=getoidNames(oidNamePairs,cid,typ2)
	DG=addDirectedEdges(DG, cid, ccurrIDs)
	c=c+1
	for id in oidRootNamePairs:
	if id not in oidNamePairs:
	oidNamePairs[id]=oidRootNamePairs[id]
	DG=labelNodes(DG,oidNamePairs)
	print nx.info(DG)

	now = datetime.datetime.now()
	ts = now.strftime("_%Y-%m-%d-%H-%M-%S")

	fname=name.replace(' ','_')
	nx.write_graphml(DG, '/'.join(['reports',fname+'_google'+typ+'Friends_'+ts+".graphml"]))
	nx.write_edgelist(DG, '/'.join(['reports',fname+'_google'+typ+'Friends_'+ts+".txt"]),data=False)

	def filterNet(DG,mindegree,indegree,outdegree,outdegreemax,typ,typ2,addUserFriendships,user,indegreemax):
	#need to tweak this to allow filtering by in and out degree?
	if addUserFriendships==1:
	DG=addFocus(DG,user,typ)
	#handle min,in,out degree
	filter=[]
	#filter=[n for n in DG if DG.degree(n)>=mindegree]
	for n in DG:
	if outdegreemax==None or DG.out_degree(n)<=outdegreemax:
	if mindegree!=None:
	if DG.degree(n)>=mindegree:
	filter.append(n)
	else:
	if indegree!=None:
	if DG.in_degree(n)>=indegree:
	filter.append(n)
	if outdegree!=None:
	if DG.out_degree(n)>=outdegree:
	filter.append(n)
	#the filter represents the intersect of the *degreesets
	#indegree and outdegree values are ignored if mindegree is set
	filter=set(filter)
	H=DG.subgraph(filter)
	#Superstitiously, perhaps, make sure we only grab nodes that project edges...
	filter= [n for n in H if H.degree(n)>0]
	L=H.subgraph(filter)
	#print "Filter set:",filter
	print L.order(),L.size()
	#L=labelGraph(L,filter)

	if mindegree==None: tm='X'
	else: tm=str(mindegree)
	if indegree==None: ti='X'
	else: ti=str(indegree)
	if outdegree==None: to='X'
	else: to=str(outdegree)
	if outdegreemax==None: tom='X'
	else: tom=str(outdegreemax)
	st='/'.join([projname,name+'_google'+typ+typ2+'degree_'+tm+'_'+ti+'_'+to+'_'+tom+"_esp"])
	print nx.info(L)
	nx.write_graphml(L, st+".graphml")
	nx.write_edgelist(L, st+".txt",data=False)


	mindegree=None
	indegree=20
	outdegree=25
	outdegreemax=None
	addUserFriendships=0
	user=''
	indegreemax=None
	projname='reports/'
	filterNet(DG,mindegree,indegree,outdegree,outdegreemax,typ,typ2,addUserFriendships,user,indegreemax)