Skip to content

Instantly share code, notes, and snippets.

@joelkuiper
Created April 29, 2020 18:09
Show Gist options
  • Save joelkuiper/15a9d064d36ab1f81d1b3fbad904754f to your computer and use it in GitHub Desktop.
Save joelkuiper/15a9d064d36ab1f81d1b3fbad904754f to your computer and use it in GitHub Desktop.
from gensim.models.poincare import PoincareModel, PoincareRelations
import logging
import matplotlib.pyplot as plt
import numpy as np
import csv
import itertools
import networkx as nx
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
import pandas as pd
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
level=logging.INFO)
def load(p="data/2d_poincare"):
return PoincareModel.load(p)
def train(file_path="data/sui_graph.tsv", epochs=50):
model = PoincareModel(PoincareRelations(file_path), size=2)
model.train(epochs=epochs)
model.save("data/2d_poincare_e" + str(epochs))
def read_cord(path):
with open(path, 'r') as csvfile:
reader = csv.DictReader(csvfile, delimiter=',', quotechar='\"')
rows = [row for row in reader]
return rows
def unique(a):
indices = sorted(range(len(a)), key=a.__getitem__)
indices = set(next(it) for k, it in
itertools.groupby(indices, key=a.__getitem__))
return [x for i, x in enumerate(a) if i in indices]
def parse_graph(finput):
'''
Parses a CSV file with SUI,SUI into a list
'''
with open(finput) as f:
lines = f.readlines()
lines = [[c.strip('\"') for c in l.strip().split(",")] for l in lines]
return lines[1:] # Skip header
def construct_graph(graph_file):
G = nx.DiGraph()
g = parse_graph(graph_file)
g = unique(g) # remove duplicates due to mapping
for child, parent in g:
G.add_edge(parent, child)
return G
def visualize(model, g, lines, fname):
vectors = model.kv
if model.kv.syn0.shape[1] != 2:
raise ValueError('can only plot 2-d vectors')
flatten = lambda l: [item for sublist in l for item in sublist]
dct = Dictionary(lines)
bow = [dct.doc2bow(line) for line in lines]
tfidf = TfidfModel(bow, smartirs='dtu')
corpus_tfidf = tfidf[bow]
weights = {dct.get(id): value for doc in corpus_tfidf for id, value in doc}
nodes = set([n for n in dct.token2id.keys() if n in vectors])
# Cartesian
points = np.array([vectors[node] for node in nodes])
x = points[:, 0]
y = points[:, 1]
# Polar coordinates
# r = np.sqrt(x**2+y**2)
# t = np.arctan2(y, x)
fig = plt.gcf()
fig.set_size_inches(30,30)
ax = fig.gca()
ax.set_aspect(1)
plt.axis('off')
plt.xlim(-1.0, 1.0)
plt.ylim(-1.0, 1.0)
#cmap = plt.cm.get_cmap('gray')
ss = [weights[node] for node in nodes]
plt.scatter(x, y, s=ss, c="#111111")
#Add edges
for node in nodes:
if node in vectors:
p1 = vectors[node]
edges = list(g.out_edges(node))
for _, out in edges:
if out in nodes:
p2 = vectors[out]
plt.plot([p1[0], p2[0]], [p1[1], p2[1]],
color="#111111",
alpha=0.05)
plt.savefig("/home/joelkuiper/Windows/Desktop/cor/" + fname + ".png", dpi=96, facecolor="#e0e0e0")
plt.close()
# lazy https://stackoverflow.com/questions/56982035/generate-list-of-list-for-start-and-end-date-of-week-from-date-range
import calendar
from datetime import datetime, timedelta
def get_all_weeks(start_date, end_date):
start_date = datetime.strptime(start_date, '%Y-%m-%d').date()
end_date = datetime.strptime(end_date, '%Y-%m-%d').date()
one_day, one_week = timedelta(days=1), timedelta(days=7)
current_week_start = calendar.Calendar().monthdatescalendar(start_date.year, start_date.month)[0][0]
while True:
if current_week_start + one_week <= start_date:
current_week_start += one_week
continue
if current_week_start > end_date:
break
yield [current_week_start.strftime('%Y-%m-%d'), (current_week_start + one_week - one_day).strftime('%Y-%m-%d')]
current_week_start += one_week
def animation(model, g, cord, start='2019-01-01', end='2020-05-01', key='i'):
df = pd.DataFrame(cord)
df['date'] = pd.to_datetime(df['published'])
weeks = [*get_all_weeks(start, end)]
idx = 0
for week_start, week_end in weeks:
mask = (df['date'] > start) & (df['date'] <= week_end)
rows = df.loc[mask]
lines = [list(filter(len, row.split("; "))) for row in rows[key]]
visualize(model, g, lines, key + "_" + str(idx))
idx = idx + 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment