Created
April 29, 2020 18:09
-
-
Save joelkuiper/15a9d064d36ab1f81d1b3fbad904754f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from gensim.models.poincare import PoincareModel, PoincareRelations | |
import logging | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import csv | |
import itertools | |
import networkx as nx | |
from gensim.models import TfidfModel | |
from gensim.corpora import Dictionary | |
import pandas as pd | |
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', | |
level=logging.INFO) | |
def load(p="data/2d_poincare"): | |
return PoincareModel.load(p) | |
def train(file_path="data/sui_graph.tsv", epochs=50): | |
model = PoincareModel(PoincareRelations(file_path), size=2) | |
model.train(epochs=epochs) | |
model.save("data/2d_poincare_e" + str(epochs)) | |
def read_cord(path): | |
with open(path, 'r') as csvfile: | |
reader = csv.DictReader(csvfile, delimiter=',', quotechar='\"') | |
rows = [row for row in reader] | |
return rows | |
def unique(a): | |
indices = sorted(range(len(a)), key=a.__getitem__) | |
indices = set(next(it) for k, it in | |
itertools.groupby(indices, key=a.__getitem__)) | |
return [x for i, x in enumerate(a) if i in indices] | |
def parse_graph(finput): | |
''' | |
Parses a CSV file with SUI,SUI into a list | |
''' | |
with open(finput) as f: | |
lines = f.readlines() | |
lines = [[c.strip('\"') for c in l.strip().split(",")] for l in lines] | |
return lines[1:] # Skip header | |
def construct_graph(graph_file): | |
G = nx.DiGraph() | |
g = parse_graph(graph_file) | |
g = unique(g) # remove duplicates due to mapping | |
for child, parent in g: | |
G.add_edge(parent, child) | |
return G | |
def visualize(model, g, lines, fname): | |
vectors = model.kv | |
if model.kv.syn0.shape[1] != 2: | |
raise ValueError('can only plot 2-d vectors') | |
flatten = lambda l: [item for sublist in l for item in sublist] | |
dct = Dictionary(lines) | |
bow = [dct.doc2bow(line) for line in lines] | |
tfidf = TfidfModel(bow, smartirs='dtu') | |
corpus_tfidf = tfidf[bow] | |
weights = {dct.get(id): value for doc in corpus_tfidf for id, value in doc} | |
nodes = set([n for n in dct.token2id.keys() if n in vectors]) | |
# Cartesian | |
points = np.array([vectors[node] for node in nodes]) | |
x = points[:, 0] | |
y = points[:, 1] | |
# Polar coordinates | |
# r = np.sqrt(x**2+y**2) | |
# t = np.arctan2(y, x) | |
fig = plt.gcf() | |
fig.set_size_inches(30,30) | |
ax = fig.gca() | |
ax.set_aspect(1) | |
plt.axis('off') | |
plt.xlim(-1.0, 1.0) | |
plt.ylim(-1.0, 1.0) | |
#cmap = plt.cm.get_cmap('gray') | |
ss = [weights[node] for node in nodes] | |
plt.scatter(x, y, s=ss, c="#111111") | |
#Add edges | |
for node in nodes: | |
if node in vectors: | |
p1 = vectors[node] | |
edges = list(g.out_edges(node)) | |
for _, out in edges: | |
if out in nodes: | |
p2 = vectors[out] | |
plt.plot([p1[0], p2[0]], [p1[1], p2[1]], | |
color="#111111", | |
alpha=0.05) | |
plt.savefig("/home/joelkuiper/Windows/Desktop/cor/" + fname + ".png", dpi=96, facecolor="#e0e0e0") | |
plt.close() | |
# lazy https://stackoverflow.com/questions/56982035/generate-list-of-list-for-start-and-end-date-of-week-from-date-range | |
import calendar | |
from datetime import datetime, timedelta | |
def get_all_weeks(start_date, end_date): | |
start_date = datetime.strptime(start_date, '%Y-%m-%d').date() | |
end_date = datetime.strptime(end_date, '%Y-%m-%d').date() | |
one_day, one_week = timedelta(days=1), timedelta(days=7) | |
current_week_start = calendar.Calendar().monthdatescalendar(start_date.year, start_date.month)[0][0] | |
while True: | |
if current_week_start + one_week <= start_date: | |
current_week_start += one_week | |
continue | |
if current_week_start > end_date: | |
break | |
yield [current_week_start.strftime('%Y-%m-%d'), (current_week_start + one_week - one_day).strftime('%Y-%m-%d')] | |
current_week_start += one_week | |
def animation(model, g, cord, start='2019-01-01', end='2020-05-01', key='i'): | |
df = pd.DataFrame(cord) | |
df['date'] = pd.to_datetime(df['published']) | |
weeks = [*get_all_weeks(start, end)] | |
idx = 0 | |
for week_start, week_end in weeks: | |
mask = (df['date'] > start) & (df['date'] <= week_end) | |
rows = df.loc[mask] | |
lines = [list(filter(len, row.split("; "))) for row in rows[key]] | |
visualize(model, g, lines, key + "_" + str(idx)) | |
idx = idx + 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment