Last active
August 29, 2015 14:13
-
-
Save glouppe/1b90984e2d8168ca9de4 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
sys.path.append("/usr/lib/python2.7/dist-packages/") | |
sys.path.append("/usr/local/lib/python2.7/dist-packages/") | |
import string | |
import re | |
from joblib import Parallel, delayed | |
from invenio.dbquery import run_sql | |
from invenio.bibauthorid_dbinterface import get_title_of_paper | |
from invenio.bibauthorid_dbinterface import get_authors_of_paper | |
from invenio.bibauthorid_dbinterface import get_keywords_for_paper | |
from invenio.bibauthorid_dbinterface import get_collaborations_for_paper | |
from invenio.bibrank_citation_searcher import get_refers_to | |
from invenio.bibrank_citation_searcher import get_cited_by | |
from invenio.search_engine_utils import get_fieldvalues | |
def get_affiliation(table, bibref_value, bibrec): | |
"""Returns institution name and field number of a signature.""" | |
table_name = str(table)[0:2] + 'x' | |
q = run_sql("""SELECT f2.value, r.field_number | |
FROM bibrec AS b | |
INNER JOIN bibrec_bib%s AS r ON (r.id_bibrec = b.id) | |
INNER JOIN bib%s AS f ON (r.id_bibxxx = f.id) | |
INNER JOIN bibrec_bib%s AS r2 ON (r2.id_bibrec = b.id AND | |
r.field_number = r2.field_number) | |
INNER JOIN bib%s AS f2 ON (r2.id_bibxxx = f2.id) | |
WHERE b.id = %d AND | |
f.id = %d AND | |
f2.tag = '%s__u' | |
""" % (table_name, table_name, table_name, table_name, | |
bibrec, bibref_value, table)) | |
if len(q) > 0: | |
return q[0] | |
else: | |
q = run_sql("""SELECT field_number | |
FROM bib%s, bibrec_bib%s | |
WHERE bib%s.id = bibrec_bib%s.id_bibxxx AND | |
bib%s.id = %s AND bibrec_bib%s.id_bibrec = %s | |
""" % (table_name, table_name, table_name, table_name, | |
table_name, bibref_value, table_name, bibrec)) | |
if len(q) > 0: | |
return None, q[0][0] | |
return None, None | |
def _getter_sig(i, signature): | |
affiliation, position = get_affiliation(signature[1], | |
signature[2], | |
signature[3]) | |
return {'signature_id': i, | |
'author_name': signature[4], | |
'publication_id': signature[3], | |
'author_affiliation': affiliation, | |
'signature_position': position} | |
def extract_signature_data(signatures, n_jobs=1): | |
return Parallel(n_jobs=n_jobs, verbose=3)(delayed(_getter_sig)(i, signature) | |
for i, signature | |
in signatures) | |
def get_year(recid): | |
for tag in ["773__y", "260__c", "269__c", "909C4y", "925__a"]: | |
date = get_fieldvalues([recid], tag) | |
if len(date) == 1: | |
date = date[0] | |
match_obj = re.search("\d\d\d\d", date) | |
if match_obj is not None: | |
return int(match_obj.group()) | |
return None | |
if __name__ == "__main__": | |
LIMIT = int(sys.argv[1]) | |
all_pairs = [] | |
all_y = [] | |
# Same author, same names | |
for letter in string.ascii_lowercase: | |
query = """SELECT a1.personid, a1.bibref_table, a1.bibref_value, | |
a1.bibrec, a1.name, a1.flag, | |
a2.personid, a2.bibref_table, a2.bibref_value, | |
a2.bibrec, a2.name, a2.flag | |
FROM aidPERSONIDPAPERS as a1 | |
INNER JOIN aidPERSONIDPAPERS as a2 ON a1.personid = a2.personid | |
WHERE a1.name = a2.name AND | |
a1.bibrec <> a2.bibrec AND | |
a1.flag = 2 AND a2.flag = 2 | |
AND a1.name LIKE '%s%%' | |
ORDER BY RAND() | |
LIMIT %d""" % (letter, LIMIT) | |
print query | |
pairs = run_sql(query) | |
y = [0.0] * len(pairs) | |
all_pairs.extend(pairs) | |
all_y.extend(y) | |
# Same author, different names | |
for letter in string.ascii_lowercase: | |
query = """SELECT a1.personid, a1.bibref_table, a1.bibref_value, | |
a1.bibrec, a1.name, a1.flag, | |
a2.personid, a2.bibref_table, a2.bibref_value, | |
a2.bibrec, a2.name, a2.flag | |
FROM aidPERSONIDPAPERS as a1 | |
INNER JOIN aidPERSONIDPAPERS as a2 ON a1.personid = a2.personid | |
WHERE a1.name <> a2.name AND | |
a1.bibrec <> a2.bibrec AND | |
a1.flag = 2 AND a2.flag = 2 | |
AND a1.name LIKE '%s%%' | |
ORDER BY RAND() | |
LIMIT %d""" % (letter, LIMIT) | |
print query | |
pairs = run_sql(query) | |
y = [0.0] * len(pairs) | |
all_pairs.extend(pairs) | |
all_y.extend(y) | |
# Different authors, same name | |
for letter in string.ascii_lowercase: | |
query = """SELECT a1.personid, a1.bibref_table, a1.bibref_value, | |
a1.bibrec, a1.name, a1.flag, | |
a2.personid, a2.bibref_table, a2.bibref_value, | |
a2.bibrec, a2.name, a2.flag | |
FROM aidPERSONIDPAPERS as a1 | |
INNER JOIN aidPERSONIDPAPERS as a2 ON a1.name = a2.name | |
WHERE a1.personid <> a2.personid AND | |
a1.bibrec <> a2.bibrec AND | |
a1.flag = 2 AND a2.flag = 2 | |
AND a1.name LIKE '%s%%' | |
ORDER BY RAND() | |
LIMIT %d""" % (letter, LIMIT) | |
print query | |
pairs = run_sql(query) | |
y = [1.0] * len(pairs) | |
all_pairs.extend(pairs) | |
all_y.extend(y) | |
# Different authors, different names | |
for letter in string.ascii_lowercase: | |
query1 = """SELECT a1.personid, a1.bibref_table, a1.bibref_value, | |
a1.bibrec, a1.name, a1.flag | |
FROM aidPERSONIDPAPERS as a1 | |
WHERE a1.flag = 2 AND a1.name LIKE '%s%%' | |
ORDER BY RAND() | |
LIMIT %d""" % (letter, LIMIT ** 0.5) | |
query2 = """SELECT a1.personid, a1.bibref_table, a1.bibref_value, | |
a1.bibrec, a1.name, a1.flag | |
FROM aidPERSONIDPAPERS as a1 | |
WHERE a1.flag = 2 | |
ORDER BY RAND() | |
LIMIT %d""" % (LIMIT ** 0.5, ) | |
print query1 | |
print query2 | |
pairs = [] | |
y = [] | |
for p1 in run_sql(query1): | |
for p2 in run_sql(query2): | |
if p1[0] != p2[0] and p1[4] != p2[4]: | |
pairs.append(p1 + p2) | |
y.append(1.0) | |
all_pairs.extend(pairs) | |
all_y.extend(y) | |
# Rejected | |
for letter in string.ascii_lowercase: | |
query = """SELECT a1.personid, a1.bibref_table, a1.bibref_value, | |
a1.bibrec, a1.name, a1.flag, | |
a2.personid, a2.bibref_table, a2.bibref_value, | |
a2.bibrec, a2.name, a2.flag | |
FROM aidPERSONIDPAPERS as a1 | |
INNER JOIN aidPERSONIDPAPERS as a2 ON a1.personid = a2.personid | |
WHERE a1.flag = 2 AND a2.flag = -2 | |
AND a1.name LIKE '%s%%' | |
ORDER BY RAND() | |
LIMIT %d""" % (letter, LIMIT) | |
print query | |
pairs = run_sql(query) | |
y = [1.0] * len(pairs) | |
all_pairs.extend(pairs) | |
all_y.extend(y) | |
# Assign IDs to signatures | |
signature_id_mapping = {} | |
next_id = 0 | |
X = [] | |
for i, pair in enumerate(all_pairs): | |
s1 = pair[0:6] | |
s2 = pair[6:12] | |
if s1[1:4] not in signature_id_mapping: | |
signature_id_mapping[s1[1:4]] = (next_id, s1) | |
next_id += 1 | |
if s2[1:4] not in signature_id_mapping: | |
signature_id_mapping[s2[1:4]] = (next_id, s2) | |
next_id += 1 | |
X.append((signature_id_mapping[s1[1:4]][0], signature_id_mapping[s2[1:4]][0])) | |
# Extract signature data | |
signature_data = extract_signature_data(signature_id_mapping.values(), n_jobs=-1) | |
signature_data = sorted(signature_data, key=lambda x: x["signature_id"]) | |
# Extract record data | |
records = {} | |
for i, signature in enumerate(signature_id_mapping.keys()): | |
if i % 1000 == 0: | |
print i | |
if signature[2] not in records: | |
records[signature[2]] = {'publication_id': signature[2], | |
'title': get_title_of_paper(signature[2]), | |
'authors': get_authors_of_paper(signature[2]), | |
'references': get_refers_to(signature[2]), | |
'citations': get_cited_by(signature[2]), | |
'year': get_year(signature[2]), | |
'keywords': get_keywords_for_paper(signature[2]), | |
'collaborations': get_collaborations_for_paper(signature[2])} | |
record_data = sorted(records.values(), key=lambda x: x["publication_id"]) | |
# Dump all | |
import cPickle | |
cPickle.dump((X, all_y, signature_data, record_data), | |
open(sys.argv[2], "w"), | |
protocol=cPickle.HIGHEST_PROTOCOL) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment