#!/usr/bin/python
# The usual preamble
import numpy as np
import pandas as pd
import time
import argparse
import json
import itertools
def jlower(jval):
try:
return json.dumps([name.lower() for name in json.loads(jval)])
except:
return "[]"
def removeshortwords(name):
return " ".join([word for word in name.split(' ') if len(word) > 2])
def jremoveshortwords(jval):
# return ','.join([removeshortwords(name) for name in jval.split(',')])
try:
return json.dumps([removeshortwords(name) for name in json.loads(jval)])
except:
return "[]"
def sortname(name):
return " ".join(sorted(name.split(' ')))
def jsortvalue(jval):
# return ','.join([sortname(name) for name in jval.split(',')])
try:
return json.dumps([sortname(name) for name in json.loads(jval)])
except:
return "[]"
def jsortallauthors(jval):
# return ','.join(sorted(jval.split(',')))
try:
return json.dumps(sorted(json.loads(jval)))
except:
return "[]"
def jfusedudfs(jval):
try:
jval = json.loads(jval)
jval = [name.lower() for name in jval]
jval = [removeshortwords(name) for name in jval]
jval = [sortname(name) for name in jval]
jval = sorted(jval)
return json.dumps(jval)
except:
return "[]"
def extractpid(project):
try:
return int(project.split("::")[2])
except:
return -1
def extractpclass(project):
return project.split("::")[1]
def extractpfunder(project):
return project.split("::")[0]
def cleandate(pubdate):
try:
pubdate_split = pubdate.split("-")
return pubdate_split[0] + "/" + pubdate_split[1] + "/" + pubdate_split[2]
except:
return ""
parser = argparse.ArgumentParser(description="DemoVldb")
parser.add_argument(
"--path",
type=str,
dest="data_path",
)
args = parser.parse_args()
tstart = time.time()
publications = pd.read_csv(args.data_path)
print("Done reading input file...")
load_time = time.time() - tstart
print('Load time: ', load_time)
publications['authorlist'] = publications['authorlist'].apply(jlower)
publications['authorlist'] = publications['authorlist'].apply(jremoveshortwords)
publications['authorlist'] = publications['authorlist'].apply(jsortvalue)
publications['authorlist'] = publications['authorlist'].apply(jsortallauthors)
publications['funder'] = publications['project'].apply(extractpfunder)
publications['class'] = publications['project'].apply(extractpclass)
publications['projectid'] = publications['project'].apply(extractpid)
j2combinations = pd.DataFrame(columns=['pubid', 'pubdate', 'funder', 'class', 'projectid', 'projectstart', 'projectend', 'authorpair'])
data_to_append = []
for row_index, row in publications.iterrows():
try:
name_list = json.loads(row['authorlist'])
for name_per in itertools.combinations(name_list, 2):
data = {
'id': row['id'],
'pubdate': row['date'],
'funder': row['funder'],
'class': row['class'],
'projectid': row['projectid'],
'projectstart': row['startdate'],
'projectend': row['enddate'],
'authorpair': json.dumps([name_per[0], name_per[1]])
}
data_to_append.append(data)
except:
pass
pairs = j2combinations.append(data_to_append, ignore_index=True)
projectpairs = pairs.copy()
joined = pairs.join(projectpairs, how='inner', lsuffix='_')
joined['pubdate'] = joined['pubdate'].apply(cleandate)
joined['projectstart'] = joined['projectstart'].apply(cleandate)
joined['projectend'] = joined['projectend'].apply(cleandate)
joined_grouped_df = pd.DataFrame()
joined_grouped_df['projectid'] = joined['projectid'].copy()
joined_grouped_df['authors_before'] = np.where(joined['pubdate'] < joined['projectstart'], 1, 0)
joined_grouped_df['authors_during'] = np.where((joined['projectstart'] <= joined['pubdate']) | (joined['pubdate'] <= joined['projectend']), 1, 0)
joined_grouped_df['authors_after'] = np.where(joined['pubdate'] > joined['projectend'], 1, 0)
tend = time.time()
print(f'Total exec time: {(tend - tstart) * 1000}ms')