Untitled
unknown
python
2 months ago
4.3 kB
2
Indexable
Never
#!/usr/bin/python # The usual preamble import numpy as np import pandas as pd import time import argparse import json import itertools def jlower(jval): try: return json.dumps([name.lower() for name in json.loads(jval)]) except: return "[]" def removeshortwords(name): return " ".join([word for word in name.split(' ') if len(word) > 2]) def jremoveshortwords(jval): # return ','.join([removeshortwords(name) for name in jval.split(',')]) try: return json.dumps([removeshortwords(name) for name in json.loads(jval)]) except: return "[]" def sortname(name): return " ".join(sorted(name.split(' '))) def jsortvalue(jval): # return ','.join([sortname(name) for name in jval.split(',')]) try: return json.dumps([sortname(name) for name in json.loads(jval)]) except: return "[]" def jsortallauthors(jval): # return ','.join(sorted(jval.split(','))) try: return json.dumps(sorted(json.loads(jval))) except: return "[]" def jfusedudfs(jval): try: jval = json.loads(jval) jval = [name.lower() for name in jval] jval = [removeshortwords(name) for name in jval] jval = [sortname(name) for name in jval] jval = sorted(jval) return json.dumps(jval) except: return "[]" def extractpid(project): try: return int(project.split("::")[2]) except: return -1 def extractpclass(project): return project.split("::")[1] def extractpfunder(project): return project.split("::")[0] def cleandate(pubdate): try: pubdate_split = pubdate.split("-") return pubdate_split[0] + "/" + pubdate_split[1] + "/" + pubdate_split[2] except: return "" parser = argparse.ArgumentParser(description="DemoVldb") parser.add_argument( "--path", type=str, dest="data_path", ) args = parser.parse_args() tstart = time.time() publications = pd.read_csv(args.data_path) print("Done reading input file...") load_time = time.time() - tstart print('Load time: ', load_time) publications['authorlist'] = publications['authorlist'].apply(jlower) publications['authorlist'] = publications['authorlist'].apply(jremoveshortwords) publications['authorlist'] = publications['authorlist'].apply(jsortvalue) publications['authorlist'] = publications['authorlist'].apply(jsortallauthors) publications['funder'] = publications['project'].apply(extractpfunder) publications['class'] = publications['project'].apply(extractpclass) publications['projectid'] = publications['project'].apply(extractpid) j2combinations = pd.DataFrame(columns=['pubid', 'pubdate', 'funder', 'class', 'projectid', 'projectstart', 'projectend', 'authorpair']) data_to_append = [] for row_index, row in publications.iterrows(): try: name_list = json.loads(row['authorlist']) for name_per in itertools.combinations(name_list, 2): data = { 'id': row['id'], 'pubdate': row['date'], 'funder': row['funder'], 'class': row['class'], 'projectid': row['projectid'], 'projectstart': row['startdate'], 'projectend': row['enddate'], 'authorpair': json.dumps([name_per[0], name_per[1]]) } data_to_append.append(data) except: pass pairs = j2combinations.append(data_to_append, ignore_index=True) projectpairs = pairs.copy() joined = pairs.join(projectpairs, how='inner', lsuffix='_') joined['pubdate'] = joined['pubdate'].apply(cleandate) joined['projectstart'] = joined['projectstart'].apply(cleandate) joined['projectend'] = joined['projectend'].apply(cleandate) joined_grouped_df = pd.DataFrame() joined_grouped_df['projectid'] = joined['projectid'].copy() joined_grouped_df['authors_before'] = np.where(joined['pubdate'] < joined['projectstart'], 1, 0) joined_grouped_df['authors_during'] = np.where((joined['projectstart'] <= joined['pubdate']) | (joined['pubdate'] <= joined['projectend']), 1, 0) joined_grouped_df['authors_after'] = np.where(joined['pubdate'] > joined['projectend'], 1, 0) tend = time.time() print(f'Total exec time: {(tend - tstart) * 1000}ms')