Untitled

mail@pastecode.io avatar
unknown
python
a year ago
4.3 kB
5
Indexable
#!/usr/bin/python

# The usual preamble
import numpy as np
import pandas as pd
import time
import argparse
import json
import itertools

def jlower(jval):
    try:
        return json.dumps([name.lower() for name in json.loads(jval)])
    except:
        return "[]"

def removeshortwords(name):
    return " ".join([word for word in name.split(' ') if len(word) > 2])

def jremoveshortwords(jval):
    # return ','.join([removeshortwords(name) for name in jval.split(',')])
    try:
        return json.dumps([removeshortwords(name) for name in json.loads(jval)])
    except:
        return "[]"


def sortname(name):
    return " ".join(sorted(name.split(' ')))

def jsortvalue(jval):
    # return ','.join([sortname(name) for name in jval.split(',')])
    try:
        return json.dumps([sortname(name) for name in json.loads(jval)])
    except:
        return "[]"

def jsortallauthors(jval):
    # return ','.join(sorted(jval.split(',')))
    try:
        return json.dumps(sorted(json.loads(jval)))
    except:
        return "[]"

def jfusedudfs(jval):
    try:
        jval = json.loads(jval)
        jval = [name.lower() for name in jval]
        jval = [removeshortwords(name) for name in jval]
        jval = [sortname(name) for name in jval]
        jval = sorted(jval)

        return json.dumps(jval)
    except:
        return "[]"

def extractpid(project):
    try:
        return int(project.split("::")[2])
    except:
        return -1
    
def extractpclass(project):
    return project.split("::")[1]

def extractpfunder(project):
    return project.split("::")[0]

def cleandate(pubdate):
    try:
        pubdate_split = pubdate.split("-")
        return pubdate_split[0] + "/" + pubdate_split[1] + "/" + pubdate_split[2]
    except:
        return ""


parser = argparse.ArgumentParser(description="DemoVldb")
parser.add_argument(
    "--path",
    type=str,
    dest="data_path",
)
args = parser.parse_args()

tstart = time.time()
publications = pd.read_csv(args.data_path)
print("Done reading input file...")

load_time = time.time() - tstart
print('Load time: ', load_time)

publications['authorlist'] = publications['authorlist'].apply(jlower)
publications['authorlist'] = publications['authorlist'].apply(jremoveshortwords)
publications['authorlist'] = publications['authorlist'].apply(jsortvalue)
publications['authorlist'] = publications['authorlist'].apply(jsortallauthors)
publications['funder'] = publications['project'].apply(extractpfunder)
publications['class'] = publications['project'].apply(extractpclass)
publications['projectid'] = publications['project'].apply(extractpid)

j2combinations = pd.DataFrame(columns=['pubid', 'pubdate', 'funder', 'class', 'projectid', 'projectstart', 'projectend', 'authorpair'])
data_to_append = []
for row_index, row in publications.iterrows():
    try:
        name_list = json.loads(row['authorlist'])
        for name_per in itertools.combinations(name_list, 2):
            data = {
                'id': row['id'],
                'pubdate': row['date'],
                'funder': row['funder'],
                'class': row['class'],
                'projectid': row['projectid'],
                'projectstart': row['startdate'],
                'projectend': row['enddate'],
                'authorpair': json.dumps([name_per[0], name_per[1]])
            }
            data_to_append.append(data)
    except:
        pass

pairs = j2combinations.append(data_to_append, ignore_index=True)
projectpairs = pairs.copy()

joined = pairs.join(projectpairs, how='inner', lsuffix='_')

joined['pubdate'] = joined['pubdate'].apply(cleandate)
joined['projectstart'] = joined['projectstart'].apply(cleandate)
joined['projectend'] = joined['projectend'].apply(cleandate)

joined_grouped_df = pd.DataFrame()
joined_grouped_df['projectid'] = joined['projectid'].copy()
joined_grouped_df['authors_before'] = np.where(joined['pubdate'] < joined['projectstart'], 1, 0)
joined_grouped_df['authors_during'] = np.where((joined['projectstart'] <= joined['pubdate']) | (joined['pubdate'] <= joined['projectend']), 1, 0)
joined_grouped_df['authors_after'] = np.where(joined['pubdate'] > joined['projectend'], 1, 0)

tend = time.time()

print(f'Total exec time: {(tend - tstart) * 1000}ms')