Untitled

 avatar
unknown
plain_text
4 years ago
3.1 kB
4
Indexable


Without Pandas:

from itertools import groupby, product

def read_data(filename, columns):
    with open(filename, mode='r') as f:
        rows = [line.split('::') for line in f.read().splitlines()]
    records = [{name: value for name, value in zip(columns, row)} for row in rows]
    return records

def merge(a, b, on):
    key = lambda d: d.get(on)
    lt = {k: list(g) for k, g in groupby(sorted(a, key=key), key=key)}
    rt = {k: list(g) for k, g in groupby(sorted(b, key=key), key=key)}
    out = [
        {**a, **b}
        for k in set(lt).intersection(rt)
        for a, b in product(lt[k], rt[k])
    ]
    return out

def write_merge(fp, merged, sortby=None, fields=None, sep=None):
    if sortby is not None:
        merged = sorted(merged, key=lambda e: tuple(e[k] for k in sortby))
    if merged and fields is None:
        fields = list(merged[0].keys())
    for i, row in enumerate(merged):
        if i > 0 and sep is not None:
            print(sep, file=fp)
        for k in fields:
            print(f'{k}={row.get(k)}', file=fp)

Application:

users = read_data('users.dat', unames)
ratings = read_data('ratings.dat', rnames)
movies = read_data('movies.dat', mnames)

merged = merge(merge(users, ratings, on='user_id'), movies, on='movie_id')

fields = 'user_id movie_id rating timestamp gender age occupation zip title genres'.split()
sortby = fields

with open('output.dat', mode='w') as f:
    write_merge(f, merged, sortby=sortby, fields=fields, sep='')

Simple test (modifying your data a little so that the inner merge is not empty):

Prep:

mov = """1::Toy Story (1995)::Animation|Children's|Comedy
2::Jumanji (1995)::Adventure|Children's|Fantasy
3::Grumpier Old Men (1995)::Comedy|Romance
4::Waiting to Exhale (1995)::Comedy|Drama
5::Father of the Bride Part II (1995)::Comedy
"""

rat = """1::1::5::978300760
1::2::3::978302109
1::914::3::978301968
2::1::4::978300275
2::3::5::978824291
"""

usr = """1::F::1::10::48067
2::M::56::16::70072
3::M::25::15::55117
4::M::45::7::02460
5::M::25::20::55455
"""

with open('users.dat', mode='w') as f: f.write(usr)
with open('ratings.dat', mode='w') as f: f.write(rat)
with open('movies.dat', mode='w') as f: f.write(mov)

Test:

unames = 'user_id gender age occupation zip'.split()
rnames = 'user_id movie_id rating timestamp'.split()
mnames = 'movie_id title genres'.split()

users = read_data('users.dat', unames)
ratings = read_data('ratings.dat', rnames)
movies = read_data('movies.dat', mnames)

merged = merge(merge(users, ratings, on='user_id'), movies, on='movie_id')

# optional: quick vis using Pandas, just to test:
display(pd.DataFrame(merged).sort_values(by=sortby))

# save to stdout for inspection
write_merge(sys.stdout, merged, sortby=sortby, fields=fields, sep='------')

Output:

user_id=1
movie_id=1
rating=5
timestamp=978300760
gender=F
age=1
occupation=10
zip=48067
title=Toy Story (1995)
genres=Animation|Children's|Comedy
------
user_id=1
movie_id=2
rating=3
timestamp=978302109
gender=F
age=1
occupation=10
zip=48067
title=Jumanji (1995)
genres=Adventure|Children's|Fantasy
------
(...)

Editor is loading...