Untitled
unknown
plain_text
4 years ago
3.1 kB
4
Indexable
Without Pandas: from itertools import groupby, product def read_data(filename, columns): with open(filename, mode='r') as f: rows = [line.split('::') for line in f.read().splitlines()] records = [{name: value for name, value in zip(columns, row)} for row in rows] return records def merge(a, b, on): key = lambda d: d.get(on) lt = {k: list(g) for k, g in groupby(sorted(a, key=key), key=key)} rt = {k: list(g) for k, g in groupby(sorted(b, key=key), key=key)} out = [ {**a, **b} for k in set(lt).intersection(rt) for a, b in product(lt[k], rt[k]) ] return out def write_merge(fp, merged, sortby=None, fields=None, sep=None): if sortby is not None: merged = sorted(merged, key=lambda e: tuple(e[k] for k in sortby)) if merged and fields is None: fields = list(merged[0].keys()) for i, row in enumerate(merged): if i > 0 and sep is not None: print(sep, file=fp) for k in fields: print(f'{k}={row.get(k)}', file=fp) Application: users = read_data('users.dat', unames) ratings = read_data('ratings.dat', rnames) movies = read_data('movies.dat', mnames) merged = merge(merge(users, ratings, on='user_id'), movies, on='movie_id') fields = 'user_id movie_id rating timestamp gender age occupation zip title genres'.split() sortby = fields with open('output.dat', mode='w') as f: write_merge(f, merged, sortby=sortby, fields=fields, sep='') Simple test (modifying your data a little so that the inner merge is not empty): Prep: mov = """1::Toy Story (1995)::Animation|Children's|Comedy 2::Jumanji (1995)::Adventure|Children's|Fantasy 3::Grumpier Old Men (1995)::Comedy|Romance 4::Waiting to Exhale (1995)::Comedy|Drama 5::Father of the Bride Part II (1995)::Comedy """ rat = """1::1::5::978300760 1::2::3::978302109 1::914::3::978301968 2::1::4::978300275 2::3::5::978824291 """ usr = """1::F::1::10::48067 2::M::56::16::70072 3::M::25::15::55117 4::M::45::7::02460 5::M::25::20::55455 """ with open('users.dat', mode='w') as f: f.write(usr) with open('ratings.dat', mode='w') as f: f.write(rat) with open('movies.dat', mode='w') as f: f.write(mov) Test: unames = 'user_id gender age occupation zip'.split() rnames = 'user_id movie_id rating timestamp'.split() mnames = 'movie_id title genres'.split() users = read_data('users.dat', unames) ratings = read_data('ratings.dat', rnames) movies = read_data('movies.dat', mnames) merged = merge(merge(users, ratings, on='user_id'), movies, on='movie_id') # optional: quick vis using Pandas, just to test: display(pd.DataFrame(merged).sort_values(by=sortby)) # save to stdout for inspection write_merge(sys.stdout, merged, sortby=sortby, fields=fields, sep='------') Output: user_id=1 movie_id=1 rating=5 timestamp=978300760 gender=F age=1 occupation=10 zip=48067 title=Toy Story (1995) genres=Animation|Children's|Comedy ------ user_id=1 movie_id=2 rating=3 timestamp=978302109 gender=F age=1 occupation=10 zip=48067 title=Jumanji (1995) genres=Adventure|Children's|Fantasy ------ (...)
Editor is loading...