Untitled
unknown
plain_text
2 years ago
3.6 kB
17
Indexable
Never
#!/usr/bin/python # -*- coding: utf-8 -*- import sys import getopt from datetime import datetime import pandas as pd from sqlalchemy import create_engine if __name__ == '__main__': # input params setup unixOptions = 'sdt:edt:' gnuOptions = ['start_dt=', 'end_dt='] fullCmdArguments = sys.argv argumentList = fullCmdArguments[1:] # excluding script name try: arguments, values = getopt.getopt( argumentList, unixOptions, gnuOptions ) except getopt.error as err: # output error, and return with an error code print(str(err)) sys.exit(2) start_dt = '1981-01-01' end_dt = '1998-01-01' for currentArgument, currentValue in arguments: if currentArgument in ('-sdt', '--start_dt'): start_dt = currentValue elif currentArgument in ('-edt', '--end_dt'): end_dt = currentValue db_config = { 'user': 'my_user', 'pwd': 'my_user_password', 'host': 'localhost', 'port': 5432, 'db': 'games', } connection_string = 'postgresql://{}:{}@{}:{}/{}'.format( db_config['user'], db_config['pwd'], db_config['host'], db_config['port'], db_config['db'], ) engine = create_engine(connection_string) query = ''' SELECT * FROM data_raw WHERE year_of_release::TIMESTAMP BETWEEN '{}'::TIMESTAMP AND '{}'::TIMESTAMP '''.format( start_dt, end_dt ) data_raw = pd.io.sql.read_sql(query, con=engine, index_col='game_id') columns_str = ['name', 'platform', 'genre', 'rating'] columns_numeric = [ 'na_players', 'eu_players', 'jp_players', 'other_players', 'critic_score', 'user_score', ] columns_datetime = ['year_of_release'] for column in columns_str: data_raw[column] = data_raw[column].astype(str) for column in columns_numeric: data_raw[column] = pd.to_numeric(data_raw[column], errors='coerce') for column in columns_datetime: data_raw[column] = pd.to_datetime(data_raw[column]) data_raw['total_copies_sold'] = data_raw[ ['na_players', 'eu_players', 'jp_players', 'other_players'] ].sum(axis=1) agg_games_year_genre_platform = data_raw.groupby( ['year_of_release', 'genre', 'platform'] ).agg({'name': 'count', 'total_copies_sold': 'sum'}) agg_games_year_score = data_raw.groupby( ['year_of_release', 'genre', 'platform'] ).agg({'critic_score': 'mean', 'user_score': 'mean'}) agg_games_year_genre_platform = agg_games_year_genre_platform.rename( columns={'name': 'games'} ) agg_games_year_score = agg_games_year_score.rename( columns={ 'critic_score': 'avg_critic_score', 'user_score': 'avg_user_score', } ) agg_games_year_genre_platform = agg_games_year_genre_platform.fillna( 0 ).reset_index() agg_games_year_score = agg_games_year_score.fillna(0).reset_index() tables = { 'agg_games_year_genre_platform': agg_games_year_genre_platform, 'agg_games_year_score': agg_games_year_score, } for table_name, table_data in tables.items(): query = ''' DELETE FROM {} WHERE year_of_release BETWEEN '{}'::TIMESTAMP AND '{}'::TIMESTAMP '''.format( table_name, start_dt, end_dt ) engine.execute(query) table_data.to_sql( name=table_name, con=engine, if_exists='append', index=False ) print('All done.')