Untitled

mail@pastecode.io avatar
unknown
plain_text
3 years ago
3.6 kB
17
Indexable
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys

import getopt
from datetime import datetime

import pandas as pd

from sqlalchemy import create_engine

if __name__ == '__main__':

    # input params setup
    unixOptions = 'sdt:edt:'
    gnuOptions = ['start_dt=', 'end_dt=']

    fullCmdArguments = sys.argv
    argumentList = fullCmdArguments[1:]  # excluding script name

    try:
        arguments, values = getopt.getopt(
            argumentList, unixOptions, gnuOptions
        )
    except getopt.error as err:
        # output error, and return with an error code
        print(str(err))
        sys.exit(2)

    start_dt = '1981-01-01'
    end_dt = '1998-01-01'
    for currentArgument, currentValue in arguments:
        if currentArgument in ('-sdt', '--start_dt'):
            start_dt = currentValue
        elif currentArgument in ('-edt', '--end_dt'):
            end_dt = currentValue

    db_config = {
        'user': 'my_user',
        'pwd': 'my_user_password',
        'host': 'localhost',
        'port': 5432,
        'db': 'games',
    }
    connection_string = 'postgresql://{}:{}@{}:{}/{}'.format(
        db_config['user'],
        db_config['pwd'],
        db_config['host'],
        db_config['port'],
        db_config['db'],
    )

    engine = create_engine(connection_string)

    query = ''' SELECT * 
                FROM data_raw 
                WHERE year_of_release::TIMESTAMP BETWEEN '{}'::TIMESTAMP AND '{}'::TIMESTAMP
            '''.format(
        start_dt, end_dt
    )

    data_raw = pd.io.sql.read_sql(query, con=engine, index_col='game_id')

    columns_str = ['name', 'platform', 'genre', 'rating']
    columns_numeric = [
        'na_players',
        'eu_players',
        'jp_players',
        'other_players',
        'critic_score',
        'user_score',
    ]
    columns_datetime = ['year_of_release']

    for column in columns_str:
        data_raw[column] = data_raw[column].astype(str)
    for column in columns_numeric:
        data_raw[column] = pd.to_numeric(data_raw[column], errors='coerce')
    for column in columns_datetime:
        data_raw[column] = pd.to_datetime(data_raw[column])
    data_raw['total_copies_sold'] = data_raw[
        ['na_players', 'eu_players', 'jp_players', 'other_players']
    ].sum(axis=1)

    agg_games_year_genre_platform = data_raw.groupby(
        ['year_of_release', 'genre', 'platform']
    ).agg({'name': 'count', 'total_copies_sold': 'sum'})
    agg_games_year_score = data_raw.groupby(
        ['year_of_release', 'genre', 'platform']
    ).agg({'critic_score': 'mean', 'user_score': 'mean'})

    agg_games_year_genre_platform = agg_games_year_genre_platform.rename(
        columns={'name': 'games'}
    )
    agg_games_year_score = agg_games_year_score.rename(
        columns={
            'critic_score': 'avg_critic_score',
            'user_score': 'avg_user_score',
        }
    )

    agg_games_year_genre_platform = agg_games_year_genre_platform.fillna(
        0
    ).reset_index()
    agg_games_year_score = agg_games_year_score.fillna(0).reset_index()

    tables = {
        'agg_games_year_genre_platform': agg_games_year_genre_platform,
        'agg_games_year_score': agg_games_year_score,
    }

    for table_name, table_data in tables.items():

        query = '''
                  DELETE FROM {} WHERE year_of_release BETWEEN '{}'::TIMESTAMP AND '{}'::TIMESTAMP
                '''.format(
            table_name, start_dt, end_dt
        )
        engine.execute(query)

        table_data.to_sql(
            name=table_name, con=engine, if_exists='append', index=False
        )

    print('All done.')