Untitled
unknown
python
2 months ago
4.5 kB
5
Indexable
""" 1) It's worth to add here everywhere typing 2) It's worth to cover everything with unit tests and maybe a bit refactor to make it more unit testable 3) Event Type and Event Source we operate with should be enums to easier work with them during validation and so on """ import argparse import json import pandas as pd LISTING_EVENTS_EVENT_TYPE_MAPPING = { 'SOLD': 'LISTING_SOLD', 'LISTED': 'LISTING_LISTED', 'PRICE_CHANGED': 'LISTING_PRICE_CHANGED', 'WITHDRAWN': 'LISTING_WITHDRAWN', } def _process_and_prepare_data_registered_units(registered_units): return registered_units.rename(columns={'id': 'unit_id'}) def _process_and_prepare_data_registered_sales(registered_sales): registered_sales['event_source'] = 'SALE_REGISTRY' registered_sales['event_type'] = 'REGISTERED_SALE' registered_sales['registered_sale_id'] = registered_sales.index + 1 # creating missing id return registered_sales def _process_and_prepare_data_listing_events(listing_events): listing_events['event_source'] = 'LISTINGS' listing_events['event_type'] = listing_events.apply( lambda row: LISTING_EVENTS_EVENT_TYPE_MAPPING[row['listing_event']], axis=1 ) return listing_events def _process_and_prepare_merged_data(data): data['date'] = data.apply( lambda row: row['event_date'] if pd.isna(row['sale_date']) else row['sale_date'], axis=1 ) data['event_original_ref_id'] = data.apply( lambda row: row['registered_sale_id'] if pd.isna(row['listing_id']) else row['listing_id'], axis=1 ) data['price'] = data.apply( lambda row: row['listed_price'] if pd.isna(row['sale_price']) else row['sale_price'], axis=1 ) # it's better to sort on 'real' date objects data['date'] = pd.to_datetime(data['date']) data = data.sort_values(by=['unit_id', 'date'], ascending=[True, True]) data['date'] = data['date'].dt.strftime('%Y-%m-%d') data['price'] = data['price'].astype(int) data['event_original_ref_id'] = data['event_original_ref_id'].astype(int) # it's much better to drop some of unused columns in the very beginning # to make our data lighter to manipulate on but for our example it's ok data.drop(columns=[ 'event_date', 'sale_date', 'registered_sale_id', 'listing_id', 'usable_area', 'rooms', 'listed_price', 'sale_price', 'listing_event' ], inplace=True) return data def process_and_prepare_data( registered_sales, registered_units, listing_events ): registered_sales = _process_and_prepare_data_registered_sales(registered_sales) registered_units = _process_and_prepare_data_registered_units(registered_units) listing_events = _process_and_prepare_data_listing_events(listing_events) merged_sales_and_units = pd.merge(registered_sales, registered_units, on='unit_id', how='left') merge_listings_and_units = pd.merge(listing_events, registered_units, on='unit_id', how='left') data = pd.concat([merged_sales_and_units, merge_listings_and_units], ignore_index=True, sort=False) return _process_and_prepare_merged_data(data) def format_data(data): result_total = {} for _, row in data.iterrows(): result_per_unit = result_total.get(row['unit_id']) or {} result_per_unit['unit_id'] = row['unit_id'] result_per_unit['unit_type'] = row['unit_type'] result_per_unit['market_history'] = result_per_unit.get('market_history') or [] result_per_unit['market_history'].append({ 'event_date': row['date'], 'event_type': row['event_type'], 'price': row['price'], 'event_source': row['event_source'], 'event_original_ref_id': row['event_original_ref_id'], }) result_total[row['unit_id']] = result_per_unit return [v for k, v in result_total.items()] if __name__ == '__main__': # refactor to manage reading csv's more flexible and robust registered_sales = pd.read_csv('registered_sales.csv') registered_units = pd.read_csv('registered_units.csv') listing_events = pd.read_csv('listing_events.csv') parser = argparse.ArgumentParser() parser.add_argument('--plot-id', type=int) args = parser.parse_args() data = process_and_prepare_data( registered_sales, registered_units, listing_events ) # at this point I would save our processed and prepared data # to the database but in our example we can use it directly relevant_to_given_plot_data = data[data['plot_id'] == args.plot_id] formatted_data = format_data(relevant_to_given_plot_data) # also before printing / giving out the data I would like # to introduce some validation (Marshmallow or custom one) print(json.dumps(formatted_data, indent=4))
Editor is loading...
Leave a Comment