Untitled
import pandas as pd # Load the Excel file file_path = '/mnt/data/nearu-customair-import.xlsx' data = pd.read_excel(file_path, sheet_name='Sheet1') # Add a unique identifier for each customer data['Customer Identifier'] = ( data['Customer Name'] + " | " + data['Location Street'] + " | " + data['Location Zip'].astype(str) + " | " + data['Installed On'].astype(str) ) # Extract model prefixes data['Model_Prefix'] = data['Model'].fillna('').str[:3] # Prepare variables merged_rows = [] unique_id_start = 10000 used_units = set() # Group by customer customer_groups = data.groupby('Customer Identifier') # Process each customer group for customer_id, customer_data in customer_groups: outdoor_units = customer_data[customer_data['Unit Type'] == 'Outdoor Unit'] other_units = customer_data[customer_data['Unit Type'] != 'Outdoor Unit'] for _, outdoor_unit in outdoor_units.iterrows(): matched_prefixes = set() merged_row = { 'Unique ID': unique_id_start, 'Merged Units': str(outdoor_unit['unit_number']), 'Installed On': outdoor_unit['Installed On'], 'Customer Name': outdoor_unit['Customer Name'], 'Location Street': outdoor_unit['Location Street'], 'Location Zip': outdoor_unit['Location Zip'], 'unit1-model': outdoor_unit['Model'], 'unit1-serial': outdoor_unit['Serial Number'] } for unit in other_units.itertuples(): if unit.unit_number not in used_units and unit.Model_Prefix not in matched_prefixes: index = len(matched_prefixes) + 2 merged_row[f'unit{index}-model'] = unit.Model merged_row[f'unit{index}-serial'] = unit._4 merged_row['Merged Units'] += f", {unit.unit_number}" matched_prefixes.add(unit.Model_Prefix) used_units.add(unit.unit_number) if len(matched_prefixes) >= 3: break merged_rows.append(merged_row) unique_id_start += 1 # Create DataFrame from merged rows transformed_data = pd.DataFrame(merged_rows) # Save merged data merged_output_path = '/mnt/data/final_corrected_multimatch_customer_data.csv' transformed_data.to_csv(merged_output_path, index=False) # Calculate summary all_unit_numbers = set(data['unit_number']) merged_unit_numbers = set() for row in merged_rows: merged_unit_numbers.update(map(int, row['Merged Units'].split(", "))) unmatched_unit_numbers = all_unit_numbers - merged_unit_numbers # Prepare unmatched data unmatched_df = pd.DataFrame({'Unmatched Unit Numbers': sorted(unmatched_unit_numbers)}) # Count rows merged by year transformed_data['Installed Year'] = pd.to_datetime(transformed_data['Installed On']).dt.year yearly_counts = transformed_data['Installed Year'].value_counts().reset_index() yearly_counts.columns = ['Year', 'Count'] yearly_counts = yearly_counts.sort_values('Year') # Prepare summary summary_data = { "Total Units (unit_number)": [len(all_unit_numbers)], "Total Rows Merged": [len(merged_rows)], "Total Rows Not Merged": [len(unmatched_unit_numbers)] } summary_df = pd.DataFrame(summary_data) # Save summary to Excel summary_output_path = '/mnt/data/updated_summary_report.xlsx' with pd.ExcelWriter(summary_output_path, engine='xlsxwriter') as writer: summary_df.to_excel(writer, sheet_name='Summary', index=False) unmatched_df.to_excel(writer, sheet_name='Unmatched Units', index=False) yearly_counts.to_excel(writer, sheet_name='Yearly Counts', index=False) print(f"Merged data saved at: {merged_output_path}") print(f"Summary report saved at: {summary_output_path}")
Leave a Comment