Untitled

 avatar
unknown
python
a month ago
3.8 kB
4
Indexable
import pandas as pd

# Load the Excel file
file_path = '/mnt/data/nearu-customair-import.xlsx'
data = pd.read_excel(file_path, sheet_name='Sheet1')

# Add a unique identifier for each customer
data['Customer Identifier'] = (
    data['Customer Name'] + " | " +
    data['Location Street'] + " | " +
    data['Location Zip'].astype(str) + " | " +
    data['Installed On'].astype(str)
)

# Extract model prefixes
data['Model_Prefix'] = data['Model'].fillna('').str[:3]

# Prepare variables
merged_rows = []
unique_id_start = 10000
used_units = set()

# Group by customer
customer_groups = data.groupby('Customer Identifier')

# Process each customer group
for customer_id, customer_data in customer_groups:
    outdoor_units = customer_data[customer_data['Unit Type'] == 'Outdoor Unit']
    other_units = customer_data[customer_data['Unit Type'] != 'Outdoor Unit']

    for _, outdoor_unit in outdoor_units.iterrows():
        matched_prefixes = set()
        merged_row = {
            'Unique ID': unique_id_start,
            'Merged Units': str(outdoor_unit['unit_number']),
            'Installed On': outdoor_unit['Installed On'],
            'Customer Name': outdoor_unit['Customer Name'],
            'Location Street': outdoor_unit['Location Street'],
            'Location Zip': outdoor_unit['Location Zip'],
            'unit1-model': outdoor_unit['Model'],
            'unit1-serial': outdoor_unit['Serial Number']
        }

        for unit in other_units.itertuples():
            if unit.unit_number not in used_units and unit.Model_Prefix not in matched_prefixes:
                index = len(matched_prefixes) + 2
                merged_row[f'unit{index}-model'] = unit.Model
                merged_row[f'unit{index}-serial'] = unit._4
                merged_row['Merged Units'] += f", {unit.unit_number}"
                matched_prefixes.add(unit.Model_Prefix)
                used_units.add(unit.unit_number)

                if len(matched_prefixes) >= 3:
                    break

        merged_rows.append(merged_row)
        unique_id_start += 1

# Create DataFrame from merged rows
transformed_data = pd.DataFrame(merged_rows)

# Save merged data
merged_output_path = '/mnt/data/final_corrected_multimatch_customer_data.csv'
transformed_data.to_csv(merged_output_path, index=False)

# Calculate summary
all_unit_numbers = set(data['unit_number'])
merged_unit_numbers = set()
for row in merged_rows:
    merged_unit_numbers.update(map(int, row['Merged Units'].split(", ")))
unmatched_unit_numbers = all_unit_numbers - merged_unit_numbers

# Prepare unmatched data
unmatched_df = pd.DataFrame({'Unmatched Unit Numbers': sorted(unmatched_unit_numbers)})

# Count rows merged by year
transformed_data['Installed Year'] = pd.to_datetime(transformed_data['Installed On']).dt.year
yearly_counts = transformed_data['Installed Year'].value_counts().reset_index()
yearly_counts.columns = ['Year', 'Count']
yearly_counts = yearly_counts.sort_values('Year')

# Prepare summary
summary_data = {
    "Total Units (unit_number)": [len(all_unit_numbers)],
    "Total Rows Merged": [len(merged_rows)],
    "Total Rows Not Merged": [len(unmatched_unit_numbers)]
}
summary_df = pd.DataFrame(summary_data)

# Save summary to Excel
summary_output_path = '/mnt/data/updated_summary_report.xlsx'
with pd.ExcelWriter(summary_output_path, engine='xlsxwriter') as writer:
    summary_df.to_excel(writer, sheet_name='Summary', index=False)
    unmatched_df.to_excel(writer, sheet_name='Unmatched Units', index=False)
    yearly_counts.to_excel(writer, sheet_name='Yearly Counts', index=False)

print(f"Merged data saved at: {merged_output_path}")
print(f"Summary report saved at: {summary_output_path}")
Leave a Comment