Untitled

import pandas as pd
import joblib  # Import joblib directly
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import matplotlib.pyplot as plt

# Load the training data
data = pd.read_csv('C:/Users/Panos/Downloads/training_data_car_ins.csv')

# Define Age of Driver Classification
def classify_age_of_driver(age):
    if age < 25:
        return '<25'
    elif 25 <= age <= 74:
        return '25-74'
    else:
        return '75+'

# Define Age of Licence Classification
def classify_age_of_licence(age):
    return '<4' if age < 4 else '4+'

# Define Age of Policy Classification
def classify_age_of_policy(age):
    return '<1' if age < 1 else '1+'

# Apply the classifications to the training data
data['Age of Driver Class'] = data['Age of Driver'].apply(classify_age_of_driver)
data['Age of Licence Class'] = data['Age of Licence'].apply(classify_age_of_licence)
data['Age of Policy Class'] = data['Age of Policy'].apply(classify_age_of_policy)

# Prepare the feature and target variables
X = pd.get_dummies(data[['Age of Driver Class', 'Age of Licence Class', 'Age of Policy Class']], drop_first=True)
y = data['No Claims']

# Train the model using the entire dataset
model = DecisionTreeClassifier(random_state=42)
model.fit(X, y)

# Save the trained model to a file
joblib.dump(model, 'decision_tree_model.pkl')
print("Model saved.")

# ----------------------------
# Now for applying the model
# ----------------------------

# Load the new dataset (where 'No Claims' is missing)
new_data = pd.read_csv('C:/Users/Panos/Downloads/test_data_car_ins.csv')

# Preprocess the new data
new_data['Age of Driver Class'] = new_data['Age of Driver'].apply(classify_age_of_driver)
new_data['Age of Licence Class'] = new_data['Age of Licence'].apply(classify_age_of_licence)
new_data['Age of Policy Class'] = new_data['Age of Policy'].apply(classify_age_of_policy)

# Convert the new data's categorical variables into dummy variables
X_new = pd.get_dummies(new_data[['Age of Driver Class', 'Age of Licence Class', 'Age of Policy Class']], drop_first=True)

# Load the trained model from the file
model = joblib.load('decision_tree_model.pkl')

# Ensure that the new data has the same columns as the training data
training_columns = model.feature_names_in_

# Add any missing columns in the new dataset with a value of 0
for col in training_columns:
    if col not in X_new.columns:
        X_new[col] = 0

# Ensure the new data has exactly the same columns (and order) as the training data
X_new = X_new[training_columns]

# Predict the 'No Claims' column using the trained model
new_data['Predicted No Claims'] = model.predict(X_new)

# Save the predictions to a new CSV file
new_data.to_csv('C:/Users/Panos/Downloads/new_car_insurance_data_with_predictions.csv', index=False)

print("Predictions saved to new CSV file.")
Editor is loading...