Untitled

 avatar
unknown
plain_text
2 months ago
8.5 kB
4
Indexable
# A program to process, analyse, and predict some features about Housing price data set
import pandas as pd # importing pandas libarary for analysing, and dealing with the data
import matplotlib.pyplot as plt  # For creating static, animated, and interactive visualizations
import seaborn as sns  # For making statistical plots, built on top of Matplotlib

# Data preprocessing
## Reading the data
### loading the data set

housing_data = pd.read_csv("Housing_Price_Data.csv")

### inspecting the first 5 cols in the dataset
print(housing_data.head())

### inspecting the last 5 cols in the dataset
print(housing_data.tail())

### generate a summary statistics about the dataset
print(housing_data.describe())



## Exploring the data
### Basic Exploration:
#### I guess the "prefarea" col won't help me in analysing the dataset, so I am gonna drop it
housing_data = housing_data.drop(columns=['prefarea'])



### Check Datatypes
#### Identify numerical columns
numerical_columns = housing_data.select_dtypes(include=['number']).columns
print(numerical_columns)

#### Identify categorical columns
categorical_columns = housing_data.select_dtypes(include=['object', 'category']).columns
print(categorical_columns)


#### Convert categorical columns to the 'category' datatype
housing_data[categorical_columns] = housing_data[categorical_columns].apply(lambda col: col.astype('category'))

### Categorical Column Analysis
#### Display the number of unique categories in each categorical column
unique_categories = housing_data[categorical_columns].nunique()
print(unique_categories)



### Missing Values
#### Check for missing values in each column
missing_values = housing_data.isnull().sum()
print(missing_values)

#### Calculate the percentage of missing values in each column
missing_percentage = (housing_data.isnull().sum() / len(housing_data)) * 100
print(missing_percentage)



##Handle Missing Values
### High Null Ratios
#### Drop columns where the null-value percentage is too high (e.g., >50%).
columns_to_drop = missing_percentage[missing_percentage > 50].index
housing_data = housing_data.drop(columns=columns_to_drop)

### Categorical Columns
#### Fill missing values with the mode of each column
housing_data = housing_data.apply(lambda col: col.fillna(col.mode()[0]) if col.isnull().any() else col)

### Numerical Columns
####Visualize the distribution of each column (e.g., using histograms or skewness statistics).

# Set up the figure size
plt.figure(figsize=(15, 10))

# Plot histograms for numerical columns
numerical_columns = housing_data.select_dtypes(include=['number']).columns  # Select numerical columns
for i, column in enumerate(numerical_columns, 1):
    plt.subplot(3, 3, i)  # Adjust subplot size as needed
    sns.histplot(housing_data[column], kde=True)  # Plot histogram with kernel density estimate (KDE)
    plt.title(f'Distribution of {column}')  # Set title for each subplot
    plt.tight_layout()  # Adjust spacing between subplots for a clean layout

# Show the plots
plt.show()

# Calculate skewness for numerical columns
skewness = housing_data[numerical_columns].skew()  # Calculate skewness for each numerical column
print(skewness)

# Impute missing values based on skewness
for column in numerical_columns:
    if skewness[column] > 1 or skewness[column] < -1:  # If the column is highly skewed
        # Fill missing values with the median for skewed distributions
        housing_data[column] = housing_data[column].fillna(housing_data[column].median())
    else:
        # Fill missing values with the mean for symmetric distributions
        housing_data[column] = housing_data[column].fillna(housing_data[column].mean())

# Verify the updated DataFrame with missing values filled
print(housing_data.isnull().sum())  # Check if any missing values remain

### valdiate Null handling
missing_values_after_imputation = housing_data.isnull().sum()

# Display the number of missing values for each column
print("Missing values after imputation:")
print(missing_values_after_imputation)

# Check if there are any columns with missing values
if missing_values_after_imputation.sum() == 0:
    print("\nNo missing values remain in the dataset.")
else:
    print("\nThere are still missing values in the dataset.")

## Visualize Outliers Using Box Plots
import matplotlib.pyplot as plt
import seaborn as sns

# Set up the figure size for plotting
plt.figure(figsize=(15, 10))

# Plot box plots for numerical columns to detect outliers
numerical_columns = housing_data.select_dtypes(include=['number']).columns
for i, column in enumerate(numerical_columns, 1):
    plt.subplot(3, 3, i)  # Adjust subplot grid size if necessary
    sns.boxplot(x=housing_data[column])  # Box plot for each numerical column
    plt.title(f'Box Plot of {column}')
    plt.tight_layout()  # Adjust spacing between subplots

# Show the plots
plt.show()

## Capping Outliers
# Capping outliers in numerical columns
for column in numerical_columns:
    # Calculate Q1 (25th percentile) and Q3 (75th percentile)
    Q1 = housing_data[column].quantile(0.25)
    Q3 = housing_data[column].quantile(0.75)

    # Calculate the IQR (Interquartile Range)
    IQR = Q3 - Q1

    # Calculate upper and lower bounds
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Capping the outliers
    housing_data[column] = housing_data[column].clip(lower=lower_bound, upper=upper_bound)

# Verify if outliers are capped
print(housing_data[numerical_columns].describe())

# Handle Categorical Outliers
# Handle categorical outliers by replacing rare categories with the mode
for column in housing_data.select_dtypes(include=['object', 'category']).columns:
    # Calculate the frequency of each category
    category_counts = housing_data[column].value_counts()

    # Identify rare categories (less frequent than a threshold, e.g., 5%)
    threshold = 0.05 * len(housing_data)  # Categories occurring less than 5% of the time are considered rare
    rare_categories = category_counts[category_counts < threshold].index

    # Replace rare categories with the mode of the column
    housing_data[column] = housing_data[column].replace(rare_categories, housing_data[column].mode()[0])

# Verify the changes
print(housing_data.select_dtypes(include=['object', 'category']).nunique())  # Check unique categories


# Remove dublicates
# Check for duplicates in the dataset
duplicates = housing_data.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

# Remove duplicate rows
housing_data = housing_data.drop_duplicates()

# Verify that duplicates have been removed
duplicates_after_removal = housing_data.duplicated().sum()
print(f"Number of duplicate rows after removal: {duplicates_after_removal}")


#Drop Low-Variance Columns
# Calculate the standard deviation of each numerical column
std_devs = housing_data.std()

# Define a threshold for low variance (e.g., standard deviation close to zero)
variance_threshold = 0.01  # You can adjust this threshold as needed

# Identify columns with low variance
low_variance_columns = std_devs[std_devs < variance_threshold].index

# Drop the low-variance columns
housing_data = housing_data.drop(columns=low_variance_columns)

# Verify the updated DataFrame
print(f"Columns with low variance dropped: {low_variance_columns}")


#Split the Data using train_test_split()
# Split the dataset into training and testing subsets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Verify the split sizes
print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")


# Train the Model
# Instantiate the model
model = LinearRegression()

# Train the model on the training dataset
model.fit(X_train, y_train)

# Check the model's coefficients
print(f"Model coefficients: {model.coef_}")

#Evaluate the Model using Regression Metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Predict on the testing set
y_pred = model.predict(X_test)

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae}")

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse}")

Leave a Comment