Untitled
unknown
plain_text
10 months ago
8.5 kB
8
Indexable
# A program to process, analyse, and predict some features about Housing price data set
import pandas as pd # importing pandas libarary for analysing, and dealing with the data
import matplotlib.pyplot as plt # For creating static, animated, and interactive visualizations
import seaborn as sns # For making statistical plots, built on top of Matplotlib
# Data preprocessing
## Reading the data
### loading the data set
housing_data = pd.read_csv("Housing_Price_Data.csv")
### inspecting the first 5 cols in the dataset
print(housing_data.head())
### inspecting the last 5 cols in the dataset
print(housing_data.tail())
### generate a summary statistics about the dataset
print(housing_data.describe())
## Exploring the data
### Basic Exploration:
#### I guess the "prefarea" col won't help me in analysing the dataset, so I am gonna drop it
housing_data = housing_data.drop(columns=['prefarea'])
### Check Datatypes
#### Identify numerical columns
numerical_columns = housing_data.select_dtypes(include=['number']).columns
print(numerical_columns)
#### Identify categorical columns
categorical_columns = housing_data.select_dtypes(include=['object', 'category']).columns
print(categorical_columns)
#### Convert categorical columns to the 'category' datatype
housing_data[categorical_columns] = housing_data[categorical_columns].apply(lambda col: col.astype('category'))
### Categorical Column Analysis
#### Display the number of unique categories in each categorical column
unique_categories = housing_data[categorical_columns].nunique()
print(unique_categories)
### Missing Values
#### Check for missing values in each column
missing_values = housing_data.isnull().sum()
print(missing_values)
#### Calculate the percentage of missing values in each column
missing_percentage = (housing_data.isnull().sum() / len(housing_data)) * 100
print(missing_percentage)
##Handle Missing Values
### High Null Ratios
#### Drop columns where the null-value percentage is too high (e.g., >50%).
columns_to_drop = missing_percentage[missing_percentage > 50].index
housing_data = housing_data.drop(columns=columns_to_drop)
### Categorical Columns
#### Fill missing values with the mode of each column
housing_data = housing_data.apply(lambda col: col.fillna(col.mode()[0]) if col.isnull().any() else col)
### Numerical Columns
####Visualize the distribution of each column (e.g., using histograms or skewness statistics).
# Set up the figure size
plt.figure(figsize=(15, 10))
# Plot histograms for numerical columns
numerical_columns = housing_data.select_dtypes(include=['number']).columns # Select numerical columns
for i, column in enumerate(numerical_columns, 1):
plt.subplot(3, 3, i) # Adjust subplot size as needed
sns.histplot(housing_data[column], kde=True) # Plot histogram with kernel density estimate (KDE)
plt.title(f'Distribution of {column}') # Set title for each subplot
plt.tight_layout() # Adjust spacing between subplots for a clean layout
# Show the plots
plt.show()
# Calculate skewness for numerical columns
skewness = housing_data[numerical_columns].skew() # Calculate skewness for each numerical column
print(skewness)
# Impute missing values based on skewness
for column in numerical_columns:
if skewness[column] > 1 or skewness[column] < -1: # If the column is highly skewed
# Fill missing values with the median for skewed distributions
housing_data[column] = housing_data[column].fillna(housing_data[column].median())
else:
# Fill missing values with the mean for symmetric distributions
housing_data[column] = housing_data[column].fillna(housing_data[column].mean())
# Verify the updated DataFrame with missing values filled
print(housing_data.isnull().sum()) # Check if any missing values remain
### valdiate Null handling
missing_values_after_imputation = housing_data.isnull().sum()
# Display the number of missing values for each column
print("Missing values after imputation:")
print(missing_values_after_imputation)
# Check if there are any columns with missing values
if missing_values_after_imputation.sum() == 0:
print("\nNo missing values remain in the dataset.")
else:
print("\nThere are still missing values in the dataset.")
## Visualize Outliers Using Box Plots
import matplotlib.pyplot as plt
import seaborn as sns
# Set up the figure size for plotting
plt.figure(figsize=(15, 10))
# Plot box plots for numerical columns to detect outliers
numerical_columns = housing_data.select_dtypes(include=['number']).columns
for i, column in enumerate(numerical_columns, 1):
plt.subplot(3, 3, i) # Adjust subplot grid size if necessary
sns.boxplot(x=housing_data[column]) # Box plot for each numerical column
plt.title(f'Box Plot of {column}')
plt.tight_layout() # Adjust spacing between subplots
# Show the plots
plt.show()
## Capping Outliers
# Capping outliers in numerical columns
for column in numerical_columns:
# Calculate Q1 (25th percentile) and Q3 (75th percentile)
Q1 = housing_data[column].quantile(0.25)
Q3 = housing_data[column].quantile(0.75)
# Calculate the IQR (Interquartile Range)
IQR = Q3 - Q1
# Calculate upper and lower bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
# Capping the outliers
housing_data[column] = housing_data[column].clip(lower=lower_bound, upper=upper_bound)
# Verify if outliers are capped
print(housing_data[numerical_columns].describe())
# Handle Categorical Outliers
# Handle categorical outliers by replacing rare categories with the mode
for column in housing_data.select_dtypes(include=['object', 'category']).columns:
# Calculate the frequency of each category
category_counts = housing_data[column].value_counts()
# Identify rare categories (less frequent than a threshold, e.g., 5%)
threshold = 0.05 * len(housing_data) # Categories occurring less than 5% of the time are considered rare
rare_categories = category_counts[category_counts < threshold].index
# Replace rare categories with the mode of the column
housing_data[column] = housing_data[column].replace(rare_categories, housing_data[column].mode()[0])
# Verify the changes
print(housing_data.select_dtypes(include=['object', 'category']).nunique()) # Check unique categories
# Remove dublicates
# Check for duplicates in the dataset
duplicates = housing_data.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")
# Remove duplicate rows
housing_data = housing_data.drop_duplicates()
# Verify that duplicates have been removed
duplicates_after_removal = housing_data.duplicated().sum()
print(f"Number of duplicate rows after removal: {duplicates_after_removal}")
#Drop Low-Variance Columns
# Calculate the standard deviation of each numerical column
std_devs = housing_data.std()
# Define a threshold for low variance (e.g., standard deviation close to zero)
variance_threshold = 0.01 # You can adjust this threshold as needed
# Identify columns with low variance
low_variance_columns = std_devs[std_devs < variance_threshold].index
# Drop the low-variance columns
housing_data = housing_data.drop(columns=low_variance_columns)
# Verify the updated DataFrame
print(f"Columns with low variance dropped: {low_variance_columns}")
#Split the Data using train_test_split()
# Split the dataset into training and testing subsets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Verify the split sizes
print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")
# Train the Model
# Instantiate the model
model = LinearRegression()
# Train the model on the training dataset
model.fit(X_train, y_train)
# Check the model's coefficients
print(f"Model coefficients: {model.coef_}")
#Evaluate the Model using Regression Metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error
# Predict on the testing set
y_pred = model.predict(X_test)
# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae}")
# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse}")
Editor is loading...
Leave a Comment