Untitled
# A program to process, analyse, and predict some features about Housing price data set import pandas as pd # importing pandas libarary for analysing, and dealing with the data import matplotlib.pyplot as plt # For creating static, animated, and interactive visualizations import seaborn as sns # For making statistical plots, built on top of Matplotlib # Data preprocessing ## Reading the data ### loading the data set housing_data = pd.read_csv("Housing_Price_Data.csv") ### inspecting the first 5 cols in the dataset print(housing_data.head()) ### inspecting the last 5 cols in the dataset print(housing_data.tail()) ### generate a summary statistics about the dataset print(housing_data.describe()) ## Exploring the data ### Basic Exploration: #### I guess the "prefarea" col won't help me in analysing the dataset, so I am gonna drop it housing_data = housing_data.drop(columns=['prefarea']) ### Check Datatypes #### Identify numerical columns numerical_columns = housing_data.select_dtypes(include=['number']).columns print(numerical_columns) #### Identify categorical columns categorical_columns = housing_data.select_dtypes(include=['object', 'category']).columns print(categorical_columns) #### Convert categorical columns to the 'category' datatype housing_data[categorical_columns] = housing_data[categorical_columns].apply(lambda col: col.astype('category')) ### Categorical Column Analysis #### Display the number of unique categories in each categorical column unique_categories = housing_data[categorical_columns].nunique() print(unique_categories) ### Missing Values #### Check for missing values in each column missing_values = housing_data.isnull().sum() print(missing_values) #### Calculate the percentage of missing values in each column missing_percentage = (housing_data.isnull().sum() / len(housing_data)) * 100 print(missing_percentage) ##Handle Missing Values ### High Null Ratios #### Drop columns where the null-value percentage is too high (e.g., >50%). columns_to_drop = missing_percentage[missing_percentage > 50].index housing_data = housing_data.drop(columns=columns_to_drop) ### Categorical Columns #### Fill missing values with the mode of each column housing_data = housing_data.apply(lambda col: col.fillna(col.mode()[0]) if col.isnull().any() else col) ### Numerical Columns ####Visualize the distribution of each column (e.g., using histograms or skewness statistics). # Set up the figure size plt.figure(figsize=(15, 10)) # Plot histograms for numerical columns numerical_columns = housing_data.select_dtypes(include=['number']).columns # Select numerical columns for i, column in enumerate(numerical_columns, 1): plt.subplot(3, 3, i) # Adjust subplot size as needed sns.histplot(housing_data[column], kde=True) # Plot histogram with kernel density estimate (KDE) plt.title(f'Distribution of {column}') # Set title for each subplot plt.tight_layout() # Adjust spacing between subplots for a clean layout # Show the plots plt.show() # Calculate skewness for numerical columns skewness = housing_data[numerical_columns].skew() # Calculate skewness for each numerical column print(skewness) # Impute missing values based on skewness for column in numerical_columns: if skewness[column] > 1 or skewness[column] < -1: # If the column is highly skewed # Fill missing values with the median for skewed distributions housing_data[column] = housing_data[column].fillna(housing_data[column].median()) else: # Fill missing values with the mean for symmetric distributions housing_data[column] = housing_data[column].fillna(housing_data[column].mean()) # Verify the updated DataFrame with missing values filled print(housing_data.isnull().sum()) # Check if any missing values remain ### valdiate Null handling missing_values_after_imputation = housing_data.isnull().sum() # Display the number of missing values for each column print("Missing values after imputation:") print(missing_values_after_imputation) # Check if there are any columns with missing values if missing_values_after_imputation.sum() == 0: print("\nNo missing values remain in the dataset.") else: print("\nThere are still missing values in the dataset.") ## Visualize Outliers Using Box Plots import matplotlib.pyplot as plt import seaborn as sns # Set up the figure size for plotting plt.figure(figsize=(15, 10)) # Plot box plots for numerical columns to detect outliers numerical_columns = housing_data.select_dtypes(include=['number']).columns for i, column in enumerate(numerical_columns, 1): plt.subplot(3, 3, i) # Adjust subplot grid size if necessary sns.boxplot(x=housing_data[column]) # Box plot for each numerical column plt.title(f'Box Plot of {column}') plt.tight_layout() # Adjust spacing between subplots # Show the plots plt.show() ## Capping Outliers # Capping outliers in numerical columns for column in numerical_columns: # Calculate Q1 (25th percentile) and Q3 (75th percentile) Q1 = housing_data[column].quantile(0.25) Q3 = housing_data[column].quantile(0.75) # Calculate the IQR (Interquartile Range) IQR = Q3 - Q1 # Calculate upper and lower bounds lower_bound = Q1 - 1.5 * IQR upper_bound = Q3 + 1.5 * IQR # Capping the outliers housing_data[column] = housing_data[column].clip(lower=lower_bound, upper=upper_bound) # Verify if outliers are capped print(housing_data[numerical_columns].describe()) # Handle Categorical Outliers # Handle categorical outliers by replacing rare categories with the mode for column in housing_data.select_dtypes(include=['object', 'category']).columns: # Calculate the frequency of each category category_counts = housing_data[column].value_counts() # Identify rare categories (less frequent than a threshold, e.g., 5%) threshold = 0.05 * len(housing_data) # Categories occurring less than 5% of the time are considered rare rare_categories = category_counts[category_counts < threshold].index # Replace rare categories with the mode of the column housing_data[column] = housing_data[column].replace(rare_categories, housing_data[column].mode()[0]) # Verify the changes print(housing_data.select_dtypes(include=['object', 'category']).nunique()) # Check unique categories # Remove dublicates # Check for duplicates in the dataset duplicates = housing_data.duplicated().sum() print(f"Number of duplicate rows: {duplicates}") # Remove duplicate rows housing_data = housing_data.drop_duplicates() # Verify that duplicates have been removed duplicates_after_removal = housing_data.duplicated().sum() print(f"Number of duplicate rows after removal: {duplicates_after_removal}") #Drop Low-Variance Columns # Calculate the standard deviation of each numerical column std_devs = housing_data.std() # Define a threshold for low variance (e.g., standard deviation close to zero) variance_threshold = 0.01 # You can adjust this threshold as needed # Identify columns with low variance low_variance_columns = std_devs[std_devs < variance_threshold].index # Drop the low-variance columns housing_data = housing_data.drop(columns=low_variance_columns) # Verify the updated DataFrame print(f"Columns with low variance dropped: {low_variance_columns}") #Split the Data using train_test_split() # Split the dataset into training and testing subsets (80% train, 20% test) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Verify the split sizes print(f"Training set size: {X_train.shape[0]}") print(f"Testing set size: {X_test.shape[0]}") # Train the Model # Instantiate the model model = LinearRegression() # Train the model on the training dataset model.fit(X_train, y_train) # Check the model's coefficients print(f"Model coefficients: {model.coef_}") #Evaluate the Model using Regression Metrics from sklearn.metrics import mean_absolute_error, mean_squared_error # Predict on the testing set y_pred = model.predict(X_test) # Calculate Mean Absolute Error (MAE) mae = mean_absolute_error(y_test, y_pred) print(f"Mean Absolute Error (MAE): {mae}") # Calculate Mean Squared Error (MSE) mse = mean_squared_error(y_test, y_pred) print(f"Mean Squared Error (MSE): {mse}")
Leave a Comment