Untitled

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df=pd.read_csv("Data Analysis Ecommerce Dataset.xlsx - Orders.csv")
df

df.head()

df.tail(7)

df.columns

len(df)


df.shape


df.info()


df.describe()

df.isnull()

df.isnull().sum()

df.isnull().sum().sum()

df.info()

df['Zone'].unique()


df['Zone'].value_counts()


df['Zone']=df['Zone'].fillna(df['Zone'].mode()[0])

df.isnull().sum()


df.drop(columns=['Reason'], inplace=True)
df.isnull().sum()


df.dropna(inplace=True)

df.isnull().sum()

df.shape

df.to_csv("Cleaned Ecom Dataset.csv", index="False")


clean_df=pd.read_csv("Cleaned Ecom Dataset.csv")
clean_df


clean_df.drop(columns=['Unnamed: 0'], inplace=True)
clean_df.isnull().sum()

clean_df['OrderDate']= pd.to_datetime(df['OrderDate'])

clean_df['Year']=clean_df['OrderDate'].dt.year
clean_df['Month']=clean_df['OrderDate'].dt.month
clean_df.info()

clean_df['Revenue']= df['Sale Price']*df['Unit Price']

clean_df.columns

clean_df.info()


sns.countplot(data=clean_df, x='Delivery Type')
plt.title('Orders by Delivery Type')
plt.xlabel('Delivery Type')
plt.ylabel('Count')
plt.show()

sns.barplot(data=clean_df, x='Zone', y='Revenue')
plt.title('Average Revenue by Zone')
plt.xlabel('Zone')
plt.ylabel('Average Revenue')
plt.show()

sns.barplot(x="Year",y="Revenue",data=clean_df)
plt.ylabel("Revenue in Dollars")
plt.title("Revenue per Year")
plt.show()


sns.barplot(x="Month",y="Revenue",data=clean_df)
plt.ylabel("Revenue in Dollars")
plt.title("Revenue in Months")
plt.show()

#### Histogram

#This graph shows the distribution of sales price and it shows:
#The most sales are happing in the lower saler price below 1000 and some sales are happing in the higgest sale price
#Here bins=30 divides the data into 30 equal intervals
#kde=True adds a Kernel Density Estimate curve to smooth the distribution.(Shows the curve on graph)
sns.histplot(clean_df['Sale Price'], bins=30, kde=True, color='skyblue')
# Add titles and labels
plt.title('Distribution of Sale Price')
plt.xlabel('Sale Price')
plt.ylabel('Frequency')
plt.show()


#### Pie Chart

#This graph shows differnt zone proportion with each other
#Here Zone 3 is higgest in number with 45.05% and Zone 4 with lowest number with 12.86%
#autopct=0.2-this attribute show 2 decimal number after percentage
#explode=this attribute explode the pie we want as we can see Zone 4 is exploded
plt.pie( x=clean_df['Zone'].value_counts(),
        labels=df['Zone'].value_counts().index,
        autopct='%0.2f%%',
        colors=['red','blue','lightblue','green'],
        explode=[0,0,0,0.2])
plt.title('Proportion of Orders by Zone')
plt.show()

####  Scatterplots 

A scatter plot is used to visualize the relationship between two numerical columns. It helps identify patterns, correlations, or clusters in your data.

# Create a scatter plot using Sale Price as x-axis and Revenue as y-axis
# alpha=0.6 makes the points slightly transparent for better visibility
sns.scatterplot(data=clean_df, x='Sale Price', y='Revenue', alpha=0.6,)
plt.title('Revenue vs. Sale Price')
plt.xlabel('Sale Price')
plt.ylabel('Revenue')
plt.show()


#### Boxplots

A boxplot is used to visualize the distribution, spread, and outliers in a numerical column, grouped by a categorical column. It provides insights into the range (min, max), quartiles, and potential outliers.

Key Insights from Boxplots:
Median: The line inside the box shows the median.
Interquartile Range (IQR): The box represents the middle 50% of the data.
Whiskers: Extend to show the range of non-outlier data.
Outliers: Points outside the whiskers.

# Boxplot for 'Revenue' by 'Product Category'
sns.boxplot(data=clean_df, x='Product Category', y='Revenue')
# Add titles and labels
plt.title('Boxplot: Revenue by Product Category')
plt.xlabel('Product Category')
plt.ylabel('Revenue')
plt.xticks(rotation=45)  # Rotate x-axis labels for readability
plt.show()

#### Heatmaps

A heatmap is used to visualize relationships or patterns in data, particularly in the form of correlations or counts. It's especially useful for showing how numerical variables relate to each other or how values vary across categories.

Correlation Heatmap: Show correlations between numerical columns.
Insights:-
Correlation Heatmap: Identify strong positive/negative relationships.
Values close to 1.0: Strong positive correlation.
Values close to -1.0: Strong negative correlation.
Values near 0: Weak or no correlation.

# Compute correlation 
correaltion= clean_df[['Revenue', 'Sale Price', 'Unit Price', 'Shipping Fee', 'Order Quantity', 'Rating']].corr()
# Plot heatmap
#fmt='.2f': Formats the correlation values to two decimal places.
#annot=True: Displays the correlation values in each cell of the heatmap.
sns.heatmap(correaltion, annot=True, fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

This heatmap shows the correalation between all the numerical data in dataset
Here Qrder Quantity and Unit Price have -0.28 correlation which shows it is strong neagtive correalation which means that they have nothing in common also this is lowest among all numerical data
And Sales Price and Unit Price have 0.70 correalation which is higgest among all numerical data 

### Distribution Analysis

KDE (Kernel Density Estimation) plots are great for understanding the shape of a distribution and spotting patterns in numerical data.
KDE is a non-parametric way to estimate the probability density function of a continuous random variable.
It provides a smoothed curve that represents the distribution of a dataset, helping us visualize how data points are distributed over a range.

#sns.kdeplot(): This function generates the KDE plot
#fill=True: Fills the area under the curve to make the plot visually appealing
sns.kdeplot(data=clean_df, x='Sale Price', fill=True, color='green')
plt.title('KDE Plot of Sale Price')
plt.xlabel('Sale Price')
plt.ylabel('Density')
plt.show()
Editor is loading...