import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Define the data
data = {'Product ID': np.arange(1, 51),
'Product Name': ['Product' + str(i) for i in range(1, 51)],
'Category': np.random.choice(['Electronics', 'Clothing', 'Home Goods'], size=50),
'Price': np.round(np.random.uniform(10, 100, size=50), 2),
'Quantity Sold': np.random.randint(1, 50, size=50),
'Date Sold': pd.date_range('2022-01-01', periods=50)}
# Create a pandas DataFrame
sales_df = pd.DataFrame(data)
#Load the data from the CSV file (if applicable)
#sales_df = pd.read_csv('sales_data.csv')
#Data cleaning
sales_df = sales_df.drop_duplicates()
sales_df = sales_df.dropna()
#Exploratory data analysis
total_sales_by_product = sales_df.groupby('Product Name')['Price'].sum().sort_values(ascending=False)
total_sales_by_category = sales_df.groupby('Category')['Price'].sum().sort_values(ascending=False)
avg_price_by_product = sales_df.groupby('Product Name')['Price'].mean().sort_values(ascending=False)
avg_price_by_category=sales_df.groupby('Category')['Price'].mean().sort_values(ascending=False)
top_selling_products = sales_df.groupby('Product Name')['Quantity Sold'].sum().sort_values(ascending=False)[:5]
top_selling_categories = sales_df.groupby('Category')['Quantity Sold'].sum().sort_values(ascending=False)[:5]
# Visualization
plt.figure(figsize=(10, 8))
total_sales_by_product.plot(kind='bar')
plt.title('Total Sales by Product')
plt.xlabel('Product Name')
plt.ylabel('Total Sales')
plt.show()
plt.figure(figsize=(10, 8))
total_sales_by_category.plot(kind='bar')
plt.title('Total Sales by Category')
plt.xlabel('Category')
plt.ylabel('Total Sales')
plt.show()
plt.figure(figsize=(10, 8))
avg_price_by_product.plot(kind='bar')
plt.title('Average Price by Product')
plt.xlabel('Product Name')
plt.ylabel('Average Price')
plt.show()
plt.figure(figsize=(10, 8))
avg_price_by_category.plot(kind='bar')
plt.title('Average Price by Category')
plt.xlabel('Category')
plt.ylabel('Average Price')
plt.show()
plt.figure(figsize=(10, 8))
top_selling_products.plot(kind='bar')
plt.title('Top Selling Products')
plt.xlabel('Product Name')
plt.ylabel('Quantity Sold')
plt.show()
plt.figure(figsize=(10, 8))
top_selling_categories.plot(kind='bar')
plt.title('Top Selling Categories')
plt.xlabel('Category')
plt.ylabel('Quantity Sold')
plt.show()
print('Total Sales by Product:\n', total_sales_by_product)
print('Total Sales by Category:\n', total_sales_by_category)
print('Average Price by Product:\n', avg_price_by_product)
print('Average Price by Category:\n', avg_price_by_category)
print('Top Selling Products:\n', top_selling_products)
print('Top Selling Categories:\n', top_selling_categories)