30/50 (only my part)

 avatar
user_2508819
r
2 years ago
30 kB
4
Indexable
### ---------------------- Questions To Analysis -------------------------- ###

## Question 1: What Makes Employees Quit Their Jobs? (Termination)


# Analysis 1.1 - Is there correlation between termination type and length of service?

# Filter the data to include only relevant columns and drop NA values
filtered_data <- new_employee_attrition %>%
  select(termination_type, length_of_service) %>%
  drop_na()

# Group by termination type and calculate the count of each length of service
termtype_los <- filtered_data %>%
  group_by(termination_type, length_of_service) %>%
  summarize(count = n())

# Create a line chart with theme and theme_minimal
ggplot(termtype_los, aes(x = length_of_service, y = count, color = termination_type)) +
  geom_line(size = 1) +
  geom_point(size = 3, shape = 21, fill = "lightskyblue1") +
  labs(x = "Length of Service", y = "Count", color = "Termination Type", 
       title = "Correlation between Termination Type and Length of Service") +
  theme_minimal()


# Analysis 1.2 - I want to investigate if voluntary termination is related due to length of service?

# Filter the data to include only voluntary terminations and length of service columns
filtered_data <- new_employee_attrition %>%
  filter(termination_type == "Voluntary") %>%
  select(termination_type, length_of_service)

# Group by length of service and calculate the count of voluntary terminations
voluntary_length_of_service <- filtered_data %>%
  group_by(length_of_service) %>%
  summarize(count = n())

# Sort the data by length of service
voluntary_length_of_service <- voluntary_length_of_service[order(voluntary_length_of_service$length_of_service), ]

# Create an area chart to visualize the relationship
ggplot(voluntary_length_of_service, aes(x = length_of_service, y = count)) +
  geom_area(fill = "steelblue", alpha = 0.7) +
  labs(x = "Length of Service", y = "Count", title = "Relationship Between Voluntary Termination & Length Of Service") +
  theme_minimal() +
  theme(axis.text = element_text(size = 10),
        axis.title = element_text(size = 12, face = "bold"),
        plot.title = element_text(size = 16, face = "bold"),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank()) +
  scale_x_continuous(breaks = seq(0, max(voluntary_length_of_service$length_of_service), by = 5)) +
  scale_y_continuous(limits = c(0, max(voluntary_length_of_service$count) * 1.1)) +
  theme(plot.background = element_rect(fill = "white"),
        panel.background = element_rect(fill = "white"),
        legend.position = "none")


# Analysis 1.3 - I want to investigate if involuntary termination is related due to length of service?

# Filter the data to include only involuntary terminations and length of service columns
filtered_data <- new_employee_attrition %>%
  filter(termination_type == "Involuntary") %>%
  select(termination_type, length_of_service)

# Group by length of service and calculate the count of each termination type
involuntary_termination <- filtered_data %>%
  group_by(length_of_service, termination_type) %>%
  summarize(count = n())

# Create a stacked area chart to visualize the relationship
ggplot(involuntary_termination, aes(x = length_of_service, y = count, fill = termination_type)) +
  geom_area(color = "white") +
  labs(x = "Length of Service", y = "Count", title = "Relationship between Involuntary Termination and Length of Service") +
  theme_minimal() +
  theme(axis.text = element_text(size = 10),
        plot.title = element_text(size = 14, face = "bold"),
        axis.title = element_text(size = 11, face = "bold"),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank()) +
  scale_fill_manual(values = c("darkgreen", "lightgreen")) +
  theme(plot.background = element_rect(fill = "white"),
        panel.background = element_rect(fill = "white"),
        legend.position = "bottom")


# Analysis 1.4 - What is the correlation between voluntary termination (termination type) and age of employee?

# Filter the data to include only voluntary terminations and age columns
filtered_data <- new_employee_attrition %>%
  filter(termination_type == "Voluntary") %>%
  select(termination_type, age)

# Group by termination type and calculate the count of each age
voluntary_age <- filtered_data %>%
  group_by(age) %>%
  summarize(count = n())

# Create a connected scatter plot to visualize the correlation
ggplot(voluntary_age, aes(x = age, y = count)) +
  geom_line(color = "steelblue", size = 1) +
  geom_point(color = "steelblue", size = 3) +
  labs(x = "Age", y = "Count", title = "Correlation between Voluntary Termination and Age") +
  theme_minimal() +
  theme(axis.text = element_text(size = 10),
        axis.title = element_text(size = 12, face = "bold"),
        plot.title = element_text(size = 14, face = "bold"),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank()) +
  scale_color_gradient(low = "darkblue", high = "lightblue") +
  theme(plot.background = element_rect(fill = "white"),
        panel.background = element_rect(fill = "white"),
        legend.position = "none")


# Analysis 1.5 - What is the correlation between involuntary termination (termination type) and age of employee?

# Filter the data to include only involuntary terminations and age columns, and group by age
involuntary_age <- new_employee_attrition %>%
  filter(termination_type == "Involuntary") %>%
  group_by(age) %>%
  summarize(count = n())

# Create a scatter plot with enhanced aesthetics
ggplot(involuntary_age, aes(x = age, y = count)) +
  geom_point(color = "#FF6384", size = 4, alpha = 0.8) +
  labs(x = "Age", y = "Count", title = "Correlation between Involuntary Termination and Age") +
  theme_minimal() +
  theme(axis.text = element_text(size = 10),
        axis.title = element_text(size = 11, face = "bold"),
        plot.title = element_text(size = 14, face = "bold"),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        legend.position = "bottom",
        legend.title = element_blank(),
        legend.text = element_text(size = 10),
        legend.key.size = unit(0.7, "cm")) +
  scale_color_gradient(low = "#FF6384", high = "#FFABAB") +
  guides(color = guide_legend(override.aes = list(size = 4))) +
  theme(plot.background = element_rect(fill = "white"),
        panel.background = element_rect(fill = "white"))


# Analysis 1.6 - Is there correlation between termination reason and age of employee?

# Filter, group, summarize, and arrange the data
termreason_age <- new_employee_attrition %>%
  filter(!is.na(termination_reason)) %>%
  group_by(age, termination_reason) %>%
  summarize(count = n())

# Create a stacked bar chart 
ggplot(termreason_age, aes(x = age, y = count, fill = termination_reason)) +
  geom_bar(stat = "identity") +
  labs(x = "Age", y = "Total Count Of Term. Reasons", fill = "Termination Reason") +
  ggtitle("Correlation Between Termination Reasons & Employee Ages")


# Analysis 1.7 - How does employees' genders relate to their reasons for leaving?

# Filter, group, summarize, and arrange the data
termreason_gender <- employee_attrition %>%
  filter(!is.na(termination_reason) & !is.na(gender)) %>%
  group_by(gender, termination_reason) %>%
  summarize(count = n()) %>%
  arrange(desc(gender))

# Create a visually appealing grouped bar chart
ggplot(termreason_gender, aes(x = gender, y = count, fill = termination_reason)) +
  geom_bar(stat = "identity", position = "dodge", color = "black", alpha = 0.8) +
  labs(x = "Gender", y = "Total Count of Term. Reasons", fill = "Termination Reason") +
  ggtitle("Correlation Between Termination Reasons & Gender") +
  theme_minimal() +
  theme(plot.title = element_text(size = 16, face = "bold"),
        axis.title = element_text(size = 12),
        axis.text = element_text(size = 10),
        legend.title = element_text(size = 12),
        legend.text = element_text(size = 10),
        legend.position = "bottom") +
  scale_fill_brewer(palette = "Set2")


# Analysis 1.8 - What is the relationship between termination types and gender?

# Filter the data to include only relevant columns and non-null values for termination type and gender
filtered_data <- new_employee_attrition %>%
  filter(!is.na(termination_type) & !is.na(gender))

# Group by termination type and gender and calculate the count of each combination
grouped_data <- filtered_data %>%
  group_by(termination_type, gender) %>%
  summarize(count = n())

# Create a stacked bar chart
ggplot(grouped_data, aes(x = termination_type, y = count, fill = gender)) +
  geom_bar(stat = "identity") +
  labs(x = "Termination Type", y = "Count", fill = "Gender") +
  ggtitle("Relationship Between Termination Types & Gender") +
  theme_minimal() +
  theme(legend.position = "bottom")


# Analysis 1.9 - What is the relationship between voluntary termination and gender?

# Filter the data to include only relevant columns and non-null values for termination type and gender
filtered_data <- new_employee_attrition %>%
  filter(!is.na(termination_type) & !is.na(gender))

# Filter the data to include only voluntary terminations
voluntary_data <- filtered_data %>%
  filter(termination_type == "Voluntary")

# Group by gender and calculate the count of each gender
grouped_data <- voluntary_data %>%
  group_by(gender) %>%
  summarize(count = n())

grouped_data

# Create a pie chart
ggplot(grouped_data, aes(x = "", y = count, fill = gender)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar(theta = "y") +
  geom_text(aes(label = count), position = position_stack(vjust = 0.5), color = "white", size = 4) +
  labs(x = "", y = "", fill = "Gender") +
  ggtitle("Correlation Between Voluntary Termination & Gender") +
  theme_minimal() +
  theme(plot.title = element_text(size = 16, face = "bold"),
        axis.text = element_blank(), axis.title = element_blank(),
        panel.grid = element_blank()) +
  scale_fill_manual(values = c("seagreen3", "dimgray"))


# Analysis 1.10 - What is the relationship between involuntary termination and gender?

# Filter the data to include only involuntary terminations and gender columns
filtered_data <- new_employee_attrition %>%
  filter(termination_type == "Involuntary" & !is.na(gender))

# Group by gender and calculate the count of involuntary terminations
grouped_data <- filtered_data %>%
  group_by(gender) %>%
  summarize(count = n())

# Create a pie chart with text labels
ggplot(grouped_data, aes(x = "", y = count, fill = gender)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar(theta = "y") +
  geom_text(aes(label = count), position = position_stack(vjust = 0.5), color = "white", size = 4) +
  labs(x = "", y = "", fill = "Gender") +
  ggtitle("Correlation Between Involuntary Termination & Gender") +
  theme_minimal() +
  theme(legend.position = "bottom",
        plot.title = element_text(size = 16, face = "bold"),
        axis.text = element_blank(),
        panel.grid = element_blank()) +
  scale_fill_manual(values = c("lightsalmon1", "salmon4")) +
  coord_polar("y", start = 0, direction = -1) +
  theme_void() +
  theme(plot.title = element_text(hjust = 0.5))


# Analysis 1.11 - Are job titles make employees terminated voluntarily? 
# termination type - voluntary

# Filter the data to include only relevant columns and voluntary terminations
filtered_data <- new_employee_attrition %>%
  filter(termination_type == "Voluntary" & !is.na(job_title))

# Group by job title and calculate the count of voluntary terminations
grouped_data <- filtered_data %>%
  group_by(job_title) %>%
  summarize(count = n())

# Create a horizontal bar chart with a bigger plot title
ggplot(grouped_data, aes(x = count, y = job_title, fill = job_title)) +
  geom_bar(stat = "identity", color = "black") +
  labs(x = "Count", y = "Job Title", fill = "Job Title") +
  ggtitle("Relationship Between Voluntary Termination & Job Title") +
  theme_minimal() +
  theme(axis.text.y = element_text(hjust = 0),
        legend.position = "none",
        plot.title = element_text(hjust = 0.5, size = 16))


# Analysis 1.12 - Are job titles make employees terminated involuntarily? 
# termination type - Involuntary

# Filter the data to include only relevant columns and involuntary terminations
filtered_data <- new_employee_attrition %>%
  filter(termination_type == "Involuntary" & !is.na(job_title))

# Group by job title and calculate the count of involuntary terminations
grouped_data <- filtered_data %>%
  group_by(job_title) %>%
  summarize(count = n())

# Create a vertical bar chart
ggplot(grouped_data, aes(x = job_title, y = count, fill = job_title)) +
  geom_bar(stat = "identity", color = "white") +
  labs(x = "Job Title", y = "Count", fill = "Job Title") +
  ggtitle("Connection Between Involuntary Termination & Job Title") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "none",
        plot.title = element_text(hjust = 0.5))


# Analysis 1.13 - Does the department in which employees work have an impact on their likelihood of termination?

# Filter the data to include only relevant columns
filtered_data <- new_employee_attrition %>%
  filter(!is.na(department_name) & !is.na(termination_type))

# Group by department and termination type, and calculate the count of each combination
grouped_data <- filtered_data %>%
  group_by(department_name, termination_type) %>%
  summarize(count = n())

# Create a stacked bar chart
ggplot(grouped_data, aes(x = department_name, y = count, fill = termination_type)) +
  geom_bar(stat = "identity", color = "black") +
  labs(x = "Department", y = "Count", fill = "Termination Type") +
  ggtitle("Connection Between Termination Type & Department") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "bottom")


# Analysis 1.14 - Does the business unit in which employees work have an influence on the occurrence of terminations?

# Filter the data to include only relevant columns
filtered_data <- new_employee_attrition %>%
  filter(!is.na(business_unit) & !is.na(termination_type))

# Group by business unit and calculate the count of terminations
grouped_data <- filtered_data %>%
  group_by(business_unit) %>%
  summarize(count = n())

# Sort the data by count in descending order
grouped_data <- grouped_data[order(grouped_data$count, decreasing = TRUE), ]

# Create a treemap chart
ggplot(grouped_data, aes(area = count, fill = business_unit, label = business_unit)) +
  geom_treemap() +
  geom_treemap_text(fontface = "bold", color = "white", place = "centre", min.size = 0) +
  labs(fill = "Business Unit") +
  ggtitle("Connection Between Termination Type & Business Unit") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5),
        panel.background = element_blank(),
        axis.text = element_blank(),
        axis.title = element_blank(),
        legend.position = "bottom") +
  scale_fill_viridis_d(option = "D", direction = -1) +
  guides(fill = guide_legend(reverse = TRUE)) +
  coord_equal() +
  theme(legend.key.size = unit(0.7, "cm"),
        legend.text = element_text(size = 10),
        legend.title = element_text(size = 12, face = "bold"))


# Analysis 1.15 - Does city have to do with the result of terminations from employees?


# Filter the data to include only relevant columns
filtered_data <- new_employee_attrition %>%
  filter(!is.na(city) & status == "TERMINATED")

# Group by city and calculate the count of terminations
grouped_data <- filtered_data %>%
  group_by(city) %>%
  summarize(count = n())

# Sort the data by count in descending order
grouped_data <- grouped_data[order(grouped_data$count, decreasing = TRUE), ]

# Create a bar chart
ggplot(grouped_data, aes(x = city, y = count, fill = city)) +
  geom_bar(stat = "identity", color = "ivory1") +
  labs(x = "City", y = "Termination Count", fill = "City") +
  ggtitle("Correlation Between Terminations Count & City") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "none")


## Question 2: How is the gender distribution distributed among the employees in the company?


# Analysis 2.1 - What are the total male and female of employees in the company?

# Convert gender to character type
filtered_data$gender <- as.character(filtered_data$gender)

# Group by gender and calculate the count of employees
grouped_data <- filtered_data %>%
  group_by(gender) %>%
  summarize(count = n())

# Create a donut chart
plot_ly(grouped_data, labels = ~gender, values = ~count, type = 'pie',
        text = ~paste(gender, ": ", count), textposition = 'inside',
        hole = 0.6, marker = list(colors = c("steelblue", "pink"))) %>%
  layout(title = list(text = "Gender Proportion in the Company", x = 0.5),
         showlegend = TRUE,
         legend = list(orientation = "h", x = 0.5, y = -0.15))


# Analysis 2.2 - What is the gender ratio between males and females here?

# Filter the data to include only relevant columns
filtered_data <- new_employee_attrition %>%
  filter(!is.na(gender))

# Group by gender and calculate the count of employees
grouped_data <- filtered_data %>%
  group_by(gender) %>%
  summarize(count = n())

# Calculate the total count of employees
total_count <- sum(grouped_data$count)

# Calculate the gender ratio
grouped_data <- grouped_data %>%
  mutate(ratio = count / total_count * 100)

# Create a stacked bar chart
ggplot(grouped_data, aes(x = 1, y = ratio, fill = gender)) +
  geom_bar(stat = "identity", color = "black") +
  labs(x = "", y = "Ratio (%)", fill = "Gender") +
  ggtitle("Gender Ratio in the Company") +
  theme_minimal() +
  theme(legend.position = "bottom") +
  coord_flip()


# Analysis 2.3 - How does the gender distribution vary across different departments in the company?

# Filter the data to include only relevant columns
filtered_data <- new_employee_attrition %>%
  filter(!is.na(gender) & !is.na(department_name))

# Group by department and gender, and calculate the count of employees
grouped_data <- filtered_data %>%
  group_by(department_name, gender) %>%
  summarize(count = n())

# Calculate the proportion of each gender within each department
proportion_data <- grouped_data %>%
  group_by(department_name) %>%
  mutate(proportion = count / sum(count))

# Create a stacked bar chart
ggplot(proportion_data, aes(x = department_name, y = proportion, fill = gender)) +
  geom_bar(stat = "identity") +
  labs(x = "Department", y = "Proportion", fill = "Gender") +
  ggtitle("Gender Distribution across Departments") +
  theme_minimal() +
  theme(legend.position = "bottom", 
        axis.text.x = element_text(angle = 45, hjust = 1))


# Analysis 2.4 - Is there a difference in the gender distribution between different job titles within the company?

# Filter the data to include only relevant columns
filtered_data <- new_employee_attrition %>%
  filter(!is.na(gender) & !is.na(job_title))

# Group by job title and gender, and calculate the count of employees
grouped_data <- filtered_data %>%
  group_by(job_title, gender) %>%
  summarize(count = n())

# Calculate the proportion of each gender within each job title
proportion_data <- grouped_data %>%
  group_by(job_title) %>%
  mutate(proportion = count / sum(count))

# Create a grouped dot plot
ggplot(proportion_data, aes(x = job_title, y = proportion, color = gender)) +
  geom_point(size = 3, position = position_dodge(width = 0.5)) +
  labs(x = "Job Title", y = "Proportion", color = "Gender") +
  ggtitle("Gender Distribution across Job Titles") +
  theme_minimal() +
  theme(legend.position = "bottom",
        axis.text.x = element_text(angle = 45, hjust = 1),
        legend.title = element_blank())


# Analysis 2.5 - I want to know if there is connection between length of service and gender.


# Filter the data to include only relevant columns
filtered_data <- new_employee_attrition %>%
  filter(!is.na(gender) & !is.na(length_of_service))

# Create a violin plot
ggplot(filtered_data, aes(x = gender, y = length_of_service, fill = gender)) +
  geom_violin(trim = FALSE) +
  labs(x = "Gender", y = "Length of Service", fill = "Gender") +
  ggtitle("Length of Service by Gender") +
  theme_minimal() +
  theme(legend.position = "none")


# Analysis 2.6 - Is there relationship between gender and the age of employees in the company?

# Filter the data to include only relevant columns
filtered_data <- new_employee_attrition %>%
  filter(!is.na(gender) & !is.na(age))

# Create a box plot
ggplot(filtered_data, aes(x = gender, y = age, fill = gender)) +
  geom_boxplot(color = "black", outlier.shape = NA) +
  labs(x = "Gender", y = "Age", fill = "Gender") +
  ggtitle("Relationship between Gender and Age") +
  theme_minimal() +
  theme(legend.position = "none",
        plot.title = element_text(hjust = 0.5, size = 16, face = "bold", color = "slategrey"),
        axis.text = element_text(size = 12, color = "gray40"),
        axis.title = element_text(size = 14, face = "bold", color = "steelblue"),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        plot.margin = unit(c(1, 1, 1, 1), "cm"),
        plot.background = element_rect(fill = "bisque1"),
        # plot.border = element_rect(color = "steelblue", fill = NA, size = 1),
        panel.background = element_rect(fill = "white"),
        panel.border = element_rect(color = "gray80", fill = NA),
        legend.background = element_rect(fill = "white"),
        legend.title = element_text(size = 12, face = "bold"),
        legend.text = element_text(size = 12),
        legend.key = element_rect(fill = "lightgray", color = NA),
        strip.background = element_rect(fill = "steelblue", color = NA),
        strip.text = element_text(size = 12, face = "bold", color = "white"))


# Analysis 2.7 - Is there a relationship between gender and departments in the company?

# Filter the data to include only relevant columns
filtered_data <- new_employee_attrition %>%
  filter(!is.na(gender) & !is.na(department_name))

# Group by gender and department, and calculate the count of employees
grouped_data <- filtered_data %>%
  group_by(gender, department_name) %>%
  summarize(count = n())

# Create a heatmap chart with red color palette
ggplot(grouped_data, aes(x = department_name, y = gender, fill = count)) +
  geom_tile() +
  labs(x = "Department", y = "Gender", fill = "Count") +
  ggtitle("Correlation between Gender and Department") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "right") +
  scale_fill_gradient(low = "red4", high = "red")


# Analysis 2.8 - Investigate the gender distribution of Cashier (Job Title).

# Filter the data to include only employees with the job title "Cashier"
filtered_data <- new_employee_attrition %>%
  filter(job_title == "Cashier")

# Group by gender and calculate the count of employees
grouped_data <- filtered_data %>%
  group_by(gender) %>%
  summarize(count = n())

# Create a pie chart
ggplot(grouped_data, aes(x = "", y = count, fill = gender)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar(theta = "y") +
  labs(x = NULL, y = NULL, fill = "Gender") +
  ggtitle("Gender Distribution of Cashiers") +
  theme_minimal() +
  theme(legend.position = "bottom",
        plot.title = element_text(size = 16, face = "bold")) +
  scale_fill_manual(values = c("steelblue", "pink"))


# Analysis 2.9 - Identify the gender proportion of current active employees.

# Filter the data to include only current active employees
filtered_data <- new_employee_attrition %>%
  filter(status == "ACTIVE")

# Create a boxplot
ggplot(filtered_data, aes(x = gender, y = age, fill = gender)) +
  geom_boxplot() +
  labs(x = "Gender", y = "Age", fill = "Gender") +
  ggtitle("Gender Distribution of Current Active Employees") +
  theme_minimal() +
  theme(legend.position = "none",
        plot.title = element_text(hjust = 0.5))


# Analysis 2.10 - Identify the gender proportion of past employees.

# Filter the data to include only past employees
filtered_data <- new_employee_attrition %>%
  filter(status == "TERMINATED")

# Group by gender and calculate the count of past employees
grouped_data <- filtered_data %>%
  group_by(gender) %>%
  summarize(count = n())

# Calculate the proportion of each gender
grouped_data <- grouped_data %>%
  mutate(proportion = count / sum(count))

# Create a horizontal bar chart
ggplot(grouped_data, aes(x = proportion, y = reorder(gender, proportion), fill = gender)) +
  geom_col() +
  labs(x = "Proportion", y = "Gender", fill = "Gender") +
  ggtitle("Gender Proportion of Past Employees") +
  theme_minimal() +
  theme(legend.position = "none",
        plot.title = element_text(hjust = 0.5),
        axis.text.y = element_text(hjust = 0))


# Analysis 2.11 -  Does the gender ratio change over time? (e.g., analyzing gender distribution by year)?

# Filter the data to include only relevant columns
filtered_data <- new_employee_attrition %>%
  filter(!is.na(gender) & !is.na(status_year))

# Group by status_year and gender, and calculate the count of employees
grouped_data <- filtered_data %>%
  group_by(status_year, gender) %>%
  summarize(count = n())

# Calculate the proportion of each gender within each year
proportion_data <- grouped_data %>%
  group_by(status_year) %>%
  mutate(proportion = count / sum(count))

# Create a bar chart
ggplot(proportion_data, aes(x = as.factor(status_year), y = proportion, fill = gender)) +
  geom_bar(stat = "identity", position = "stack") +
  labs(x = "Year", y = "Proportion", fill = "Gender") +
  ggtitle("Gender Distribution Over Time") +
  theme_minimal() +
  theme(legend.position = "bottom")


# Analysis 2.12 - Are there any variations in the gender distribution based on the business units in the company?

# Filter the data to include only relevant columns
filtered_data <- new_employee_attrition %>%
  filter(!is.na(gender) & !is.na(business_unit))

# Group by business_unit and gender, and calculate the count of employees
grouped_data <- filtered_data %>%
  group_by(business_unit, gender) %>%
  summarize(count = n())

# Calculate the proportion of each gender within each business unit
proportion_data <- grouped_data %>%
  group_by(business_unit) %>%
  mutate(proportion = count / sum(count))

# Define a custom color palette
colors <- brewer.pal(3, "Set2")

# Create a stacked bar chart with custom colors
ggplot(proportion_data, aes(x = business_unit, y = proportion, fill = gender)) +
  geom_bar(stat = "identity", position = "fill") +
  labs(x = "Business Unit", y = "Proportion", fill = "Gender") +
  ggtitle("Gender Distribution by Business Unit") +
  scale_fill_manual(values = colors) +
  theme_minimal() +
  theme(legend.position = "bottom",
        axis.text.x = element_text(angle = 45, hjust = 1))


# Analysis 2.13 - How many female employees are still active in this company from 2010 to 2011?

# Convert status_year to numeric
new_employee_attrition$status_year <- as.numeric(as.character(new_employee_attrition$status_year))

# Filter the data to include only relevant columns and years 2010-2011
filtered_data <- new_employee_attrition %>%
  filter(gender == "Female" & status_year >= 2010 & status_year <= 2011)

# Group by status_year and calculate the count of active females
grouped_data <- filtered_data %>%
  group_by(status_year) %>%
  summarize(count = n())

# Create a scatter plot with encircling
ggplot(grouped_data, aes(x = status_year, y = count)) +
  geom_point() +
  geom_encircle(data = grouped_data[grouped_data$status_year >= 2010 & grouped_data$status_year <= 2011, ], aes(x = status_year, y = count), color = "blue", expand = 0.1) +
  labs(x = "Year", y = "Count", title = "Number of Active Females (2010-2011)") +
  theme_minimal()


# Analysis 2.14 - Which gender dominates from the highest job positions in the company?

# Define the job titles of interest
higher_positions <- c("CEO", "Chief Information Officer", "VP Finances", "VP Human Resources", "VP Stores")

# Filter the data to include only relevant job titles
filtered_data <- new_employee_attrition %>%
  filter(job_title %in% higher_positions)

# Group by gender and job title, and calculate the count of employees
grouped_data <- filtered_data %>%
  group_by(gender, job_title) %>%
  summarize(count = n())

# Order the job titles by count in descending order
grouped_data <- grouped_data %>%
  arrange(job_title, desc(count))

# Create the lollipop chart
ggplot(grouped_data, aes(x = reorder(job_title, count), y = count, fill = gender)) +
  geom_segment(aes(xend = reorder(job_title, count), yend = 0), color = "black") +
  geom_point(size = 3, color = "black", shape = 21) +
  labs(x = "Job Title", y = "Count", fill = "Gender") +
  ggtitle("Gender Distribution in Higher Job Positions") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "bottom")

# Analysis 2.15 - I want to know gender distribution from the stores? 

# Filter the data to include only relevant columns
filtered_data <- new_employee_attrition %>%
  filter(!is.na(store_id) & !is.na(gender))

# Group by store_id and gender, and calculate the count of employees
grouped_data <- filtered_data %>%
  group_by(store_id, gender) %>%
  summarize(count = n())

# Create a bar chart
ggplot(grouped_data, aes(x = store_id, y = count, fill = gender)) +
  geom_bar(stat = "identity", color = "ivory1") +
  labs(x = "Store ID", y = "Count", fill = "Gender") +
  ggtitle("Gender Distribution from Stores") +
  theme_minimal() +
  theme(legend.position = "bottom",
        axis.text.x = element_text(angle = 45, hjust = 1))