assignment

mail@pastecode.io avatar
unknown
r
a month ago
4.8 kB
7
Indexable
Never
check_and_install <- function(package){
  if (!require(package, character.only = TRUE)) {
    install.packages(package, dependencies = TRUE)
    library(package, character.only = TRUE)
  }
}

# Apply function on the packages
check_and_install("readxl")
check_and_install("dplyr")
check_and_install("ggplot2")

# Read excel data
data <- read_excel("Mortality-Rate.xlsx")

# Remove dots
clean_data <- distinct(data)
clean_data <- clean_data %>% filter_all(all_vars(. != ".."))

# Remove empty rows
clean_data <- na.omit(clean_data)

# Remove duplicate countries
clean_data <- clean_data %>%
  distinct(`Country Code`, .keep_all = TRUE)

# Convert columns to numeric, format to 4 decimal values (if possible) except first two columns
data_transformed <- clean_data %>%
  mutate(across(-c(1,2), ~
    ifelse(!is.na(as.numeric(as.character(.))), sprintf("%.4f", as.numeric(as.character(.))), .)
  ))

# View the new data frame
View(data_transformed)

#-----------------------------------------------------TASK 1----------------------------------------------
# Five-number summary
summary(data_transformed$`Child mortality, under-5 (per 1,000 live births) in 2015`)
summary(data_transformed$`Child mortality, under-5 (per 1,000 live births) in 2020`)

# Find top 5 countries with the largest child mortality in 2015 and 2020
top5_2015 <- data_transformed %>%
                arrange(desc(`Child mortality, under-5 (per 1,000 live births) in 2015`)) %>%
                head(5)

top5_2020 <- data_transformed %>%
                arrange(desc(`Child mortality, under-5 (per 1,000 live births) in 2020`)) %>%
                head(5)

# Find top 5 countries with the smallest child mortality in 2015 and 2020
bottom5_2015 <- data_transformed %>%
                  arrange(`Child mortality, under-5 (per 1,000 live births) in 2015`) %>%
                  head(5)

bottom5_2020 <- data_transformed %>%
                  arrange(`Child mortality, under-5 (per 1,000 live births) in 2020`) %>%
                  head(5)

# Print the countries for inspection
print(top5_2015$`Country Name`)
print(top5_2020$`Country Name`)

print(bottom5_2015$`Country Name`)
print(bottom5_2020$`Country Name`)

#-----------------------------------------------------TASK 2----------------------------------------------
# Make numeric values
data_transformed$`Child mortality, under-5 (per 1,000 live births) in 2015` <- as.numeric(as.character(data_transformed$`Child mortality, under-5 (per 1,000 live births) in 2015`))
data_transformed$`Child mortality, under-5 (per 1,000 live births) in 2020` <- as.numeric(as.character(data_transformed$`Child mortality, under-5 (per 1,000 live births) in 2020`))
data_transformed$`Domestic general government health expenditure (% of GDP) in 2015` <- as.numeric(as.character(data_transformed$`Domestic general government health expenditure (% of GDP) in 2015`))
data_transformed$`Domestic general government health expenditure (% of GDP) in 2020` <- as.numeric(as.character(data_transformed$`Domestic general government health expenditure (% of GDP) in 2020`))

# Child Mortality
ggplot(data_transformed, aes(`Child mortality, under-5 (per 1,000 live births) in 2015`)) + geom_histogram() + ggtitle('Child Mortality in 2015')
ggplot(data_transformed, aes(`Child mortality, under-5 (per 1,000 live births) in 2020`)) + geom_histogram() + ggtitle('Child Mortality in 2020')

# Domestic general government health expenditure
ggplot(data_transformed, aes(`Domestic general government health expenditure (% of GDP) in 2015`)) + geom_histogram() + ggtitle('Government Health Expenditure in 2015')
ggplot(data_transformed, aes(`Domestic general government health expenditure (% of GDP) in 2020`)) + geom_histogram() + ggtitle('Government Health Expenditure in 2020')

#-----------------------------------------------------TASK 3----------------------------------------------

# Convert the column to numeric
data_transformed$`GNI per capita (constant 2015 US$) in 2020` <- as.numeric(as.character(data_transformed$`GNI per capita (constant 2015 US$) in 2020`))

# Boxplot for visual analysis
ggplot(data_transformed, aes(x = " ", y = `GNI per capita (constant 2015 US$) in 2020`)) +
  geom_boxplot() +
  ggtitle("GNI per capita in 2020 - Boxplot")

# Identify outliers in the GNI per capita data
boxplot_outliers <- boxplot.stats(data_transformed$`GNI per capita (constant 2015 US$) in 2020`)$out
print(boxplot_outliers)

# Calculate z-scores
z_scores <- scale(data_transformed$`GNI per capita (constant 2015 US$) in 2020`, center = TRUE, scale = TRUE)

# Identify outliers (threshold = 2 standard deviations)
z_score_outliers <- which(abs(z_scores) > 2)
print(z_score_outliers)
Leave a Comment