assignment
unknown
r
a year ago
4.8 kB
10
Indexable
check_and_install <- function(package){ if (!require(package, character.only = TRUE)) { install.packages(package, dependencies = TRUE) library(package, character.only = TRUE) } } # Apply function on the packages check_and_install("readxl") check_and_install("dplyr") check_and_install("ggplot2") # Read excel data data <- read_excel("Mortality-Rate.xlsx") # Remove dots clean_data <- distinct(data) clean_data <- clean_data %>% filter_all(all_vars(. != "..")) # Remove empty rows clean_data <- na.omit(clean_data) # Remove duplicate countries clean_data <- clean_data %>% distinct(`Country Code`, .keep_all = TRUE) # Convert columns to numeric, format to 4 decimal values (if possible) except first two columns data_transformed <- clean_data %>% mutate(across(-c(1,2), ~ ifelse(!is.na(as.numeric(as.character(.))), sprintf("%.4f", as.numeric(as.character(.))), .) )) # View the new data frame View(data_transformed) #-----------------------------------------------------TASK 1---------------------------------------------- # Five-number summary summary(data_transformed$`Child mortality, under-5 (per 1,000 live births) in 2015`) summary(data_transformed$`Child mortality, under-5 (per 1,000 live births) in 2020`) # Find top 5 countries with the largest child mortality in 2015 and 2020 top5_2015 <- data_transformed %>% arrange(desc(`Child mortality, under-5 (per 1,000 live births) in 2015`)) %>% head(5) top5_2020 <- data_transformed %>% arrange(desc(`Child mortality, under-5 (per 1,000 live births) in 2020`)) %>% head(5) # Find top 5 countries with the smallest child mortality in 2015 and 2020 bottom5_2015 <- data_transformed %>% arrange(`Child mortality, under-5 (per 1,000 live births) in 2015`) %>% head(5) bottom5_2020 <- data_transformed %>% arrange(`Child mortality, under-5 (per 1,000 live births) in 2020`) %>% head(5) # Print the countries for inspection print(top5_2015$`Country Name`) print(top5_2020$`Country Name`) print(bottom5_2015$`Country Name`) print(bottom5_2020$`Country Name`) #-----------------------------------------------------TASK 2---------------------------------------------- # Make numeric values data_transformed$`Child mortality, under-5 (per 1,000 live births) in 2015` <- as.numeric(as.character(data_transformed$`Child mortality, under-5 (per 1,000 live births) in 2015`)) data_transformed$`Child mortality, under-5 (per 1,000 live births) in 2020` <- as.numeric(as.character(data_transformed$`Child mortality, under-5 (per 1,000 live births) in 2020`)) data_transformed$`Domestic general government health expenditure (% of GDP) in 2015` <- as.numeric(as.character(data_transformed$`Domestic general government health expenditure (% of GDP) in 2015`)) data_transformed$`Domestic general government health expenditure (% of GDP) in 2020` <- as.numeric(as.character(data_transformed$`Domestic general government health expenditure (% of GDP) in 2020`)) # Child Mortality ggplot(data_transformed, aes(`Child mortality, under-5 (per 1,000 live births) in 2015`)) + geom_histogram() + ggtitle('Child Mortality in 2015') ggplot(data_transformed, aes(`Child mortality, under-5 (per 1,000 live births) in 2020`)) + geom_histogram() + ggtitle('Child Mortality in 2020') # Domestic general government health expenditure ggplot(data_transformed, aes(`Domestic general government health expenditure (% of GDP) in 2015`)) + geom_histogram() + ggtitle('Government Health Expenditure in 2015') ggplot(data_transformed, aes(`Domestic general government health expenditure (% of GDP) in 2020`)) + geom_histogram() + ggtitle('Government Health Expenditure in 2020') #-----------------------------------------------------TASK 3---------------------------------------------- # Convert the column to numeric data_transformed$`GNI per capita (constant 2015 US$) in 2020` <- as.numeric(as.character(data_transformed$`GNI per capita (constant 2015 US$) in 2020`)) # Boxplot for visual analysis ggplot(data_transformed, aes(x = " ", y = `GNI per capita (constant 2015 US$) in 2020`)) + geom_boxplot() + ggtitle("GNI per capita in 2020 - Boxplot") # Identify outliers in the GNI per capita data boxplot_outliers <- boxplot.stats(data_transformed$`GNI per capita (constant 2015 US$) in 2020`)$out print(boxplot_outliers) # Calculate z-scores z_scores <- scale(data_transformed$`GNI per capita (constant 2015 US$) in 2020`, center = TRUE, scale = TRUE) # Identify outliers (threshold = 2 standard deviations) z_score_outliers <- which(abs(z_scores) > 2) print(z_score_outliers)
Editor is loading...
Leave a Comment