Untitled

Sys.setenv(LANG = "en")
options(scipen=999)
library("readxl")
library("tidyr")
library("reshape")
library("stringr")
library("MASS")
library("sandwich")
library("zoo")
library("car")
library("lmtest") 
library("Formula")
library("plm")
library("stargazer")
library("tidyverse")
library(reshape)

countries <- c('AFG', 'ALB', 'DZA', 'ASM', 'AND', 'AGO', 'AIA', 'ATA', 'ATG', 'ARG', 'ARM', 'ABW', 'AUS', 
               'AUT', 'AZE', 'BHS', 'BHR', 'BGD', 'BRB', 'BLR', 'BEL', 'BLZ', 'BEN', 'BMU', 'BTN', 'BOL', 
               'BES', 'BIH', 'BWA', 'BVT', 'BRA', 'IOT', 'BRN', 'BGR', 'BFA', 'BDI', 'CPV', 'KHM', 'CMR', 
               'CAN', 'CYM', 'CAF', 'TCD', 'CHL', 'CHN', 'CXR', 'CCK', 'COL', 'COM', 'COD', 'COG', 'COK', 
               'CRI', 'HRV', 'CUB', 'CUW', 'CYP', 'CZE', 'CIV', 'DNK', 'DJI', 'DMA', 'DOM', 'ECU', 'EGY', 
               'SLV', 'GNQ', 'ERI', 'EST', 'SWZ', 'ETH', 'FLK', 'FRO', 'FJI', 'FIN', 'FRA', 'GUF', 'PYF', 
               'ATF', 'GAB', 'GMB', 'GEO', 'DEU', 'GHA', 'GIB', 'GRC', 'GRL', 'GRD', 'GLP', 'GUM', 'GTM', 
               'GGY', 'GIN', 'GNB', 'GUY', 'HTI', 'HMD', 'VAT', 'HND', 'HKG', 'HUN', 'ISL', 'IND', 'IDN', 
               'IRN', 'IRQ', 'IRL', 'IMN', 'ISR', 'ITA', 'JAM', 'JPN', 'JEY', 'JOR', 'KAZ', 'KEN', 'KIR', 
               'PRK', 'KOR', 'KWT', 'KGZ', 'LAO', 'LVA', 'LBN', 'LSO', 'LBR', 'LBY', 'LIE', 'LTU', 'LUX', 
               'MAC', 'MDG', 'MWI', 'MYS', 'MDV', 'MLI', 'MLT', 'MHL', 'MTQ', 'MRT', 'MUS', 'MYT', 'MEX', 
               'FSM', 'MDA', 'MCO', 'MNG', 'MNE', 'MSR', 'MAR', 'MOZ', 'MMR', 'NAM', 'NRU', 'NPL', 'NLD', 
               'NCL', 'NZL', 'NIC', 'NER', 'NGA', 'NIU', 'NFK', 'MNP', 'NOR', 'OMN', 'PAK', 'PLW', 'PSE', 
               'PAN', 'PNG', 'PRY', 'PER', 'PHL', 'PCN', 'POL', 'PRT', 'PRI', 'QAT', 'MKD', 'ROU', 'RUS', 
               'RWA', 'REU', 'BLM', 'SHN', 'KNA', 'LCA', 'MAF', 'SPM', 'VCT', 'WSM', 'SMR', 'STP', 'SAU', 
               'SEN', 'SRB', 'SYC', 'SLE', 'SGP', 'SXM', 'SVK', 'SVN', 'SLB', 'SOM', 'ZAF', 'SGS', 'SSD', 
               'ESP', 'LKA', 'SDN', 'SUR', 'SJM', 'SWE', 'CHE', 'SYR', 'TWN', 'TJK', 'TZA', 'THA', 'TLS', 
               'TGO', 'TKL', 'TON', 'TTO', 'TUN', 'TUR', 'TKM', 'TCA', 'TUV', 'UGA', 'UKR', 'ARE', 'GBR', 
               'UMI', 'USA', 'URY', 'UZB', 'VUT', 'VEN', 'VNM', 'VGB', 'VIR', 'WLF', 'ESH', 'YEM', 'ZMB', 
               'ZWE')

# Population data

Population = read.csv(file="Population.csv", 
                   sep=",", skip = 0, header=TRUE)

Population['Source'] = 'Population'

# Contextual data

Contextual = read.csv(file="Contextual Indicators.csv", 
                   sep=",", skip = 0, header=TRUE)

Contextual['Source'] = 'Contextual'

# Leadership data

Leadership = read.csv(file="Leadership.csv", 
                   sep=",", skip = 0, header=TRUE)

Leadership['Source'] = 'Leadership'

# Employment and Time Use data

Employment = read.csv(file="Employment and Time Use.csv", 
                   sep=",", skip = 0, header=TRUE)

Employment['Source'] = 'Employment'

# Entrepreneurship data

Entrepreneurship = read.csv(file="Entrepreneurship.csv", 
                   sep=",", skip = 0, header=TRUE)


Entrepreneurship['Source'] = 'Entrepreneurship'

Employment = Employment %>% filter(Indicator.Code	 != 'SL.EMP.SMGT.FE.ZS') # Remove duplicate column that comes from leadership
Entrepreneurship = Entrepreneurship %>% filter(Indicator.Code	 != 'IC.FRM.FEMM.ZS') # Remove duplicate column that comes from leadership

# Union all datasets

union <- rbind(Population, Contextual, Leadership, Employment, Entrepreneurship)

## Remove binary columns as time invariant features cannot be used in panel models
union <- union %>% filter(!grepl("(1=yes; 0=no)", union$Indicator.Name, fixed = TRUE)) 

search <- c('SP.POP.TOTL.FE.ZS',
'SP.RUR.TOTL.ZS',
'SP.DYN.TFRT.IN',
'NY.GDP.PCAP.CD',
'SH.PAR.LEVE.FE',
'SH.PAR.LEVE.MA',
'SL.TLF.TOTL.FE.IN',
'SP.DYN.LE00.FE.IN',
'SL.UEM.TOTL.FE.ZS',
'SL.EMP.MPYR.FE.ZS',
'SL.EMP.WORK.FE.ZS',
'SE.XPD.TOTL.GD.ZS',
'SG.GEN.PARL.ZS',
'SP.POP.0014.FE.IN')

union_filtered <- union[union$Indicator.Code %in% search,]
selected_indicators <- data.frame(union_filtered %>% 
                        group_by(Source, Indicator.Code, Indicator.Name) %>% 
                        count() %>% arrange(-n) %>% filter(n>1000))

selected_indicators

# Casting unioned datasets, grouping by Year, Country.Name and Country.Code
union_filtered_cast <- cast(union_filtered, Year + Country.Name + Country.Code ~ Indicator.Code, value = "Value")
head(union_filtered_cast)
dim(union_filtered_cast)

# Remove aggregates for regions - selecting only countries
data <-  union_filtered_cast[union_filtered_cast$Country.Code %in% countries,]

data <- data %>% drop_na('SG.GEN.PARL.ZS') # Drop all rows that has NA in dependent variable

data <- data %>% filter(Year > 2000) # Filter dataset to exclude all observations before 2001

# Attempt to imput country means for missing values under each indicator
data <- data %>% group_by(Country.Code) %>% 
                    mutate_if(is.numeric, function(x) ifelse(is.na(x), mean(x, na.rm = TRUE), x))

data <- na.omit(data) # Remove rows where there is still missing data


data$SL.TLF.TOTL.FE.IN = data$SL.TLF.TOTL.FE.IN/1000000 # Total labour force female in millions
data$SP.POP.0014.FE.IN = data$SP.POP.0014.FE.IN/1000000 # 0-14 age female population, in millions
# Replacing 0's with 1 to perform log transformation without errors
data$SH.PAR.LEVE.FE <- ifelse(data$SH.PAR.LEVE.FE == 0, 1, data$SH.PAR.LEVE.FE)
# Replacing 0's with 1 to perform log transformation without errors
data$SH.PAR.LEVE.MA <- ifelse(data$SH.PAR.LEVE.MA == 0, 1, data$SH.PAR.LEVE.MA)
# Replacing 0's with 1 to perform log transformation without errors
data$SL.TLF.TOTL.FE.IN <- ifelse(data$SL.TLF.TOTL.FE.IN == 0, 1, data$SL.TLF.TOTL.FE.IN)

# Interaction between life expectancy at birth and 0-14 age female population
data$SP.DYN.LE00.x.POP.0014.FE <- data$SP.DYN.LE00.FE.IN * data$SP.POP.0014.FE.IN
data$SP.DYN.LE00.FE.IN  <- NULL # Drop column
data$SP.POP.0014.FE.IN <- NULL # Drop column
data$SP.DYN.TFRT.IN3 <- data$SP.DYN.TFRT.IN^3 # 3rd power of fertility rate per 1000 women
data$SP.DYN.TFRT.IN <- NULL # Drop column
            
#continuous columns to be converted to log
log_cols <- c(
'NY.GDP.PCAP.CD',
'SH.PAR.LEVE.FE',
'SH.PAR.LEVE.MA',
'SP.DYN.TFRT.IN3',
'SL.TLF.TOTL.FE.IN',
'SP.DYN.LE00.x.POP.0014.FE')


# apply log transformation to selected columns
data <- data %>% mutate(across(all_of(log_cols), ~ log(.))) 

# divide other columns by 100 to get right pct format
data <- data %>% mutate(across(!all_of(c(log_cols, 'Year', 'Country.Name')), ~ ./100))
            
# add 'ln.' prefix for log transformaed indicators
colnames(data) <- ifelse(colnames(data) %in% log_cols, paste0("ln.", colnames(data)), colnames(data))
            
head(data)

summary(data)

dependent <- "SG.GEN.PARL.ZS" # Indicator code of dependent variable

# Specify the names to be excluded to define independent variables
to_exclude <- c("SG.GEN.PARL.ZS", "Year", "Country.Name", "Country.Code")

# Explanatory variables
features <- colnames(data)[!colnames(data) %in% to_exclude]

# Prepare form text
explanatory <- paste0(features,collapse='+')
form <- as.formula(paste0(dependent, " ~ ", explanatory)) # General model form

form 

fixed <- plm(form, data=data, index=c("Country.Name", "Year"), model="within") # within > Fixed Effects Model
random <- plm(form, data=data, index=c("Country.Name", "Year"), model="random") # random > Random Effects Model

models <- list("FE" = fixed, "RE" = random)

stargazer(models, title = "(1) - Fixed Effects Model | (2) - Random Effects Model", type = "text",
          star.cutoffs = c(0.05, 0.01, 0.001)) # One quality table for fixed and random effects models

# Testing for individual effects in fixed effects model
# H0: all ui = 0 (all individual effects are equal to zero - use OLS)
ols <- lm(form, data=data) # Estimate OLS
pF <- pFtest(fixed, ols)  # Compare fixed effects model with OLS

print(pF)

# Testing for individual effects in random effects model
# H0: all ui = 0 (all individual effects are equal to zero - use OLS)
# POLS <-plm(form, data=data, index=c("Country.Name", "Year"), model="pooling")

plmtest(random, type=c("bp")) # For random

# Testing for model consistency both for random and fixed effects models

phtest(fixed,random)

# H0: Both models are consistent
# H1: One model is inconsistent

# H0 rejected, at least one model is inconsistent which is random effects model
# Therefore, fixed effects model should be used

# Testing for heteroskedasticity -- are the variance of residuals are constant?
# If so, we should use roboust estimator to correct the errors
bp <- bptest(form, data=data, studentize=T)
print(bp)

# Testing for serial correlation -- checking if we have auto correlation or not
bpg <- pbgtest(fixed)
print(bpg)

# Controlling for heteroskedasticity and autocorrelation
# this will not change estimates but errors, t and  p values
coef <- coeftest(fixed, vcov.=vcovHC(fixed, method="white1", type="HC0", cluster="group"))
print(coef)

coef_extract <- as.data.frame(coef[,c(1,2,3,4)]) # Extract estimates, std. error, t and p values from coef
coef_extract[,"Pr(>|t|)"] <- round(coef_extract[,"Pr(>|t|)"],8) # Round values in p value column
coef_extract$var <- rownames(coef_extract) # Extract row names and assigned them to var column
rownames(coef_extract) <- NULL # Remove row names
coef_extract[coef_extract["Pr(>|t|)"]>=0.05,] # Only insignificant variables

insig <- coef_extract[coef_extract["Pr(>|t|)"]>=0.05,"var"] # Names of insignificant variables
sig <- coef_extract[coef_extract["Pr(>|t|)"]<0.05,"var"] # Names of significant variables

print("Insignificant variables:")
print(insig)

print("Significant variables:")
print(sig)

# Prepare form text
explanatory_sig <- paste0(sig,collapse='+')
form_sig <- as.formula(paste0(dependent, " ~ ", explanatory_sig)) # Model form with only significant variables

form_sig

# Estimate new model with only significant variables
fixed_sig <- plm(form_sig, data=data, index=c("Country.Name", "Year"), model="within") # within > FE
summary(fixed_sig)

# Testing for individual effects in fixed effects model
# H0: all ui = 0 (all individual effects are equal to zero - use OLS)
ols_sig <- lm(form_sig, data=data)
pF_sig <- pFtest(fixed_sig, ols) 
print(pF_sig)

# Testing if all insignificant variables are jointly insignificant by comparing 
# the general model and the model with only significant variables

pFtest(fixed, fixed_sig)

# H0: all insignificant variables are jointly insignificant
# H1: all insignificant variables are not jointly insignificant

# Not rejected, we can remove all insignificant variables from the general model

# Another joint insignificance test if all insignificant variables are jointly insignificant

linearHypothesis(fixed, paste(insig, "=0"))

# H0: All insignificant variables are jointly insignificant
# Not rejected, we can remove all insignificant variables from the general model

# Final model diagnostics

# Testing for heteroskedasticity -- are the variance of residuals are constant?
# If so, we should use roboust estimator to correct the errors
bp_sig <- bptest(form_sig, data=data, studentize=T)
print(bp_sig)

# Testing for serial correlation -- checking if we have auto correlation or not
bpg_sig <- pbgtest(fixed_sig)
print(bpg_sig)

# Controlling for heteroskedasticity and autocorrelation
# this will not change estimates but errors, t and  p values
coef_sig <- coeftest(fixed_sig, vcov.=vcovHC(fixed, method="white1", type="HC0", cluster="group"))
print(coef_sig)

# Preparing one quality table

models <- list("General Model" = fixed, "Specific Model" = fixed_sig)
model_options <- list(coef, coef_sig)

stargazer(models, title = "(1) - General Fixed Effects Model | (2) - Specific Fixed Effects Model", 
          type = "text", ... = model_options,
          star.cutoffs = c(0.05, 0.01, 0.001))
Editor is loading...