Untitled
unknown
r
2 years ago
12 kB
2
Indexable
Sys.setenv(LANG = "en") options(scipen=999) library("readxl") library("tidyr") library("reshape") library("stringr") library("MASS") library("sandwich") library("zoo") library("car") library("lmtest") library("Formula") library("plm") library("stargazer") library("tidyverse") library(reshape) countries <- c('AFG', 'ALB', 'DZA', 'ASM', 'AND', 'AGO', 'AIA', 'ATA', 'ATG', 'ARG', 'ARM', 'ABW', 'AUS', 'AUT', 'AZE', 'BHS', 'BHR', 'BGD', 'BRB', 'BLR', 'BEL', 'BLZ', 'BEN', 'BMU', 'BTN', 'BOL', 'BES', 'BIH', 'BWA', 'BVT', 'BRA', 'IOT', 'BRN', 'BGR', 'BFA', 'BDI', 'CPV', 'KHM', 'CMR', 'CAN', 'CYM', 'CAF', 'TCD', 'CHL', 'CHN', 'CXR', 'CCK', 'COL', 'COM', 'COD', 'COG', 'COK', 'CRI', 'HRV', 'CUB', 'CUW', 'CYP', 'CZE', 'CIV', 'DNK', 'DJI', 'DMA', 'DOM', 'ECU', 'EGY', 'SLV', 'GNQ', 'ERI', 'EST', 'SWZ', 'ETH', 'FLK', 'FRO', 'FJI', 'FIN', 'FRA', 'GUF', 'PYF', 'ATF', 'GAB', 'GMB', 'GEO', 'DEU', 'GHA', 'GIB', 'GRC', 'GRL', 'GRD', 'GLP', 'GUM', 'GTM', 'GGY', 'GIN', 'GNB', 'GUY', 'HTI', 'HMD', 'VAT', 'HND', 'HKG', 'HUN', 'ISL', 'IND', 'IDN', 'IRN', 'IRQ', 'IRL', 'IMN', 'ISR', 'ITA', 'JAM', 'JPN', 'JEY', 'JOR', 'KAZ', 'KEN', 'KIR', 'PRK', 'KOR', 'KWT', 'KGZ', 'LAO', 'LVA', 'LBN', 'LSO', 'LBR', 'LBY', 'LIE', 'LTU', 'LUX', 'MAC', 'MDG', 'MWI', 'MYS', 'MDV', 'MLI', 'MLT', 'MHL', 'MTQ', 'MRT', 'MUS', 'MYT', 'MEX', 'FSM', 'MDA', 'MCO', 'MNG', 'MNE', 'MSR', 'MAR', 'MOZ', 'MMR', 'NAM', 'NRU', 'NPL', 'NLD', 'NCL', 'NZL', 'NIC', 'NER', 'NGA', 'NIU', 'NFK', 'MNP', 'NOR', 'OMN', 'PAK', 'PLW', 'PSE', 'PAN', 'PNG', 'PRY', 'PER', 'PHL', 'PCN', 'POL', 'PRT', 'PRI', 'QAT', 'MKD', 'ROU', 'RUS', 'RWA', 'REU', 'BLM', 'SHN', 'KNA', 'LCA', 'MAF', 'SPM', 'VCT', 'WSM', 'SMR', 'STP', 'SAU', 'SEN', 'SRB', 'SYC', 'SLE', 'SGP', 'SXM', 'SVK', 'SVN', 'SLB', 'SOM', 'ZAF', 'SGS', 'SSD', 'ESP', 'LKA', 'SDN', 'SUR', 'SJM', 'SWE', 'CHE', 'SYR', 'TWN', 'TJK', 'TZA', 'THA', 'TLS', 'TGO', 'TKL', 'TON', 'TTO', 'TUN', 'TUR', 'TKM', 'TCA', 'TUV', 'UGA', 'UKR', 'ARE', 'GBR', 'UMI', 'USA', 'URY', 'UZB', 'VUT', 'VEN', 'VNM', 'VGB', 'VIR', 'WLF', 'ESH', 'YEM', 'ZMB', 'ZWE') # Population data Population = read.csv(file="Population.csv", sep=",", skip = 0, header=TRUE) Population['Source'] = 'Population' # Contextual data Contextual = read.csv(file="Contextual Indicators.csv", sep=",", skip = 0, header=TRUE) Contextual['Source'] = 'Contextual' # Leadership data Leadership = read.csv(file="Leadership.csv", sep=",", skip = 0, header=TRUE) Leadership['Source'] = 'Leadership' # Employment and Time Use data Employment = read.csv(file="Employment and Time Use.csv", sep=",", skip = 0, header=TRUE) Employment['Source'] = 'Employment' # Entrepreneurship data Entrepreneurship = read.csv(file="Entrepreneurship.csv", sep=",", skip = 0, header=TRUE) Entrepreneurship['Source'] = 'Entrepreneurship' Employment = Employment %>% filter(Indicator.Code != 'SL.EMP.SMGT.FE.ZS') # Remove duplicate column that comes from leadership Entrepreneurship = Entrepreneurship %>% filter(Indicator.Code != 'IC.FRM.FEMM.ZS') # Remove duplicate column that comes from leadership # Union all datasets union <- rbind(Population, Contextual, Leadership, Employment, Entrepreneurship) ## Remove binary columns as time invariant features cannot be used in panel models union <- union %>% filter(!grepl("(1=yes; 0=no)", union$Indicator.Name, fixed = TRUE)) search <- c('SP.POP.TOTL.FE.ZS', 'SP.RUR.TOTL.ZS', 'SP.DYN.TFRT.IN', 'NY.GDP.PCAP.CD', 'SH.PAR.LEVE.FE', 'SH.PAR.LEVE.MA', 'SL.TLF.TOTL.FE.IN', 'SP.DYN.LE00.FE.IN', 'SL.UEM.TOTL.FE.ZS', 'SL.EMP.MPYR.FE.ZS', 'SL.EMP.WORK.FE.ZS', 'SE.XPD.TOTL.GD.ZS', 'SG.GEN.PARL.ZS', 'SP.POP.0014.FE.IN') union_filtered <- union[union$Indicator.Code %in% search,] selected_indicators <- data.frame(union_filtered %>% group_by(Source, Indicator.Code, Indicator.Name) %>% count() %>% arrange(-n) %>% filter(n>1000)) selected_indicators # Casting unioned datasets, grouping by Year, Country.Name and Country.Code union_filtered_cast <- cast(union_filtered, Year + Country.Name + Country.Code ~ Indicator.Code, value = "Value") head(union_filtered_cast) dim(union_filtered_cast) # Remove aggregates for regions - selecting only countries data <- union_filtered_cast[union_filtered_cast$Country.Code %in% countries,] data <- data %>% drop_na('SG.GEN.PARL.ZS') # Drop all rows that has NA in dependent variable data <- data %>% filter(Year > 2000) # Filter dataset to exclude all observations before 2001 # Attempt to imput country means for missing values under each indicator data <- data %>% group_by(Country.Code) %>% mutate_if(is.numeric, function(x) ifelse(is.na(x), mean(x, na.rm = TRUE), x)) data <- na.omit(data) # Remove rows where there is still missing data data$SL.TLF.TOTL.FE.IN = data$SL.TLF.TOTL.FE.IN/1000000 # Total labour force female in millions data$SP.POP.0014.FE.IN = data$SP.POP.0014.FE.IN/1000000 # 0-14 age female population, in millions # Replacing 0's with 1 to perform log transformation without errors data$SH.PAR.LEVE.FE <- ifelse(data$SH.PAR.LEVE.FE == 0, 1, data$SH.PAR.LEVE.FE) # Replacing 0's with 1 to perform log transformation without errors data$SH.PAR.LEVE.MA <- ifelse(data$SH.PAR.LEVE.MA == 0, 1, data$SH.PAR.LEVE.MA) # Replacing 0's with 1 to perform log transformation without errors data$SL.TLF.TOTL.FE.IN <- ifelse(data$SL.TLF.TOTL.FE.IN == 0, 1, data$SL.TLF.TOTL.FE.IN) # Interaction between life expectancy at birth and 0-14 age female population data$SP.DYN.LE00.x.POP.0014.FE <- data$SP.DYN.LE00.FE.IN * data$SP.POP.0014.FE.IN data$SP.DYN.LE00.FE.IN <- NULL # Drop column data$SP.POP.0014.FE.IN <- NULL # Drop column data$SP.DYN.TFRT.IN3 <- data$SP.DYN.TFRT.IN^3 # 3rd power of fertility rate per 1000 women data$SP.DYN.TFRT.IN <- NULL # Drop column #continuous columns to be converted to log log_cols <- c( 'NY.GDP.PCAP.CD', 'SH.PAR.LEVE.FE', 'SH.PAR.LEVE.MA', 'SP.DYN.TFRT.IN3', 'SL.TLF.TOTL.FE.IN', 'SP.DYN.LE00.x.POP.0014.FE') # apply log transformation to selected columns data <- data %>% mutate(across(all_of(log_cols), ~ log(.))) # divide other columns by 100 to get right pct format data <- data %>% mutate(across(!all_of(c(log_cols, 'Year', 'Country.Name')), ~ ./100)) # add 'ln.' prefix for log transformaed indicators colnames(data) <- ifelse(colnames(data) %in% log_cols, paste0("ln.", colnames(data)), colnames(data)) head(data) summary(data) dependent <- "SG.GEN.PARL.ZS" # Indicator code of dependent variable # Specify the names to be excluded to define independent variables to_exclude <- c("SG.GEN.PARL.ZS", "Year", "Country.Name", "Country.Code") # Explanatory variables features <- colnames(data)[!colnames(data) %in% to_exclude] # Prepare form text explanatory <- paste0(features,collapse='+') form <- as.formula(paste0(dependent, " ~ ", explanatory)) # General model form form fixed <- plm(form, data=data, index=c("Country.Name", "Year"), model="within") # within > Fixed Effects Model random <- plm(form, data=data, index=c("Country.Name", "Year"), model="random") # random > Random Effects Model models <- list("FE" = fixed, "RE" = random) stargazer(models, title = "(1) - Fixed Effects Model | (2) - Random Effects Model", type = "text", star.cutoffs = c(0.05, 0.01, 0.001)) # One quality table for fixed and random effects models # Testing for individual effects in fixed effects model # H0: all ui = 0 (all individual effects are equal to zero - use OLS) ols <- lm(form, data=data) # Estimate OLS pF <- pFtest(fixed, ols) # Compare fixed effects model with OLS print(pF) # Testing for individual effects in random effects model # H0: all ui = 0 (all individual effects are equal to zero - use OLS) # POLS <-plm(form, data=data, index=c("Country.Name", "Year"), model="pooling") plmtest(random, type=c("bp")) # For random # Testing for model consistency both for random and fixed effects models phtest(fixed,random) # H0: Both models are consistent # H1: One model is inconsistent # H0 rejected, at least one model is inconsistent which is random effects model # Therefore, fixed effects model should be used # Testing for heteroskedasticity -- are the variance of residuals are constant? # If so, we should use roboust estimator to correct the errors bp <- bptest(form, data=data, studentize=T) print(bp) # Testing for serial correlation -- checking if we have auto correlation or not bpg <- pbgtest(fixed) print(bpg) # Controlling for heteroskedasticity and autocorrelation # this will not change estimates but errors, t and p values coef <- coeftest(fixed, vcov.=vcovHC(fixed, method="white1", type="HC0", cluster="group")) print(coef) coef_extract <- as.data.frame(coef[,c(1,2,3,4)]) # Extract estimates, std. error, t and p values from coef coef_extract[,"Pr(>|t|)"] <- round(coef_extract[,"Pr(>|t|)"],8) # Round values in p value column coef_extract$var <- rownames(coef_extract) # Extract row names and assigned them to var column rownames(coef_extract) <- NULL # Remove row names coef_extract[coef_extract["Pr(>|t|)"]>=0.05,] # Only insignificant variables insig <- coef_extract[coef_extract["Pr(>|t|)"]>=0.05,"var"] # Names of insignificant variables sig <- coef_extract[coef_extract["Pr(>|t|)"]<0.05,"var"] # Names of significant variables print("Insignificant variables:") print(insig) print("Significant variables:") print(sig) # Prepare form text explanatory_sig <- paste0(sig,collapse='+') form_sig <- as.formula(paste0(dependent, " ~ ", explanatory_sig)) # Model form with only significant variables form_sig # Estimate new model with only significant variables fixed_sig <- plm(form_sig, data=data, index=c("Country.Name", "Year"), model="within") # within > FE summary(fixed_sig) # Testing for individual effects in fixed effects model # H0: all ui = 0 (all individual effects are equal to zero - use OLS) ols_sig <- lm(form_sig, data=data) pF_sig <- pFtest(fixed_sig, ols) print(pF_sig) # Testing if all insignificant variables are jointly insignificant by comparing # the general model and the model with only significant variables pFtest(fixed, fixed_sig) # H0: all insignificant variables are jointly insignificant # H1: all insignificant variables are not jointly insignificant # Not rejected, we can remove all insignificant variables from the general model # Another joint insignificance test if all insignificant variables are jointly insignificant linearHypothesis(fixed, paste(insig, "=0")) # H0: All insignificant variables are jointly insignificant # Not rejected, we can remove all insignificant variables from the general model # Final model diagnostics # Testing for heteroskedasticity -- are the variance of residuals are constant? # If so, we should use roboust estimator to correct the errors bp_sig <- bptest(form_sig, data=data, studentize=T) print(bp_sig) # Testing for serial correlation -- checking if we have auto correlation or not bpg_sig <- pbgtest(fixed_sig) print(bpg_sig) # Controlling for heteroskedasticity and autocorrelation # this will not change estimates but errors, t and p values coef_sig <- coeftest(fixed_sig, vcov.=vcovHC(fixed, method="white1", type="HC0", cluster="group")) print(coef_sig) # Preparing one quality table models <- list("General Model" = fixed, "Specific Model" = fixed_sig) model_options <- list(coef, coef_sig) stargazer(models, title = "(1) - General Fixed Effects Model | (2) - Specific Fixed Effects Model", type = "text", ... = model_options, star.cutoffs = c(0.05, 0.01, 0.001))
Editor is loading...