random political research things
library(MASS)
# Recode completed education in 3 categories
data$edu <- NA
data$edu[data$YQ18_II == 1 | data$YQ18_II == 2] <- 1
data$edu[data$YQ18_II == 3 | data$YQ18_II == 4 | data$YQ18_II == 5] <- 2
data$edu[data$YQ18_II == 6 | data$YQ18_II == 7] <- 3
# Recode education to be finished (just drop NAs)
data$edu.s <- ifelse(data$YQ18_I > 0, data$YQ18_I, NA)
# Recode age (drop NAs)
data$age <- ifelse(data$YQ1 > 0, data$YQ1, NA)
# Make a dummy for female respondents
data$female <- ifelse(data$YQ42 == 2, 1, 0)
# Training data
data.t <- data[data$CCODE != 1756 & data$CCODE != 1792, ]
# Model on training data
mod.t <- polr(factor(edu) ~ factor(edu.s) + factor(age) + female,
Hess = T,
data = data.t)
summary(mod.t)
# How many cases correctly classified?
data.t$edu.pred <- predict(mod.t, newdata = data.t[, c("edu.s", "age", "female")])
tab.t <- with(data.t, table(edu, edu.pred))
round(sum(diag(tab.t))/sum(tab.t), 2) # 0.76 - NOT BAD
# Impute the data
pred.new.dat <- data.frame(cbind(data[data$CCODE == 1756 |
data$CCODE == 1792,
c("YRESID","edu")],
predict(mod.t,
newdata = data[data$CCODE == 1756 |
data$CCODE == 1792,
c("edu.s", "age", "female")])
))
names(pred.new.dat) <- c("YRESID", "edu", "edu.pred")
pred.new.dat$edu.p <- ifelse(is.na(pred.new.dat$edu), pred.new.dat$edu.pred, pred.new.dat$edu)
data <- merge(data, pred.new.dat[, c("YRESID", "edu.p")], by = "YRESID", all.x = T)
data$edu.imp <- ifelse(data$CCODE == 1756 & is.na(data$edu) |
data$CCODE == 1792 & is.na(data$edu),
1, 0)
data$edu <- ifelse(data$CCODE == 1756 & is.na(data$edu) |
data$CCODE == 1792 & is.na(data$edu),
data$edu.p, data$edu)
# Remove additional variables and objects
data <- data[, -which(names(data) %in% c("edu.s", "age", "female", "edu.p"))]
rm(pred.new.dat)
rm(data.t, tab.t)
rm(mod.t)