R Coding
Industry Positions Analysis
This project investigates on certain industry positions to determine if any discrimination lies within.
#linear regression
summary(lm(income~.,rawdata))
#check missing value
library(VIM)
summary(aggr(rawdata))
##delete industry from the regression due to the large percentage of the missing values in industry
summary(lm(income~.,rawdata[,-9]))
##pay more attention to marital status
## recategory marital status
rawdata$maritalstatus1<-rawdata$maritalstatus
levels(rawdata$maritalstatus1)<-list(Nevermarried=c("Never married, cohabiting","Never married, not cohabiting"),
Married=c("Married, spouse present","Married, spouse absent"),
Seperated=c( "Separated, cohabiting","Separated, not cohabiting"),
Divorced=c("Divorced, cohabiting","Divorced, not cohabiting"),
Widowed=c("Widowed, cohabiting","Widowed, not cohabiting"))
summary(lm(income~.,rawdata[,c(-5,-9)]))
##stepwise
lm<-lm(income~.,rawdata[,c(-5,-9)])
stepwise<-step(lm)
summary(stepwise)
##couldn't remove any variable
##decisiontree
##### Building tree 1 #####
library(rpart)
library(tidyverse)
library(forcats)
selecteddata1=select(rawdata, income, race, sex)
selecteddata1 <- mutate(selecteddata1, race = fct_recode(race,
"nonewhite" = "Something else? (SPECIFY)",
"nonewhite" = "Black or African American",
"nonewhite" = "Asian or Pacific Islander",
"nonewhite" = "American Indian, Eskimo, or Aleut"))
treedata <- filter(rawdata, !is.na(income))
treedata <- filter(treedata, !is.na(race))
treedata <- filter(treedata, !is.na(region))
treedata <- filter(treedata, !is.na(degree))
treedata <- filter(treedata, !is.na(maritalstatus))
treedata <- filter(treedata, !is.na(generalhealth))
treedata <- filter(treedata, !is.na(industry))
treedata$income[treedata$income <= 32000] <- 0
treedata$income[treedata$income > 32000] <- 1
summary(treedata)
rpartContr=rpart.control(minsplit = 10,
cp=1e-09,
minbucket = 4)
tree0 = rpart(income~.,
control=rpartContr,
data=selecteddata1)
tree1 = rpart(income~.,
control=rpartContr,
data=treedata)
#Plot color tree
library(rpart.plot)
rpart.plot(tree0)
rpart.plot(tree1)
##### Split the Data #####
library(caTools)
set.seed(100)
split = sample.split(treedata$income,SplitRatio = 0.7)
train = treedata[split,]
test = treedata[!split,]
##### Model data - Decision Tree #####
##### Cross Validation #####
# k-fold Cross Validation
# Find the optimal Complexity by using 5-fold crossvalidation
library(caret)
trControl = trainControl(method = 'cv',number = 5)
tuneGrid = expand.grid(.cp=seq(0,0.1,0.0001))
set.seed(100)
trainCV = train(factor(income)~.,train,method='rpart',trControl=trControl,tuneGrid=tuneGrid)
plot(trainCV)
trainCV$bestTune
# Run model with optimal cp
tree_cv = rpart(income~.,data=train,method='class',cp=trainCV$bestTune)
rpart.plot(tree_cv)
summary(tree_cv) # note value of cp
## Assess Performance
# Accuracy
pred = predict(tree_cv,type='class')
ct = table(train$income,pred); ct
accuracy = (ct[1,1]+ct[2,2])/nrow(train); accuracy
pred = predict(tree_cv,newdata=test, type='class')
ct = table(test$income,pred); ct
accuracy = (ct[1,1]+ct[2,2])/nrow(test); accuracy
##### Random Forests #####
library(randomForest)
set.seed(100)
forest = randomForest(factor(income)~.,data=train,ntree=1000)
plot(forest)
getTree(forest,1,labelVar = T)
importance(forest) # or varImp(bag)
varImpPlot(forest)
hist(treesize(forest))
# Accuracy
pred = predict(forest)
ct = table(train$income,pred); ct
accuracy = (ct[1,1]+ct[2,2])/nrow(train); accuracy
pred = predict(forest,newdata=test)
ct = table(test$income,pred); ct
accuracy = (ct[1,1]+ct[2,2])/nrow(test); accuracy
## Construct ROC curve
library(ROCR)
pred = predict(forest,newdata=test,'prob')
ROCRpred = prediction(pred[,2],test$income)
as.numeric(performance(ROCRpred,"auc")@y.values)
ROCRperf = performance(ROCRpred,"tpr","fpr")
plot(ROCRperf,colorize=TRUE,print.cutoffs.at=seq(0,1,0.5),text.adj=c(-0.3,2))