top of page
R.png

R Coding

Industry Positions Analysis

This project investigates on certain industry positions to determine if any discrimination lies within.

#linear regression
summary(lm(income~.,rawdata))


#check missing value
library(VIM)
summary(aggr(rawdata))


##delete industry from the regression due to the large percentage of the missing values in industry
summary(lm(income~.,rawdata[,-9]))


##pay more attention to marital status
## recategory marital status

rawdata$maritalstatus1<-rawdata$maritalstatus
levels(rawdata$maritalstatus1)<-list(Nevermarried=c("Never married, cohabiting","Never married, not cohabiting"),
                                     Married=c("Married, spouse present","Married, spouse absent"),
                                     Seperated=c( "Separated, cohabiting","Separated, not cohabiting"),
                                     Divorced=c("Divorced, cohabiting","Divorced, not cohabiting"),
                                     Widowed=c("Widowed, cohabiting","Widowed, not cohabiting"))
summary(lm(income~.,rawdata[,c(-5,-9)]))


##stepwise 
lm<-lm(income~.,rawdata[,c(-5,-9)])
stepwise<-step(lm)
summary(stepwise)
##couldn't remove any variable


##decisiontree
##### Building tree 1 #####
library(rpart)
library(tidyverse)
library(forcats)
selecteddata1=select(rawdata, income, race, sex)  
selecteddata1 <- mutate(selecteddata1, race = fct_recode(race, 
                                                         "nonewhite" = "Something else? (SPECIFY)",
                                                         "nonewhite" = "Black or African American",
                                                         "nonewhite" = "Asian or Pacific Islander",
                                                         "nonewhite" = "American Indian, Eskimo, or Aleut"))
treedata <- filter(rawdata, !is.na(income))
treedata <- filter(treedata, !is.na(race))
treedata <- filter(treedata, !is.na(region))
treedata <- filter(treedata, !is.na(degree))
treedata <- filter(treedata, !is.na(maritalstatus))
treedata <- filter(treedata, !is.na(generalhealth))
treedata <- filter(treedata, !is.na(industry))
treedata$income[treedata$income <= 32000] <- 0
treedata$income[treedata$income > 32000] <- 1
summary(treedata)
rpartContr=rpart.control(minsplit = 10,
                         cp=1e-09, 
                         minbucket = 4)  
tree0 = rpart(income~., 
              control=rpartContr, 
              data=selecteddata1)
tree1 = rpart(income~., 
              control=rpartContr, 
              data=treedata)

#Plot color tree
 

library(rpart.plot)
rpart.plot(tree0)
rpart.plot(tree1) 

##### Split the Data #####
library(caTools)
set.seed(100)
split = sample.split(treedata$income,SplitRatio = 0.7)
train = treedata[split,]
test = treedata[!split,]

##### Model data - Decision Tree #####

##### Cross Validation #####
# k-fold Cross Validation
# Find the optimal Complexity by using 5-fold crossvalidation

library(caret)
trControl = trainControl(method = 'cv',number = 5)
tuneGrid = expand.grid(.cp=seq(0,0.1,0.0001))
set.seed(100)
trainCV = train(factor(income)~.,train,method='rpart',trControl=trControl,tuneGrid=tuneGrid)
plot(trainCV)
trainCV$bestTune

# Run model with optimal cp
tree_cv = rpart(income~.,data=train,method='class',cp=trainCV$bestTune)
rpart.plot(tree_cv)
summary(tree_cv)  # note value of cp

## Assess Performance 
# Accuracy

pred = predict(tree_cv,type='class')
ct = table(train$income,pred); ct
accuracy = (ct[1,1]+ct[2,2])/nrow(train); accuracy

pred = predict(tree_cv,newdata=test, type='class')
ct = table(test$income,pred); ct
accuracy = (ct[1,1]+ct[2,2])/nrow(test); accuracy

##### Random Forests #####
library(randomForest)
set.seed(100)
forest = randomForest(factor(income)~.,data=train,ntree=1000)
plot(forest)
getTree(forest,1,labelVar = T)
importance(forest) # or varImp(bag)
varImpPlot(forest)
hist(treesize(forest))

# Accuracy
pred = predict(forest)
ct = table(train$income,pred); ct
accuracy = (ct[1,1]+ct[2,2])/nrow(train); accuracy

pred = predict(forest,newdata=test)
ct = table(test$income,pred); ct
accuracy = (ct[1,1]+ct[2,2])/nrow(test); accuracy

## Construct ROC curve 
library(ROCR)
pred = predict(forest,newdata=test,'prob')
ROCRpred = prediction(pred[,2],test$income)
as.numeric(performance(ROCRpred,"auc")@y.values)
ROCRperf = performance(ROCRpred,"tpr","fpr")
plot(ROCRperf,colorize=TRUE,print.cutoffs.at=seq(0,1,0.5),text.adj=c(-0.3,2))

© 2025 Niki Tao All Rights Reserved.

bottom of page