#------------------ # Data Preparation #------------------ #Read datasets #Download the data from http://www.saedsayad.com/datasets/CreditData.zip train <- read.csv("Credit_train.csv") test <- read.csv("Credit_test.csv") #Rows and Cols dim(train) dim(test) #Columns name colnames(train) colnames(test) #Show head(train) head(test) #------------------------------------------------------------------- # Data Exploration - Bivariate analysis - Categorical and Numerical #------------------------------------------------------------------- #DEFAULT and MAXLINEUTIL boxplot(MAXLINEUTIL~DEFAULT, data=train, main="Maximum number of lines being utilized" , sub="train", col="darkgreen", xlab="DEFAULT", ylab="MAXLINEUTIL") boxplot(MAXLINEUTIL~DEFAULT, data=test, main="Maximum number of lines being utilized" , sub="test", col="brown", xlab="DEFAULT", ylab="MAXLINEUTIL") #DEFAULT and DAYSDELQ boxplot(DAYSDELQ~DEFAULT, data=train, main="Number of delinquent days" , sub="train", col="darkgreen", xlab="DEFAULT", ylab="DAYSDELQ") boxplot(DAYSDELQ~DEFAULT, data=test, main="Number of delinquent days" , sub="test", col="brown", xlab="DEFAULT", ylab="DAYSDELQ") #DEFAULT and TOTACBAL boxplot(TOTACBAL~DEFAULT, data=train, main="Total balance of business account" , sub="train", col="darkgreen", xlab="DEFAULT", ylab="TOTACBAL") boxplot(TOTACBAL~DEFAULT, data=test, main="Total balance of business account" , sub="test", col="brown", xlab="DEFAULT", ylab="TOTACBAL") #Z-test for two variables - DEFAULT and DAYSDELQ train <- na.omit(train) a <- subset(train$DAYSDELQ,train$DEFAULT=='Y') b <- subset(train$DAYSDELQ,train$DEFAULT=='N') n1 <- length(a) n2 <- length(b) z <- (mean(a) - mean(b)) / (sqrt(var(a)/n1 + var(b)/n2)) pz <- 1-(pnorm(abs(z))-0.5)*2 #ANOVA - BUSTYPE and TOTACBAL fit <- aov(train$TOTACBAL ~ train$BUSTYPE) summary(fit)