Newspaper Customer Churn Data Analysis in R

Newspapers

A project that uses R to explore various statistics of customer churn for a newspaper.

The code below displays highlights from the project. For more details, please view the GitHub Repository.

Link to GitHub Repository:

Click Here

Libraries and Data

library(readxl)
library(ggplot2)
library(pROC)
library(caTools)
library(caret)
library(class)
library(e1071)
library(mlogit)
library(Hmisc)
ChurnData <- read_excel("NewspaperChurn.xlsx")
names(ChurnData) <- gsub(" ","_",names(ChurnData))

Convert Subscriber values to numbers Subscribers = 1
Non-subscribers = 2

ChurnData$Subscriber_Number <- as.character(ChurnData$Subscriber)
ChurnData$Subscriber_Number[ChurnData$Subscriber_Number == "YES"] <- "1"
ChurnData$Subscriber_Number[ChurnData$Subscriber_Number == "NO"] <- "2"
ChurnData$Subscriber_Number <- as.numeric(ChurnData$Subscriber_Number)

Convert Home Ownership values to numbers
Renter = 1
Owner = 2

ChurnData$Home_Ownership_Number <- as.character(ChurnData$Home_Ownership)
ChurnData$Home_Ownership_Number[ChurnData$Home_Ownership_Number == "RENTER"] <- "1"
ChurnData$Home_Ownership_Number[ChurnData$Home_Ownership_Number == "OWNER"] <- "2"
ChurnData$Home_Ownership_Number <- as.numeric(ChurnData$Home_Ownership_Number)

Convert Income to numeric values
I converted the lower number of the bucket to a number
For less than $20,000, I assigned the value of 1

ChurnData$HH_Income_Number <- as.character(ChurnData$HH_Income)
ChurnData$HH_Income_Number[ChurnData$HH_Income_Number == "$  30,000 - $39,999"] <- "30"
ChurnData$HH_Income_Number[ChurnData$HH_Income_Number == "$500,000 Plus"] <- "500"
ChurnData$HH_Income_Number[ChurnData$HH_Income_Number == "$100,000 - $124,999"] <- "100"
ChurnData$HH_Income_Number[ChurnData$HH_Income_Number == "$200,000 - $249,999"] <- "200"
ChurnData$HH_Income_Number[ChurnData$HH_Income_Number == "$  50,000 - $59,999"] <- "50"
ChurnData$HH_Income_Number[ChurnData$HH_Income_Number == "$150,000 - $174,999"] <- "150"
ChurnData$HH_Income_Number[ChurnData$HH_Income_Number == "$400,000 - $499,999"] <- "400"
ChurnData$HH_Income_Number[ChurnData$HH_Income_Number == "$175,000 - $199,999"] <- "175"
ChurnData$HH_Income_Number[ChurnData$HH_Income_Number == "$125,000 - $149,999"] <- "125"
ChurnData$HH_Income_Number[ChurnData$HH_Income_Number == "Under $20,000"] <- "1"
ChurnData$HH_Income_Number[ChurnData$HH_Income_Number == "$  80,000 - $89,999"] <- "80"
ChurnData$HH_Income_Number[ChurnData$HH_Income_Number == "$  90,000 - $99,999"] <- "90"
ChurnData$HH_Income_Number[ChurnData$HH_Income_Number == "$300,000 - $399,999"] <- "300"
ChurnData$HH_Income_Number[ChurnData$HH_Income_Number == "$  20,000 - $29,999"] <- "20"
ChurnData$HH_Income_Number[ChurnData$HH_Income_Number == "$  70,000 - $79,999"] <- "70"
ChurnData$HH_Income_Number[ChurnData$HH_Income_Number == "$  60,000 - $69,999"] <- "60"
ChurnData$HH_Income_Number[ChurnData$HH_Income_Number == "$  40,000 - $49,999"] <- "40"
ChurnData$HH_Income_Number[ChurnData$HH_Income_Number == "$250,000 - $299,999"] <- "250"
ChurnData$HH_Income_Number <- as.numeric(ChurnData$HH_Income_Number)

Convert Dummy for Children Y -> 1
N -> 2

ChurnData$dummy_for_Children_Number <- as.character(ChurnData$dummy_for_Children)
ChurnData$dummy_for_Children_Number[ChurnData$dummy_for_Children_Number == "Y"] <- "1"
ChurnData$dummy_for_Children_Number[ChurnData$dummy_for_Children_Number == "N"] <- "2"
ChurnData$dummy_for_Children_Number <- as.numeric(ChurnData$dummy_for_Children_Number)

Convert Age Range
I converted the lower number of the bucket to a number
For <24, I changed it to 18

ChurnData$Age_range_Number <- as.character(ChurnData$Age_range)
ChurnData$Age_range_Number[ChurnData$Age_range_Number == "25-29"] <- "25"
ChurnData$Age_range_Number[ChurnData$Age_range_Number == "50-54"] <- "50"
ChurnData$Age_range_Number[ChurnData$Age_range_Number == "45-49"] <- "45"
ChurnData$Age_range_Number[ChurnData$Age_range_Number == "55-59"] <- "55"
ChurnData$Age_range_Number[ChurnData$Age_range_Number == "60-64"] <- "60"
ChurnData$Age_range_Number[ChurnData$Age_range_Number == "75 years or more"] <- "75"
ChurnData$Age_range_Number[ChurnData$Age_range_Number == "65-69"] <- "65"
ChurnData$Age_range_Number[ChurnData$Age_range_Number == "70-74"] <- "70"
ChurnData$Age_range_Number[ChurnData$Age_range_Number == "40-44"] <- "40"
ChurnData$Age_range_Number[ChurnData$Age_range_Number == "24 years or less"] <- "18"
ChurnData$Age_range_Number[ChurnData$Age_range_Number == "35-39"] <- "35"
ChurnData$Age_range_Number[ChurnData$Age_range_Number == "30-34"] <- "30"
ChurnData$Age_range_Number <- as.numeric(ChurnData$Age_range_Number)

Weekly Fee
For most levels, I converted the lower number of the bucket to a number
For ‘$0’ and ‘$0-0.01’, I made them both 0

ChurnData$weekly_fee_number <- as.character(ChurnData$weekly_fee)
ChurnData$weekly_fee_number[ChurnData$weekly_fee_number == "$7.00 - $7.99"] <- "7"
ChurnData$weekly_fee_number[ChurnData$weekly_fee_number == "$0.01 - $0.50"] <- "0.01"
ChurnData$weekly_fee_number[ChurnData$weekly_fee_number == "$1.00 - $1.99"] <- "1"
ChurnData$weekly_fee_number[ChurnData$weekly_fee_number == "$8.00 - $8.99"] <- "8"
ChurnData$weekly_fee_number[ChurnData$weekly_fee_number == "$0 - $0.01"] <- "0"
ChurnData$weekly_fee_number[ChurnData$weekly_fee_number == "$2.00 - $2.99"] <- "2"
ChurnData$weekly_fee_number[ChurnData$weekly_fee_number == "$9.00 - $9.99"] <- "9"
ChurnData$weekly_fee_number[ChurnData$weekly_fee_number == "$4.00 - $4.99"] <- "4"
ChurnData$weekly_fee_number[ChurnData$weekly_fee_number == "$0.51 - $0.99"] <- "0.51"
ChurnData$weekly_fee_number[ChurnData$weekly_fee_number == "$3.00 - $3.99"] <- "3"
ChurnData$weekly_fee_number[ChurnData$weekly_fee_number == "$5.00 - $5.99"] <- "5"
ChurnData$weekly_fee_number[ChurnData$weekly_fee_number == "$6.00 - $6.99"] <- "6"
ChurnData$weekly_fee_number[ChurnData$weekly_fee_number == "$10.00 - $10.99"] <- "10"
ChurnData$weekly_fee_number[ChurnData$weekly_fee_number == "$0"] <- "0"
ChurnData$weekly_fee_number <- as.numeric(ChurnData$weekly_fee_number)

Create new dataframe with select columns and remove N/A values

ChurnDataModified <- ChurnData[, c("Subscriber", "Subscriber_Number", "Home_Ownership_Number", "HH_Income_Number", "dummy_for_Children_Number", "Year_Of_Residence", "Age_range_Number", "weekly_fee_number", "Zip_Code")]
ChurnData_2 <- na.omit(ChurnDataModified)

Histogram of Subscribers Variable Subscribers = 1 
Non-subscribers = 2

ggplot(data = ChurnData_2, aes(Subscriber_Number)) +  
  geom_histogram(bins = 2, binwidth = 0.5)+  
  xlab("Subscribers vs. Non-Subscribers") +  
  ylab("Count")

image-center

Home Ownership -> Year of Residence -> Subscription Status

bar <- ggplot(data = ChurnData_2, aes(x = Home_Ownership_Number, y = Year_Of_Residence, group = Subscriber_Number, fill = Subscriber_Number))bar +
  geom_bar(stat = "identity", position = position_dodge(1), width = 0.5) +
  scale_fill_continuous(name="Subscription", breaks=c(1, 2),                   labels=c("Subscribers", "Non-Subscribers"))+  
  xlab("Home Ownership")+  
  ylab("Year of Residence")

image-center

Histogram of Age Range

ggplot(data = ChurnData_2, aes(Age_range_Number)) +  
  geom_histogram(bins = 6, binwidth = 2)+  
  xlab("Age Range") +  
  ylab("Count")

image-center