import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set()

# loading data

data = pd.read_csv("../data/Telecom-Churn.csv")
data.head()

# we will drop the customer ID as a useless attribute
data.drop(columns=['CustomerID', 'Unnamed: 0'], inplace = True)
data.head()

# for numeric attributes

print(data[['Tenure', 'MonthlyCharges', 'TotalCharges']].describe())

# number of values and histograms for nominal attributes (except target)

for x in ['Gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 
          'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
          'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']:
    print(data[x].value_counts())
    print('')

# if we want to draw histograms, use countplot()

sns.countplot(data['Gender'])

# if we want to draw histograms for all attributes at once, for example as follows:

for i in ['Gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport','StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'Churn']:
    plt.figure(i)
    sns.countplot(x=i, data=data)

# number of missing values:

for i in data.columns:
    print('Number of missing values of attribute', i, ':', data[i].isna().sum())

data_numeric = data[['Tenure', 'MonthlyCharges', 'TotalCharges']]
print(data_numeric.corr())

g = sns.heatmap(data_numeric.corr(), cmap='RdPu', annot= True)

# most correlated tenure and total charges (0.83)

# dependence can be visualized with a scatterplot

g = sns.regplot(data=data_numeric, x='Tenure', y='TotalCharges')

# discretization of MonthlyCharges and TotalCharges

data['MonthlyCharges_ordinal'] = pd.cut(data['MonthlyCharges'], 5)
data['TotalCharges_ordinal'] = pd.cut(data['TotalCharges'], 5)

# discretization of Tenure

data['Tenure_ordinal'] = pd.qcut(data['Tenure'], 5)

# contingency tables
pd.pivot_table(data, index=["Tenure_ordinal"], values="Churn")

pd.pivot_table(data, index=["TotalCharges_ordinal"], values="Churn")

pd.pivot_table(data, index=["MonthlyCharges_ordinal"], values="Churn")

pd.pivot_table(data, index=["Tenure_ordinal", "MonthlyCharges_ordinal", "TotalCharges_ordinal"], values="Churn")

g = sns.FacetGrid(data, col="Tenure_ordinal", hue="MonthlyCharges_ordinal", palette="Set1")
g = (g.map(plt.hist, "Churn").add_legend())

pd.pivot_table(data, index=["Gender"], values="Churn")

pd.pivot_table(data, index=["SeniorCitizen"], values="Churn")

g = sns.countplot(y="Churn", hue="SeniorCitizen", data=data, palette='rainbow')

g = sns.FacetGrid(data, col="Gender", hue="SeniorCitizen", palette="Set1")
g = (g.map(plt.hist, "Churn").add_legend())

pd.crosstab(index=data["StreamingTV"], columns=data["StreamingMovies"])

pd.crosstab(index=data["StreamingTV"], columns=data["PhoneService"])

pd.crosstab(index=data["StreamingMovies"], columns=data["PhoneService"])

pd.crosstab(index=data["InternetService"], columns=data["PhoneService"])

g = sns.countplot(y="InternetService", hue="PhoneService", data=data, palette='rainbow')

g = sns.countplot(y="StreamingMovies", hue="StreamingTV", data=data, palette='Set1')

g = sns.countplot(y="StreamingTV", hue="PhoneService", data=data, palette='Set1')

# replacement of MonthlyCharges and TotalCharges

def replace_missing_MonthlyCharges(row):
    TotalCharges = row["TotalCharges"]
    Tenure = row["Tenure"]
    MonthlyCharges = row["MonthlyCharges"]
    if pd.isna(MonthlyCharges):
        return TotalCharges / Tenure
    else:
        return MonthlyCharges
    
def replace_missing_TotalCharges(row):
    TotalCharges = row["TotalCharges"]
    MonthlyCharges = row["MonthlyCharges"]
    Tenure = row["Tenure"]
    if pd.isna(TotalCharges):
        if Tenure == 0: 
            return MonthlyCharges
        else:
            return MonthlyCharges * Tenure
    else:
        return TotalCharges


data['MonthlyCharges'] = data.apply(replace_missing_MonthlyCharges, axis = 1)
data['TotalCharges'] = data.apply(replace_missing_TotalCharges, axis = 1)

print(data['MonthlyCharges'].isna().sum())
print(data['TotalCharges'].isna().sum())

# for Dependents, the counts for the combinations of Gender and Partner are first computed

table = pd.crosstab(index=[data['Gender'], data['Partner']], columns= data['Dependents'])

print(table)

# replace according counts

def replace_missing_Dependents(row):
    if pd.isna(row['Dependents']):
        if row['Gender'] == 'Female':
            if row['Partner'] == 'No':
                return 'No' 
            else:
                return 'Yes'
        else:
            if row['Partner'] == 'No':
                return 'No' 
            else:
                return 'Yes'
    else:
        return row['Dependents']

data['Dependents'] = data.apply(replace_missing_Dependents, axis = 1)

# TotalMonthCharges

data['TotalMonthCharges'] = data.eval('MonthlyCharges * Tenure')

# TotalRatio

def total_ratio(row):
    if row['TotalMonthCharges'] < row['TotalCharges']:
        return -1
    elif row['TotalMonthCharges'] == row['TotalCharges']:
        return 0
    else:
        return 1

data['TotalRatio'] = data.apply(total_ratio, axis= 1)

data = data.drop(columns=["Tenure_ordinal","MonthlyCharges_ordinal","TotalMonthCharges", "TotalCharges_ordinal"])

data['Churn'] = data['Churn'].map({"Yes":1, "No":0})

data['Gender'] = data['Gender'].map({"Female":1, "Male":0})
data['Partner'] = data['Partner'].map({'Yes':1, 'No':0})
data['Dependents'] = data['Dependents'].map({'Yes':1, 'No':0})
data['PhoneService'] = data['PhoneService'].map({'Yes':1, 'No':0})
data['PaperlessBilling'] = data['PaperlessBilling'].map({'Yes':1, 'No':0})

data['MultipleLines'] = data['MultipleLines'].map({'Yes':1, 'No':0, 'No phone service':0})
data['OnlineSecurity'] = data['OnlineSecurity'].map({'Yes':1, 'No':0, 'No internet service':0})
data['OnlineBackup'] = data['OnlineBackup'].map({'Yes':1, 'No':0, 'No internet service':0})
data['DeviceProtection'] = data['DeviceProtection'].map({'Yes':1, 'No':0, 'No internet service':0})
data['TechSupport'] = data['TechSupport'].map({'Yes':1, 'No':0, 'No internet service':0})
data['StreamingTV'] = data['StreamingTV'].map({'Yes':1, 'No':0, 'No internet service':0})
data['StreamingMovies'] = data['StreamingMovies'].map({'Yes':1, 'No':0, 'No internet service':0})

data["Contract"] = data["Contract"].map({"Month-to-month": 0, "One year": 1, "Two year" : 2})

data = pd.get_dummies(data, columns= ['InternetService', 'PaymentMethod' ])

data.head()

# normalize data

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

normData = pd.DataFrame(scaler.fit_transform(data), index= data.index, columns= data.columns)
normData.head()

from sklearn.model_selection import train_test_split

X_data = data.drop(columns= 'Churn', axis = 1)
y_data = data['Churn']
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size= 0.3, random_state= 1)


X_normData = normData.drop(columns= 'Churn', axis = 1)
y_normData = normData['Churn']
X_train_norm, X_test_norm, y_train_norm, y_test_norm = train_test_split(X_normData, y_normData, test_size= 0.3, random_state= 1)

for i in normData.columns:
    print('Number of missing values', i, ':', normData[i].isna().sum())

from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier


knn = KNeighborsClassifier()
k = list(range(1, 50))
weights_range = ['uniform', 'distance']
metric_range = ['euclidean', 'manhattan']  

param_grid_kNN = dict(n_neighbors=k, weights=weights_range, metric=metric_range)

grid_kNN = GridSearchCV(estimator=knn, param_grid=param_grid_kNN, cv=5, scoring='accuracy')
grid_kNN.fit(X_train_norm, y_train_norm)

print('Best combination of parameters for kNN model is:')
print(grid_kNN.best_params_)
print()
print('Accuracy of kNN model for optimal combination of parameters:')
print(grid_kNN.best_score_)

from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()

criterion_range = ['gini', 'entropy']
max_depth_range = [1, 2, 3, 4, 5, 6] 
min_samples_split_range = [2, 4, 6, 8, 10, 12]
min_samples_leaf_range = [1, 5, 10]
presort_range = [True, False]

param_grid_dt = dict(criterion=criterion_range, max_depth=max_depth_range, min_samples_split = min_samples_split_range, min_samples_leaf = min_samples_leaf_range, presort = presort_range )

grid_dt = GridSearchCV(estimator=dt, param_grid=param_grid_dt, cv=5, scoring='accuracy')
grid_dt.fit(X_train, y_train)

print('Best combination of parameters for decision tree model is:')
print(grid_dt.best_params_)
print()
print('Accuracy of decision tree model for optimal combination of parameters:')
print(grid_dt.best_score_)

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

n_estimators_range = [100, 300, 500, 700, 900]
oob_score_range = [True, False]
criterion_range = ['gini', 'entropy']
max_depth_range = [1, 2, 3, 4, 5, 6]
min_samples_split_range = [2, 4, 6]
min_samples_leaf_range = [1, 5, 10]

param_grid_rf = dict(n_estimators=n_estimators_range, oob_score=oob_score_range, criterion=criterion_range, max_depth=max_depth_range, min_samples_split = min_samples_split_range, min_samples_leaf = min_samples_leaf_range)

grid_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=5, scoring='accuracy')
grid_rf.fit(X_train, y_train)

print('Best combination of parameters for random forest model is::')
print(grid_rf.best_params_)
print()
print('Accuracy of random forest model for optimal combination of parameters:')
print(grid_rf.best_score_)

from sklearn.metrics import roc_curve,auc

# k-NN

knn = KNeighborsClassifier()

knn = KNeighborsClassifier(n_neighbors= 30, weights= 'uniform', metric= 'manhattan')
knn.fit(X_train_norm, y_train_norm)
y_knn=knn.predict(X_test_norm)

fpr_knn, tpr_knn, thresholds_knn = roc_curve(y_test_norm, y_knn, pos_label=1)
roc_auc_knn = auc(fpr_knn, tpr_knn)

# decision trees

dt = DecisionTreeClassifier()

dt = DecisionTreeClassifier(criterion= 'entropy', max_depth= 5, min_samples_leaf= 5, min_samples_split= 4, presort= True)
dt.fit(X_train, y_train)
y_dt=dt.predict(X_test)

fpr_dt, tpr_dt, thresholds_dt = roc_curve(y_test, y_dt, pos_label=1)
roc_auc_dt = auc(fpr_dt, tpr_dt)

# random forests

rf = RandomForestClassifier()

rf = RandomForestClassifier(n_estimators= 700, oob_score= True, criterion= 'gini', max_depth= 6, min_samples_split =  6, min_samples_leaf =  10)
rf.fit(X_train, y_train)
y_rf=rf.predict(X_test)

fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_test, y_rf, pos_label=1)
roc_auc_rf = auc(fpr_rf, tpr_rf)

# Naive Bayes

nb = GaussianNB()
nb.fit(X_train, y_train)
y_nb=nb.predict(X_test)

fpr_nb, tpr_nb, thresholds_nb = roc_curve(y_test, y_nb, pos_label=1)
roc_auc_nb = auc(fpr_nb, tpr_nb)


plt.title('ROC')

plt.plot(fpr_knn, tpr_knn, color='green', label = 'kNN (AUC = %0.2f)' % roc_auc_knn) 
plt.plot(fpr_dt, tpr_dt, color='blue', label = 'Decicion Trees (AUC = %0.2f)' % roc_auc_dt) 
plt.plot(fpr_rf, tpr_rf, color='red', label = 'Random Forests (AUC = %0.2f)' % roc_auc_rf) 
plt.plot(fpr_nb, tpr_nb, color='yellow', label = 'Naive Bayes (AUC = %0.2f)' % roc_auc_nb) 

plt.legend(loc = 'lower right') 
plt.plot([0, 1], [0, 1],linestyle='--', color='blue')
plt.xlim([0, 1]) 
plt.ylim([0, 1])
plt.ylabel('TP rate') 
plt.xlabel('FP rate') 
plt.show()

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score,precision_score, recall_score

# k-NN

print('kNN accuracy: %0.2f' % accuracy_score(y_test_norm, y_knn))
print('kNN precision: %0.2f' %  precision_score(y_test_norm, y_knn))
print('kNN recall: %0.2f' %  recall_score(y_test_norm, y_knn))
cm_knn = confusion_matrix(y_test_norm, y_knn)
print(cm_knn)

print('Decision tree accuracy: %0.2f' % accuracy_score(y_test, y_dt))
print('Decision tree precision: %0.2f' % precision_score(y_test, y_dt))
print('Decision tree recall: %0.2f' % recall_score(y_test, y_dt))
print('')
cm_dt = confusion_matrix(y_test, y_dt)
print(cm_dt)

print('RF accuracy: %0.2f' % accuracy_score(y_test, y_rf))
print('RF precision: %0.2f' % precision_score(y_test, y_rf))
print('RF recall: %0.2f' % recall_score(y_test, y_rf))
print('')
cm_rf = confusion_matrix(y_test, y_rf)
print(cm_rf)

print('NB accuracy: %0.2f' % accuracy_score(y_test, y_nb))
print('NB precision: %0.2f' % precision_score(y_test, y_nb))
print('NB recall: %0.2f' % recall_score(y_test, y_nb))
print('')
cm_nb = confusion_matrix(y_test, y_nb)
print(cm_nb)

from sklearn import tree
from sklearn.tree import export_graphviz

with open("decision_tree.txt", "w") as f:
    f = tree.export_graphviz(dt, feature_names=X_data.columns.values, class_names=['0','1'], out_file=f)

Data set - Telco Customer (final assignment)¶

Task 1 - Loading data (2p)¶

Task 2 - Basic statistics (4p)¶

Task 3 - Dependencies between numerical attributes (3p)¶

Task 4 - Discretization of values (4p)¶

Task 5 - Dependencies between nominal/ordinal attributes (5p)¶

Task 6 - Replacement of missing values (5p)¶

Task 7 - Derivation of new attributes (5p)¶

Task 8 - Converting data into a form suitable for modeling (5p)¶

Task 9 - Splitting data for training and testing (2p)¶

Task 10 - Training classification models and finding optimal parameters (6p)¶

Task 11 - Comparison of models using ROC curves (4p)¶

Task 12 - Evaluation of the model on the test set (4p)¶