{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Logistic Regression\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Import Libraries"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"import statsmodels.api as sm\n",
"import statsmodels.formula.api as smf\n",
"\n",
"\n",
"from sklearn.metrics import confusion_matrix, classification_report, accuracy_score\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Read and Confirm Data"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv('data/ccdefault.csv').round(1)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" default | \n",
" student | \n",
" balance | \n",
" income | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" No | \n",
" No | \n",
" 729.5 | \n",
" 44361.6 | \n",
"
\n",
" \n",
" 1 | \n",
" No | \n",
" Yes | \n",
" 817.2 | \n",
" 12106.1 | \n",
"
\n",
" \n",
" 2 | \n",
" No | \n",
" No | \n",
" 1073.5 | \n",
" 31767.1 | \n",
"
\n",
" \n",
" 3 | \n",
" No | \n",
" No | \n",
" 529.3 | \n",
" 35704.5 | \n",
"
\n",
" \n",
" 4 | \n",
" No | \n",
" No | \n",
" 785.7 | \n",
" 38463.5 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" default student balance income\n",
"0 No No 729.5 44361.6\n",
"1 No Yes 817.2 12106.1\n",
"2 No No 1073.5 31767.1\n",
"3 No No 529.3 35704.5\n",
"4 No No 785.7 38463.5"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# remap default = 'Yes' to 1; 'No' to 0\n",
"df['default'] = np.where(df['default'] == \"Yes\", 1, 0)\n",
"df['student'] = np.where(df['student'] == \"Yes\", 1, 0)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# size balance and income to be 100s of $\n",
"df['balance'] = np.round(df['balance']/100,0)\n",
"df['income'] = np.round(df['income']/100,0)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" default | \n",
" student | \n",
" balance | \n",
" income | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 7.0 | \n",
" 444.0 | \n",
"
\n",
" \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 8.0 | \n",
" 121.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 0 | \n",
" 0 | \n",
" 11.0 | \n",
" 318.0 | \n",
"
\n",
" \n",
" 3 | \n",
" 0 | \n",
" 0 | \n",
" 5.0 | \n",
" 357.0 | \n",
"
\n",
" \n",
" 4 | \n",
" 0 | \n",
" 0 | \n",
" 8.0 | \n",
" 385.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" default student balance income\n",
"0 0 0 7.0 444.0\n",
"1 0 1 8.0 121.0\n",
"2 0 0 11.0 318.0\n",
"3 0 0 5.0 357.0\n",
"4 0 0 8.0 385.0"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Logistic Regression "
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Optimization terminated successfully.\n",
" Current function value: 0.078644\n",
" Iterations 10\n"
]
}
],
"source": [
"lr = smf.logit(formula='default ~ balance + C(student)',data=df).fit() "
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"Logit Regression Results\n",
"\n",
" Dep. Variable: | default | No. Observations: | 10000 | \n",
"
\n",
"\n",
" Model: | Logit | Df Residuals: | 9997 | \n",
"
\n",
"\n",
" Method: | MLE | Df Model: | 2 | \n",
"
\n",
"\n",
" Date: | Sun, 26 Dec 2021 | Pseudo R-squ.: | 0.4615 | \n",
"
\n",
"\n",
" Time: | 15:34:59 | Log-Likelihood: | -786.44 | \n",
"
\n",
"\n",
" converged: | True | LL-Null: | -1460.3 | \n",
"
\n",
"\n",
" Covariance Type: | nonrobust | LLR p-value: | 2.172e-293 | \n",
"
\n",
"
\n",
"\n",
"\n",
" | coef | std err | z | P>|z| | [0.025 | 0.975] | \n",
"
\n",
"\n",
" Intercept | -10.7703 | 0.371 | -29.019 | 0.000 | -11.498 | -10.043 | \n",
"
\n",
"\n",
" C(student)[T.1] | -0.7004 | 0.147 | -4.761 | 0.000 | -0.989 | -0.412 | \n",
"
\n",
"\n",
" balance | 0.5746 | 0.023 | 24.680 | 0.000 | 0.529 | 0.620 | \n",
"
\n",
"
Possibly complete quasi-separation: A fraction 0.14 of observations can be
perfectly predicted. This might indicate that there is complete
quasi-separation. In this case some parameters will not be identified."
],
"text/plain": [
"\n",
"\"\"\"\n",
" Logit Regression Results \n",
"==============================================================================\n",
"Dep. Variable: default No. Observations: 10000\n",
"Model: Logit Df Residuals: 9997\n",
"Method: MLE Df Model: 2\n",
"Date: Sun, 26 Dec 2021 Pseudo R-squ.: 0.4615\n",
"Time: 15:34:59 Log-Likelihood: -786.44\n",
"converged: True LL-Null: -1460.3\n",
"Covariance Type: nonrobust LLR p-value: 2.172e-293\n",
"===================================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"-----------------------------------------------------------------------------------\n",
"Intercept -10.7703 0.371 -29.019 0.000 -11.498 -10.043\n",
"C(student)[T.1] -0.7004 0.147 -4.761 0.000 -0.989 -0.412\n",
"balance 0.5746 0.023 24.680 0.000 0.529 0.620\n",
"===================================================================================\n",
"\n",
"Possibly complete quasi-separation: A fraction 0.14 of observations can be\n",
"perfectly predicted. This might indicate that there is complete\n",
"quasi-separation. In this case some parameters will not be identified.\n",
"\"\"\""
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lr.summary()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Evaluate Model"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"X = df[['balance','student']]\n",
"y = df['default']\n",
"y_probabilities = lr.predict(X)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"y_hat = list(map(round,y_probabilities))"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.9735\n"
]
}
],
"source": [
"print(accuracy_score(y,y_hat))"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[9618 49]\n",
" [ 216 117]]\n"
]
}
],
"source": [
"print(confusion_matrix(y,y_hat))"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" 0 0.98 0.99 0.99 9667\n",
" 1 0.70 0.35 0.47 333\n",
"\n",
" accuracy 0.97 10000\n",
" macro avg 0.84 0.67 0.73 10000\n",
"weighted avg 0.97 0.97 0.97 10000\n",
"\n"
]
}
],
"source": [
"print(classification_report(y,y_hat))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Predictions"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"# predict new points\n",
"data_new = {'balance': [5.2,10.1,12.3,20.1,22.6], \n",
" 'student': [1,0,1,0,1]}\n",
"df_new = pd.DataFrame(data_new)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"df_new['probability'] = lr.predict(df_new).round(2)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" balance | \n",
" student | \n",
" probability | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 5.2 | \n",
" 1 | \n",
" 0.00 | \n",
"
\n",
" \n",
" 1 | \n",
" 10.1 | \n",
" 0 | \n",
" 0.01 | \n",
"
\n",
" \n",
" 2 | \n",
" 12.3 | \n",
" 1 | \n",
" 0.01 | \n",
"
\n",
" \n",
" 3 | \n",
" 20.1 | \n",
" 0 | \n",
" 0.69 | \n",
"
\n",
" \n",
" 4 | \n",
" 22.6 | \n",
" 1 | \n",
" 0.82 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" balance student probability\n",
"0 5.2 1 0.00\n",
"1 10.1 0 0.01\n",
"2 12.3 1 0.01\n",
"3 20.1 0 0.69\n",
"4 22.6 1 0.82"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_new"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Intercept 0.000021\n",
"C(student)[T.1] 0.496375\n",
"balance 1.776450\n",
"dtype: float64"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.exp(lr.params) "
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Intercept -99.997898\n",
"C(student)[T.1] -50.362525\n",
"balance 77.644961\n",
"dtype: float64"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"(np.exp(lr.params)-1)*100"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.11"
}
},
"nbformat": 4,
"nbformat_minor": 4
}