{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Logistic Regression\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Import Libraries" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "\n", "import statsmodels.api as sm\n", "import statsmodels.formula.api as smf\n", "\n", "\n", "from sklearn.metrics import confusion_matrix, classification_report, accuracy_score\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Read and Confirm Data" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('data/ccdefault.csv').round(1)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
defaultstudentbalanceincome
0NoNo729.544361.6
1NoYes817.212106.1
2NoNo1073.531767.1
3NoNo529.335704.5
4NoNo785.738463.5
\n", "
" ], "text/plain": [ " default student balance income\n", "0 No No 729.5 44361.6\n", "1 No Yes 817.2 12106.1\n", "2 No No 1073.5 31767.1\n", "3 No No 529.3 35704.5\n", "4 No No 785.7 38463.5" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# remap default = 'Yes' to 1; 'No' to 0\n", "df['default'] = np.where(df['default'] == \"Yes\", 1, 0)\n", "df['student'] = np.where(df['student'] == \"Yes\", 1, 0)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# size balance and income to be 100s of $\n", "df['balance'] = np.round(df['balance']/100,0)\n", "df['income'] = np.round(df['income']/100,0)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
defaultstudentbalanceincome
0007.0444.0
1018.0121.0
20011.0318.0
3005.0357.0
4008.0385.0
\n", "
" ], "text/plain": [ " default student balance income\n", "0 0 0 7.0 444.0\n", "1 0 1 8.0 121.0\n", "2 0 0 11.0 318.0\n", "3 0 0 5.0 357.0\n", "4 0 0 8.0 385.0" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Logistic Regression " ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Optimization terminated successfully.\n", " Current function value: 0.078644\n", " Iterations 10\n" ] } ], "source": [ "lr = smf.logit(formula='default ~ balance + C(student)',data=df).fit() " ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "
Logit Regression Results
Dep. Variable: default No. Observations: 10000
Model: Logit Df Residuals: 9997
Method: MLE Df Model: 2
Date: Sun, 26 Dec 2021 Pseudo R-squ.: 0.4615
Time: 15:34:59 Log-Likelihood: -786.44
converged: True LL-Null: -1460.3
Covariance Type: nonrobust LLR p-value: 2.172e-293
\n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "
coef std err z P>|z| [0.025 0.975]
Intercept -10.7703 0.371 -29.019 0.000 -11.498 -10.043
C(student)[T.1] -0.7004 0.147 -4.761 0.000 -0.989 -0.412
balance 0.5746 0.023 24.680 0.000 0.529 0.620


Possibly complete quasi-separation: A fraction 0.14 of observations can be
perfectly predicted. This might indicate that there is complete
quasi-separation. In this case some parameters will not be identified." ], "text/plain": [ "\n", "\"\"\"\n", " Logit Regression Results \n", "==============================================================================\n", "Dep. Variable: default No. Observations: 10000\n", "Model: Logit Df Residuals: 9997\n", "Method: MLE Df Model: 2\n", "Date: Sun, 26 Dec 2021 Pseudo R-squ.: 0.4615\n", "Time: 15:34:59 Log-Likelihood: -786.44\n", "converged: True LL-Null: -1460.3\n", "Covariance Type: nonrobust LLR p-value: 2.172e-293\n", "===================================================================================\n", " coef std err z P>|z| [0.025 0.975]\n", "-----------------------------------------------------------------------------------\n", "Intercept -10.7703 0.371 -29.019 0.000 -11.498 -10.043\n", "C(student)[T.1] -0.7004 0.147 -4.761 0.000 -0.989 -0.412\n", "balance 0.5746 0.023 24.680 0.000 0.529 0.620\n", "===================================================================================\n", "\n", "Possibly complete quasi-separation: A fraction 0.14 of observations can be\n", "perfectly predicted. This might indicate that there is complete\n", "quasi-separation. In this case some parameters will not be identified.\n", "\"\"\"" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lr.summary()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Evaluate Model" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "X = df[['balance','student']]\n", "y = df['default']\n", "y_probabilities = lr.predict(X)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "y_hat = list(map(round,y_probabilities))" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.9735\n" ] } ], "source": [ "print(accuracy_score(y,y_hat))" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[9618 49]\n", " [ 216 117]]\n" ] } ], "source": [ "print(confusion_matrix(y,y_hat))" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " 0 0.98 0.99 0.99 9667\n", " 1 0.70 0.35 0.47 333\n", "\n", " accuracy 0.97 10000\n", " macro avg 0.84 0.67 0.73 10000\n", "weighted avg 0.97 0.97 0.97 10000\n", "\n" ] } ], "source": [ "print(classification_report(y,y_hat))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Predictions" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "# predict new points\n", "data_new = {'balance': [5.2,10.1,12.3,20.1,22.6], \n", " 'student': [1,0,1,0,1]}\n", "df_new = pd.DataFrame(data_new)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "df_new['probability'] = lr.predict(df_new).round(2)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
balancestudentprobability
05.210.00
110.100.01
212.310.01
320.100.69
422.610.82
\n", "
" ], "text/plain": [ " balance student probability\n", "0 5.2 1 0.00\n", "1 10.1 0 0.01\n", "2 12.3 1 0.01\n", "3 20.1 0 0.69\n", "4 22.6 1 0.82" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_new" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Intercept 0.000021\n", "C(student)[T.1] 0.496375\n", "balance 1.776450\n", "dtype: float64" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.exp(lr.params) " ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Intercept -99.997898\n", "C(student)[T.1] -50.362525\n", "balance 77.644961\n", "dtype: float64" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(np.exp(lr.params)-1)*100" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.11" } }, "nbformat": 4, "nbformat_minor": 4 }