{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Logistic Regression\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Import Libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "import statsmodels.api as sm\n",
    "import statsmodels.formula.api as smf\n",
    "\n",
    "\n",
    "from sklearn.metrics import confusion_matrix, classification_report, accuracy_score\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Read and Confirm Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('data/ccdefault.csv').round(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>default</th>\n",
       "      <th>student</th>\n",
       "      <th>balance</th>\n",
       "      <th>income</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>729.5</td>\n",
       "      <td>44361.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>No</td>\n",
       "      <td>Yes</td>\n",
       "      <td>817.2</td>\n",
       "      <td>12106.1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>1073.5</td>\n",
       "      <td>31767.1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>529.3</td>\n",
       "      <td>35704.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>785.7</td>\n",
       "      <td>38463.5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  default student  balance   income\n",
       "0      No      No    729.5  44361.6\n",
       "1      No     Yes    817.2  12106.1\n",
       "2      No      No   1073.5  31767.1\n",
       "3      No      No    529.3  35704.5\n",
       "4      No      No    785.7  38463.5"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# remap default = 'Yes' to 1; 'No' to 0\n",
    "df['default'] = np.where(df['default'] == \"Yes\", 1, 0)\n",
    "df['student'] = np.where(df['student'] == \"Yes\", 1, 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# size balance and income to be 100s of $\n",
    "df['balance'] = np.round(df['balance']/100,0)\n",
    "df['income'] = np.round(df['income']/100,0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>default</th>\n",
       "      <th>student</th>\n",
       "      <th>balance</th>\n",
       "      <th>income</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7.0</td>\n",
       "      <td>444.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>8.0</td>\n",
       "      <td>121.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>11.0</td>\n",
       "      <td>318.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>357.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8.0</td>\n",
       "      <td>385.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   default  student  balance  income\n",
       "0        0        0      7.0   444.0\n",
       "1        0        1      8.0   121.0\n",
       "2        0        0     11.0   318.0\n",
       "3        0        0      5.0   357.0\n",
       "4        0        0      8.0   385.0"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Logistic Regression "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Optimization terminated successfully.\n",
      "         Current function value: 0.078644\n",
      "         Iterations 10\n"
     ]
    }
   ],
   "source": [
    "lr = smf.logit(formula='default ~ balance + C(student)',data=df).fit() "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"simpletable\">\n",
       "<caption>Logit Regression Results</caption>\n",
       "<tr>\n",
       "  <th>Dep. Variable:</th>        <td>default</td>     <th>  No. Observations:  </th>   <td> 10000</td>  \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Model:</th>                 <td>Logit</td>      <th>  Df Residuals:      </th>   <td>  9997</td>  \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Method:</th>                 <td>MLE</td>       <th>  Df Model:          </th>   <td>     2</td>  \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Date:</th>            <td>Sun, 26 Dec 2021</td> <th>  Pseudo R-squ.:     </th>   <td>0.4615</td>  \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Time:</th>                <td>15:34:59</td>     <th>  Log-Likelihood:    </th>  <td> -786.44</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>converged:</th>             <td>True</td>       <th>  LL-Null:           </th>  <td> -1460.3</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Covariance Type:</th>     <td>nonrobust</td>    <th>  LLR p-value:       </th> <td>2.172e-293</td>\n",
       "</tr>\n",
       "</table>\n",
       "<table class=\"simpletable\">\n",
       "<tr>\n",
       "         <td></td>            <th>coef</th>     <th>std err</th>      <th>z</th>      <th>P>|z|</th>  <th>[0.025</th>    <th>0.975]</th>  \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Intercept</th>       <td>  -10.7703</td> <td>    0.371</td> <td>  -29.019</td> <td> 0.000</td> <td>  -11.498</td> <td>  -10.043</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>C(student)[T.1]</th> <td>   -0.7004</td> <td>    0.147</td> <td>   -4.761</td> <td> 0.000</td> <td>   -0.989</td> <td>   -0.412</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>balance</th>         <td>    0.5746</td> <td>    0.023</td> <td>   24.680</td> <td> 0.000</td> <td>    0.529</td> <td>    0.620</td>\n",
       "</tr>\n",
       "</table><br/><br/>Possibly complete quasi-separation: A fraction 0.14 of observations can be<br/>perfectly predicted. This might indicate that there is complete<br/>quasi-separation. In this case some parameters will not be identified."
      ],
      "text/plain": [
       "<class 'statsmodels.iolib.summary.Summary'>\n",
       "\"\"\"\n",
       "                           Logit Regression Results                           \n",
       "==============================================================================\n",
       "Dep. Variable:                default   No. Observations:                10000\n",
       "Model:                          Logit   Df Residuals:                     9997\n",
       "Method:                           MLE   Df Model:                            2\n",
       "Date:                Sun, 26 Dec 2021   Pseudo R-squ.:                  0.4615\n",
       "Time:                        15:34:59   Log-Likelihood:                -786.44\n",
       "converged:                       True   LL-Null:                       -1460.3\n",
       "Covariance Type:            nonrobust   LLR p-value:                2.172e-293\n",
       "===================================================================================\n",
       "                      coef    std err          z      P>|z|      [0.025      0.975]\n",
       "-----------------------------------------------------------------------------------\n",
       "Intercept         -10.7703      0.371    -29.019      0.000     -11.498     -10.043\n",
       "C(student)[T.1]    -0.7004      0.147     -4.761      0.000      -0.989      -0.412\n",
       "balance             0.5746      0.023     24.680      0.000       0.529       0.620\n",
       "===================================================================================\n",
       "\n",
       "Possibly complete quasi-separation: A fraction 0.14 of observations can be\n",
       "perfectly predicted. This might indicate that there is complete\n",
       "quasi-separation. In this case some parameters will not be identified.\n",
       "\"\"\""
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lr.summary()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Evaluate Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = df[['balance','student']]\n",
    "y = df['default']\n",
    "y_probabilities = lr.predict(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_hat = list(map(round,y_probabilities))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.9735\n"
     ]
    }
   ],
   "source": [
    "print(accuracy_score(y,y_hat))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[9618   49]\n",
      " [ 216  117]]\n"
     ]
    }
   ],
   "source": [
    "print(confusion_matrix(y,y_hat))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.98      0.99      0.99      9667\n",
      "           1       0.70      0.35      0.47       333\n",
      "\n",
      "    accuracy                           0.97     10000\n",
      "   macro avg       0.84      0.67      0.73     10000\n",
      "weighted avg       0.97      0.97      0.97     10000\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(classification_report(y,y_hat))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# predict new points\n",
    "data_new = {'balance': [5.2,10.1,12.3,20.1,22.6], \n",
    "        'student': [1,0,1,0,1]}\n",
    "df_new = pd.DataFrame(data_new)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_new['probability'] = lr.predict(df_new).round(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>balance</th>\n",
       "      <th>student</th>\n",
       "      <th>probability</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5.2</td>\n",
       "      <td>1</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10.1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>12.3</td>\n",
       "      <td>1</td>\n",
       "      <td>0.01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>20.1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.69</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>22.6</td>\n",
       "      <td>1</td>\n",
       "      <td>0.82</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   balance  student  probability\n",
       "0      5.2        1         0.00\n",
       "1     10.1        0         0.01\n",
       "2     12.3        1         0.01\n",
       "3     20.1        0         0.69\n",
       "4     22.6        1         0.82"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_new"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Intercept          0.000021\n",
       "C(student)[T.1]    0.496375\n",
       "balance            1.776450\n",
       "dtype: float64"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.exp(lr.params) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Intercept         -99.997898\n",
       "C(student)[T.1]   -50.362525\n",
       "balance            77.644961\n",
       "dtype: float64"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(np.exp(lr.params)-1)*100"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}