#Import Libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report,
confusion_matrix
from sklearn.model_selection import cross_val_score
Project Summary: KNN Model for Loan Risk Classification
Objective The goal of this project is to predict loan repayment risk using the K-Nearest
Neighbors (KNN) algorithm. Our model classifies loans as either:-
• 1 (Fully Paid - Good Loan )
• 0 (Charged Off - Bad Loan)
We achieve this by analyzing a borrower characteristics and financial indicators, the model helps
measures the likelyhood of loan default which helps the lender in good decison making process.
Steps
1. Data Preprocessing & Feature Selection :- Load the dataset and identify missing
values. Drop irrelevant features.Encoded categorical variables. Converted our target
- loan_status to binary (1 = Fully Paid, 0 = Charged Off). Scaled numerical
features.Select most relevant features.
2. Train-Test Split: - Split data into training (80%) and testing (20%) sets. (ensure
loan_status remains binary:- Just to confirm, i got errors before then decided to
confirm again at this point, even though it might not be necessary).
3. KNN Model Training:- KNN classifier with k=5 to start with. Our Evaluated model
accuracy (93.93%).
We noticed that the Recall for Charged Off loans was low.
4. Optimizing k for Better Performance:- We tuned the value of k (neighbors) using
cross-validation, where we found 3 achieved a higher accuracy. Trained model at
K=3 and our model accuracy of 94% was achieved.
#import data
df=pd.read_csv(r"C:\Users\User\Downloads\loan.csv")
df.head()
C:\Users\User\AppData\Local\Temp\ipykernel_1768\975024387.py:2:
DtypeWarning: Columns (0,45) have mixed types. Specify dtype option on
,import or set low_memory=False.
df=pd.read_csv(r"C:\Users\User\Downloads\loan.csv")
id loan_amnt funded_amnt funded_amnt_inv term
int_rate \
0 NaN 5000.0 5000.0 4975.0 36 months 10.65%
1 NaN 2500.0 2500.0 2500.0 60 months 15.27%
2 NaN 2400.0 2400.0 2400.0 36 months 15.96%
3 NaN 10000.0 10000.0 10000.0 36 months 13.49%
4 NaN 3000.0 3000.0 3000.0 60 months 12.69%
installment grade sub_grade emp_title ... \
0 162.87 B B2 NaN ...
1 59.83 C C4 Ryder ...
2 84.33 C C5 NaN ...
3 339.31 C C1 AIR RESOURCES BOARD ...
4 67.79 B B5 University Medical Group ...
last_credit_pull_d collections_12_mths_ex_med policy_code
application_type \
0 Jul-2017 0.0 1.0
INDIVIDUAL
1 Oct-2016 0.0 1.0
INDIVIDUAL
2 Jun-2017 0.0 1.0
INDIVIDUAL
3 Apr-2016 0.0 1.0
INDIVIDUAL
4 Jan-2017 0.0 1.0
INDIVIDUAL
acc_now_delinq chargeoff_within_12_mths delinq_amnt
pub_rec_bankruptcies \
0 0.0 0.0 0.0
0.0
1 0.0 0.0 0.0
0.0
2 0.0 0.0 0.0
0.0
3 0.0 0.0 0.0
0.0
4 0.0 0.0 0.0
0.0
tax_liens hardship_flag
, 0 0.0 N
1 0.0 N
2 0.0 N
3 0.0 N
4 0.0 N
[5 rows x 56 columns]
#Contents of the dataframe
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42538 entries, 0 to 42537
Data columns (total 56 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 3 non-null object
1 loan_amnt 42535 non-null float64
2 funded_amnt 42535 non-null float64
3 funded_amnt_inv 42535 non-null float64
4 term 42535 non-null object
5 int_rate 42535 non-null object
6 installment 42535 non-null float64
7 grade 42535 non-null object
8 sub_grade 42535 non-null object
9 emp_title 39909 non-null object
10 emp_length 41423 non-null object
11 home_ownership 42535 non-null object
12 annual_inc 42531 non-null float64
13 verification_status 42535 non-null object
14 issue_d 42535 non-null object
15 loan_status 42535 non-null object
16 pymnt_plan 42535 non-null object
17 desc 29240 non-null object
18 purpose 42535 non-null object
19 title 42522 non-null object
20 zip_code 42535 non-null object
21 addr_state 42535 non-null object
22 dti 42535 non-null float64
23 delinq_2yrs 42506 non-null float64
24 earliest_cr_line 42506 non-null object
25 inq_last_6mths 42506 non-null float64
26 mths_since_last_delinq 15609 non-null float64
27 mths_since_last_record 3651 non-null float64
28 open_acc 42506 non-null float64
29 pub_rec 42506 non-null float64
30 revol_bal 42535 non-null float64
31 revol_util 42445 non-null object
32 total_acc 42506 non-null float64
33 initial_list_status 42535 non-null object
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report,
confusion_matrix
from sklearn.model_selection import cross_val_score
Project Summary: KNN Model for Loan Risk Classification
Objective The goal of this project is to predict loan repayment risk using the K-Nearest
Neighbors (KNN) algorithm. Our model classifies loans as either:-
• 1 (Fully Paid - Good Loan )
• 0 (Charged Off - Bad Loan)
We achieve this by analyzing a borrower characteristics and financial indicators, the model helps
measures the likelyhood of loan default which helps the lender in good decison making process.
Steps
1. Data Preprocessing & Feature Selection :- Load the dataset and identify missing
values. Drop irrelevant features.Encoded categorical variables. Converted our target
- loan_status to binary (1 = Fully Paid, 0 = Charged Off). Scaled numerical
features.Select most relevant features.
2. Train-Test Split: - Split data into training (80%) and testing (20%) sets. (ensure
loan_status remains binary:- Just to confirm, i got errors before then decided to
confirm again at this point, even though it might not be necessary).
3. KNN Model Training:- KNN classifier with k=5 to start with. Our Evaluated model
accuracy (93.93%).
We noticed that the Recall for Charged Off loans was low.
4. Optimizing k for Better Performance:- We tuned the value of k (neighbors) using
cross-validation, where we found 3 achieved a higher accuracy. Trained model at
K=3 and our model accuracy of 94% was achieved.
#import data
df=pd.read_csv(r"C:\Users\User\Downloads\loan.csv")
df.head()
C:\Users\User\AppData\Local\Temp\ipykernel_1768\975024387.py:2:
DtypeWarning: Columns (0,45) have mixed types. Specify dtype option on
,import or set low_memory=False.
df=pd.read_csv(r"C:\Users\User\Downloads\loan.csv")
id loan_amnt funded_amnt funded_amnt_inv term
int_rate \
0 NaN 5000.0 5000.0 4975.0 36 months 10.65%
1 NaN 2500.0 2500.0 2500.0 60 months 15.27%
2 NaN 2400.0 2400.0 2400.0 36 months 15.96%
3 NaN 10000.0 10000.0 10000.0 36 months 13.49%
4 NaN 3000.0 3000.0 3000.0 60 months 12.69%
installment grade sub_grade emp_title ... \
0 162.87 B B2 NaN ...
1 59.83 C C4 Ryder ...
2 84.33 C C5 NaN ...
3 339.31 C C1 AIR RESOURCES BOARD ...
4 67.79 B B5 University Medical Group ...
last_credit_pull_d collections_12_mths_ex_med policy_code
application_type \
0 Jul-2017 0.0 1.0
INDIVIDUAL
1 Oct-2016 0.0 1.0
INDIVIDUAL
2 Jun-2017 0.0 1.0
INDIVIDUAL
3 Apr-2016 0.0 1.0
INDIVIDUAL
4 Jan-2017 0.0 1.0
INDIVIDUAL
acc_now_delinq chargeoff_within_12_mths delinq_amnt
pub_rec_bankruptcies \
0 0.0 0.0 0.0
0.0
1 0.0 0.0 0.0
0.0
2 0.0 0.0 0.0
0.0
3 0.0 0.0 0.0
0.0
4 0.0 0.0 0.0
0.0
tax_liens hardship_flag
, 0 0.0 N
1 0.0 N
2 0.0 N
3 0.0 N
4 0.0 N
[5 rows x 56 columns]
#Contents of the dataframe
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42538 entries, 0 to 42537
Data columns (total 56 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 3 non-null object
1 loan_amnt 42535 non-null float64
2 funded_amnt 42535 non-null float64
3 funded_amnt_inv 42535 non-null float64
4 term 42535 non-null object
5 int_rate 42535 non-null object
6 installment 42535 non-null float64
7 grade 42535 non-null object
8 sub_grade 42535 non-null object
9 emp_title 39909 non-null object
10 emp_length 41423 non-null object
11 home_ownership 42535 non-null object
12 annual_inc 42531 non-null float64
13 verification_status 42535 non-null object
14 issue_d 42535 non-null object
15 loan_status 42535 non-null object
16 pymnt_plan 42535 non-null object
17 desc 29240 non-null object
18 purpose 42535 non-null object
19 title 42522 non-null object
20 zip_code 42535 non-null object
21 addr_state 42535 non-null object
22 dti 42535 non-null float64
23 delinq_2yrs 42506 non-null float64
24 earliest_cr_line 42506 non-null object
25 inq_last_6mths 42506 non-null float64
26 mths_since_last_delinq 15609 non-null float64
27 mths_since_last_record 3651 non-null float64
28 open_acc 42506 non-null float64
29 pub_rec 42506 non-null float64
30 revol_bal 42535 non-null float64
31 revol_util 42445 non-null object
32 total_acc 42506 non-null float64
33 initial_list_status 42535 non-null object