Baseline for DCRCL

Baseline for the challenge DCRCL

Open In Colab

Import necessary packages

In [20]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

Download data

In [ ]:
!wget https://s3.eu-central-1.wasabisys.com/aicrowd-public-datasets/aicrowd_educational_dcrcl/data/public/test.csv
!wget https://s3.eu-central-1.wasabisys.com/aicrowd-public-datasets/aicrowd_educational_dcrcl/data/public/train.csv

Load Data

In [2]:
train_data = pd.read_csv('train.csv')

Analyse Data

In [3]:
train_data.head()
Out[3]:
LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5 ... BILL_AMT4 BILL_AMT5 BILL_AMT6 PAY_AMT1 PAY_AMT2 PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6 default payment next month
0 30000 2 2 1 38 0 0 0 0 0 ... 22810 25772 26360 1650 1700 1400 3355 1146 0 0
1 170000 1 4 1 28 0 0 0 -1 -1 ... 11760 0 4902 14000 5695 11760 0 4902 6000 0
2 340000 1 1 2 38 0 0 0 -1 -1 ... 1680 1920 9151 5000 7785 1699 1920 9151 187000 0
3 140000 2 2 2 29 0 0 0 2 0 ... 65861 64848 64936 3000 8600 6 2500 2500 2500 0
4 130000 2 2 1 42 2 2 2 0 0 ... 126792 103497 96991 6400 0 4535 3900 4300 3700 1

5 rows × 24 columns

In [4]:
train_data.describe()
Out[4]:
LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5 ... BILL_AMT4 BILL_AMT5 BILL_AMT6 PAY_AMT1 PAY_AMT2 PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6 default payment next month
count 25500.000000 25500.000000 25500.000000 25500.000000 25500.000000 25500.000000 25500.000000 25500.000000 25500.000000 25500.000000 ... 25500.000000 25500.000000 25500.000000 25500.000000 2.550000e+04 25500.000000 25500.000000 25500.000000 25500.000000 25500.000000
mean 167436.458039 1.604667 1.852824 1.551961 35.503333 -0.016275 -0.131882 -0.166706 -0.218667 -0.264157 ... 43139.224941 40252.920588 38846.415529 5690.801373 5.986709e+03 5246.605294 4829.790078 4810.296706 5187.016549 0.220902
std 129837.118639 0.488932 0.791803 0.522754 9.235048 1.126813 1.196710 1.192883 1.168375 1.132166 ... 64214.508636 60789.101393 59397.443604 17070.733348 2.402498e+04 18117.236738 16021.336645 15505.873498 17568.450557 0.414863
min 10000.000000 1.000000 0.000000 0.000000 21.000000 -2.000000 -2.000000 -2.000000 -2.000000 -2.000000 ... -170000.000000 -81334.000000 -209051.000000 0.000000 0.000000e+00 0.000000 0.000000 0.000000 0.000000 0.000000
25% 50000.000000 1.000000 1.000000 1.000000 28.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 ... 2360.000000 1779.250000 1280.000000 1000.000000 8.635000e+02 390.000000 292.750000 256.750000 113.750000 0.000000
50% 140000.000000 2.000000 2.000000 2.000000 34.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 19033.000000 18085.000000 17129.000000 2100.000000 2.010000e+03 1800.000000 1500.000000 1500.000000 1500.000000 0.000000
75% 240000.000000 2.000000 2.000000 2.000000 42.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 54084.750000 50080.750000 49110.500000 5006.000000 5.000000e+03 4507.000000 4001.250000 4024.000000 4000.000000 0.000000
max 1000000.000000 2.000000 6.000000 3.000000 79.000000 8.000000 8.000000 8.000000 8.000000 8.000000 ... 891586.000000 927171.000000 961664.000000 873552.000000 1.684259e+06 896040.000000 621000.000000 426529.000000 527143.000000 1.000000

8 rows × 24 columns

Split Data into Train and Validation

In [5]:
X = train_data.drop('default payment next month',1)
y = train_data['default payment next month']
# Validation testing
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

Define the Classifier and Train

In [6]:
classifier = LogisticRegression()
classifier.fit(X_train,y_train)
/home/gera/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
Out[6]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

Predict on Validation

In [7]:
y_pred = classifier.predict(X_val)
In [8]:
df = pd.DataFrame({'Actual': y_val, 'Predicted': y_pred})
df1 = df.head(25)
df1
Out[8]:
Actual Predicted
6913 0 0
11124 0 0
25100 1 0
2764 0 0
23216 0 0
17269 0 0
3073 0 0
8184 0 0
2595 0 0
5483 0 0
6508 0 0
11776 0 0
5306 0 0
18846 0 0
19854 0 0
2463 0 0
5304 0 0
23739 0 0
20427 0 0
20263 0 0
9578 0 0
14164 0 0
5107 0 0
5160 0 0
8450 0 0

Evaluate the Performance

In [9]:
print('F1 score Score:', metrics.f1_score(y_val, y_pred))  
print('ROC AUC Score:', metrics.roc_auc_score(y_val, y_pred))
F1 score Error: 0.0
ROC AUC Error: 0.49975062344139654

Load Test Set

In [10]:
test_data = pd.read_csv('test.csv')

Predict Test Set

In [11]:
y_test = classifier.predict(test_data)
In [12]:
df = pd.DataFrame(y_test,columns=['default payment next month'])
df.to_csv('submission.csv',index=False)

To participate in the challenge click here