# import all packages and set plots to be embedded inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline


# import data 
df = pd.read_csv('LoanData.csv')


# to find out the datatype, shape, stats of the data
df.shape

(113937, 81)


# to choose ten variables to study the patterns amongst them.

new_variables = ['Term','BorrowerRate','LoanOriginalAmount','EstimatedReturn','EmploymentStatus','TotalProsperLoans','AmountDelinquent','StatedMonthlyIncome','OnTimeProsperPayments','MonthlyLoanPayment']
df_new = df[new_variables]
df_new


df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113937 entries, 0 to 113936
Data columns (total 10 columns):
 #   Column                 Non-Null Count   Dtype
---  ------                 --------------   -----
 0   Term                   113937 non-null  int64
 1   BorrowerRate           113937 non-null  float64
 2   LoanOriginalAmount     113937 non-null  int64
 3   EstimatedReturn        84853 non-null   float64
 4   EmploymentStatus       111682 non-null  object
 5   TotalProsperLoans      22085 non-null   float64
 6   AmountDelinquent       106315 non-null  float64
 7   StatedMonthlyIncome    113937 non-null  float64
 8   OnTimeProsperPayments  22085 non-null   float64
 9   MonthlyLoanPayment     113937 non-null  float64
dtypes: float64(7), int64(2), object(1)
memory usage: 8.3+ MB


df_new.shape

(113937, 10)


df_new.describe()


# to find out the different kinds of status.
pd.value_counts(df_new.EmploymentStatus).plot.bar();


df_unemployed = df_new[df.EmploymentStatus.isin(["Not employed", "Retired"])] #to select only variables as unemployed
df_employed = df_new[df.EmploymentStatus.isin(["Employed", "Full-time", "Self-employed", "Part-time"])]


df_others = df_new[df.EmploymentStatus.isin(["Not available", "Other", "No_Status"])]
df_hasincome = df_others.loc[df_others['StatedMonthlyIncome'] != 0]
df_employed = df_employed.append(df_hasincome, ignore_index = True)

df_noincome = df_others.loc[df_others['StatedMonthlyIncome'] == 0]
df_unemployed = df_unemployed.append(df_noincome, ignore_index = True)


num_na = df_new.isna().sum()
base_colour = sb.color_palette()[5]
sb.barplot(num_na.index.values, num_na, color = base_colour);
plt.xticks(rotation = 90);


# to fillna with 0.

df_new['EmploymentStatus'].fillna('No_Status', inplace=True);
df_new.fillna(0, inplace = True);
df_employed.fillna(0, inplace = True);
df_unemployed.fillna(0, inplace = True);

C:\Users\ajeet_8zakkbd\anaconda3\lib\site-packages\pandas\core\generic.py:6245: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
C:\Users\ajeet_8zakkbd\anaconda3\lib\site-packages\pandas\core\frame.py:4153: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


df_employed.shape, df_unemployed.shape

((110019, 10), (1663, 10))


# two plots to see how the interest rates are distributed between unemployed and employed.

plt.figure(figsize = (10, 5))

plt.subplot(1,2,1)
bins = np.arange(0, df_employed['BorrowerRate'].max()+0.01, 0.01)
plt.hist(data = df_employed, x = 'BorrowerRate', bins = bins, rwidth = 0.9)
plt.xlabel('Interest Rate-Employed');

plt.subplot(1,2,2)
bins = np.arange(0, df_unemployed['BorrowerRate'].max()+0.01, 0.01)
plt.hist(data = df_unemployed, x = 'BorrowerRate', bins = bins, rwidth = 0.9)
plt.xlabel('Interest Rate-Unemployed');


# to find out the different Terms allocated to the loans.

plt.subplot(1,3,1)
sorted_counts = df_new.Term.value_counts()
plt.pie(sorted_counts, labels = sorted_counts.index, startangle = 90, counterclock = False, wedgeprops = {'width' : 0.4});
plt.axis('square');

plt.subplot(1,3,2)
employed = df_employed.Term.value_counts()
plt.pie(employed, labels = employed.index, startangle = 90, counterclock = False, wedgeprops = {'width' : 0.4});
plt.axis('square');

plt.subplot(1,3,3)
unemployed = df_unemployed.Term.value_counts()
plt.pie(unemployed, labels = unemployed.index, startangle = 90, counterclock = False, wedgeprops = {'width' : 0.4});
plt.axis('square');


# to find out the distribution of salaries.

plt.figure(figsize = (20,10))

plt.subplot(2, 1, 1);
log_data = np.log10(df_employed['StatedMonthlyIncome']);
log_bin_edges = np.arange(0, log_data.max()+0.05, 0.05);
plt.hist(log_data, bins = log_bin_edges, rwidth = 0.9);
plt.xlabel('log(values)-employed');

plt.subplot(2, 1, 2);
log_data = np.log10(df_unemployed['StatedMonthlyIncome']);
log_bin_edges = np.arange(0, log_data.max()+0.05, 0.05);
plt.hist(log_data, bins = log_bin_edges, rwidth = 0.9);
plt.xlabel('log(values)-unemployed');

C:\Users\ajeet_8zakkbd\anaconda3\lib\site-packages\pandas\core\series.py:679: RuntimeWarning: divide by zero encountered in log10
  result = getattr(ufunc, method)(*inputs, **kwargs)


# Hist plots to show intial loan amount.
plt.figure(figsize = (20,10))
plt.subplot(2,1,1)
bins = np.arange(1000, df_employed['LoanOriginalAmount'].max()+500, 500)
plt.hist(df_employed['LoanOriginalAmount'], rwidth = 0.9, bins= bins);
plt.xlabel('Original Loan Amount-Unemployed');

plt.subplot(2,1,2)
bins = np.arange(1000, df_unemployed['LoanOriginalAmount'].max()+500, 500)
plt.hist(df_unemployed['LoanOriginalAmount'], rwidth = 0.9, bins= bins);
plt.xlabel('Original Loan Amount-Unemployed');


# to plot estimated returns using the raw data and the adjusted data. 
plt.figure(figsize = (15,5))
base_colour = sb.color_palette()[7]

# the numbers to the power of ten.
plt.subplot(1, 3, 1)
power_data = np.power(df_new['EstimatedReturn'], 2)
power_bin_edges = np.arange(0, power_data.max()+0.01, 0.01)
plt.hist(power_data, bins = power_bin_edges, color = base_colour, rwidth = 0.9);
plt.xlabel('power(values)')

#the bin edges on the left begin with a negative value.
plt.subplot(1, 3, 2)
bins = np.arange(-0.2, df_new['EstimatedReturn'].max()+0.05, 0.05)
plt.hist(data = df_new, x = 'EstimatedReturn', color = base_colour, bins = bins, rwidth = 0.9);

#the bin edges on the left begin with a '0'.
plt.subplot(1, 3, 3)
bins = np.arange(0, df_new['EstimatedReturn'].max()+0.05, 0.05)
plt.hist(data = df_new, x = 'EstimatedReturn', color = base_colour, bins = bins, rwidth = 0.9);

plt.show()


# to see the current delinquencies (at the time of listing)

plt.subplot(2,1,1)
bins = np.arange(0, df_employed['LoanCurrentDaysDelinquent'].max()+100, 100)
df_employed['LoanCurrentDaysDelinquent'].plot.hist(bins = bins, figsize = (20,10));
tick_locs = [0, 50, 100, 500, 1000, 1500, 2000, 2500];
plt.xticks(tick_locs, tick_locs);
plt.xlabel('Current Delinquent-Unemployed');

plt.subplot(2,1,2)
bins = np.arange(0, df_unemployed['LoanCurrentDaysDelinquent'].max()+50, 50)
df_unemployed['LoanCurrentDaysDelinquent'].plot.hist(bins = bins, figsize = (20,10));
tick_locs = [0, 50, 100, 500, 1000, 1500, 2000, 2500];
plt.xticks(tick_locs, tick_locs);
plt.xlabel('Current Delinquent-Unemployed');


# to plot dollar amount.
plt.figure(figsize = (20,10))

plt.subplot(2,1,1)
bins = np.arange(0, df_new['AmountDelinquent'].max()+ 1, 1);
plt.hist(data = df_new, x = 'AmountDelinquent', color = sb.color_palette()[6]);

# to log the values.
plt.subplot(2,1,2)
log_data = np.log10(df_new['AmountDelinquent']);
log_bin_edges = np.arange(0, log_data.max()+0.05, 0.05);
plt.hist(log_data, bins = log_bin_edges, rwidth = 0.9, color = sb.color_palette()[6]);
plt.xlabel('log(values)');


# to plot the number of on-time payments.
plt.figure(figsize = (10,5))

plt.subplot(2,1,1)
plt.hist(data = df_new, x = 'OnTimeProsperPayments', color = sb.color_palette()[4]);

plt.subplot(2,1,2)
bins = np.arange(0, df_new['OnTimeProsperPayments'].max()+ 1, 1)
plt.hist(data = df_new, x = 'OnTimeProsperPayments', color = sb.color_palette()[4], bins= bins);
tick_locs = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100];
plt.xticks(tick_locs, tick_locs);


df_employed.plot.scatter(y = 'AmountDelinquent', x = 'OnTimeProsperPayments', alpha = 0.5);
plt.xlabel('On Time Prosper Payments - Employed');
df_unemployed.plot.scatter(y = 'AmountDelinquent', x = 'OnTimeProsperPayments', alpha = 0.5);
plt.xlabel('On Time Prosper Payments - Unemployed');


plt.figure(figsize = (15, 5));

g = sb.PairGrid(data = df_unemployed, vars = ['BorrowerRate', 'OnTimeProsperPayments']);
g.map_diag(plt.hist);
g.map_offdiag(plt.scatter);

g = sb.PairGrid(data = df_employed, vars = ['BorrowerRate', 'OnTimeProsperPayments']);
g.map_diag(plt.hist);
g.map_offdiag(plt.scatter);

<Figure size 1080x360 with 0 Axes>


g = sb.FacetGrid(data = df_employed, col = 'EmploymentStatus', height=4, aspect=0.9);
g.map(plt.hist, "LoanOriginalAmount");

g = sb.FacetGrid(data = df_unemployed, col = 'EmploymentStatus', height=4, aspect=0.9);
g.map(plt.hist, "LoanOriginalAmount");


g = sb.FacetGrid(data = df_unemployed, col = 'EmploymentStatus', height=4, aspect=0.9);
g.map(plt.hist, "StatedMonthlyIncome");


g = sb.PairGrid(data = df_unemployed, x_vars = ['MonthlyLoanPayment', 'LoanOriginalAmount'],
                y_vars = ['EmploymentStatus']);
g.map(sb.violinplot, inner = None);


g = sb.FacetGrid(data = df_unemployed, col = 'EmploymentStatus', size = 5)
g.map(plt.hist,'MonthlyLoanPayment');
g.add_legend();

g = sb.FacetGrid(data = df_employed, col = 'EmploymentStatus', size = 5)
g.map(plt.hist,'MonthlyLoanPayment');
g.add_legend();


# to see the distribution of income and loan payment in their specified term.

# to find out the portion of employed and non-employed
g = sb.FacetGrid(data = df_employed, hue = 'EmploymentStatus', height = 5, margin_titles = True);
g.map(plt.scatter, 'Term', 'MonthlyLoanPayment');
g.add_legend();

# the other data that aren't properly classfied to the status.
g = sb.FacetGrid(data = df_unemployed, hue = 'EmploymentStatus', height = 5, margin_titles = True);
g.map(plt.scatter, 'Term', 'MonthlyLoanPayment');
g.add_legend();


plt.figure(figsize = (15, 5));
g = sb.PairGrid(data = df_unemployed, vars = ['AmountDelinquent', 'OnTimeProsperPayments', 'MonthlyLoanPayment']);
g.map_diag(plt.hist);
g.map_offdiag(plt.scatter);

<Figure size 1080x360 with 0 Axes>


g = sb.PairGrid(data = df_unemployed, x_vars = ['Term', 'AmountDelinquent', 'LoanOriginalAmount', 'TotalProsperLoans', 'MonthlyLoanPayment'],
                y_vars = ['EmploymentStatus']);
g.map(sb.violinplot, inner = None);

g = sb.PairGrid(data = df_employed, x_vars = ['Term', 'AmountDelinquent', 'LoanOriginalAmount', 'TotalProsperLoans', 'MonthlyLoanPayment'],
                y_vars = ['EmploymentStatus']);
g.map(sb.violinplot, inner = None);


!jupyter nbconvert "Exploration of Loan Data from Prosper.ipynb" --to html --output-dir='C:\Users\Ajeet\OneDrive\Projects\Blogging\ajeethaa.github.io'

C:\Users\Ajeet\anaconda3\lib\site-packages\traitlets\traitlets.py:2195: FutureWarning: Supporting extra quotes around Unicode is deprecated in traitlets 5.0. Use 'C:\\Users\\Ajeet\\OneDrive\\Projects\\Blogging\\ajeethaa.github.io' instead of "'C:\\Users\\Ajeet\\OneDrive\\Projects\\Blogging\\ajeethaa.github.io'" – or use CUnicode.
  warn(
[NbConvertApp] Converting notebook Exploration of Loan Data from Prosper.ipynb to html
[NbConvertApp] Writing 1192321 bytes to C:\Users\Ajeet\OneDrive\Projects\Blogging\ajeethaa.github.io\Exploration of Loan Data from Prosper.html

	Term	BorrowerRate	LoanOriginalAmount	EstimatedReturn	EmploymentStatus	TotalProsperLoans	AmountDelinquent	StatedMonthlyIncome	OnTimeProsperPayments	MonthlyLoanPayment
0	36	0.1580	9425	NaN	Self-employed	NaN	472.0	3083.333333	NaN	330.43
1	36	0.0920	10000	0.05470	Employed	NaN	0.0	6125.000000	NaN	318.93
2	36	0.2750	3001	NaN	Not available	NaN	NaN	2083.333333	NaN	123.32
3	36	0.0974	10000	0.06000	Employed	NaN	10056.0	2875.000000	NaN	321.45
4	36	0.2085	15000	0.09066	Employed	1.0	0.0	9583.333333	11.0	563.97
...	...	...	...	...	...	...	...	...	...	...
113932	36	0.1864	10000	0.09500	Employed	NaN	0.0	4333.333333	NaN	364.74
113933	36	0.1110	2000	0.08070	Employed	3.0	5062.0	8041.666667	60.0	65.57
113934	60	0.2150	10000	0.08578	Employed	NaN	0.0	2875.000000	NaN	273.35
113935	60	0.2605	15000	0.15950	Full-time	1.0	0.0	3875.000000	16.0	449.55
113936	36	0.1039	2000	0.06081	Employed	NaN	257.0	4583.333333	NaN	64.90

	Term	BorrowerRate	LoanOriginalAmount	EstimatedReturn	TotalProsperLoans	AmountDelinquent	StatedMonthlyIncome	OnTimeProsperPayments	MonthlyLoanPayment
count	113937.000000	113937.000000	113937.00000	84853.000000	22085.000000	106315.000000	1.139370e+05	22085.000000	113937.000000
mean	40.830248	0.192764	8337.01385	0.096068	1.421100	984.507059	5.608026e+03	22.271949	272.475783
std	10.436212	0.074818	6245.80058	0.030403	0.764042	7158.270157	7.478497e+03	18.830425	192.697812
min	12.000000	0.000000	1000.00000	-0.182700	0.000000	0.000000	0.000000e+00	0.000000	0.000000
25%	36.000000	0.134000	4000.00000	0.074080	1.000000	0.000000	3.200333e+03	9.000000	131.620000
50%	36.000000	0.184000	6500.00000	0.091700	1.000000	0.000000	4.666667e+03	15.000000	217.740000
75%	36.000000	0.250000	12000.00000	0.116600	2.000000	0.000000	6.825000e+03	32.000000	371.580000
max	60.000000	0.497500	35000.00000	0.283700	8.000000	463881.000000	1.750003e+06	141.000000	2251.510000

Exploration of Loan Data from Prosper¶

My First Data Analysis Project¶

Dataset¶

Summary of Findings¶

Key Insights for Presentation¶

Preliminary Wrangling¶

Data structure of the dataset¶

The main features of interest¶

Univariate Exploration¶

Distribution of the variables¶

Bivariate Exploration¶

The relationships in the dataset¶

Multivariate Exploration¶

Conclusion¶

References:¶