Python Coding
Banking Assignment
The project utilizes Jupyter Notebook to perform multiple data analysis about banks.
#import data
%cd C:\\Users\\janet\\Desktop\\Columbia\\Spring 2018\\Python 5210\\Ad Campaign Assignment
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df=pd.read_csv("banklist.csv")
print(len(df))
df.info( )
df.head( )
#Check missing values and identify top distribution
df.isnull( ).sum( )
df['Acquiring Institution'].value_counts( ).head(5)
A_city=df['City'].drop_duplicates( )
A_city.str.startswith('A').value_counts( )
city_split = df['City'].str.split( ) #find the length of splitted that is equal to two
city_split.str.len( ).value_counts( )
city_list = df.copy( )
list_2words = city_list.loc[city_list.loc[:,'City'].str.split(' ').map(lambda x : len(x))== 2,'City']
print(list_2words.unique( ) )
#subset data frame to analyze certain part of data
bins = 70
plt.title('CERT Distribution')
plt.xlabel('CERT')
plt.ylabel('Frequency')
x=df['CERT'].dropna( )
#note, to use the matplotlib method, you have to first remove missing values before
#passing in the series object as parameter for plotting
plt.hist(x, bins, alpha=0.5, histtype='bar', ec='black',color='indigo')
plt.show( )
ten_perc=df['CERT'].describe(percentiles=[.1])['10%']
ten_perc
df_subset = df[df['CERT']<ten_perc]
df_subset.head( )
sub_2 = df_subset[df_subset['ST'].map(lambda x : df_subset['ST'].value_counts( )[x]) > 1]
sub_2.head( )
cert_info = sub_2.pivot_table(values = 'CERT', index=['ST'],aggfunc=[np.min,np.max,np.mean])
cert_info.rename(columns={"amin":"min_CERT","amax":"max_CERT","mean":"mean_CERT"},inplace=True)
cert_info = cert_info.swaplevel(0,1,axis=1)
cert_info = cert_info['CERT']
cert_info
