Prepare the titanic data to plot:
%matplotlib inline importpandasaspdimportnumpyasnpimportpylabaspltimportseaborn# Set the global default size of matplotlib figuresplt.rc('figure',figsize=(10,5))# Set seaborn aesthetic parameters to defaultsseaborn.set()
df_train=pd.read_csv('../data/titanic/train.csv')defclean_data(df):# Get the unique values of Sexsexes=np.sort(df['Sex'].unique())# Generate a mapping of Sex from a string to a number representation genders_mapping=dict(zip(sexes,range(0,len(sexes)+1)))# Transform Sex from a string to a number representationdf['Sex_Val']=df['Sex'].map(genders_mapping).astype(int)# Get the unique values of Embarkedembarked_locs=np.sort(df['Embarked'].unique())# Generate a mapping of Embarked from a string to a number representation embarked_locs_mapping=dict(zip(embarked_locs,range(0,len(embarked_locs)+1)))# Transform Embarked from a string to dummy variablesdf=pd.concat([df,pd.get_dummies(df['Embarked'],prefix='Embarked_Val')],axis=1)# Fill in missing values of Embarked# Since the vast majority of passengers embarked in 'S': 3, # we assign the missing values in Embarked to 'S':iflen(df[df['Embarked'].isnull()]>0):df.replace({'Embarked_Val':{embarked_locs_mapping[np.nan]:embarked_locs_mapping['S']}},inplace=True)# Fill in missing values of Fare with the average Fareiflen(df[df['Fare'].isnull()]>0):avg_fare=df['Fare'].mean()df.replace({None:avg_fare},inplace=True)# To keep Age in tact, make a copy of it called AgeFill # that we will use to fill in the missing ages:df['AgeFill']=df['Age']# Determine the Age typical for each passenger class by Sex_Val. # We'll use the median instead of the mean because the Age # histogram seems to be right skewed.df['AgeFill']=df['AgeFill'] \ .groupby([df['Sex_Val'],df['Pclass']]) \ .apply(lambdax:x.fillna(x.median()))# Define a new feature FamilySize that is the sum of # Parch (number of parents or children on board) and # SibSp (number of siblings or spouses):df['FamilySize']=df['SibSp']+df['Parch']returndfdf_train=clean_data(df_train)
# Size of matplotlib figures that contain subplotsfigsize_with_subplots=(10,10)# Set up a grid of plotsfig=plt.figure(figsize=figsize_with_subplots)fig_dims=(3,2)# Plot death and survival countsplt.subplot2grid(fig_dims,(0,0))df_train['Survived'].value_counts().plot(kind='bar',title='Death and Survival Counts',color='r',align='center')# Plot Pclass countsplt.subplot2grid(fig_dims,(0,1))df_train['Pclass'].value_counts().plot(kind='bar',title='Passenger Class Counts')# Plot Sex countsplt.subplot2grid(fig_dims,(1,0))df_train['Sex'].value_counts().plot(kind='bar',title='Gender Counts')plt.xticks(rotation=0)# Plot Embarked countsplt.subplot2grid(fig_dims,(1,1))df_train['Embarked'].value_counts().plot(kind='bar',title='Ports of Embarkation Counts')# Plot the Age histogramplt.subplot2grid(fig_dims,(2,0))df_train['Age'].hist()plt.title('Age Histogram')
<matplotlib.text.Text at 0x118cf7358>
# Get the unique values of Embarked and its maximumfamily_sizes=np.sort(df_train['FamilySize'].unique())family_size_max=max(family_sizes)df1=df_train[df_train['Survived']==0]['FamilySize']df2=df_train[df_train['Survived']==1]['FamilySize']plt.hist([df1,df2],bins=family_size_max+1,range=(0,family_size_max),stacked=True)plt.legend(('Died','Survived'),loc='best')plt.title('Survivors by Family Size')
pclass_xt=pd.crosstab(df_train['Pclass'],df_train['Survived'])# Normalize the cross tab to sum to 1:pclass_xt_pct=pclass_xt.div(pclass_xt.sum(1).astype(float),axis=0)pclass_xt_pct.plot(kind='bar',stacked=True,title='Survival Rate by Passenger Classes')plt.xlabel('Passenger Class')plt.ylabel('Survival Rate')# Plot survival rate by Sexfemales_df=df_train[df_train['Sex']=='female']females_xt=pd.crosstab(females_df['Pclass'],df_train['Survived'])females_xt_pct=females_xt.div(females_xt.sum(1).astype(float),axis=0)females_xt_pct.plot(kind='bar',stacked=True,title='Female Survival Rate by Passenger Class')plt.xlabel('Passenger Class')plt.ylabel('Survival Rate')# Plot survival rate by Pclassmales_df=df_train[df_train['Sex']=='male']males_xt=pd.crosstab(males_df['Pclass'],df_train['Survived'])males_xt_pct=males_xt.div(males_xt.sum(1).astype(float),axis=0)males_xt_pct.plot(kind='bar',stacked=True,title='Male Survival Rate by Passenger Class')plt.xlabel('Passenger Class')plt.ylabel('Survival Rate')
<matplotlib.text.Text at 0x1194800b8>
# Set up a grid of plotsfig,axes=plt.subplots(2,1,figsize=figsize_with_subplots)# Histogram of AgeFill segmented by Surviveddf1=df_train[df_train['Survived']==0]['Age']df2=df_train[df_train['Survived']==1]['Age']max_age=max(df_train['AgeFill'])axes[1].hist([df1,df2],bins=max_age/10,range=(1,max_age),stacked=True)axes[1].legend(('Died','Survived'),loc='best')axes[1].set_title('Survivors by Age Groups Histogram')axes[1].set_xlabel('Age')axes[1].set_ylabel('Count')# Scatter plot Survived and AgeFillaxes[0].scatter(df_train['Survived'],df_train['AgeFill'])axes[0].set_title('Survivors by Age Plot')axes[0].set_xlabel('Survived')axes[0].set_ylabel('Age')
# Get the unique values of Pclass:passenger_classes=np.sort(df_train['Pclass'].unique())forpclassinpassenger_classes:df_train.AgeFill[df_train.Pclass==pclass].plot(kind='kde')plt.title('Age Density Plot by Passenger Class')plt.xlabel('Age')plt.legend(('1st Class','2nd Class','3rd Class'),loc='best')