defremove_outliers_by_iqr(df,colname,iqr_threshold):df_drop_na=df[df[colname].notnull()]threshold=float(iqr_threshold)Q3=df_drop_na[colname].quantile(0.75)Q1=df_drop_na[colname].quantile(0.25)IQR=Q3-Q1df_drop_na_qt=df_drop_na[~((df_drop_na[colname]<(Q1-threshold*IQR))|(df_drop_na[colname]>(Q3+threshold*IQR)))]number_nas=len(df.index)-df_drop_na[colname].count()number_outliers=df_drop_na[colname].count()-df_drop_na_qt[colname].count()print('Dataframe shape before removing outliers: ',df.shape)print(number_nas,'rows where column',colname,'are NULL are removed')print('Dataframe shape after removing NAs in the column',colname,':',df_drop_na.shape)print(number_outliers,'outliers are removed')print('Dataframe shape after removing NAs and ourliers in the column',colname,'by',iqr_threshold,'* IQR:',df_drop_na_qt.shape)returndf_drop_na_qtdf2=remove_outliers_by_iqr(df,'area',1.5)
Remove outliers by Z score
defremove_outliers_by_z(df,colname,z_score_threshold):df_drop_na=df[df[colname].notnull()]fromscipyimportstatsimportnumpyasnpz=np.abs(stats.zscore(df_drop_na[colname]))df_drop_na_zoutlier=df_drop_na[(z<float(z_score_threshold))]number_nas=len(df.index)-df_drop_na[colname].count()number_outliers=df_drop_na[colname].count()-df_drop_na_zoutlier[colname].count()print('Dataframe shape before removing outliers: ',df.shape)print(number_nas,'rows where column',colname,'are NULL are removed')print('Dataframe shape after removing NAs in the column',colname,':',df_drop_na.shape)print(number_outliers,'outliers are removed')print('Dataframe shape after removing NAs and ourliers in the column',colname,'by Z score threshold',z_score_threshold,':',df_drop_na_zoutlier.shape)returndf_drop_na_zoutlierdf3=remove_outliers_by_z(df,'area',3)