Source code for prepropy.eda

import pandas as pd
import altair as alt


[docs]def eda(df, target): """ Generates a dictionary to access summary statistics of the given data frame Parameters -------- df : pandas.DataFrame input dataframe to be analyzed target : string target column name Returns -------- dict access summary statistics of the given data frame. cor the correlation map Examples -------- >>> from propropy import eda >>> url1 = "https://archive.ics.uci.edu/ml/machine-learning-databases/" >>> url2 = "wine-quality/winequality-red.csv" >>> url = url1+url2 >>> df = pd.read_csv(url, ";") >>> target = "quality" >>> res = eda(df,quality) """ # Check the dataframe input if not isinstance(df, pd.DataFrame): raise TypeError("Input data must be an instance of DataFrame") # Create an empty dictionary res = {} # obtain statistical information df_fea = df.drop(target, 1) num_fea = df_fea.select_dtypes("number").columns.to_list() cat_fea = list(set(list(df_fea.columns)) - set(num_fea)) key_null = list(df_fea.isnull().sum().index) val_null = list(df_fea.isnull().sum().values) res["nb_missing_values"] = list(zip(key_null, val_null)) res["nb_cat_features"] = len(cat_fea) res["cat_features_name"] = cat_fea res["nb_num_features"] = len(num_fea) res["num_features_name"] = num_fea res["nb_class"] = len(list(set(df[target]))) class_count = df[target].value_counts(normalize=True).values res["class_ratio"] = list(class_count.round(4)) # Create a pair plots with Altair color_lab = target + ":N" chart = ( alt.Chart(df) .mark_circle() .encode( alt.X(alt.repeat("column"), type="quantitative"), alt.Y(alt.repeat("row"), type="quantitative"), color=color_lab, ) .properties(width=100, height=100) .repeat(row=num_fea, column=num_fea) ) res["pairplot"] = chart return res