Source code for prepropy.imputation

import pandas as pd
import numpy as np


[docs]class imputation:
    """
    Generates an instance of an imputation class for imputation on missing data

    Parameters
    --------
    method: str
        method we wish to do the imputing.
    values: numpy array
        an array with values to be imputed. Default None

    Returns
    --------
    An instance of the imputation class

    Examples
    --------
    >>>test_df = pd.DataFrame([[np.nan,2,3],[2,np.nan,4],[5,6,7]])
    >>>imputer = imputation('mean')
    """

    def __init__(self, method):
        """
        Initialize the class

        Parameters
        --------
        method: str
            method we wish to do the imputing.
        """
        if method not in ["mean", "median", "most_frequent"]:
            raise KeyError("Method must be one of mean, median, most_frequent")
        self.method = method
        self.values = None

[docs]    def fit(self, data):
        """
        Calculates the value to be imputated for each column

        Parameters
        --------
        data: pandas.core.frame.DataFrame
            a pandas dataframe

        Returns
        --------
        An instance of the imputation class

        Examples
        --------
        >>>test_df = pd.DataFrame([[np.nan,2,3],[2,np.nan,4],[5,6,7]])
        >>>imputer = imputation('mean')
        >>>imputer.fit(test_df)
        """
        if type(data) != pd.DataFrame:
            raise TypeError("Input data must be a Pandas Dataframe")
        if data.empty:
            raise ValueError("DataFrame cannot be empty")
        if self.method == "mean":
            if data.shape[1] != data.select_dtypes(include=np.number).shape[1]:
                raise TypeError("All values in dataframe must be numeric")
            self.values = data.mean().values
        elif self.method == "median":
            if data.shape[1] != data.select_dtypes(include=np.number).shape[1]:
                raise TypeError("All values in dataframe must be numeric")
            self.values = data.median().values
        elif self.method == "most_frequent":
            self.values = data.mode().values[0]

[docs]    def fill(self, data_for_fill):
        """
        Fills the missing values in each column

        Parameters
        --------
        data_for_fill: pandas.core.frame.DataFrame
            a pandas dataframe that we wish to fill the missing values with

        Returns
        --------
        A dataframe with the missing values imputed

        Examples
        --------
        >>>test_df = pd.DataFrame([[np.nan,2,3],[2,np.nan,4],[5,6,7]])
        >>>imputer = imputation('mean')
        >>>imputer.fit(test_df)
        >>>new = imputer.fill(test_df)
        >>>test_df2 = pd.DataFrame([[1,10,8],[5,2,6],[np.nan,3,np.nan]])
        >>>new2 = imputer.fill(test_df2)
        """
        if len(self.values) != data_for_fill.shape[1]:
            raise TypeError("Columns are not Equal")
        data = data_for_fill.copy()
        for i in range(len(data.columns)):
            data.iloc[:, i].fillna(self.values[i], inplace=True)
        return data