Source code for prepropy.imputation
import pandas as pd
import numpy as np
[docs]class imputation:
"""
Generates an instance of an imputation class for imputation on missing data
Parameters
--------
method: str
method we wish to do the imputing.
values: numpy array
an array with values to be imputed. Default None
Returns
--------
An instance of the imputation class
Examples
--------
>>>test_df = pd.DataFrame([[np.nan,2,3],[2,np.nan,4],[5,6,7]])
>>>imputer = imputation('mean')
"""
def __init__(self, method):
"""
Initialize the class
Parameters
--------
method: str
method we wish to do the imputing.
"""
if method not in ["mean", "median", "most_frequent"]:
raise KeyError("Method must be one of mean, median, most_frequent")
self.method = method
self.values = None
[docs] def fit(self, data):
"""
Calculates the value to be imputated for each column
Parameters
--------
data: pandas.core.frame.DataFrame
a pandas dataframe
Returns
--------
An instance of the imputation class
Examples
--------
>>>test_df = pd.DataFrame([[np.nan,2,3],[2,np.nan,4],[5,6,7]])
>>>imputer = imputation('mean')
>>>imputer.fit(test_df)
"""
if type(data) != pd.DataFrame:
raise TypeError("Input data must be a Pandas Dataframe")
if data.empty:
raise ValueError("DataFrame cannot be empty")
if self.method == "mean":
if data.shape[1] != data.select_dtypes(include=np.number).shape[1]:
raise TypeError("All values in dataframe must be numeric")
self.values = data.mean().values
elif self.method == "median":
if data.shape[1] != data.select_dtypes(include=np.number).shape[1]:
raise TypeError("All values in dataframe must be numeric")
self.values = data.median().values
elif self.method == "most_frequent":
self.values = data.mode().values[0]
[docs] def fill(self, data_for_fill):
"""
Fills the missing values in each column
Parameters
--------
data_for_fill: pandas.core.frame.DataFrame
a pandas dataframe that we wish to fill the missing values with
Returns
--------
A dataframe with the missing values imputed
Examples
--------
>>>test_df = pd.DataFrame([[np.nan,2,3],[2,np.nan,4],[5,6,7]])
>>>imputer = imputation('mean')
>>>imputer.fit(test_df)
>>>new = imputer.fill(test_df)
>>>test_df2 = pd.DataFrame([[1,10,8],[5,2,6],[np.nan,3,np.nan]])
>>>new2 = imputer.fill(test_df2)
"""
if len(self.values) != data_for_fill.shape[1]:
raise TypeError("Columns are not Equal")
data = data_for_fill.copy()
for i in range(len(data.columns)):
data.iloc[:, i].fillna(self.values[i], inplace=True)
return data