Source code for dfsummarizer.funcs

# -*- coding: utf-8 -*-
from io import StringIO
import pandas as pd 
import numpy as np
import operator
import math
import os

from .config import max_filesize
from .FlajoletMartin import FMEstimator
 
"""
    dfsummarizer.funcs: Core functions of the dfsummarizer package.
        analyse_df( pandas_dataframe): return a sumamry dataframe of the input dataframe
        analyse_df_in_chunks(path_to_dataset): Read the dataset in chunks and provide a summary
"""


########################################################################################
[docs]def analyse_file(path_to_file):
    df = load_complete_dataframe(path_to_file)
    summary = analyse_df(df)
    return summary


########################################################################################
[docs]def analyse_df(df):
    """
        Given a pandas dataframe that is already in memory we generate a table of summary
        statistics and descriptors.
    """
    colnames = df.columns
    records = len(df)
    df = coerce_dates(df)
    rez = pd.DataFrame(columns=('Name', 'Type', 'Unique Vals', 'Unique', 'Nulls', 'Min', 'Mean', 'Max'))

    for name in colnames:
        nacount = len(df[df[name].isna()])
        napercent = round(100*nacount/records,1)
        uniques = df[name].unique().tolist()
        if np.nan in uniques :
            uniques.remove(np.nan)
        unicount = len(uniques)
        unipercent = round(100*unicount/records,1)
        mode = df[name].mode(dropna=False)[0]
        #if (type(mode) == float) & np.isnan(mode):
        #    mode = "NaN" 
        if mode != mode:
            mode = "NaN" 
        #valtype = infer_type(str(type(df.loc[1,name])), unicount, uniques)
        valtype = infer_type_2( df.loc[:,name], 0, unicount, uniques)

        if (valtype == "Char") :
            lenvec = df[name].apply(lambda x: len_or_null(x))
            themin = round(lenvec.min(),3) # "-"
            themean = round(lenvec.mean(),3) #"-"
            themax = round(lenvec.max(),3) #"-"
        elif (valtype == "Bool") :
            newvec = df[name].apply(lambda x: booleanize(x))
            themin = round(newvec.min(),3)
            themean = round(newvec.mean(),3)
            themax = round(newvec.max(),3)
        else:
            if (valtype != "Date") :
                themin = round(df[name].min(),3)
                themean = round(df[name].mean(),3)
                themax = round(df[name].max(),3)
            else :
                themin = str(df[name].min())[0:10]
                themean = str(df[name].mean())[0:10] #"-"
                themax = str(df[name].max())[0:10]

        values_to_add = {
            'Name':name, 
            'Type':valtype,
            'Mode':mode,
            'Unique Vals':unicount, 
            'Unique':unipercent, 
            'Nulls':napercent, 
            'Min':themin, 
            'Mean': themean, 
            'Max':themax
        }
        rez = rez.append(values_to_add, ignore_index=True)

    return rez

########################################################################################
[docs]def analyse_file_in_chunks(path_to_file):
    """
        Given a path to a large dataset we will iteratively load it in chunks and build
        out the statistics necessary to summarise the whole dataset.
    """
    fsize = os.stat(path_to_file).st_size
    sample_prop = max_filesize / fsize 
    line_count = count_lines(path_to_file)
    chunks = round(line_count * sample_prop)
    temp = {}
    data_iterator = pd.read_csv(path_to_file, chunksize=chunks, low_memory=False)
    total_chunks = 0
    for index, chunk in enumerate(data_iterator, start=0):
        startpoint = 0 + (index*chunks)
        total_chunks = index + 1
        temp = update_temp_summary(temp, chunk, startpoint)
    summary = generate_final_summary(temp, total_chunks)
    return summary


########################################################################################
[docs]def generate_final_summary(temp, total_chunks):
    rez = pd.DataFrame(columns=('Name', 'Mode', 'Type', 'Unique Vals', 'Unique', 'Nulls', 'Min', 'Mean', 'Max'))
    for name in temp.keys():
        col = temp[name]
        total = col['nulls'] + col['nonnulls']
        mode = max(col['val_counts'].items(), key=operator.itemgetter(1))[0]
        unicount = col['uniques'].estimate()
        if unicount > total:
            uniprop = 1.0
            unicount = total
        else:
            uniprop = unicount / total
        unipercent = round(100 * uniprop, 1)
        napercent = round((100 * col['nulls']) / total, 1)
        if (col['type'] != "Date") :
            themean = col['sum'] / total
        else:
            themean = col['mean'] 
        values_to_add = {
            'Name':name, 
            'Mode':mode,
            'Type': col['type'],
            'Unique Vals':unicount,
            'Unique':unipercent,
            'Nulls':napercent,
            'Min': col['min'],
            'Mean': themean, 
            'Max': col['max']
        }
        rez = rez.append(values_to_add, ignore_index=True)
    return rez

########################################################################################
[docs]def clean_dict(df, col):
    temp = df[col].value_counts(dropna=False)
    indeces = temp.index
    newie = []
    for i in indeces:
        if np.isnan(i):
            newie.append("NaN")
        else:
            newie.append(i)
    temp.index = newie
    return temp.to_dict()

[docs]def combine_dicts(a, b, op=operator.add):
    return {**a, **b, **{k: op(a[k], b[k]) for k in a.keys() & b}}

########################################################################################
[docs]def update_temp_summary(temp, df, startpoint):
    colnames = df.columns
    records = len(df)
    df = coerce_dates(df)
    for name in colnames:
        if name in temp:
            rez = temp[name]
        else: 
            rez = { "type":[], "val_counts":{}, "sum":0, "mean":np.nan, 
                    "min":np.nan, "max":np.nan, 
                    "uniques":FMEstimator(), "nulls":0, 
                    "nonnulls":0
                   }
        nacount = len(df[df[name].isna()])
        nonnulls = len(df) - nacount
        val_counts = clean_dict(df, name)
        uniques = df[name].unique().tolist()
        if np.nan in uniques :
            uniques.remove(np.nan)
        unicount = len(uniques)
        uniprop = unicount / len(df)
        valtype = infer_type_2( df.loc[:,name], startpoint, unicount, uniques)
        if (valtype == "Char") :
            lenvec = df[name].apply(lambda x: len_or_null(x))
            themin = round(lenvec.min(),3) # "-"
            thesum = round(lenvec.sum(),3) #"-"
            themax = round(lenvec.max(),3) #"-"
        elif (valtype == "Bool") :
            newvec = df[name].apply(lambda x: booleanize(x))
            themin = round(newvec.min(),3)
            thesum = round(newvec.sum(),3)
            themax = round(newvec.max(),3)
        else:
            if (valtype != "Date") :
                themin = round(df[name].min(),3)
                thesum = round(df[name].sum(),3)
                themax = round(df[name].max(),3)
            else :
                themin = str(df[name].min())[0:10]
                themean = df[name].mean()
                themax = str(df[name].max())[0:10] 

        rez['type'] = valtype
        if (valtype != "Date") :
            rez['sum'] = rez['sum'] + thesum
        else:
            if isNaN(rez['mean']):
                rez['mean'] = themean
        #    else:
        #        rez['mean'] = rez['mean'] + (rez['mean'] - themean)/2
        # ABOVE IS OFF FOR THE MOMENT - i.e Keep the first mean
        rez['nulls'] = rez['nulls'] + nacount
        if isNaN( rez['min'] ) or themin < rez['min']:
            rez['min'] = themin
        if isNaN( rez['max'] ) or themax > rez['max']:
            rez['max'] = themax
        rez['uniques'].update_all(uniques)
        #rez['uniques'] += uniprop
        rez['val_counts'] = combine_dicts(rez['val_counts'], val_counts)
        rez['nonnulls'] = rez['nonnulls'] + nonnulls
        temp[name] = rez
    return temp

########################################################################################
[docs]def extract_file_extension(path_to_file):
    return os.path.splitext(path_to_file)[1]

########################################################################################
[docs]def load_complete_dataframe(path_to_file):
    """
        We load the entire dataset into memory, using the file extension to determine
        the expected format. We are using encoding='latin1' because it ppears to 
        permit loading of the largest variety of files.
        Representation of strings may not be perfect, but is not important for generating a
        summarization of the entire dataset.
    """
    extension = extract_file_extension(path_to_file).lower()
    if extension == ".csv":
        df = pd.read_csv(path_to_file, encoding='latin1', low_memory=False)
        return df
    if extension == ".tsv":
        df = pd.read_csv(path_to_file, encoding='latin1', sep='\t', low_memory=False)
        return df
    if extension == ".xls" or extension == ".xlsx" or extension == ".odf" :
        df = pd.read_excel(path_to_file)
        return df

    raise ValueError("Unsupported File Type")


########################################################################################
[docs]def infer_type_2( thecolumn, startpoint, unicount, uniques):
    thetype = get_first_non_null_type(thecolumn, startpoint)
    return infer_type(thetype, unicount, uniques)


########################################################################################
[docs]def get_first_non_null_type(thecolumn, startpoint):
    thetype = ""
    index = startpoint
    while thetype == "":
        temptype = str(type(thecolumn[index]))
        tempval = thecolumn[index]
        if tempval == np.nan:
            thetype = ""
        elif tempval is None:
            thetype = ""
        elif temptype == "<class 'pandas._libs.tslibs.nattype.NaTType'>":
            thetype = ""
        else:
            thetype = temptype
        index = index + 1 
    return thetype


########################################################################################
[docs]def infer_type(thetype, unicount, uniques):
     valtype = "Char"
     if thetype == "<class 'numpy.float64'>" :
        valtype = "Float"
     if thetype == "<class 'numpy.int64'>" :
        valtype = "Int"
     if thetype == "<class 'pandas._libs.tslib.Timestamp'>" :
        valtype = "Date"
     if thetype == "<class 'pandas._libs.tslibs.timestamps.Timestamp'>" :
        valtype = "Date"
     if thetype == "<class 'numpy.bool_'>":
        valtype = "Bool"
     if thetype == "<class 'bool'>":
        valtype = "Bool"

     # Additional Inference of Booleans by strings with 2 unique values 
     # and common names as additional criteria
     if (valtype == "Char") :
         if unicount == 2:
            temp = [x.lower() for x in uniques if x is not None]
            temp.sort()
            if (temp == ['no', 'yes']):
                valtype = "Bool"
            if (temp == ['n', 'y']):
                valtype = "Bool"
            if (temp == ['false', 'true']):
                valtype = "Bool"
            if (temp == ['f', 't']):
                valtype = "Bool"
     return valtype


########################################################################################
[docs]def count_lines(path_to_file):
    """
    Return a count of total lines in a file. In a way that filesize is irrelevant
    """
    count = 0
    for line in open(path_to_file): count += 1
    return count


########################################################################################
[docs]def len_or_null(val):
    """ 
       Alternative len function that will simply return numpy.NA for invalid values 
       This is need to get sensible results when running len over a column that may contain nulls
    """
    try:
        return len(val)
    except:
        return np.nan


########################################################################################
[docs]def isNaN(num):
    return num != num


########################################################################################
[docs]def booleanize(x):
    if isNaN(x) :
        return x
    elif x is None :
        return x
    elif str(type(x)) == "<class 'bool'>": 
        return x
    else :
        x = x.lower()

    if x == "yes" or x == "y" or x == "true" or x == "t" or x == 1:
        return 1
    else :
        return 0


########################################################################################
[docs]def coerce_dates(df):
    return df.apply(
        lambda col: pd.to_datetime(col, errors='ignore')
        if col.dtypes == object
        else col,
        axis=0
    )


########################################################################################
[docs]def print_latex(summary):
    print("\\begin{table}[h!]")
    print(" \\begin{center}")
    print("   \\caption{Data Summary Table}")
    print("   \\label{tab:table1}")
    print("   \\begin{tabular}{l|l|r|r|r|r|r} ")
    print("    \\textbf{Name} & \\textbf{Type} & \\textbf{Unique Vals \%} & \\textbf{Nulls \%} & \\textbf{Mode} & \\textbf{Min} & \\textbf{Mean} & \\textbf{Max}\\\\")
    print("      \\hline")
    for i in range(len(summary)):
        print("      ", summary.loc[i,"Name"], 
              "&", summary.loc[i,"Type"], 
              "&", summary.loc[i,"Unique Vals"], "%" 
              "&", summary.loc[i,"Nulls"], "%" 
              "&", summary.loc[i,"Mode"], 
              "&", summary.loc[i,"Min"], 
              "&", summary.loc[i,"Mean"], 
              "&", summary.loc[i,"Max"], "\\\\")
    print("    \\end{tabular}")
    print("  \\end{center}")
    print("\\end{table}")


########################################################################################
[docs]def get_spaces(spacer):
    rez = ""
    for i in range(spacer):
        rez = rez + " "
    return rez


########################################################################################
[docs]def get_type_spacer(t):
    if (t == "Int") :
        return "    "
    if (t == "Char") :
        return "   "
    if (t == "Date") :
        return "   "
    if (t == "Float") :
        return "  "
    return "   "


########################################################################################
[docs]def get_percent_spacer(p):
    if (p==100.0):
       return " "
    elif (p>=10): 
        return "  " 
    else: 
        return "   "


########################################################################################
[docs]def get_padded_number(n):
    if (n == "-"):
        return "     -     "
    if (str(n).replace('.','',1).replace('-','',1).isdigit()):
        if (n<0):
            adjus = -1
        else:
            adjus = 0
        if (abs(n)<10):
            return get_spaces(8 - after_decimal(n) + adjus) + str(n) + " "
        if (abs(n)<100):
            return get_spaces(7 - after_decimal(n) + adjus) + str(n)+ " "
        if (abs(n)<1000):
            return get_spaces(6 - after_decimal(n) + adjus) + str(n)+ " "
        if (abs(n)<10000):
            return get_spaces(5 - after_decimal(n) + adjus) + str(n)+ " "
        if (abs(n)<100000):
            return get_spaces(4 - after_decimal(n) + adjus) + str(n)+ " "
        if (abs(n)<1000000):
            return get_spaces(3 - after_decimal(n) + adjus) + str(n)+ " "
        if (abs(n)<10000000):
            return get_spaces(2 - after_decimal(n) + adjus) + str(n)+ " "
        else:
            number = "{:.2e}".format(n)
            return get_spaces(2 + adjus) + number + " "
    else:
        return str(n) + " "


########################################################################################
[docs]def after_decimal(n):
    arr = str(n).split(".")
    if( len(arr)==2 ):
        return len(arr[1])
    else:
        return -1

########################################################################################
[docs]def print_csv(s):
    output = StringIO()
    s.to_csv(output, index=False)
    print(output.getvalue())

########################################################################################
[docs]def get_padded_val2(val, spacer):
    filler = ""
    if spacer > 10:
        filler = (get_spaces(spacer - 10))
    if type(val) == int:
        return filler + get_padded_number(val) 
    if type(val) == float:
        return filler + get_padded_number(val) 
    else:
        printval = str(val)
        return (get_spaces(spacer - len(printval)) + printval + " ")

[docs]def get_padded_val(val):
    return str(val) + " - " + str(type(val))

########################################################################################
[docs]def print_markdown(s):
    longest_name = max(s["Name"].apply(lambda x: len_or_null(x)))
    if(longest_name>4):
        name_spacer = longest_name+2
    else:
        name_spacer = 6

    longest_mode = max(s["Mode"].apply(lambda x: len_or_null(str(x))))
    if(longest_mode>10):
        mode_spacer = longest_mode
    else:
        mode_spacer = 10

    print("| Name ", get_spaces(name_spacer-6), 
        "| Type   | Unique Vals | Nulls   | Mode ", get_spaces(mode_spacer-4),
        "|  Min       |  Mean      |  Max       |", sep="")
    print("| ---- ", get_spaces(name_spacer-6), 
        "| ------ | ----------- | ------- | ---- ", get_spaces(mode_spacer-4),
        "|  ---       |  ----      |  ---       |", sep="")
    for i in range(len(s)):
        print("| ", s.loc[i,"Name"], 
            get_spaces(name_spacer - len(s.loc[i,"Name"]) - 1 ), 
            "| ", s.loc[i,"Type"], get_type_spacer(s.loc[i,"Type"]),
            "|  ", get_padded_number(s.loc[i,"Unique Vals"]), 
            "| ", get_percent_spacer(s.loc[i,"Nulls"]), s.loc[i,"Nulls"],"% ", 
            "| ", get_padded_val2(s.loc[i,"Mode"], mode_spacer), 
            "| ", get_padded_number(s.loc[i,"Min"]), 
            "| ", get_padded_number(s.loc[i,"Mean"]),
            "| ", get_padded_number(s.loc[i,"Max"]), "|", sep="")


########################################################################################
[docs]def round_down(n, decimals=0):
    """
    Round down a number to a specifed number of decimal places
    """
    multiplier = 10 ** decimals
    return math.floor(n * multiplier) / multiplier