Source code for dfsummarizer.funcs

# -*- coding: utf-8 -*-
from io import StringIO
import pandas as pd 
import numpy as np
import operator
import math
import os

from .config import max_filesize
from .FlajoletMartin import FMEstimator
 
"""
    dfsummarizer.funcs: Core functions of the dfsummarizer package.
        analyse_df( pandas_dataframe): return a sumamry dataframe of the input dataframe
        analyse_df_in_chunks(path_to_dataset): Read the dataset in chunks and provide a summary
"""


########################################################################################
[docs]def analyse_file(path_to_file): df = load_complete_dataframe(path_to_file) summary = analyse_df(df) return summary
########################################################################################
[docs]def analyse_df(df): """ Given a pandas dataframe that is already in memory we generate a table of summary statistics and descriptors. """ colnames = df.columns records = len(df) df = coerce_dates(df) rez = pd.DataFrame(columns=('Name', 'Type', 'Unique Vals', 'Unique', 'Nulls', 'Min', 'Mean', 'Max')) for name in colnames: nacount = len(df[df[name].isna()]) napercent = round(100*nacount/records,1) uniques = df[name].unique().tolist() if np.nan in uniques : uniques.remove(np.nan) unicount = len(uniques) unipercent = round(100*unicount/records,1) mode = df[name].mode(dropna=False)[0] #if (type(mode) == float) & np.isnan(mode): # mode = "NaN" if mode != mode: mode = "NaN" #valtype = infer_type(str(type(df.loc[1,name])), unicount, uniques) valtype = infer_type_2( df.loc[:,name], 0, unicount, uniques) if (valtype == "Char") : lenvec = df[name].apply(lambda x: len_or_null(x)) themin = round(lenvec.min(),3) # "-" themean = round(lenvec.mean(),3) #"-" themax = round(lenvec.max(),3) #"-" elif (valtype == "Bool") : newvec = df[name].apply(lambda x: booleanize(x)) themin = round(newvec.min(),3) themean = round(newvec.mean(),3) themax = round(newvec.max(),3) else: if (valtype != "Date") : themin = round(df[name].min(),3) themean = round(df[name].mean(),3) themax = round(df[name].max(),3) else : themin = str(df[name].min())[0:10] themean = str(df[name].mean())[0:10] #"-" themax = str(df[name].max())[0:10] values_to_add = { 'Name':name, 'Type':valtype, 'Mode':mode, 'Unique Vals':unicount, 'Unique':unipercent, 'Nulls':napercent, 'Min':themin, 'Mean': themean, 'Max':themax } rez = rez.append(values_to_add, ignore_index=True) return rez
########################################################################################
[docs]def analyse_file_in_chunks(path_to_file): """ Given a path to a large dataset we will iteratively load it in chunks and build out the statistics necessary to summarise the whole dataset. """ fsize = os.stat(path_to_file).st_size sample_prop = max_filesize / fsize line_count = count_lines(path_to_file) chunks = round(line_count * sample_prop) temp = {} data_iterator = pd.read_csv(path_to_file, chunksize=chunks, low_memory=False) total_chunks = 0 for index, chunk in enumerate(data_iterator, start=0): startpoint = 0 + (index*chunks) total_chunks = index + 1 temp = update_temp_summary(temp, chunk, startpoint) summary = generate_final_summary(temp, total_chunks) return summary
########################################################################################
[docs]def generate_final_summary(temp, total_chunks): rez = pd.DataFrame(columns=('Name', 'Mode', 'Type', 'Unique Vals', 'Unique', 'Nulls', 'Min', 'Mean', 'Max')) for name in temp.keys(): col = temp[name] total = col['nulls'] + col['nonnulls'] mode = max(col['val_counts'].items(), key=operator.itemgetter(1))[0] unicount = col['uniques'].estimate() if unicount > total: uniprop = 1.0 unicount = total else: uniprop = unicount / total unipercent = round(100 * uniprop, 1) napercent = round((100 * col['nulls']) / total, 1) if (col['type'] != "Date") : themean = col['sum'] / total else: themean = col['mean'] values_to_add = { 'Name':name, 'Mode':mode, 'Type': col['type'], 'Unique Vals':unicount, 'Unique':unipercent, 'Nulls':napercent, 'Min': col['min'], 'Mean': themean, 'Max': col['max'] } rez = rez.append(values_to_add, ignore_index=True) return rez
########################################################################################
[docs]def clean_dict(df, col): temp = df[col].value_counts(dropna=False) indeces = temp.index newie = [] for i in indeces: if np.isnan(i): newie.append("NaN") else: newie.append(i) temp.index = newie return temp.to_dict()
[docs]def combine_dicts(a, b, op=operator.add): return {**a, **b, **{k: op(a[k], b[k]) for k in a.keys() & b}}
########################################################################################
[docs]def update_temp_summary(temp, df, startpoint): colnames = df.columns records = len(df) df = coerce_dates(df) for name in colnames: if name in temp: rez = temp[name] else: rez = { "type":[], "val_counts":{}, "sum":0, "mean":np.nan, "min":np.nan, "max":np.nan, "uniques":FMEstimator(), "nulls":0, "nonnulls":0 } nacount = len(df[df[name].isna()]) nonnulls = len(df) - nacount val_counts = clean_dict(df, name) uniques = df[name].unique().tolist() if np.nan in uniques : uniques.remove(np.nan) unicount = len(uniques) uniprop = unicount / len(df) valtype = infer_type_2( df.loc[:,name], startpoint, unicount, uniques) if (valtype == "Char") : lenvec = df[name].apply(lambda x: len_or_null(x)) themin = round(lenvec.min(),3) # "-" thesum = round(lenvec.sum(),3) #"-" themax = round(lenvec.max(),3) #"-" elif (valtype == "Bool") : newvec = df[name].apply(lambda x: booleanize(x)) themin = round(newvec.min(),3) thesum = round(newvec.sum(),3) themax = round(newvec.max(),3) else: if (valtype != "Date") : themin = round(df[name].min(),3) thesum = round(df[name].sum(),3) themax = round(df[name].max(),3) else : themin = str(df[name].min())[0:10] themean = df[name].mean() themax = str(df[name].max())[0:10] rez['type'] = valtype if (valtype != "Date") : rez['sum'] = rez['sum'] + thesum else: if isNaN(rez['mean']): rez['mean'] = themean # else: # rez['mean'] = rez['mean'] + (rez['mean'] - themean)/2 # ABOVE IS OFF FOR THE MOMENT - i.e Keep the first mean rez['nulls'] = rez['nulls'] + nacount if isNaN( rez['min'] ) or themin < rez['min']: rez['min'] = themin if isNaN( rez['max'] ) or themax > rez['max']: rez['max'] = themax rez['uniques'].update_all(uniques) #rez['uniques'] += uniprop rez['val_counts'] = combine_dicts(rez['val_counts'], val_counts) rez['nonnulls'] = rez['nonnulls'] + nonnulls temp[name] = rez return temp
########################################################################################
[docs]def extract_file_extension(path_to_file): return os.path.splitext(path_to_file)[1]
########################################################################################
[docs]def load_complete_dataframe(path_to_file): """ We load the entire dataset into memory, using the file extension to determine the expected format. We are using encoding='latin1' because it ppears to permit loading of the largest variety of files. Representation of strings may not be perfect, but is not important for generating a summarization of the entire dataset. """ extension = extract_file_extension(path_to_file).lower() if extension == ".csv": df = pd.read_csv(path_to_file, encoding='latin1', low_memory=False) return df if extension == ".tsv": df = pd.read_csv(path_to_file, encoding='latin1', sep='\t', low_memory=False) return df if extension == ".xls" or extension == ".xlsx" or extension == ".odf" : df = pd.read_excel(path_to_file) return df raise ValueError("Unsupported File Type")
########################################################################################
[docs]def infer_type_2( thecolumn, startpoint, unicount, uniques): thetype = get_first_non_null_type(thecolumn, startpoint) return infer_type(thetype, unicount, uniques)
########################################################################################
[docs]def get_first_non_null_type(thecolumn, startpoint): thetype = "" index = startpoint while thetype == "": temptype = str(type(thecolumn[index])) tempval = thecolumn[index] if tempval == np.nan: thetype = "" elif tempval is None: thetype = "" elif temptype == "<class 'pandas._libs.tslibs.nattype.NaTType'>": thetype = "" else: thetype = temptype index = index + 1 return thetype
########################################################################################
[docs]def infer_type(thetype, unicount, uniques): valtype = "Char" if thetype == "<class 'numpy.float64'>" : valtype = "Float" if thetype == "<class 'numpy.int64'>" : valtype = "Int" if thetype == "<class 'pandas._libs.tslib.Timestamp'>" : valtype = "Date" if thetype == "<class 'pandas._libs.tslibs.timestamps.Timestamp'>" : valtype = "Date" if thetype == "<class 'numpy.bool_'>": valtype = "Bool" if thetype == "<class 'bool'>": valtype = "Bool" # Additional Inference of Booleans by strings with 2 unique values # and common names as additional criteria if (valtype == "Char") : if unicount == 2: temp = [x.lower() for x in uniques if x is not None] temp.sort() if (temp == ['no', 'yes']): valtype = "Bool" if (temp == ['n', 'y']): valtype = "Bool" if (temp == ['false', 'true']): valtype = "Bool" if (temp == ['f', 't']): valtype = "Bool" return valtype
########################################################################################
[docs]def count_lines(path_to_file): """ Return a count of total lines in a file. In a way that filesize is irrelevant """ count = 0 for line in open(path_to_file): count += 1 return count
########################################################################################
[docs]def len_or_null(val): """ Alternative len function that will simply return numpy.NA for invalid values This is need to get sensible results when running len over a column that may contain nulls """ try: return len(val) except: return np.nan
########################################################################################
[docs]def isNaN(num): return num != num
########################################################################################
[docs]def booleanize(x): if isNaN(x) : return x elif x is None : return x elif str(type(x)) == "<class 'bool'>": return x else : x = x.lower() if x == "yes" or x == "y" or x == "true" or x == "t" or x == 1: return 1 else : return 0
########################################################################################
[docs]def coerce_dates(df): return df.apply( lambda col: pd.to_datetime(col, errors='ignore') if col.dtypes == object else col, axis=0 )
######################################################################################## ########################################################################################
[docs]def get_spaces(spacer): rez = "" for i in range(spacer): rez = rez + " " return rez
########################################################################################
[docs]def get_type_spacer(t): if (t == "Int") : return " " if (t == "Char") : return " " if (t == "Date") : return " " if (t == "Float") : return " " return " "
########################################################################################
[docs]def get_percent_spacer(p): if (p==100.0): return " " elif (p>=10): return " " else: return " "
########################################################################################
[docs]def get_padded_number(n): if (n == "-"): return " - " if (str(n).replace('.','',1).replace('-','',1).isdigit()): if (n<0): adjus = -1 else: adjus = 0 if (abs(n)<10): return get_spaces(8 - after_decimal(n) + adjus) + str(n) + " " if (abs(n)<100): return get_spaces(7 - after_decimal(n) + adjus) + str(n)+ " " if (abs(n)<1000): return get_spaces(6 - after_decimal(n) + adjus) + str(n)+ " " if (abs(n)<10000): return get_spaces(5 - after_decimal(n) + adjus) + str(n)+ " " if (abs(n)<100000): return get_spaces(4 - after_decimal(n) + adjus) + str(n)+ " " if (abs(n)<1000000): return get_spaces(3 - after_decimal(n) + adjus) + str(n)+ " " if (abs(n)<10000000): return get_spaces(2 - after_decimal(n) + adjus) + str(n)+ " " else: number = "{:.2e}".format(n) return get_spaces(2 + adjus) + number + " " else: return str(n) + " "
########################################################################################
[docs]def after_decimal(n): arr = str(n).split(".") if( len(arr)==2 ): return len(arr[1]) else: return -1
######################################################################################## ########################################################################################
[docs]def get_padded_val2(val, spacer): filler = "" if spacer > 10: filler = (get_spaces(spacer - 10)) if type(val) == int: return filler + get_padded_number(val) if type(val) == float: return filler + get_padded_number(val) else: printval = str(val) return (get_spaces(spacer - len(printval)) + printval + " ")
[docs]def get_padded_val(val): return str(val) + " - " + str(type(val))
######################################################################################## ########################################################################################
[docs]def round_down(n, decimals=0): """ Round down a number to a specifed number of decimal places """ multiplier = 10 ** decimals return math.floor(n * multiplier) / multiplier