# -*- coding: utf-8 -*-
from io import StringIO
import pandas as pd
import numpy as np
import operator
import math
import os
from .config import max_filesize
from .FlajoletMartin import FMEstimator
"""
dfsummarizer.funcs: Core functions of the dfsummarizer package.
analyse_df( pandas_dataframe): return a sumamry dataframe of the input dataframe
analyse_df_in_chunks(path_to_dataset): Read the dataset in chunks and provide a summary
"""
########################################################################################
[docs]def analyse_file(path_to_file):
df = load_complete_dataframe(path_to_file)
summary = analyse_df(df)
return summary
########################################################################################
[docs]def analyse_df(df):
"""
Given a pandas dataframe that is already in memory we generate a table of summary
statistics and descriptors.
"""
colnames = df.columns
records = len(df)
df = coerce_dates(df)
rez = pd.DataFrame(columns=('Name', 'Type', 'Unique Vals', 'Unique', 'Nulls', 'Min', 'Mean', 'Max'))
for name in colnames:
nacount = len(df[df[name].isna()])
napercent = round(100*nacount/records,1)
uniques = df[name].unique().tolist()
if np.nan in uniques :
uniques.remove(np.nan)
unicount = len(uniques)
unipercent = round(100*unicount/records,1)
mode = df[name].mode(dropna=False)[0]
#if (type(mode) == float) & np.isnan(mode):
# mode = "NaN"
if mode != mode:
mode = "NaN"
#valtype = infer_type(str(type(df.loc[1,name])), unicount, uniques)
valtype = infer_type_2( df.loc[:,name], 0, unicount, uniques)
if (valtype == "Char") :
lenvec = df[name].apply(lambda x: len_or_null(x))
themin = round(lenvec.min(),3) # "-"
themean = round(lenvec.mean(),3) #"-"
themax = round(lenvec.max(),3) #"-"
elif (valtype == "Bool") :
newvec = df[name].apply(lambda x: booleanize(x))
themin = round(newvec.min(),3)
themean = round(newvec.mean(),3)
themax = round(newvec.max(),3)
else:
if (valtype != "Date") :
themin = round(df[name].min(),3)
themean = round(df[name].mean(),3)
themax = round(df[name].max(),3)
else :
themin = str(df[name].min())[0:10]
themean = str(df[name].mean())[0:10] #"-"
themax = str(df[name].max())[0:10]
values_to_add = {
'Name':name,
'Type':valtype,
'Mode':mode,
'Unique Vals':unicount,
'Unique':unipercent,
'Nulls':napercent,
'Min':themin,
'Mean': themean,
'Max':themax
}
rez = rez.append(values_to_add, ignore_index=True)
return rez
########################################################################################
[docs]def analyse_file_in_chunks(path_to_file):
"""
Given a path to a large dataset we will iteratively load it in chunks and build
out the statistics necessary to summarise the whole dataset.
"""
fsize = os.stat(path_to_file).st_size
sample_prop = max_filesize / fsize
line_count = count_lines(path_to_file)
chunks = round(line_count * sample_prop)
temp = {}
data_iterator = pd.read_csv(path_to_file, chunksize=chunks, low_memory=False)
total_chunks = 0
for index, chunk in enumerate(data_iterator, start=0):
startpoint = 0 + (index*chunks)
total_chunks = index + 1
temp = update_temp_summary(temp, chunk, startpoint)
summary = generate_final_summary(temp, total_chunks)
return summary
########################################################################################
[docs]def generate_final_summary(temp, total_chunks):
rez = pd.DataFrame(columns=('Name', 'Mode', 'Type', 'Unique Vals', 'Unique', 'Nulls', 'Min', 'Mean', 'Max'))
for name in temp.keys():
col = temp[name]
total = col['nulls'] + col['nonnulls']
mode = max(col['val_counts'].items(), key=operator.itemgetter(1))[0]
unicount = col['uniques'].estimate()
if unicount > total:
uniprop = 1.0
unicount = total
else:
uniprop = unicount / total
unipercent = round(100 * uniprop, 1)
napercent = round((100 * col['nulls']) / total, 1)
if (col['type'] != "Date") :
themean = col['sum'] / total
else:
themean = col['mean']
values_to_add = {
'Name':name,
'Mode':mode,
'Type': col['type'],
'Unique Vals':unicount,
'Unique':unipercent,
'Nulls':napercent,
'Min': col['min'],
'Mean': themean,
'Max': col['max']
}
rez = rez.append(values_to_add, ignore_index=True)
return rez
########################################################################################
[docs]def clean_dict(df, col):
temp = df[col].value_counts(dropna=False)
indeces = temp.index
newie = []
for i in indeces:
if np.isnan(i):
newie.append("NaN")
else:
newie.append(i)
temp.index = newie
return temp.to_dict()
[docs]def combine_dicts(a, b, op=operator.add):
return {**a, **b, **{k: op(a[k], b[k]) for k in a.keys() & b}}
########################################################################################
[docs]def update_temp_summary(temp, df, startpoint):
colnames = df.columns
records = len(df)
df = coerce_dates(df)
for name in colnames:
if name in temp:
rez = temp[name]
else:
rez = { "type":[], "val_counts":{}, "sum":0, "mean":np.nan,
"min":np.nan, "max":np.nan,
"uniques":FMEstimator(), "nulls":0,
"nonnulls":0
}
nacount = len(df[df[name].isna()])
nonnulls = len(df) - nacount
val_counts = clean_dict(df, name)
uniques = df[name].unique().tolist()
if np.nan in uniques :
uniques.remove(np.nan)
unicount = len(uniques)
uniprop = unicount / len(df)
valtype = infer_type_2( df.loc[:,name], startpoint, unicount, uniques)
if (valtype == "Char") :
lenvec = df[name].apply(lambda x: len_or_null(x))
themin = round(lenvec.min(),3) # "-"
thesum = round(lenvec.sum(),3) #"-"
themax = round(lenvec.max(),3) #"-"
elif (valtype == "Bool") :
newvec = df[name].apply(lambda x: booleanize(x))
themin = round(newvec.min(),3)
thesum = round(newvec.sum(),3)
themax = round(newvec.max(),3)
else:
if (valtype != "Date") :
themin = round(df[name].min(),3)
thesum = round(df[name].sum(),3)
themax = round(df[name].max(),3)
else :
themin = str(df[name].min())[0:10]
themean = df[name].mean()
themax = str(df[name].max())[0:10]
rez['type'] = valtype
if (valtype != "Date") :
rez['sum'] = rez['sum'] + thesum
else:
if isNaN(rez['mean']):
rez['mean'] = themean
# else:
# rez['mean'] = rez['mean'] + (rez['mean'] - themean)/2
# ABOVE IS OFF FOR THE MOMENT - i.e Keep the first mean
rez['nulls'] = rez['nulls'] + nacount
if isNaN( rez['min'] ) or themin < rez['min']:
rez['min'] = themin
if isNaN( rez['max'] ) or themax > rez['max']:
rez['max'] = themax
rez['uniques'].update_all(uniques)
#rez['uniques'] += uniprop
rez['val_counts'] = combine_dicts(rez['val_counts'], val_counts)
rez['nonnulls'] = rez['nonnulls'] + nonnulls
temp[name] = rez
return temp
########################################################################################
########################################################################################
[docs]def load_complete_dataframe(path_to_file):
"""
We load the entire dataset into memory, using the file extension to determine
the expected format. We are using encoding='latin1' because it ppears to
permit loading of the largest variety of files.
Representation of strings may not be perfect, but is not important for generating a
summarization of the entire dataset.
"""
extension = extract_file_extension(path_to_file).lower()
if extension == ".csv":
df = pd.read_csv(path_to_file, encoding='latin1', low_memory=False)
return df
if extension == ".tsv":
df = pd.read_csv(path_to_file, encoding='latin1', sep='\t', low_memory=False)
return df
if extension == ".xls" or extension == ".xlsx" or extension == ".odf" :
df = pd.read_excel(path_to_file)
return df
raise ValueError("Unsupported File Type")
########################################################################################
[docs]def infer_type_2( thecolumn, startpoint, unicount, uniques):
thetype = get_first_non_null_type(thecolumn, startpoint)
return infer_type(thetype, unicount, uniques)
########################################################################################
[docs]def get_first_non_null_type(thecolumn, startpoint):
thetype = ""
index = startpoint
while thetype == "":
temptype = str(type(thecolumn[index]))
tempval = thecolumn[index]
if tempval == np.nan:
thetype = ""
elif tempval is None:
thetype = ""
elif temptype == "<class 'pandas._libs.tslibs.nattype.NaTType'>":
thetype = ""
else:
thetype = temptype
index = index + 1
return thetype
########################################################################################
[docs]def infer_type(thetype, unicount, uniques):
valtype = "Char"
if thetype == "<class 'numpy.float64'>" :
valtype = "Float"
if thetype == "<class 'numpy.int64'>" :
valtype = "Int"
if thetype == "<class 'pandas._libs.tslib.Timestamp'>" :
valtype = "Date"
if thetype == "<class 'pandas._libs.tslibs.timestamps.Timestamp'>" :
valtype = "Date"
if thetype == "<class 'numpy.bool_'>":
valtype = "Bool"
if thetype == "<class 'bool'>":
valtype = "Bool"
# Additional Inference of Booleans by strings with 2 unique values
# and common names as additional criteria
if (valtype == "Char") :
if unicount == 2:
temp = [x.lower() for x in uniques if x is not None]
temp.sort()
if (temp == ['no', 'yes']):
valtype = "Bool"
if (temp == ['n', 'y']):
valtype = "Bool"
if (temp == ['false', 'true']):
valtype = "Bool"
if (temp == ['f', 't']):
valtype = "Bool"
return valtype
########################################################################################
[docs]def count_lines(path_to_file):
"""
Return a count of total lines in a file. In a way that filesize is irrelevant
"""
count = 0
for line in open(path_to_file): count += 1
return count
########################################################################################
[docs]def len_or_null(val):
"""
Alternative len function that will simply return numpy.NA for invalid values
This is need to get sensible results when running len over a column that may contain nulls
"""
try:
return len(val)
except:
return np.nan
########################################################################################
[docs]def isNaN(num):
return num != num
########################################################################################
[docs]def booleanize(x):
if isNaN(x) :
return x
elif x is None :
return x
elif str(type(x)) == "<class 'bool'>":
return x
else :
x = x.lower()
if x == "yes" or x == "y" or x == "true" or x == "t" or x == 1:
return 1
else :
return 0
########################################################################################
[docs]def coerce_dates(df):
return df.apply(
lambda col: pd.to_datetime(col, errors='ignore')
if col.dtypes == object
else col,
axis=0
)
########################################################################################
[docs]def print_latex(summary):
print("\\begin{table}[h!]")
print(" \\begin{center}")
print(" \\caption{Data Summary Table}")
print(" \\label{tab:table1}")
print(" \\begin{tabular}{l|l|r|r|r|r|r} ")
print(" \\textbf{Name} & \\textbf{Type} & \\textbf{Unique Vals \%} & \\textbf{Nulls \%} & \\textbf{Mode} & \\textbf{Min} & \\textbf{Mean} & \\textbf{Max}\\\\")
print(" \\hline")
for i in range(len(summary)):
print(" ", summary.loc[i,"Name"],
"&", summary.loc[i,"Type"],
"&", summary.loc[i,"Unique Vals"], "%"
"&", summary.loc[i,"Nulls"], "%"
"&", summary.loc[i,"Mode"],
"&", summary.loc[i,"Min"],
"&", summary.loc[i,"Mean"],
"&", summary.loc[i,"Max"], "\\\\")
print(" \\end{tabular}")
print(" \\end{center}")
print("\\end{table}")
########################################################################################
[docs]def get_spaces(spacer):
rez = ""
for i in range(spacer):
rez = rez + " "
return rez
########################################################################################
[docs]def get_type_spacer(t):
if (t == "Int") :
return " "
if (t == "Char") :
return " "
if (t == "Date") :
return " "
if (t == "Float") :
return " "
return " "
########################################################################################
[docs]def get_percent_spacer(p):
if (p==100.0):
return " "
elif (p>=10):
return " "
else:
return " "
########################################################################################
[docs]def get_padded_number(n):
if (n == "-"):
return " - "
if (str(n).replace('.','',1).replace('-','',1).isdigit()):
if (n<0):
adjus = -1
else:
adjus = 0
if (abs(n)<10):
return get_spaces(8 - after_decimal(n) + adjus) + str(n) + " "
if (abs(n)<100):
return get_spaces(7 - after_decimal(n) + adjus) + str(n)+ " "
if (abs(n)<1000):
return get_spaces(6 - after_decimal(n) + adjus) + str(n)+ " "
if (abs(n)<10000):
return get_spaces(5 - after_decimal(n) + adjus) + str(n)+ " "
if (abs(n)<100000):
return get_spaces(4 - after_decimal(n) + adjus) + str(n)+ " "
if (abs(n)<1000000):
return get_spaces(3 - after_decimal(n) + adjus) + str(n)+ " "
if (abs(n)<10000000):
return get_spaces(2 - after_decimal(n) + adjus) + str(n)+ " "
else:
number = "{:.2e}".format(n)
return get_spaces(2 + adjus) + number + " "
else:
return str(n) + " "
########################################################################################
[docs]def after_decimal(n):
arr = str(n).split(".")
if( len(arr)==2 ):
return len(arr[1])
else:
return -1
########################################################################################
[docs]def print_csv(s):
output = StringIO()
s.to_csv(output, index=False)
print(output.getvalue())
########################################################################################
[docs]def get_padded_val2(val, spacer):
filler = ""
if spacer > 10:
filler = (get_spaces(spacer - 10))
if type(val) == int:
return filler + get_padded_number(val)
if type(val) == float:
return filler + get_padded_number(val)
else:
printval = str(val)
return (get_spaces(spacer - len(printval)) + printval + " ")
[docs]def get_padded_val(val):
return str(val) + " - " + str(type(val))
########################################################################################
[docs]def print_markdown(s):
longest_name = max(s["Name"].apply(lambda x: len_or_null(x)))
if(longest_name>4):
name_spacer = longest_name+2
else:
name_spacer = 6
longest_mode = max(s["Mode"].apply(lambda x: len_or_null(str(x))))
if(longest_mode>10):
mode_spacer = longest_mode
else:
mode_spacer = 10
print("| Name ", get_spaces(name_spacer-6),
"| Type | Unique Vals | Nulls | Mode ", get_spaces(mode_spacer-4),
"| Min | Mean | Max |", sep="")
print("| ---- ", get_spaces(name_spacer-6),
"| ------ | ----------- | ------- | ---- ", get_spaces(mode_spacer-4),
"| --- | ---- | --- |", sep="")
for i in range(len(s)):
print("| ", s.loc[i,"Name"],
get_spaces(name_spacer - len(s.loc[i,"Name"]) - 1 ),
"| ", s.loc[i,"Type"], get_type_spacer(s.loc[i,"Type"]),
"| ", get_padded_number(s.loc[i,"Unique Vals"]),
"| ", get_percent_spacer(s.loc[i,"Nulls"]), s.loc[i,"Nulls"],"% ",
"| ", get_padded_val2(s.loc[i,"Mode"], mode_spacer),
"| ", get_padded_number(s.loc[i,"Min"]),
"| ", get_padded_number(s.loc[i,"Mean"]),
"| ", get_padded_number(s.loc[i,"Max"]), "|", sep="")
########################################################################################
[docs]def round_down(n, decimals=0):
"""
Round down a number to a specifed number of decimal places
"""
multiplier = 10 ** decimals
return math.floor(n * multiplier) / multiplier