from handwriting_sample.base import HandwritingDataBase
from handwriting_sample.validator.exceptions import PenStatusException, NegativeValueException
[docs]class HandwritingSampleValidator(HandwritingDataBase):
"""Class implementing handwriting data validator"""
# ------------------ #
# Validation methods #
# ------------------ #
# TODO: idea: make library specific exceptions
[docs] @classmethod
def validate_data(cls, df_data, verbose=False):
"""Validates input data"""
# Set column names to lower case
df_data.columns = [x.lower() for x in df_data.columns]
# Get column count
columns = df_data.columns
# Check for number of time-series
if len(columns) < len(cls.COLUMNS):
raise ValueError(
f"Input data are missing the following mandatory time-series (columns): "
f"{list(set(columns).symmetric_difference(set(cls.COLUMNS)))}")
if len(columns) > len(cls.COLUMNS):
raise ValueError(
f"Input data have unwanted time-series that are not expected in the data: "
f"{list(set(columns).symmetric_difference(set(cls.COLUMNS)))}")
# Check for missing values:
if df_data.isnull().sum().sum() > 0:
raise ValueError(
f"Empty values in input data. Please inspect your input and replace the emtpy values. \n"
f"The following table shows the count of emtpy values in particular columns: \n"
f"{df_data.isnull().sum()}")
# Check if the values are numerical
for column_name in columns:
if not all(isinstance(x, (int, float)) for x in df_data[column_name]):
raise ValueError(f"Datatype in time-series [\'{column_name}\'] is not numerical")
# Order the columns based on the pre-defined order
df_data = df_data[cls.COLUMNS]
# Check if pen status contain only 0,1 values
for index, value in enumerate(df_data[cls.PEN_STATUS]):
if value not in [0, 1]:
raise PenStatusException(value, index)
# Check if data contains negative values
df_negative_values = (df_data >= 0).all(0)
negative_values_column_names = list(df_negative_values[df_negative_values == False].index.values)
if negative_values_column_names:
raise NegativeValueException(negative_values_column_names)
# Remove any in-air movement on the boundaries
cls._remove_first_in_air_data(df_data, verbose)
cls._remove_last_in_air_data(df_data, verbose)
# TODO: validate data range
return df_data
# --------------- #
# Utility methods #
# --------------- #
@classmethod
def _remove_first_in_air_data(cls, df, verbose):
"""Removes unwanted in-air movement at the beginning of writing"""
if verbose:
cls.log(f"Check if data contains first in-air movement (unwanted before writing)")
# Check if the first sample has any in air movement
if df[cls.PEN_STATUS].iloc[0] == 1:
if verbose:
cls.log(f"Data do not contain any in-air movement at the beginning")
return
# Remove in-air data at the beginning
if verbose:
cls.log(f"Data contains in-air movement at the beginning")
count = 0
for index, row in df.iterrows():
if row[cls.PEN_STATUS] == 0:
df.drop(index, inplace=True)
count += 1
else:
if verbose:
cls.log(f"Removed first {count} in-air samples")
df.reset_index(inplace=True)
return
@classmethod
def _remove_last_in_air_data(cls, df, verbose):
"""Removes unwanted in-air movement at the end of writing"""
if verbose:
cls.log(f"Check if data contains last in-air movement (unwanted after writing)")
# Check if last sample has any in-air movement
if df[cls.PEN_STATUS].iloc[-1] == 1:
if verbose:
cls.log(f"Data do not contain any in-air movement at the end")
return
# Remove in-air data at the end
if verbose:
cls.log(f"Data contains in-air movement at the beginning")
count = 0
for index in range(df.shape[0] - 1, -1, -1):
if df[cls.PEN_STATUS].iloc[index] == 0:
df.drop(index, inplace=True)
count += 1
else:
if verbose:
cls.log(f"Removed last {count} in-air samples")
df.reset_index(inplace=True)
return