Source code for handwriting_sample.validator.interface

from handwriting_sample.base import HandwritingDataBase
from handwriting_sample.validator.exceptions import PenStatusException, NegativeValueException


[docs]class HandwritingSampleValidator(HandwritingDataBase):
    """Class implementing handwriting data validator"""

    # ------------------ #
    # Validation methods #
    # ------------------ #
    # TODO: idea: make library specific exceptions

[docs]    @classmethod
    def validate_data(cls, df_data, verbose=False):
        """Validates input data"""

        # Set column names to lower case
        df_data.columns = [x.lower() for x in df_data.columns]

        # Get column count
        columns = df_data.columns

        # Check for number of time-series
        if len(columns) < len(cls.COLUMNS):
            raise ValueError(
                f"Input data are missing the following mandatory time-series (columns): "
                f"{list(set(columns).symmetric_difference(set(cls.COLUMNS)))}")
        if len(columns) > len(cls.COLUMNS):
            raise ValueError(
                f"Input data have unwanted time-series that are not expected in the data: "
                f"{list(set(columns).symmetric_difference(set(cls.COLUMNS)))}")

        # Check for missing values:
        if df_data.isnull().sum().sum() > 0:
            raise ValueError(
                f"Empty values in input data. Please inspect your input and replace the emtpy values. \n"
                f"The following table shows the count of emtpy values in particular columns: \n"
                f"{df_data.isnull().sum()}")

        # Check if the values are numerical
        for column_name in columns:
            if not all(isinstance(x, (int, float)) for x in df_data[column_name]):
                raise ValueError(f"Datatype in time-series [\'{column_name}\'] is not numerical")

        # Order the columns based on the pre-defined order
        df_data = df_data[cls.COLUMNS]

        # Check if pen status contain only 0,1 values
        for index, value in enumerate(df_data[cls.PEN_STATUS]):
            if value not in [0, 1]:
                raise PenStatusException(value, index)

        # Check if data contains negative values
        df_negative_values = (df_data >= 0).all(0)
        negative_values_column_names = list(df_negative_values[df_negative_values == False].index.values)

        if negative_values_column_names:
            raise NegativeValueException(negative_values_column_names)

        # Remove any in-air movement on the boundaries
        cls._remove_first_in_air_data(df_data, verbose)
        cls._remove_last_in_air_data(df_data, verbose)

        # TODO: validate data range
        return df_data

    # --------------- #
    # Utility methods #
    # --------------- #

    @classmethod
    def _remove_first_in_air_data(cls, df, verbose):
        """Removes unwanted in-air movement at the beginning of writing"""
        if verbose:
            cls.log(f"Check if data contains first in-air movement (unwanted before writing)")

        # Check if the first sample has any in air movement
        if df[cls.PEN_STATUS].iloc[0] == 1:
            if verbose:
                cls.log(f"Data do not contain any in-air movement at the beginning")
            return

        # Remove in-air data at the beginning
        if verbose:
            cls.log(f"Data contains in-air movement at the beginning")

        count = 0
        for index, row in df.iterrows():
            if row[cls.PEN_STATUS] == 0:
                df.drop(index, inplace=True)
                count += 1
            else:
                if verbose:
                    cls.log(f"Removed first {count} in-air samples")
                df.reset_index(inplace=True)
                return

    @classmethod
    def _remove_last_in_air_data(cls, df, verbose):
        """Removes unwanted in-air movement at the end of writing"""
        if verbose:
            cls.log(f"Check if data contains last in-air movement (unwanted after writing)")

        # Check if last sample has any in-air movement
        if df[cls.PEN_STATUS].iloc[-1] == 1:
            if verbose:
                cls.log(f"Data do not contain any in-air movement at the end")
            return

        # Remove in-air data at the end
        if verbose:
            cls.log(f"Data contains in-air movement at the beginning")

        count = 0
        for index in range(df.shape[0] - 1, -1, -1):
            if df[cls.PEN_STATUS].iloc[index] == 0:
                df.drop(index, inplace=True)
                count += 1
            else:
                if verbose:
                    cls.log(f"Removed last {count} in-air samples")
                df.reset_index(inplace=True)
                return