Skip to content

Dataframe validator

DataFrameMissingColumnError

Bases: ValueError

Raise this when there's a DataFrame column error.

Source code in quinn/dataframe_validator.py
11
12
class DataFrameMissingColumnError(ValueError):
    """Raise this when there's a DataFrame column error."""

DataFrameMissingStructFieldError

Bases: ValueError

Raise this when there's a DataFrame column error.

Source code in quinn/dataframe_validator.py
15
16
class DataFrameMissingStructFieldError(ValueError):
    """Raise this when there's a DataFrame column error."""

DataFrameProhibitedColumnError

Bases: ValueError

Raise this when a DataFrame includes prohibited columns.

Source code in quinn/dataframe_validator.py
19
20
class DataFrameProhibitedColumnError(ValueError):
    """Raise this when a DataFrame includes prohibited columns."""

validate_absence_of_columns(df, prohibited_col_names)

Validate that none of the prohibited column names are present among specified DataFrame columns.

Parameters:

Name Type Description Default
df DataFrame

DataFrame containing columns to be checked.

required
prohibited_col_names list[str]

List of prohibited column names.

required

Raises:

Type Description
DataFrameProhibitedColumnError

If the prohibited column names are present among the specified DataFrame columns.

Source code in quinn/dataframe_validator.py
76
77
78
79
80
81
82
83
84
85
86
87
88
def validate_absence_of_columns(df: DataFrame, prohibited_col_names: list[str]) -> None:
    """Validate that none of the prohibited column names are present among specified DataFrame columns.

    :param df: DataFrame containing columns to be checked.
    :param prohibited_col_names: List of prohibited column names.
    :raises DataFrameProhibitedColumnError: If the prohibited column names are
    present among the specified DataFrame columns.
    """
    all_col_names = df.columns
    extra_col_names = [x for x in all_col_names if x in prohibited_col_names]
    error_message = f"The {extra_col_names} columns are not allowed to be included in the DataFrame with the following columns {all_col_names}"
    if extra_col_names:
        raise DataFrameProhibitedColumnError(error_message)

validate_presence_of_columns(df, required_col_names)

Validate the presence of column names in a DataFrame.

Parameters:

Name Type Description Default
df DataFrame

A spark DataFrame.

required
required_col_names list[str]

List of the required column names for the DataFrame.

required

Returns:

Type Description
None

None.

Raises:

Type Description
DataFrameMissingColumnError

if any of the requested column names are not present in the DataFrame.

Source code in quinn/dataframe_validator.py
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
def validate_presence_of_columns(df: DataFrame, required_col_names: list[str]) -> None:
    """Validate the presence of column names in a DataFrame.

    :param df: A spark DataFrame.
    :type df: DataFrame`
    :param required_col_names: List of the required column names for the DataFrame.
    :type required_col_names: :py:class:`list` of :py:class:`str`
    :return: None.
    :raises DataFrameMissingColumnError: if any of the requested column names are
    not present in the DataFrame.
    """
    all_col_names = df.columns
    missing_col_names = [x for x in required_col_names if x not in all_col_names]
    error_message = f"The {missing_col_names} columns are not included in the DataFrame with the following columns {all_col_names}"
    if missing_col_names:
        raise DataFrameMissingColumnError(error_message)

validate_schema(df, required_schema, ignore_nullable=False)

Function that validate if a given DataFrame has a given StructType as its schema.

Parameters:

Name Type Description Default
df DataFrame

DataFrame to validate

required
required_schema StructType

StructType required for the DataFrame

required
ignore_nullable bool

(Optional) A flag for if nullable fields should be ignored during validation

False

Raises:

Type Description
DataFrameMissingStructFieldError

if any StructFields from the required schema are not included in the DataFrame schema

Source code in quinn/dataframe_validator.py
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
def validate_schema(
    df: DataFrame,
    required_schema: StructType,
    ignore_nullable: bool = False,
) -> None:
    """Function that validate if a given DataFrame has a given StructType as its schema.

    :param df: DataFrame to validate
    :type df: DataFrame
    :param required_schema: StructType required for the DataFrame
    :type required_schema: StructType
    :param ignore_nullable: (Optional) A flag for if nullable fields should be
    ignored during validation
    :type ignore_nullable: bool, optional

    :raises DataFrameMissingStructFieldError: if any StructFields from the required
    schema are not included in the DataFrame schema
    """
    _all_struct_fields = copy.deepcopy(df.schema)
    _required_schema = copy.deepcopy(required_schema)

    if ignore_nullable:
        for x in _all_struct_fields:
            x.nullable = None

        for x in _required_schema:
            x.nullable = None

    missing_struct_fields = [x for x in _required_schema if x not in _all_struct_fields]
    error_message = f"The {missing_struct_fields} StructFields are not included in the DataFrame with the following StructFields {_all_struct_fields}"

    if missing_struct_fields:
        raise DataFrameMissingStructFieldError(error_message)