Skip to content

Dataframe comparer

DataFramesNotEqualError

Bases: Exception

The DataFrames are not equal

Source code in chispa/dataframe_comparer.py
59
60
61
62
class DataFramesNotEqualError(Exception):
    """The DataFrames are not equal"""

    pass

are_dfs_equal(df1, df2)

Source code in chispa/dataframe_comparer.py
115
116
117
118
119
120
def are_dfs_equal(df1: DataFrame, df2: DataFrame) -> bool:
    if df1.schema != df2.schema:
        return False
    if df1.collect() != df2.collect():
        return False
    return True

assert_approx_df_equality(df1, df2, precision, ignore_nullable=False, transforms=None, allow_nan_equality=False, ignore_column_order=False, ignore_row_order=False, ignore_columns=None, formats=None)

Source code in chispa/dataframe_comparer.py
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
def assert_approx_df_equality(
    df1: DataFrame,
    df2: DataFrame,
    precision: float,
    ignore_nullable: bool = False,
    transforms: list[Callable] | None = None,  # type: ignore[type-arg]
    allow_nan_equality: bool = False,
    ignore_column_order: bool = False,
    ignore_row_order: bool = False,
    ignore_columns: list[str] | None = None,
    formats: FormattingConfig | None = None,
) -> None:
    if not formats:
        formats = FormattingConfig()
    elif not isinstance(formats, FormattingConfig):
        formats = FormattingConfig._from_arbitrary_dataclass(formats)

    if transforms is None:
        transforms = []
    if ignore_column_order:
        transforms.append(lambda df: df.select(sorted(df.columns)))
    if ignore_columns:
        transforms.append(lambda df: df.drop(*ignore_columns))
    if ignore_row_order:
        transforms.append(_sort_df_for_row_order_comparison)

    df1 = reduce(lambda acc, fn: fn(acc), transforms, df1)
    df2 = reduce(lambda acc, fn: fn(acc), transforms, df2)

    assert_schema_equality(df1.schema, df2.schema, ignore_nullable)

    if precision != 0:
        assert_generic_rows_equality(
            df1.collect(),
            df2.collect(),
            are_rows_approx_equal,
            {"precision": precision, "allow_nan_equality": allow_nan_equality},
            formats=formats,
        )
    elif allow_nan_equality:
        assert_generic_rows_equality(
            df1.collect(), df2.collect(), are_rows_equal_enhanced, {"allow_nan_equality": True}, formats=formats
        )
    else:
        assert_basic_rows_equality(df1.collect(), df2.collect(), formats=formats)

assert_df_equality(df1, df2, ignore_nullable=False, transforms=None, allow_nan_equality=False, ignore_column_order=False, ignore_row_order=False, underline_cells=False, ignore_metadata=False, ignore_columns=None, formats=None)

Source code in chispa/dataframe_comparer.py
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
def assert_df_equality(
    df1: DataFrame,
    df2: DataFrame,
    ignore_nullable: bool = False,
    transforms: list[Callable] | None = None,  # type: ignore[type-arg]
    allow_nan_equality: bool = False,
    ignore_column_order: bool = False,
    ignore_row_order: bool = False,
    underline_cells: bool = False,
    ignore_metadata: bool = False,
    ignore_columns: list[str] | None = None,
    formats: FormattingConfig | None = None,
) -> None:
    if not formats:
        formats = FormattingConfig()
    elif not isinstance(formats, FormattingConfig):
        formats = FormattingConfig._from_arbitrary_dataclass(formats)

    if transforms is None:
        transforms = []
    if ignore_column_order:
        transforms.append(lambda df: df.select(sorted(df.columns)))
    if ignore_columns:
        transforms.append(lambda df: df.drop(*ignore_columns))
    if ignore_row_order:
        transforms.append(_sort_df_for_row_order_comparison)

    df1 = reduce(lambda acc, fn: fn(acc), transforms, df1)
    df2 = reduce(lambda acc, fn: fn(acc), transforms, df2)

    assert_schema_equality(df1.schema, df2.schema, ignore_nullable, ignore_metadata)

    if allow_nan_equality:
        assert_generic_rows_equality(
            df1.collect(),
            df2.collect(),
            are_rows_equal_enhanced,
            {"allow_nan_equality": True},
            underline_cells=underline_cells,
            formats=formats,
        )
    else:
        assert_basic_rows_equality(
            df1.collect(),
            df2.collect(),
            underline_cells=underline_cells,
            formats=formats,
        )