Skip to content

Index

__all__ = ('Chispa', 'Color', 'ColumnsNotEqualError', 'DataFramesNotEqualError', 'DefaultFormats', 'Format', 'FormattingConfig', 'SchemasNotEqualError', 'Style', 'assert_approx_column_equality', 'assert_approx_df_equality', 'assert_basic_rows_equality', 'assert_column_equality', 'assert_df_equality') module-attribute

Chispa

Source code in chispa/__init__.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
class Chispa:
    def __init__(self, formats: FormattingConfig | None = None) -> None:
        if not formats:
            self.formats = FormattingConfig()
        elif isinstance(formats, FormattingConfig):
            self.formats = formats
        else:
            self.formats = FormattingConfig._from_arbitrary_dataclass(formats)

    def assert_df_equality(
        self,
        df1: DataFrame,
        df2: DataFrame,
        ignore_nullable: bool = False,
        transforms: list[Callable] | None = None,  # type: ignore[type-arg]
        allow_nan_equality: bool = False,
        ignore_column_order: bool = False,
        ignore_row_order: bool = False,
        underline_cells: bool = False,
        ignore_metadata: bool = False,
        ignore_columns: list[str] | None = None,
    ) -> None:
        return assert_df_equality(
            df1,
            df2,
            ignore_nullable,
            transforms,
            allow_nan_equality,
            ignore_column_order,
            ignore_row_order,
            underline_cells,
            ignore_metadata,
            ignore_columns,
            self.formats,
        )

formats = FormattingConfig() instance-attribute

__init__(formats=None)

Source code in chispa/__init__.py
25
26
27
28
29
30
31
def __init__(self, formats: FormattingConfig | None = None) -> None:
    if not formats:
        self.formats = FormattingConfig()
    elif isinstance(formats, FormattingConfig):
        self.formats = formats
    else:
        self.formats = FormattingConfig._from_arbitrary_dataclass(formats)

assert_df_equality(df1, df2, ignore_nullable=False, transforms=None, allow_nan_equality=False, ignore_column_order=False, ignore_row_order=False, underline_cells=False, ignore_metadata=False, ignore_columns=None)

Source code in chispa/__init__.py
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
def assert_df_equality(
    self,
    df1: DataFrame,
    df2: DataFrame,
    ignore_nullable: bool = False,
    transforms: list[Callable] | None = None,  # type: ignore[type-arg]
    allow_nan_equality: bool = False,
    ignore_column_order: bool = False,
    ignore_row_order: bool = False,
    underline_cells: bool = False,
    ignore_metadata: bool = False,
    ignore_columns: list[str] | None = None,
) -> None:
    return assert_df_equality(
        df1,
        df2,
        ignore_nullable,
        transforms,
        allow_nan_equality,
        ignore_column_order,
        ignore_row_order,
        underline_cells,
        ignore_metadata,
        ignore_columns,
        self.formats,
    )

Color

Bases: str, Enum

Enum for terminal colors. Each color is represented by its corresponding ANSI escape code.

Source code in chispa/formatting/formats.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
class Color(str, Enum):
    """
    Enum for terminal colors.
    Each color is represented by its corresponding ANSI escape code.
    """

    BLACK = "\033[30m"
    RED = "\033[31m"
    GREEN = "\033[32m"
    YELLOW = "\033[33m"
    BLUE = "\033[34m"
    PURPLE = "\033[35m"
    CYAN = "\033[36m"
    LIGHT_GRAY = "\033[37m"
    DARK_GRAY = "\033[90m"
    LIGHT_RED = "\033[91m"
    LIGHT_GREEN = "\033[92m"
    LIGHT_YELLOW = "\033[93m"
    LIGHT_BLUE = "\033[94m"
    LIGHT_PURPLE = "\033[95m"
    LIGHT_CYAN = "\033[96m"
    WHITE = "\033[97m"

BLACK = '\x1b[30m' class-attribute instance-attribute

BLUE = '\x1b[34m' class-attribute instance-attribute

CYAN = '\x1b[36m' class-attribute instance-attribute

DARK_GRAY = '\x1b[90m' class-attribute instance-attribute

GREEN = '\x1b[32m' class-attribute instance-attribute

LIGHT_BLUE = '\x1b[94m' class-attribute instance-attribute

LIGHT_CYAN = '\x1b[96m' class-attribute instance-attribute

LIGHT_GRAY = '\x1b[37m' class-attribute instance-attribute

LIGHT_GREEN = '\x1b[92m' class-attribute instance-attribute

LIGHT_PURPLE = '\x1b[95m' class-attribute instance-attribute

LIGHT_RED = '\x1b[91m' class-attribute instance-attribute

LIGHT_YELLOW = '\x1b[93m' class-attribute instance-attribute

PURPLE = '\x1b[35m' class-attribute instance-attribute

RED = '\x1b[31m' class-attribute instance-attribute

WHITE = '\x1b[97m' class-attribute instance-attribute

YELLOW = '\x1b[33m' class-attribute instance-attribute

ColumnsNotEqualError

Bases: Exception

The columns are not equal

Source code in chispa/column_comparer.py
 9
10
11
12
class ColumnsNotEqualError(Exception):
    """The columns are not equal"""

    pass

DataFramesNotEqualError

Bases: Exception

The DataFrames are not equal

Source code in chispa/dataframe_comparer.py
54
55
56
57
class DataFramesNotEqualError(Exception):
    """The DataFrames are not equal"""

    pass

DefaultFormats dataclass

This class is now deprecated and should be removed in a future release.

Source code in chispa/default_formats.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
@dataclass
class DefaultFormats:
    """
    This class is now deprecated and should be removed in a future release.
    """

    mismatched_rows: list[str] = field(default_factory=lambda: ["red"])
    matched_rows: list[str] = field(default_factory=lambda: ["blue"])
    mismatched_cells: list[str] = field(default_factory=lambda: ["red", "underline"])
    matched_cells: list[str] = field(default_factory=lambda: ["blue"])

    def __post_init__(self) -> None:
        warnings.warn(
            "DefaultFormats is deprecated. Use `chispa.formatting.FormattingConfig` instead.", DeprecationWarning
        )

matched_cells = field(default_factory=lambda: ['blue']) class-attribute instance-attribute

matched_rows = field(default_factory=lambda: ['blue']) class-attribute instance-attribute

mismatched_cells = field(default_factory=lambda: ['red', 'underline']) class-attribute instance-attribute

mismatched_rows = field(default_factory=lambda: ['red']) class-attribute instance-attribute

__init__(mismatched_rows=lambda: ['red'](), matched_rows=lambda: ['blue'](), mismatched_cells=lambda: ['red', 'underline'](), matched_cells=lambda: ['blue']())

__post_init__()

Source code in chispa/default_formats.py
18
19
20
21
def __post_init__(self) -> None:
    warnings.warn(
        "DefaultFormats is deprecated. Use `chispa.formatting.FormattingConfig` instead.", DeprecationWarning
    )

Format dataclass

Data class to represent text formatting with color and style.

Attributes:

Name Type Description
color Color | None

The color for the text.

style list[Style] | None

A list of styles for the text.

Source code in chispa/formatting/formats.py
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
@dataclass
class Format:
    """
    Data class to represent text formatting with color and style.

    Attributes:
        color (Color | None): The color for the text.
        style (list[Style] | None): A list of styles for the text.
    """

    color: Color | None = None
    style: list[Style] | None = None

    @classmethod
    def from_dict(cls, format_dict: dict[str, str | list[str]]) -> Format:
        """
        Create a Format instance from a dictionary.

        Args:
            format_dict (dict): A dictionary with keys 'color' and/or 'style'.
        """
        if not isinstance(format_dict, dict):
            raise ValueError("Input must be a dictionary")

        valid_keys = {"color", "style"}
        invalid_keys = set(format_dict) - valid_keys
        if invalid_keys:
            raise ValueError(f"Invalid keys in format dictionary: {invalid_keys}. Valid keys are {valid_keys}")

        if isinstance(format_dict.get("color"), list):
            raise TypeError("The value for key 'color' should be a string, not a list!")
        color = cls._get_color_enum(format_dict.get("color"))  # type: ignore[arg-type]

        style = format_dict.get("style")
        if isinstance(style, str):
            styles = [cls._get_style_enum(style)]
        elif isinstance(style, list):
            styles = [cls._get_style_enum(s) for s in style]
        else:
            styles = None

        return cls(color=color, style=styles)  # type: ignore[arg-type]

    @classmethod
    def from_list(cls, values: list[str]) -> Format:
        """
        Create a Format instance from a list of strings.

        Args:
            values (list[str]): A list of strings representing colors and styles.
        """
        if not all(isinstance(value, str) for value in values):
            raise ValueError("All elements in the list must be strings")

        color = None
        styles = []
        valid_colors = [c.name.lower() for c in Color]
        valid_styles = [s.name.lower() for s in Style]

        for value in values:
            if value in valid_colors:
                color = Color[value.upper()]
            elif value in valid_styles:
                styles.append(Style[value.upper()])
            else:
                raise ValueError(
                    f"Invalid value: {value}. Valid values are colors: {valid_colors} and styles: {valid_styles}"
                )

        return cls(color=color, style=styles if styles else None)

    @staticmethod
    def _get_color_enum(color: Color | str | None) -> Color | None:
        if isinstance(color, Color):
            return color
        elif isinstance(color, str):
            try:
                return Color[color.upper()]
            except KeyError:
                valid_colors = [c.name.lower() for c in Color]
                raise ValueError(f"Invalid color name: {color}. Valid color names are {valid_colors}")
        return None

    @staticmethod
    def _get_style_enum(style: Style | str | None) -> Style | None:
        if isinstance(style, Style):
            return style
        elif isinstance(style, str):
            try:
                return Style[style.upper()]
            except KeyError:
                valid_styles = [f.name.lower() for f in Style]
                raise ValueError(f"Invalid style name: {style}. Valid style names are {valid_styles}")
        return None

color = None class-attribute instance-attribute

style = None class-attribute instance-attribute

__init__(color=None, style=None)

from_dict(format_dict) classmethod

Create a Format instance from a dictionary.

Parameters:

Name Type Description Default
format_dict dict

A dictionary with keys 'color' and/or 'style'.

required
Source code in chispa/formatting/formats.py
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
@classmethod
def from_dict(cls, format_dict: dict[str, str | list[str]]) -> Format:
    """
    Create a Format instance from a dictionary.

    Args:
        format_dict (dict): A dictionary with keys 'color' and/or 'style'.
    """
    if not isinstance(format_dict, dict):
        raise ValueError("Input must be a dictionary")

    valid_keys = {"color", "style"}
    invalid_keys = set(format_dict) - valid_keys
    if invalid_keys:
        raise ValueError(f"Invalid keys in format dictionary: {invalid_keys}. Valid keys are {valid_keys}")

    if isinstance(format_dict.get("color"), list):
        raise TypeError("The value for key 'color' should be a string, not a list!")
    color = cls._get_color_enum(format_dict.get("color"))  # type: ignore[arg-type]

    style = format_dict.get("style")
    if isinstance(style, str):
        styles = [cls._get_style_enum(style)]
    elif isinstance(style, list):
        styles = [cls._get_style_enum(s) for s in style]
    else:
        styles = None

    return cls(color=color, style=styles)  # type: ignore[arg-type]

from_list(values) classmethod

Create a Format instance from a list of strings.

Parameters:

Name Type Description Default
values list[str]

A list of strings representing colors and styles.

required
Source code in chispa/formatting/formats.py
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
@classmethod
def from_list(cls, values: list[str]) -> Format:
    """
    Create a Format instance from a list of strings.

    Args:
        values (list[str]): A list of strings representing colors and styles.
    """
    if not all(isinstance(value, str) for value in values):
        raise ValueError("All elements in the list must be strings")

    color = None
    styles = []
    valid_colors = [c.name.lower() for c in Color]
    valid_styles = [s.name.lower() for s in Style]

    for value in values:
        if value in valid_colors:
            color = Color[value.upper()]
        elif value in valid_styles:
            styles.append(Style[value.upper()])
        else:
            raise ValueError(
                f"Invalid value: {value}. Valid values are colors: {valid_colors} and styles: {valid_styles}"
            )

    return cls(color=color, style=styles if styles else None)

FormattingConfig

Class to manage and parse formatting configurations.

Source code in chispa/formatting/formatting_config.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
class FormattingConfig:
    """
    Class to manage and parse formatting configurations.
    """

    VALID_KEYS: ClassVar = {"color", "style"}

    def __init__(
        self,
        mismatched_rows: Format | dict[str, str | list[str]] = Format(Color.RED),
        matched_rows: Format | dict[str, str | list[str]] = Format(Color.BLUE),
        mismatched_cells: Format | dict[str, str | list[str]] = Format(Color.RED, [Style.UNDERLINE]),
        matched_cells: Format | dict[str, str | list[str]] = Format(Color.BLUE),
    ):
        """
        Initializes the FormattingConfig with given or default formatting.

        Each of the arguments can be provided as a `Format` object or a dictionary with the following keys:
        - 'color': A string representing a color name, which should be one of the valid colors:
            ['black', 'red', 'green', 'yellow', 'blue', 'purple', 'cyan', 'light_gray',
            'dark_gray', 'light_red', 'light_green', 'light_yellow', 'light_blue',
            'light_purple', 'light_cyan', 'white'].
        - 'style': A string or list of strings representing styles, which should be one of the valid styles:
            ['bold', 'underline', 'blink', 'invert', 'hide'].

        Args:
            mismatched_rows (Format | dict): Format or dictionary for mismatched rows.
            matched_rows (Format | dict): Format or dictionary for matched rows.
            mismatched_cells (Format | dict): Format or dictionary for mismatched cells.
            matched_cells (Format | dict): Format or dictionary for matched cells.

        Raises:
            ValueError: If the dictionary contains invalid keys or values.
        """
        self.mismatched_rows: Format = self._parse_format(mismatched_rows)
        self.matched_rows: Format = self._parse_format(matched_rows)
        self.mismatched_cells: Format = self._parse_format(mismatched_cells)
        self.matched_cells: Format = self._parse_format(matched_cells)

    def _parse_format(self, format: Format | dict[str, str | list[str]]) -> Format:
        if isinstance(format, Format):
            return format
        elif isinstance(format, dict):
            return Format.from_dict(format)
        raise ValueError("Invalid format type. Must be Format or dict.")

    @classmethod
    def _from_arbitrary_dataclass(cls, instance: Any) -> FormattingConfig:
        """
        Converts an instance of an arbitrary class with specified fields to a FormattingConfig instance.
        This method is purely for backwards compatibility and should be removed in a future release,
        together with the `DefaultFormats` class.
        """

        if not isinstance(instance, DefaultFormats):
            warnings.warn(
                "Using an arbitrary dataclass is deprecated. Use `chispa.formatting.FormattingConfig` instead.",
                DeprecationWarning,
            )

        mismatched_rows = Format.from_list(getattr(instance, "mismatched_rows"))
        matched_rows = Format.from_list(getattr(instance, "matched_rows"))
        mismatched_cells = Format.from_list(getattr(instance, "mismatched_cells"))
        matched_cells = Format.from_list(getattr(instance, "matched_cells"))

        return cls(
            mismatched_rows=mismatched_rows,
            matched_rows=matched_rows,
            mismatched_cells=mismatched_cells,
            matched_cells=matched_cells,
        )

VALID_KEYS = {'color', 'style'} class-attribute instance-attribute

matched_cells = self._parse_format(matched_cells) instance-attribute

matched_rows = self._parse_format(matched_rows) instance-attribute

mismatched_cells = self._parse_format(mismatched_cells) instance-attribute

mismatched_rows = self._parse_format(mismatched_rows) instance-attribute

__init__(mismatched_rows=Format(Color.RED), matched_rows=Format(Color.BLUE), mismatched_cells=Format(Color.RED, [Style.UNDERLINE]), matched_cells=Format(Color.BLUE))

Initializes the FormattingConfig with given or default formatting.

Each of the arguments can be provided as a Format object or a dictionary with the following keys: - 'color': A string representing a color name, which should be one of the valid colors: ['black', 'red', 'green', 'yellow', 'blue', 'purple', 'cyan', 'light_gray', 'dark_gray', 'light_red', 'light_green', 'light_yellow', 'light_blue', 'light_purple', 'light_cyan', 'white']. - 'style': A string or list of strings representing styles, which should be one of the valid styles: ['bold', 'underline', 'blink', 'invert', 'hide'].

Parameters:

Name Type Description Default
mismatched_rows Format | dict

Format or dictionary for mismatched rows.

Format(RED)
matched_rows Format | dict

Format or dictionary for matched rows.

Format(BLUE)
mismatched_cells Format | dict

Format or dictionary for mismatched cells.

Format(RED, [UNDERLINE])
matched_cells Format | dict

Format or dictionary for matched cells.

Format(BLUE)

Raises:

Type Description
ValueError

If the dictionary contains invalid keys or values.

Source code in chispa/formatting/formatting_config.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
def __init__(
    self,
    mismatched_rows: Format | dict[str, str | list[str]] = Format(Color.RED),
    matched_rows: Format | dict[str, str | list[str]] = Format(Color.BLUE),
    mismatched_cells: Format | dict[str, str | list[str]] = Format(Color.RED, [Style.UNDERLINE]),
    matched_cells: Format | dict[str, str | list[str]] = Format(Color.BLUE),
):
    """
    Initializes the FormattingConfig with given or default formatting.

    Each of the arguments can be provided as a `Format` object or a dictionary with the following keys:
    - 'color': A string representing a color name, which should be one of the valid colors:
        ['black', 'red', 'green', 'yellow', 'blue', 'purple', 'cyan', 'light_gray',
        'dark_gray', 'light_red', 'light_green', 'light_yellow', 'light_blue',
        'light_purple', 'light_cyan', 'white'].
    - 'style': A string or list of strings representing styles, which should be one of the valid styles:
        ['bold', 'underline', 'blink', 'invert', 'hide'].

    Args:
        mismatched_rows (Format | dict): Format or dictionary for mismatched rows.
        matched_rows (Format | dict): Format or dictionary for matched rows.
        mismatched_cells (Format | dict): Format or dictionary for mismatched cells.
        matched_cells (Format | dict): Format or dictionary for matched cells.

    Raises:
        ValueError: If the dictionary contains invalid keys or values.
    """
    self.mismatched_rows: Format = self._parse_format(mismatched_rows)
    self.matched_rows: Format = self._parse_format(matched_rows)
    self.mismatched_cells: Format = self._parse_format(mismatched_cells)
    self.matched_cells: Format = self._parse_format(matched_cells)

SchemasNotEqualError

Bases: Exception

The schemas are not equal

Source code in chispa/schema_comparer.py
14
15
16
17
class SchemasNotEqualError(Exception):
    """The schemas are not equal"""

    pass

Style

Bases: str, Enum

Enum for text styles. Each style is represented by its corresponding ANSI escape code.

Source code in chispa/formatting/formats.py
33
34
35
36
37
38
39
40
41
42
43
class Style(str, Enum):
    """
    Enum for text styles.
    Each style is represented by its corresponding ANSI escape code.
    """

    BOLD = "\033[1m"
    UNDERLINE = "\033[4m"
    BLINK = "\033[5m"
    INVERT = "\033[7m"
    HIDE = "\033[8m"

BOLD = '\x1b[1m' class-attribute instance-attribute

HIDE = '\x1b[8m' class-attribute instance-attribute

INVERT = '\x1b[7m' class-attribute instance-attribute

UNDERLINE = '\x1b[4m' class-attribute instance-attribute

assert_approx_column_equality(df, col_name1, col_name2, precision)

Source code in chispa/column_comparer.py
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
def assert_approx_column_equality(df: DataFrame, col_name1: str, col_name2: str, precision: float) -> None:
    rows = df.select(col_name1, col_name2).collect()
    col_name_1_elements = [x[0] for x in rows]
    col_name_2_elements = [x[1] for x in rows]
    all_rows_equal = True
    zipped = list(zip(col_name_1_elements, col_name_2_elements))
    t = PrettyTable([col_name1, col_name2])
    for elements in zipped:
        first = blue(str(elements[0]))
        second = blue(str(elements[1]))
        # when one is None and the other isn't, they're not equal
        if (elements[0] is None) != (elements[1] is None):
            all_rows_equal = False
            t.add_row([str(elements[0]), str(elements[1])])
        # when both are None, they're equal
        elif elements[0] is None and elements[1] is None:
            t.add_row([first, second])
        # when the diff is less than the threshhold, they're approximately equal
        elif abs(elements[0] - elements[1]) < precision:
            t.add_row([first, second])
        # otherwise, they're not equal
        else:
            all_rows_equal = False
            t.add_row([str(elements[0]), str(elements[1])])
    if all_rows_equal is False:
        raise ColumnsNotEqualError("\n" + t.get_string())

assert_approx_df_equality(df1, df2, precision, ignore_nullable=False, transforms=None, allow_nan_equality=False, ignore_column_order=False, ignore_row_order=False, ignore_columns=None, formats=None)

Source code in chispa/dataframe_comparer.py
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
def assert_approx_df_equality(
    df1: DataFrame,
    df2: DataFrame,
    precision: float,
    ignore_nullable: bool = False,
    transforms: list[Callable] | None = None,  # type: ignore[type-arg]
    allow_nan_equality: bool = False,
    ignore_column_order: bool = False,
    ignore_row_order: bool = False,
    ignore_columns: list[str] | None = None,
    formats: FormattingConfig | None = None,
) -> None:
    if not formats:
        formats = FormattingConfig()
    elif not isinstance(formats, FormattingConfig):
        formats = FormattingConfig._from_arbitrary_dataclass(formats)

    if transforms is None:
        transforms = []
    if ignore_column_order:
        transforms.append(lambda df: df.select(sorted(df.columns)))
    if ignore_columns:
        transforms.append(lambda df: df.drop(*ignore_columns))
    if ignore_row_order:
        transforms.append(_sort_df_for_row_order_comparison)

    df1 = reduce(lambda acc, fn: fn(acc), transforms, df1)
    df2 = reduce(lambda acc, fn: fn(acc), transforms, df2)

    assert_schema_equality(df1.schema, df2.schema, ignore_nullable)

    if precision != 0:
        assert_generic_rows_equality(
            df1.collect(),
            df2.collect(),
            are_rows_approx_equal,
            {"precision": precision, "allow_nan_equality": allow_nan_equality},
            formats=formats,
        )
    elif allow_nan_equality:
        assert_generic_rows_equality(
            df1.collect(), df2.collect(), are_rows_equal_enhanced, {"allow_nan_equality": True}, formats=formats
        )
    else:
        assert_basic_rows_equality(df1.collect(), df2.collect(), formats=formats)

assert_basic_rows_equality(rows1, rows2, underline_cells=False, formats=None)

Source code in chispa/rows_comparer.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
def assert_basic_rows_equality(
    rows1: list[Row], rows2: list[Row], underline_cells: bool = False, formats: FormattingConfig | None = None
) -> None:
    if not formats:
        formats = FormattingConfig()
    elif not isinstance(formats, FormattingConfig):
        formats = FormattingConfig._from_arbitrary_dataclass(formats)

    if rows1 != rows2:
        t = PrettyTable(["df1", "df2"])
        zipped = list(zip_longest(rows1, rows2))
        all_rows_equal = True

        for r1, r2 in zipped:
            if r1 is None and r2 is not None:
                t.add_row([None, format_string(str(r2), formats.mismatched_rows)])
                all_rows_equal = False
            elif r1 is not None and r2 is None:
                t.add_row([format_string(str(r1), formats.mismatched_rows), None])
                all_rows_equal = False
            else:
                r_zipped = list(zip_longest(r1.__fields__, r2.__fields__))
                r1_string = []
                r2_string = []
                for r1_field, r2_field in r_zipped:
                    if r1[r1_field] != r2[r2_field]:
                        all_rows_equal = False
                        r1_string.append(format_string(f"{r1_field}={r1[r1_field]}", formats.mismatched_cells))
                        r2_string.append(format_string(f"{r2_field}={r2[r2_field]}", formats.mismatched_cells))
                    else:
                        r1_string.append(format_string(f"{r1_field}={r1[r1_field]}", formats.matched_cells))
                        r2_string.append(format_string(f"{r2_field}={r2[r2_field]}", formats.matched_cells))
                r1_res = ", ".join(r1_string)
                r2_res = ", ".join(r2_string)

                t.add_row([r1_res, r2_res])
        if all_rows_equal is False:
            raise chispa.DataFramesNotEqualError("\n" + t.get_string())

assert_column_equality(df, col_name1, col_name2)

Source code in chispa/column_comparer.py
15
16
17
18
19
20
21
22
23
24
25
26
27
def assert_column_equality(df: DataFrame, col_name1: str, col_name2: str) -> None:
    rows = df.select(col_name1, col_name2).collect()
    col_name_1_elements = [x[0] for x in rows]
    col_name_2_elements = [x[1] for x in rows]
    if col_name_1_elements != col_name_2_elements:
        zipped = list(zip(col_name_1_elements, col_name_2_elements))
        t = PrettyTable([col_name1, col_name2])
        for elements in zipped:
            if elements[0] == elements[1]:
                t.add_row([blue(str(elements[0])), blue(str(elements[1]))])
            else:
                t.add_row([str(elements[0]), str(elements[1])])
        raise ColumnsNotEqualError("\n" + t.get_string())

assert_df_equality(df1, df2, ignore_nullable=False, transforms=None, allow_nan_equality=False, ignore_column_order=False, ignore_row_order=False, underline_cells=False, ignore_metadata=False, ignore_columns=None, formats=None)

Source code in chispa/dataframe_comparer.py
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
def assert_df_equality(
    df1: DataFrame,
    df2: DataFrame,
    ignore_nullable: bool = False,
    transforms: list[Callable] | None = None,  # type: ignore[type-arg]
    allow_nan_equality: bool = False,
    ignore_column_order: bool = False,
    ignore_row_order: bool = False,
    underline_cells: bool = False,
    ignore_metadata: bool = False,
    ignore_columns: list[str] | None = None,
    formats: FormattingConfig | None = None,
) -> None:
    if not formats:
        formats = FormattingConfig()
    elif not isinstance(formats, FormattingConfig):
        formats = FormattingConfig._from_arbitrary_dataclass(formats)

    if transforms is None:
        transforms = []
    if ignore_column_order:
        transforms.append(lambda df: df.select(sorted(df.columns)))
    if ignore_columns:
        transforms.append(lambda df: df.drop(*ignore_columns))
    if ignore_row_order:
        transforms.append(_sort_df_for_row_order_comparison)

    df1 = reduce(lambda acc, fn: fn(acc), transforms, df1)
    df2 = reduce(lambda acc, fn: fn(acc), transforms, df2)

    assert_schema_equality(df1.schema, df2.schema, ignore_nullable, ignore_metadata)

    if allow_nan_equality:
        assert_generic_rows_equality(
            df1.collect(),
            df2.collect(),
            are_rows_equal_enhanced,
            {"allow_nan_equality": True},
            underline_cells=underline_cells,
            formats=formats,
        )
    else:
        assert_basic_rows_equality(
            df1.collect(),
            df2.collect(),
            underline_cells=underline_cells,
            formats=formats,
        )