Index

`all = ('Chispa', 'Color', 'ColumnsNotEqualError', 'DataFramesNotEqualError', 'DefaultFormats', 'Format', 'FormattingConfig', 'SchemasNotEqualError', 'Style', 'assert_approx_column_equality', 'assert_approx_df_equality', 'assert_basic_rows_equality', 'assert_column_equality', 'assert_df_equality')` `module-attribute` ¶

`Chispa` ¶

Source code in chispa/__init__.py

class Chispa:
    def __init__(self, formats: FormattingConfig | None = None) -> None:
        if not formats:
            self.formats = FormattingConfig()
        elif isinstance(formats, FormattingConfig):
            self.formats = formats
        else:
            self.formats = FormattingConfig._from_arbitrary_dataclass(formats)

    def assert_df_equality(
        self,
        df1: DataFrame,
        df2: DataFrame,
        ignore_nullable: bool = False,
        transforms: list[Callable] | None = None,  # type: ignore[type-arg]
        allow_nan_equality: bool = False,
        ignore_column_order: bool = False,
        ignore_row_order: bool = False,
        underline_cells: bool = False,
        ignore_metadata: bool = False,
        ignore_columns: list[str] | None = None,
    ) -> None:
        return assert_df_equality(
            df1,
            df2,
            ignore_nullable,
            transforms,
            allow_nan_equality,
            ignore_column_order,
            ignore_row_order,
            underline_cells,
            ignore_metadata,
            ignore_columns,
            self.formats,
        )

`formats = FormattingConfig()` `instance-attribute` ¶

`init(formats=None)` ¶

Source code in chispa/__init__.py

def __init__(self, formats: FormattingConfig | None = None) -> None:
    if not formats:
        self.formats = FormattingConfig()
    elif isinstance(formats, FormattingConfig):
        self.formats = formats
    else:
        self.formats = FormattingConfig._from_arbitrary_dataclass(formats)

`assert_df_equality(df1, df2, ignore_nullable=False, transforms=None, allow_nan_equality=False, ignore_column_order=False, ignore_row_order=False, underline_cells=False, ignore_metadata=False, ignore_columns=None)` ¶

Source code in chispa/__init__.py

def assert_df_equality(
    self,
    df1: DataFrame,
    df2: DataFrame,
    ignore_nullable: bool = False,
    transforms: list[Callable] | None = None,  # type: ignore[type-arg]
    allow_nan_equality: bool = False,
    ignore_column_order: bool = False,
    ignore_row_order: bool = False,
    underline_cells: bool = False,
    ignore_metadata: bool = False,
    ignore_columns: list[str] | None = None,
) -> None:
    return assert_df_equality(
        df1,
        df2,
        ignore_nullable,
        transforms,
        allow_nan_equality,
        ignore_column_order,
        ignore_row_order,
        underline_cells,
        ignore_metadata,
        ignore_columns,
        self.formats,
    )

`Color` ¶

Bases: str, Enum

Enum for terminal colors. Each color is represented by its corresponding ANSI escape code.

Source code in chispa/formatting/formats.py

class Color(str, Enum):
    """
    Enum for terminal colors.
    Each color is represented by its corresponding ANSI escape code.
    """

    BLACK = "\033[30m"
    RED = "\033[31m"
    GREEN = "\033[32m"
    YELLOW = "\033[33m"
    BLUE = "\033[34m"
    PURPLE = "\033[35m"
    CYAN = "\033[36m"
    LIGHT_GRAY = "\033[37m"
    DARK_GRAY = "\033[90m"
    LIGHT_RED = "\033[91m"
    LIGHT_GREEN = "\033[92m"
    LIGHT_YELLOW = "\033[93m"
    LIGHT_BLUE = "\033[94m"
    LIGHT_PURPLE = "\033[95m"
    LIGHT_CYAN = "\033[96m"
    WHITE = "\033[97m"

`BLACK = '\x1b[30m'` `class-attribute` `instance-attribute` ¶

`BLUE = '\x1b[34m'` `class-attribute` `instance-attribute` ¶

`CYAN = '\x1b[36m'` `class-attribute` `instance-attribute` ¶

`DARK_GRAY = '\x1b[90m'` `class-attribute` `instance-attribute` ¶

`GREEN = '\x1b[32m'` `class-attribute` `instance-attribute` ¶

`LIGHT_BLUE = '\x1b[94m'` `class-attribute` `instance-attribute` ¶

`LIGHT_CYAN = '\x1b[96m'` `class-attribute` `instance-attribute` ¶

`LIGHT_GRAY = '\x1b[37m'` `class-attribute` `instance-attribute` ¶

`LIGHT_GREEN = '\x1b[92m'` `class-attribute` `instance-attribute` ¶

`LIGHT_PURPLE = '\x1b[95m'` `class-attribute` `instance-attribute` ¶

`LIGHT_RED = '\x1b[91m'` `class-attribute` `instance-attribute` ¶

`LIGHT_YELLOW = '\x1b[93m'` `class-attribute` `instance-attribute` ¶

`PURPLE = '\x1b[35m'` `class-attribute` `instance-attribute` ¶

`RED = '\x1b[31m'` `class-attribute` `instance-attribute` ¶

`WHITE = '\x1b[97m'` `class-attribute` `instance-attribute` ¶

`YELLOW = '\x1b[33m'` `class-attribute` `instance-attribute` ¶

`ColumnsNotEqualError` ¶

Bases: Exception

The columns are not equal

Source code in chispa/column_comparer.py

class ColumnsNotEqualError(Exception):
    """The columns are not equal"""

    pass

`DataFramesNotEqualError` ¶

Bases: Exception

The DataFrames are not equal

Source code in chispa/dataframe_comparer.py

class DataFramesNotEqualError(Exception):
    """The DataFrames are not equal"""

    pass

`DefaultFormats` `dataclass` ¶

This class is now deprecated and should be removed in a future release.

Source code in chispa/default_formats.py

@dataclass
class DefaultFormats:
    """
    This class is now deprecated and should be removed in a future release.
    """

    mismatched_rows: list[str] = field(default_factory=lambda: ["red"])
    matched_rows: list[str] = field(default_factory=lambda: ["blue"])
    mismatched_cells: list[str] = field(default_factory=lambda: ["red", "underline"])
    matched_cells: list[str] = field(default_factory=lambda: ["blue"])

    def __post_init__(self) -> None:
        warnings.warn(
            "DefaultFormats is deprecated. Use `chispa.formatting.FormattingConfig` instead.", DeprecationWarning
        )

`matched_cells = field(default_factory=lambda: ['blue'])` `class-attribute` `instance-attribute` ¶

`matched_rows = field(default_factory=lambda: ['blue'])` `class-attribute` `instance-attribute` ¶

`mismatched_cells = field(default_factory=lambda: ['red', 'underline'])` `class-attribute` `instance-attribute` ¶

`mismatched_rows = field(default_factory=lambda: ['red'])` `class-attribute` `instance-attribute` ¶

`init(mismatched_rows=lambda: ['red'](), matched_rows=lambda: ['blue'](), mismatched_cells=lambda: ['red', 'underline'](), matched_cells=lambda: ['blue']())` ¶

`__post_init__()` ¶

Source code in chispa/default_formats.py

def __post_init__(self) -> None:
    warnings.warn(
        "DefaultFormats is deprecated. Use `chispa.formatting.FormattingConfig` instead.", DeprecationWarning
    )

`Format` `dataclass` ¶

Data class to represent text formatting with color and style.

Attributes:

Name	Type	Description
`color`	`Color \| None`	The color for the text.
`style`	`list[Style] \| None`	A list of styles for the text.

Source code in chispa/formatting/formats.py

@dataclass
class Format:
    """
    Data class to represent text formatting with color and style.

    Attributes:
        color (Color | None): The color for the text.
        style (list[Style] | None): A list of styles for the text.
    """

    color: Color | None = None
    style: list[Style] | None = None

    @classmethod
    def from_dict(cls, format_dict: dict[str, str | list[str]]) -> Format:
        """
        Create a Format instance from a dictionary.

        Args:
            format_dict (dict): A dictionary with keys 'color' and/or 'style'.
        """
        if not isinstance(format_dict, dict):
            raise ValueError("Input must be a dictionary")

        valid_keys = {"color", "style"}
        invalid_keys = set(format_dict) - valid_keys
        if invalid_keys:
            raise ValueError(f"Invalid keys in format dictionary: {invalid_keys}. Valid keys are {valid_keys}")

        if isinstance(format_dict.get("color"), list):
            raise TypeError("The value for key 'color' should be a string, not a list!")
        color = cls._get_color_enum(format_dict.get("color"))  # type: ignore[arg-type]

        style = format_dict.get("style")
        if isinstance(style, str):
            styles = [cls._get_style_enum(style)]
        elif isinstance(style, list):
            styles = [cls._get_style_enum(s) for s in style]
        else:
            styles = None

        return cls(color=color, style=styles)  # type: ignore[arg-type]

    @classmethod
    def from_list(cls, values: list[str]) -> Format:
        """
        Create a Format instance from a list of strings.

        Args:
            values (list[str]): A list of strings representing colors and styles.
        """
        if not all(isinstance(value, str) for value in values):
            raise ValueError("All elements in the list must be strings")

        color = None
        styles = []
        valid_colors = [c.name.lower() for c in Color]
        valid_styles = [s.name.lower() for s in Style]

        for value in values:
            if value in valid_colors:
                color = Color[value.upper()]
            elif value in valid_styles:
                styles.append(Style[value.upper()])
            else:
                raise ValueError(
                    f"Invalid value: {value}. Valid values are colors: {valid_colors} and styles: {valid_styles}"
                )

        return cls(color=color, style=styles if styles else None)

    @staticmethod
    def _get_color_enum(color: Color | str | None) -> Color | None:
        if isinstance(color, Color):
            return color
        elif isinstance(color, str):
            try:
                return Color[color.upper()]
            except KeyError:
                valid_colors = [c.name.lower() for c in Color]
                raise ValueError(f"Invalid color name: {color}. Valid color names are {valid_colors}")
        return None

    @staticmethod
    def _get_style_enum(style: Style | str | None) -> Style | None:
        if isinstance(style, Style):
            return style
        elif isinstance(style, str):
            try:
                return Style[style.upper()]
            except KeyError:
                valid_styles = [f.name.lower() for f in Style]
                raise ValueError(f"Invalid style name: {style}. Valid style names are {valid_styles}")
        return None

`color = None` `class-attribute` `instance-attribute` ¶

`style = None` `class-attribute` `instance-attribute` ¶

`init(color=None, style=None)` ¶

`from_dict(format_dict)` `classmethod` ¶

Create a Format instance from a dictionary.

Parameters:

Name	Type	Description	Default
`format_dict`	`dict`	A dictionary with keys 'color' and/or 'style'.	required

Source code in chispa/formatting/formats.py

@classmethod
def from_dict(cls, format_dict: dict[str, str | list[str]]) -> Format:
    """
    Create a Format instance from a dictionary.

    Args:
        format_dict (dict): A dictionary with keys 'color' and/or 'style'.
    """
    if not isinstance(format_dict, dict):
        raise ValueError("Input must be a dictionary")

    valid_keys = {"color", "style"}
    invalid_keys = set(format_dict) - valid_keys
    if invalid_keys:
        raise ValueError(f"Invalid keys in format dictionary: {invalid_keys}. Valid keys are {valid_keys}")

    if isinstance(format_dict.get("color"), list):
        raise TypeError("The value for key 'color' should be a string, not a list!")
    color = cls._get_color_enum(format_dict.get("color"))  # type: ignore[arg-type]

    style = format_dict.get("style")
    if isinstance(style, str):
        styles = [cls._get_style_enum(style)]
    elif isinstance(style, list):
        styles = [cls._get_style_enum(s) for s in style]
    else:
        styles = None

    return cls(color=color, style=styles)  # type: ignore[arg-type]

`from_list(values)` `classmethod` ¶

Create a Format instance from a list of strings.

Parameters:

Name	Type	Description	Default
`values`	`list[str]`	A list of strings representing colors and styles.	required

Source code in chispa/formatting/formats.py

@classmethod
def from_list(cls, values: list[str]) -> Format:
    """
    Create a Format instance from a list of strings.

    Args:
        values (list[str]): A list of strings representing colors and styles.
    """
    if not all(isinstance(value, str) for value in values):
        raise ValueError("All elements in the list must be strings")

    color = None
    styles = []
    valid_colors = [c.name.lower() for c in Color]
    valid_styles = [s.name.lower() for s in Style]

    for value in values:
        if value in valid_colors:
            color = Color[value.upper()]
        elif value in valid_styles:
            styles.append(Style[value.upper()])
        else:
            raise ValueError(
                f"Invalid value: {value}. Valid values are colors: {valid_colors} and styles: {valid_styles}"
            )

    return cls(color=color, style=styles if styles else None)

`FormattingConfig` ¶

Class to manage and parse formatting configurations.

Source code in chispa/formatting/formatting_config.py

class FormattingConfig:
    """
    Class to manage and parse formatting configurations.
    """

    VALID_KEYS: ClassVar = {"color", "style"}

    def __init__(
        self,
        mismatched_rows: Format | dict[str, str | list[str]] = Format(Color.RED),
        matched_rows: Format | dict[str, str | list[str]] = Format(Color.BLUE),
        mismatched_cells: Format | dict[str, str | list[str]] = Format(Color.RED, [Style.UNDERLINE]),
        matched_cells: Format | dict[str, str | list[str]] = Format(Color.BLUE),
    ):
        """
        Initializes the FormattingConfig with given or default formatting.

        Each of the arguments can be provided as a `Format` object or a dictionary with the following keys:
        - 'color': A string representing a color name, which should be one of the valid colors:
            ['black', 'red', 'green', 'yellow', 'blue', 'purple', 'cyan', 'light_gray',
            'dark_gray', 'light_red', 'light_green', 'light_yellow', 'light_blue',
            'light_purple', 'light_cyan', 'white'].
        - 'style': A string or list of strings representing styles, which should be one of the valid styles:
            ['bold', 'underline', 'blink', 'invert', 'hide'].

        Args:
            mismatched_rows (Format | dict): Format or dictionary for mismatched rows.
            matched_rows (Format | dict): Format or dictionary for matched rows.
            mismatched_cells (Format | dict): Format or dictionary for mismatched cells.
            matched_cells (Format | dict): Format or dictionary for matched cells.

        Raises:
            ValueError: If the dictionary contains invalid keys or values.
        """
        self.mismatched_rows: Format = self._parse_format(mismatched_rows)
        self.matched_rows: Format = self._parse_format(matched_rows)
        self.mismatched_cells: Format = self._parse_format(mismatched_cells)
        self.matched_cells: Format = self._parse_format(matched_cells)

    def _parse_format(self, format: Format | dict[str, str | list[str]]) -> Format:
        if isinstance(format, Format):
            return format
        elif isinstance(format, dict):
            return Format.from_dict(format)
        raise ValueError("Invalid format type. Must be Format or dict.")

    @classmethod
    def _from_arbitrary_dataclass(cls, instance: Any) -> FormattingConfig:
        """
        Converts an instance of an arbitrary class with specified fields to a FormattingConfig instance.
        This method is purely for backwards compatibility and should be removed in a future release,
        together with the `DefaultFormats` class.
        """

        if not isinstance(instance, DefaultFormats):
            warnings.warn(
                "Using an arbitrary dataclass is deprecated. Use `chispa.formatting.FormattingConfig` instead.",
                DeprecationWarning,
            )

        mismatched_rows = Format.from_list(getattr(instance, "mismatched_rows"))
        matched_rows = Format.from_list(getattr(instance, "matched_rows"))
        mismatched_cells = Format.from_list(getattr(instance, "mismatched_cells"))
        matched_cells = Format.from_list(getattr(instance, "matched_cells"))

        return cls(
            mismatched_rows=mismatched_rows,
            matched_rows=matched_rows,
            mismatched_cells=mismatched_cells,
            matched_cells=matched_cells,
        )

`VALID_KEYS = {'color', 'style'}` `class-attribute` `instance-attribute` ¶

`matched_cells = self._parse_format(matched_cells)` `instance-attribute` ¶

`matched_rows = self._parse_format(matched_rows)` `instance-attribute` ¶

`mismatched_cells = self._parse_format(mismatched_cells)` `instance-attribute` ¶

`mismatched_rows = self._parse_format(mismatched_rows)` `instance-attribute` ¶

`init(mismatched_rows=Format(Color.RED), matched_rows=Format(Color.BLUE), mismatched_cells=Format(Color.RED, [Style.UNDERLINE]), matched_cells=Format(Color.BLUE))` ¶

Initializes the FormattingConfig with given or default formatting.

Each of the arguments can be provided as a Format object or a dictionary with the following keys: - 'color': A string representing a color name, which should be one of the valid colors: ['black', 'red', 'green', 'yellow', 'blue', 'purple', 'cyan', 'light_gray', 'dark_gray', 'light_red', 'light_green', 'light_yellow', 'light_blue', 'light_purple', 'light_cyan', 'white']. - 'style': A string or list of strings representing styles, which should be one of the valid styles: ['bold', 'underline', 'blink', 'invert', 'hide'].

Parameters:

Name	Type	Description	Default
`mismatched_rows`	`Format \| dict`	Format or dictionary for mismatched rows.	`Format(RED)`
`matched_rows`	`Format \| dict`	Format or dictionary for matched rows.	`Format(BLUE)`
`mismatched_cells`	`Format \| dict`	Format or dictionary for mismatched cells.	`Format(RED, [UNDERLINE])`
`matched_cells`	`Format \| dict`	Format or dictionary for matched cells.	`Format(BLUE)`

Raises:

Type	Description
`ValueError`	If the dictionary contains invalid keys or values.

Source code in chispa/formatting/formatting_config.py

def __init__(
    self,
    mismatched_rows: Format | dict[str, str | list[str]] = Format(Color.RED),
    matched_rows: Format | dict[str, str | list[str]] = Format(Color.BLUE),
    mismatched_cells: Format | dict[str, str | list[str]] = Format(Color.RED, [Style.UNDERLINE]),
    matched_cells: Format | dict[str, str | list[str]] = Format(Color.BLUE),
):
    """
    Initializes the FormattingConfig with given or default formatting.

    Each of the arguments can be provided as a `Format` object or a dictionary with the following keys:
    - 'color': A string representing a color name, which should be one of the valid colors:
        ['black', 'red', 'green', 'yellow', 'blue', 'purple', 'cyan', 'light_gray',
        'dark_gray', 'light_red', 'light_green', 'light_yellow', 'light_blue',
        'light_purple', 'light_cyan', 'white'].
    - 'style': A string or list of strings representing styles, which should be one of the valid styles:
        ['bold', 'underline', 'blink', 'invert', 'hide'].

    Args:
        mismatched_rows (Format | dict): Format or dictionary for mismatched rows.
        matched_rows (Format | dict): Format or dictionary for matched rows.
        mismatched_cells (Format | dict): Format or dictionary for mismatched cells.
        matched_cells (Format | dict): Format or dictionary for matched cells.

    Raises:
        ValueError: If the dictionary contains invalid keys or values.
    """
    self.mismatched_rows: Format = self._parse_format(mismatched_rows)
    self.matched_rows: Format = self._parse_format(matched_rows)
    self.mismatched_cells: Format = self._parse_format(mismatched_cells)
    self.matched_cells: Format = self._parse_format(matched_cells)

`SchemasNotEqualError` ¶

Bases: Exception

The schemas are not equal

Source code in chispa/schema_comparer.py

class SchemasNotEqualError(Exception):
    """The schemas are not equal"""

    pass

`Style` ¶

Bases: str, Enum

Enum for text styles. Each style is represented by its corresponding ANSI escape code.

Source code in chispa/formatting/formats.py

class Style(str, Enum):
    """
    Enum for text styles.
    Each style is represented by its corresponding ANSI escape code.
    """

    BOLD = "\033[1m"
    UNDERLINE = "\033[4m"
    BLINK = "\033[5m"
    INVERT = "\033[7m"
    HIDE = "\033[8m"

`BLINK = '\x1b[5m'` `class-attribute` `instance-attribute` ¶

`BOLD = '\x1b[1m'` `class-attribute` `instance-attribute` ¶

`HIDE = '\x1b[8m'` `class-attribute` `instance-attribute` ¶

`INVERT = '\x1b[7m'` `class-attribute` `instance-attribute` ¶

`UNDERLINE = '\x1b[4m'` `class-attribute` `instance-attribute` ¶

`assert_approx_column_equality(df, col_name1, col_name2, precision)` ¶

Source code in chispa/column_comparer.py

def assert_approx_column_equality(df: DataFrame, col_name1: str, col_name2: str, precision: float) -> None:
    rows = df.select(col_name1, col_name2).collect()
    col_name_1_elements = [x[0] for x in rows]
    col_name_2_elements = [x[1] for x in rows]
    all_rows_equal = True
    zipped = list(zip(col_name_1_elements, col_name_2_elements))
    t = PrettyTable([col_name1, col_name2])
    for elements in zipped:
        first = blue(str(elements[0]))
        second = blue(str(elements[1]))
        # when one is None and the other isn't, they're not equal
        if (elements[0] is None) != (elements[1] is None):
            all_rows_equal = False
            t.add_row([str(elements[0]), str(elements[1])])
        # when both are None, they're equal
        elif elements[0] is None and elements[1] is None:
            t.add_row([first, second])
        # when the diff is less than the threshhold, they're approximately equal
        elif abs(elements[0] - elements[1]) < precision:
            t.add_row([first, second])
        # otherwise, they're not equal
        else:
            all_rows_equal = False
            t.add_row([str(elements[0]), str(elements[1])])
    if all_rows_equal is False:
        raise ColumnsNotEqualError("\n" + t.get_string())

`assert_approx_df_equality(df1, df2, precision, ignore_nullable=False, transforms=None, allow_nan_equality=False, ignore_column_order=False, ignore_row_order=False, ignore_columns=None, formats=None)` ¶

Source code in chispa/dataframe_comparer.py

def assert_approx_df_equality(
    df1: DataFrame,
    df2: DataFrame,
    precision: float,
    ignore_nullable: bool = False,
    transforms: list[Callable] | None = None,  # type: ignore[type-arg]
    allow_nan_equality: bool = False,
    ignore_column_order: bool = False,
    ignore_row_order: bool = False,
    ignore_columns: list[str] | None = None,
    formats: FormattingConfig | None = None,
) -> None:
    if not formats:
        formats = FormattingConfig()
    elif not isinstance(formats, FormattingConfig):
        formats = FormattingConfig._from_arbitrary_dataclass(formats)

    if transforms is None:
        transforms = []
    if ignore_column_order:
        transforms.append(lambda df: df.select(sorted(df.columns)))
    if ignore_columns:
        transforms.append(lambda df: df.drop(*ignore_columns))
    if ignore_row_order:
        transforms.append(_sort_df_for_row_order_comparison)

    df1 = reduce(lambda acc, fn: fn(acc), transforms, df1)
    df2 = reduce(lambda acc, fn: fn(acc), transforms, df2)

    assert_schema_equality(df1.schema, df2.schema, ignore_nullable)

    if precision != 0:
        assert_generic_rows_equality(
            df1.collect(),
            df2.collect(),
            are_rows_approx_equal,
            {"precision": precision, "allow_nan_equality": allow_nan_equality},
            formats=formats,
        )
    elif allow_nan_equality:
        assert_generic_rows_equality(
            df1.collect(), df2.collect(), are_rows_equal_enhanced, {"allow_nan_equality": True}, formats=formats
        )
    else:
        assert_basic_rows_equality(df1.collect(), df2.collect(), formats=formats)

`assert_basic_rows_equality(rows1, rows2, underline_cells=False, formats=None)` ¶

Source code in chispa/rows_comparer.py

def assert_basic_rows_equality(
    rows1: list[Row], rows2: list[Row], underline_cells: bool = False, formats: FormattingConfig | None = None
) -> None:
    if not formats:
        formats = FormattingConfig()
    elif not isinstance(formats, FormattingConfig):
        formats = FormattingConfig._from_arbitrary_dataclass(formats)

    if rows1 != rows2:
        t = PrettyTable(["df1", "df2"])
        zipped = list(zip_longest(rows1, rows2))
        all_rows_equal = True

        for r1, r2 in zipped:
            if r1 is None and r2 is not None:
                t.add_row([None, format_string(str(r2), formats.mismatched_rows)])
                all_rows_equal = False
            elif r1 is not None and r2 is None:
                t.add_row([format_string(str(r1), formats.mismatched_rows), None])
                all_rows_equal = False
            else:
                r_zipped = list(zip_longest(r1.__fields__, r2.__fields__))
                r1_string = []
                r2_string = []
                for r1_field, r2_field in r_zipped:
                    if r1[r1_field] != r2[r2_field]:
                        all_rows_equal = False
                        r1_string.append(format_string(f"{r1_field}={r1[r1_field]}", formats.mismatched_cells))
                        r2_string.append(format_string(f"{r2_field}={r2[r2_field]}", formats.mismatched_cells))
                    else:
                        r1_string.append(format_string(f"{r1_field}={r1[r1_field]}", formats.matched_cells))
                        r2_string.append(format_string(f"{r2_field}={r2[r2_field]}", formats.matched_cells))
                r1_res = ", ".join(r1_string)
                r2_res = ", ".join(r2_string)

                t.add_row([r1_res, r2_res])
        if all_rows_equal is False:
            raise chispa.DataFramesNotEqualError("\n" + t.get_string())

`assert_column_equality(df, col_name1, col_name2)` ¶

Source code in chispa/column_comparer.py

def assert_column_equality(df: DataFrame, col_name1: str, col_name2: str) -> None:
    rows = df.select(col_name1, col_name2).collect()
    col_name_1_elements = [x[0] for x in rows]
    col_name_2_elements = [x[1] for x in rows]
    if col_name_1_elements != col_name_2_elements:
        zipped = list(zip(col_name_1_elements, col_name_2_elements))
        t = PrettyTable([col_name1, col_name2])
        for elements in zipped:
            if elements[0] == elements[1]:
                t.add_row([blue(str(elements[0])), blue(str(elements[1]))])
            else:
                t.add_row([str(elements[0]), str(elements[1])])
        raise ColumnsNotEqualError("\n" + t.get_string())

`assert_df_equality(df1, df2, ignore_nullable=False, transforms=None, allow_nan_equality=False, ignore_column_order=False, ignore_row_order=False, underline_cells=False, ignore_metadata=False, ignore_columns=None, formats=None)` ¶

Source code in chispa/dataframe_comparer.py

def assert_df_equality(
    df1: DataFrame,
    df2: DataFrame,
    ignore_nullable: bool = False,
    transforms: list[Callable] | None = None,  # type: ignore[type-arg]
    allow_nan_equality: bool = False,
    ignore_column_order: bool = False,
    ignore_row_order: bool = False,
    underline_cells: bool = False,
    ignore_metadata: bool = False,
    ignore_columns: list[str] | None = None,
    formats: FormattingConfig | None = None,
) -> None:
    if not formats:
        formats = FormattingConfig()
    elif not isinstance(formats, FormattingConfig):
        formats = FormattingConfig._from_arbitrary_dataclass(formats)

    if transforms is None:
        transforms = []
    if ignore_column_order:
        transforms.append(lambda df: df.select(sorted(df.columns)))
    if ignore_columns:
        transforms.append(lambda df: df.drop(*ignore_columns))
    if ignore_row_order:
        transforms.append(_sort_df_for_row_order_comparison)

    df1 = reduce(lambda acc, fn: fn(acc), transforms, df1)
    df2 = reduce(lambda acc, fn: fn(acc), transforms, df2)

    assert_schema_equality(df1.schema, df2.schema, ignore_nullable, ignore_metadata)

    if allow_nan_equality:
        assert_generic_rows_equality(
            df1.collect(),
            df2.collect(),
            are_rows_equal_enhanced,
            {"allow_nan_equality": True},
            underline_cells=underline_cells,
            formats=formats,
        )
    else:
        assert_basic_rows_equality(
            df1.collect(),
            df2.collect(),
            underline_cells=underline_cells,
            formats=formats,
        )

Index

Chispa ¶

formats = FormattingConfig() instance-attribute ¶

__init__(formats=None) ¶

assert_df_equality(df1, df2, ignore_nullable=False, transforms=None, allow_nan_equality=False, ignore_column_order=False, ignore_row_order=False, underline_cells=False, ignore_metadata=False, ignore_columns=None) ¶

Color ¶

BLACK = '\x1b[30m' class-attribute instance-attribute ¶

BLUE = '\x1b[34m' class-attribute instance-attribute ¶

CYAN = '\x1b[36m' class-attribute instance-attribute ¶

DARK_GRAY = '\x1b[90m' class-attribute instance-attribute ¶

GREEN = '\x1b[32m' class-attribute instance-attribute ¶

LIGHT_BLUE = '\x1b[94m' class-attribute instance-attribute ¶

LIGHT_CYAN = '\x1b[96m' class-attribute instance-attribute ¶

LIGHT_GRAY = '\x1b[37m' class-attribute instance-attribute ¶

LIGHT_GREEN = '\x1b[92m' class-attribute instance-attribute ¶

LIGHT_PURPLE = '\x1b[95m' class-attribute instance-attribute ¶

LIGHT_RED = '\x1b[91m' class-attribute instance-attribute ¶

LIGHT_YELLOW = '\x1b[93m' class-attribute instance-attribute ¶

PURPLE = '\x1b[35m' class-attribute instance-attribute ¶

RED = '\x1b[31m' class-attribute instance-attribute ¶

WHITE = '\x1b[97m' class-attribute instance-attribute ¶

YELLOW = '\x1b[33m' class-attribute instance-attribute ¶

ColumnsNotEqualError ¶

DataFramesNotEqualError ¶

DefaultFormats dataclass ¶

matched_cells = field(default_factory=lambda: ['blue']) class-attribute instance-attribute ¶

matched_rows = field(default_factory=lambda: ['blue']) class-attribute instance-attribute ¶

mismatched_cells = field(default_factory=lambda: ['red', 'underline']) class-attribute instance-attribute ¶

mismatched_rows = field(default_factory=lambda: ['red']) class-attribute instance-attribute ¶

__init__(mismatched_rows=lambda: ['red'](), matched_rows=lambda: ['blue'](), mismatched_cells=lambda: ['red', 'underline'](), matched_cells=lambda: ['blue']()) ¶

__post_init__() ¶

Format dataclass ¶

color = None class-attribute instance-attribute ¶

style = None class-attribute instance-attribute ¶

__init__(color=None, style=None) ¶

from_dict(format_dict) classmethod ¶

from_list(values) classmethod ¶

FormattingConfig ¶

VALID_KEYS = {'color', 'style'} class-attribute instance-attribute ¶

matched_cells = self._parse_format(matched_cells) instance-attribute ¶

matched_rows = self._parse_format(matched_rows) instance-attribute ¶

mismatched_cells = self._parse_format(mismatched_cells) instance-attribute ¶

mismatched_rows = self._parse_format(mismatched_rows) instance-attribute ¶

__init__(mismatched_rows=Format(Color.RED), matched_rows=Format(Color.BLUE), mismatched_cells=Format(Color.RED, [Style.UNDERLINE]), matched_cells=Format(Color.BLUE)) ¶

SchemasNotEqualError ¶

Style ¶

BLINK = '\x1b[5m' class-attribute instance-attribute ¶

BOLD = '\x1b[1m' class-attribute instance-attribute ¶

HIDE = '\x1b[8m' class-attribute instance-attribute ¶

INVERT = '\x1b[7m' class-attribute instance-attribute ¶

UNDERLINE = '\x1b[4m' class-attribute instance-attribute ¶

assert_approx_column_equality(df, col_name1, col_name2, precision) ¶

assert_approx_df_equality(df1, df2, precision, ignore_nullable=False, transforms=None, allow_nan_equality=False, ignore_column_order=False, ignore_row_order=False, ignore_columns=None, formats=None) ¶

assert_basic_rows_equality(rows1, rows2, underline_cells=False, formats=None) ¶

assert_column_equality(df, col_name1, col_name2) ¶

assert_df_equality(df1, df2, ignore_nullable=False, transforms=None, allow_nan_equality=False, ignore_column_order=False, ignore_row_order=False, underline_cells=False, ignore_metadata=False, ignore_columns=None, formats=None) ¶

`Chispa` ¶

`formats = FormattingConfig()` `instance-attribute` ¶

`init(formats=None)` ¶

`assert_df_equality(df1, df2, ignore_nullable=False, transforms=None, allow_nan_equality=False, ignore_column_order=False, ignore_row_order=False, underline_cells=False, ignore_metadata=False, ignore_columns=None)` ¶

`Color` ¶

`BLACK = '\x1b[30m'` `class-attribute` `instance-attribute` ¶

`BLUE = '\x1b[34m'` `class-attribute` `instance-attribute` ¶

`CYAN = '\x1b[36m'` `class-attribute` `instance-attribute` ¶

`DARK_GRAY = '\x1b[90m'` `class-attribute` `instance-attribute` ¶

`GREEN = '\x1b[32m'` `class-attribute` `instance-attribute` ¶

`LIGHT_BLUE = '\x1b[94m'` `class-attribute` `instance-attribute` ¶

`LIGHT_CYAN = '\x1b[96m'` `class-attribute` `instance-attribute` ¶

`LIGHT_GRAY = '\x1b[37m'` `class-attribute` `instance-attribute` ¶

`LIGHT_GREEN = '\x1b[92m'` `class-attribute` `instance-attribute` ¶

`LIGHT_PURPLE = '\x1b[95m'` `class-attribute` `instance-attribute` ¶

`LIGHT_RED = '\x1b[91m'` `class-attribute` `instance-attribute` ¶

`LIGHT_YELLOW = '\x1b[93m'` `class-attribute` `instance-attribute` ¶

`PURPLE = '\x1b[35m'` `class-attribute` `instance-attribute` ¶

`RED = '\x1b[31m'` `class-attribute` `instance-attribute` ¶

`WHITE = '\x1b[97m'` `class-attribute` `instance-attribute` ¶

`YELLOW = '\x1b[33m'` `class-attribute` `instance-attribute` ¶

`ColumnsNotEqualError` ¶

`DataFramesNotEqualError` ¶

`DefaultFormats` `dataclass` ¶

`matched_cells = field(default_factory=lambda: ['blue'])` `class-attribute` `instance-attribute` ¶

`matched_rows = field(default_factory=lambda: ['blue'])` `class-attribute` `instance-attribute` ¶

`mismatched_cells = field(default_factory=lambda: ['red', 'underline'])` `class-attribute` `instance-attribute` ¶

`mismatched_rows = field(default_factory=lambda: ['red'])` `class-attribute` `instance-attribute` ¶

`init(mismatched_rows=lambda: ['red'](), matched_rows=lambda: ['blue'](), mismatched_cells=lambda: ['red', 'underline'](), matched_cells=lambda: ['blue']())` ¶

`__post_init__()` ¶

`Format` `dataclass` ¶

`color = None` `class-attribute` `instance-attribute` ¶

`style = None` `class-attribute` `instance-attribute` ¶

`init(color=None, style=None)` ¶

`from_dict(format_dict)` `classmethod` ¶

`from_list(values)` `classmethod` ¶

`FormattingConfig` ¶

`VALID_KEYS = {'color', 'style'}` `class-attribute` `instance-attribute` ¶

`matched_cells = self._parse_format(matched_cells)` `instance-attribute` ¶

`matched_rows = self._parse_format(matched_rows)` `instance-attribute` ¶

`mismatched_cells = self._parse_format(mismatched_cells)` `instance-attribute` ¶

`mismatched_rows = self._parse_format(mismatched_rows)` `instance-attribute` ¶

`init(mismatched_rows=Format(Color.RED), matched_rows=Format(Color.BLUE), mismatched_cells=Format(Color.RED, [Style.UNDERLINE]), matched_cells=Format(Color.BLUE))` ¶

`SchemasNotEqualError` ¶

`Style` ¶

`BLINK = '\x1b[5m'` `class-attribute` `instance-attribute` ¶

`BOLD = '\x1b[1m'` `class-attribute` `instance-attribute` ¶

`HIDE = '\x1b[8m'` `class-attribute` `instance-attribute` ¶

`INVERT = '\x1b[7m'` `class-attribute` `instance-attribute` ¶

`UNDERLINE = '\x1b[4m'` `class-attribute` `instance-attribute` ¶

`assert_approx_column_equality(df, col_name1, col_name2, precision)` ¶

`assert_approx_df_equality(df1, df2, precision, ignore_nullable=False, transforms=None, allow_nan_equality=False, ignore_column_order=False, ignore_row_order=False, ignore_columns=None, formats=None)` ¶

`assert_basic_rows_equality(rows1, rows2, underline_cells=False, formats=None)` ¶

`assert_column_equality(df, col_name1, col_name2)` ¶

`assert_df_equality(df1, df2, ignore_nullable=False, transforms=None, allow_nan_equality=False, ignore_column_order=False, ignore_row_order=False, underline_cells=False, ignore_metadata=False, ignore_columns=None, formats=None)` ¶