Skip to content

Instantly share code, notes, and snippets.

@krassowski
Last active October 19, 2022 14:09
  • Select an option

Select an option

Revisions

  1. krassowski revised this gist Jan 22, 2021. 1 changed file with 10 additions and 1 deletion.
    11 changes: 10 additions & 1 deletion pandas_explorer.py
    Original file line number Diff line number Diff line change
    @@ -1,3 +1,5 @@
    # Copyright (c) 2021 Michał Krassowski.
    # Distributed under the terms of the Modified BSD License.
    from sidecar import Sidecar
    from ipywidgets import widgets
    from IPython.display import display, update_display, HTML
    @@ -58,7 +60,14 @@ def show_frame(

    if row_filter:
    df = df[
    df.apply(lambda row: row.astype(str).apply(contains, substring=row_filter, fuzzy=row_filter_fuzzy).any(), axis=1)
    df.apply(
    lambda row: (
    row
    .astype(str)
    .apply(contains, substring=row_filter, fuzzy=row_filter_fuzzy)
    .any()
    ),
    axis=1)
    ]
    if len(df) < max_rows and not filtered_from:
    filtered_from = max_rows
  2. krassowski created this gist Jan 22, 2021.
    6 changes: 6 additions & 0 deletions example.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,6 @@
    from pandas_explorer import pandas_explorer
    from pandas import read_csv

    iris = read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv')

    pandas_explorer(iris, title='Iris')
    145 changes: 145 additions & 0 deletions pandas_explorer.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,145 @@
    from sidecar import Sidecar
    from ipywidgets import widgets
    from IPython.display import display, update_display, HTML
    from types import SimpleNamespace
    from pandas import DataFrame, option_context
    import string
    import re


    def pandas_explorer(data: DataFrame, title='Explorer', default_rows=30, drop_index=False):

    data = data.reset_index(drop=drop_index)

    split_on = '|'.join(re.escape(x) for x in string.punctuation + ' ')

    table_widgets = SimpleNamespace(
    row_filter=widgets.Text(description='Rows filter'),
    row_filter_fuzzy=widgets.Checkbox(description='Fuzzy', value=True),
    show_index=widgets.Checkbox(description='Index', value=False),
    max_rows=widgets.IntSlider(value=default_rows, description='Max rows', min=0, max=len(data)),
    max_columns=widgets.IntSlider(value=10, description='Max columns', min=0, max=len(data.columns)),
    sort_column=widgets.Dropdown(options=[None, *data.columns], description='Sort'),
    sort_ascending=widgets.Checkbox(description='Ascending')
    )
    filtered_from = None

    def highlight(value, substring, marker='b', options=''):
    value = str(value)
    value = value.split(substring)
    return f'<{marker} {options}>{substring}</{marker}>'.join(value)

    def split(text: str):
    return [
    v
    for v in re.split(split_on, text)
    if v
    ]

    def contains(value, substring: str, fuzzy: bool):
    if fuzzy:
    parts = split(substring)

    if len(parts) > 1:
    return all(
    contains(value, part, fuzzy=fuzzy)
    for part in parts
    )
    # todo case sensitivity option?
    return substring.lower() in value.lower()

    def show_frame(
    row_filter: str, row_filter_fuzzy: bool,
    show_index: bool, max_rows: int, max_columns: int,
    sort_column: str, sort_ascending: bool
    ):
    nonlocal filtered_from
    df = data.copy()

    if row_filter:
    df = df[
    df.apply(lambda row: row.astype(str).apply(contains, substring=row_filter, fuzzy=row_filter_fuzzy).any(), axis=1)
    ]
    if len(df) < max_rows and not filtered_from:
    filtered_from = max_rows

    def highlight_matches(value):
    if row_filter:
    if row_filter_fuzzy:
    parts = split(row_filter)
    for part in parts:
    value = highlight(value, part)
    else:
    value = highlight(value, row_filter)
    return value

    columns_to_hide = list(df.columns)[max_columns:]

    notes = []

    if max_rows < len(df):
    notes.append(f'{len(df) - max_rows} rows hidden')
    if columns_to_hide:
    notes.append(f'{len(columns_to_hide)} columns hidden')

    if sort_column is not None:
    df = df.sort_values(sort_column, ascending=sort_ascending)

    try:
    styled = (
    df.head(max_rows).style
    .hide_columns(columns_to_hide)
    .format(highlight_matches)
    .set_caption(' '.join(notes))
    )

    if not show_index:
    styled = styled.hide_index()
    displayed = display(styled)
    except ValueError as e:
    if 'style is not supported for non-unique indices' not in e.args[0]:
    raise
    with option_context('display.max_rows', max_rows):
    styled = df.head(max_rows).loc[:,df.columns.isin(columns_to_hide)]
    displayed = display(styled)

    if filtered_from is not None and len(df) != 0:
    current_value = min(filtered_from, len(df))
    if len(df) > filtered_from:
    filtered_from = None
    table_widgets.max_rows.max = current_value
    table_widgets.max_rows.value = current_value

    table_widgets.max_rows.max = len(df)

    return displayed

    sc = Sidecar(title=title)

    # https://github.com/jupyter-widgets/jupyterlab-sidecar/issues/25
    sc_out = widgets.Output(layout={'overflow': 'scroll', 'max-width': '100%'})
    with sc:
    display(sc_out)

    with sc_out:
    out = widgets.interactive_output(
    show_frame,
    vars(table_widgets)
    )
    ui = widgets.VBox([
    widgets.HBox([table_widgets.row_filter, table_widgets.row_filter_fuzzy]),
    widgets.HBox([table_widgets.max_rows, table_widgets.max_columns]),
    widgets.HBox([table_widgets.sort_column, table_widgets.sort_ascending]),
    ])
    out_box = widgets.Box(
    [out],
    # sadly hard-coded to allow for scroll as sidcar has layout issues
    layout=widgets.Layout(max_width='600px', max_height='1300px')
    )

    display(
    widgets.VBox([
    ui,
    out_box
    ])
    )