2

I have a dataframe

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib.backends.backend_pdf import PdfPages
import numpy as np
df= {
    'Gen':['M','M','M','M','F','F','F','F','M','M','M','M','F','F','F','F'],
    'Site':['FRX','FX','FRX','FRX','FRX','FX','FRX','FX','FX','FX','FX','FRX','FRX','FRX','FRX','FRX'],
    'Type':['L','L','L','L','L','L','L','L','R','R','R','R','R','R','R','R'],
     'UID':[1001,1002,1003,1004,1001,1002,1003,1004,1001,1002,1003,1004,1001,1002,1003,1004],
    'color':['R','R','G','G','B','G','B','B','R','G','R','G','B','B','R','G'],
    'Time2':[150.78,162.34,188.53,197.69,208.07,217.76,229.48,139.51,146.87,182.54,189.57,199.97,229.28,244.73,269.91,249.19],
     'Time3':[250.78,262.34,288.53,297.69,308.07,317.7,329.81,339.15,346.87,382.54,369.59,399.97,329.28,347.73,369.91,349.12],
     'Time4':[240.18,232.14,258.53,276.69,338.07,307.74,359.16,339.25,365.87,392.48,399.97,410.75,429.08,448.39,465.15,469.33],
     'Time5':[270.84,282.14,298.53,306.69,318.73,327.47,369.63,389.59,398.75,432.18,449.78,473.55,494.85,509.39,515.52,539.23]
}
df = pd.DataFrame(df,columns = ['Gen','Site','Type','UID','color','Time2','Time3','Time4','Time5'])
df.info()

enter image description here

I want to write a function that takes in a dataframe and does the following:

  1. countplots for columns with the object dtype ( 4 countplots for GEN, Site, Type and color columns )

  2. boxplot for columns with float dtype ( 4 boxplots for Time2,....,Time5 columns )

  3. export the graphs as a pdf file(s) - two graphs per page

My attempt :

# I am open to other approaches
def data_explorer(data):
    for col in data.columns:
        # 1. countplots for columns with the object dtype
        if data[col].dtype == 'object':
            sns.countplot(x = col, data = data)
         # 2. boxplots for columns with the float dtype   
        elif data[col].dtype == 'float':
            sns.boxplot(data[col])
            
        else:
            print("skip integer dtype")
         # 3. save the graphs as pdf- 4 graphs per page
       
        plt.savefig('data_exploration.pdf')


Pls note: The final output should have a total of 8 graphs

0

1 Answer 1

2
  • The main issue is the plots should be saved as a group in a figure, not each column separately.
  • Adjust figsize=(15, 30) as needed.

Option 1: 4 figures with 2 plots per page

  1. Select all the columns of the dataframe by dtype with .select_dtypes
  2. Separate the columns into chunks based on the number of plots per page using a list comprehension. Adjust the chunk size n as needed.
  3. Iterate through each group of columns
  4. Create a figure with a number of rows equal to the number of plots per page
  5. Add the plots to the figure and save the figure
def data_explorer(df):
    # get object and float data
    dobj = df.select_dtypes(include=['object'])
    dflo = df.select_dtypes(include=['float'])
    
    # split columns into groups of two; two being the plots per page
    n = 2
    cols_obj = [dobj.columns[i:i+n] for i in range(0, len(dobj.columns), n)]
    cols_flo = [dflo.columns[i:i+n] for i in range(0, len(dflo.columns), n)]
    
    # create a figure with two plots for each pair in dobj
    for cols in cols_obj:  # iterate through each group
        fig, axes = plt.subplots(n, 1, figsize=(15, 30))
        for col, ax in zip(cols, axes):
            sns.countplot(data=dobj[[col]], x=col, ax=ax)
        fig.savefig(f'data_exploration_{"_".join(cols)}.pdf')
        
    # create a figure with two plots for each pair in dflo
    for cols in cols_flo:  # iterate through each group
        fig, axes = plt.subplots(n, 1, figsize=(15, 30))
        for col, ax in zip(cols, axes):
            sns.boxplot(data=dflo[[col]], x=col, ax=ax)
        fig.savefig(f'data_exploration_{"_".join(cols)}.pdf')


data_explorer(df)

Option 2: 2 figures with 4 plots per page

  1. Select all the columns of the dataframe by dtype with .select_dtypes
  2. Create a figure to match the number of plots per page, equal to the total number of columns per group.
  3. Add each group of columns to a plot figure, and save the figure.
def data_explorer(df):
    # get object and float data
    dobj = df.select_dtypes(include=['object'])
    dflo = df.select_dtypes(include=['float'])
    
    # create a figure with two plots for each pair in dobj
    fig, axes = plt.subplots(2, 2, figsize=(20, 30))
    for col, ax in zip(dobj.columns, axes.flat):
        sns.countplot(data=dobj[[col]], x=col, ax=ax)
    fig.savefig(f'data_exploration_{"_".join(dobj.columns)}.pdf')
        
    # create a figure with two plots for each pair in dflo
    fig, axes = plt.subplots(2, 2, figsize=(20, 30))
    for col, ax in zip(dflo.columns, axes.flat):
        sns.boxplot(data=dflo[[col]], x=col, ax=ax)
    fig.savefig(f'data_exploration_{"_".join(dflo.columns)}.pdf')


data_explorer(df)

enter image description here

enter image description here

Sign up to request clarification or add additional context in comments.

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.