I've had this recently and have written a function to extract the boxplot values from the boxplot as a pandas dataframe.
The function is:
def get_box_plot_data(labels, bp):
rows_list = []
for i in range(len(labels)):
dict1 = {}
dict1['label'] = labels[i]
dict1['lower_whisker'] = bp['whiskers'][i*2].get_ydata()[1]
dict1['lower_quartile'] = bp['boxes'][i].get_ydata()[1]
dict1['median'] = bp['medians'][i].get_ydata()[1]
dict1['upper_quartile'] = bp['boxes'][i].get_ydata()[2]
dict1['upper_whisker'] = bp['whiskers'][(i*2)+1].get_ydata()[1]
rows_list.append(dict1)
return pd.DataFrame(rows_list)
And is called by passing an array of labels (the ones that you would pass to the boxplot plotting function) and the data returned by the boxplot function itself.
For example:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
def get_box_plot_data(labels, bp):
rows_list = []
for i in range(len(labels)):
dict1 = {}
dict1['label'] = labels[i]
dict1['lower_whisker'] = bp['whiskers'][i*2].get_ydata()[1]
dict1['lower_quartile'] = bp['boxes'][i].get_ydata()[1]
dict1['median'] = bp['medians'][i].get_ydata()[1]
dict1['upper_quartile'] = bp['boxes'][i].get_ydata()[2]
dict1['upper_whisker'] = bp['whiskers'][(i*2)+1].get_ydata()[1]
rows_list.append(dict1)
return pd.DataFrame(rows_list)
data1 = np.random.normal(loc = 0, scale = 1, size = 1000)
data2 = np.random.normal(loc = 5, scale = 1, size = 1000)
data3 = np.random.normal(loc = 10, scale = 1, size = 1000)
labels = ['data1', 'data2', 'data3']
bp = plt.boxplot([data1, data2, data3], labels=labels)
print(get_box_plot_data(labels, bp))
plt.show()
Outputs the following from get_box_plot_data
:
label lower_whisker lower_quartile median upper_quartile upper_whisker
0 data1 -2.491652 -0.587869 0.047543 0.696750 2.559301
1 data2 2.351567 4.310068 4.984103 5.665910 7.489808
2 data3 7.227794 9.278931 9.947674 10.661581 12.733275
And produces the following plot: