Untitled

import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats


# Create some example data
vpca_df = pd.DataFrame({
    "Var1": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    "Var2": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
    "Cohort": ["A", "A", "A", "A", "A", "B", "B", "B", "B", "B"]
})

def box_scatter(x, df):
  """
  This function creates a box scatter plot with statistical testing for comparing distributions between two cohorts.

  Args:
      x (str): The column name of the data to be visualized.
      df (pandas.DataFrame): The data frame containing the data.

  Returns:
      None
  """

  sns.set_theme(style="ticks")

  # Initialize the figure with a logarithmic x axis
  f, ax = plt.subplots(figsize=(7, 6))

  # Plot the data with horizontal boxes
  sns.boxplot(
      df,
      x=x,
      y="Cohort",
      hue="Cohort",
      whis=[0, 100],
      width=0.6,
      palette="vlag",
  )

  # Add points to show each observation
  sns.stripplot(df, x=x, y="Cohort", size=4)

  # Perform the statistical test (Mann-Whitney U test)
  # Assuming unequal variance
  statistic, pval = stats.mannwhitneyu(df[df["Cohort"] == "A"][x], df[df["Cohort"] == "B"][x])

  # Tweak the visual presentation
  ax.xaxis.grid(True)
  ax.set(ylabel="")
  sns.despine(trim=True, left=True)

  # Add the statistical test results to the plot title
  ax.set_title(f"{x} distribution (p-value: {pval:.4f})")
  plt.show()


# Iterate over each column (excluding the cohort column) and create the box scatter plot with statistical test
for col in vpca_df.columns[:-1]:
  box_scatter(col, vpca_df.copy())
Editor is loading...