2  Create Example Data

2.1 Get random data

In this simulated experiment, a group of researchers is studying the effects of two different treatments (A and B) on a biological response (e.g., gene expression level) in three types of cells (“HEK293”, “MCF7”, “A549”). For each treatment 50 cells are tested across 3 biological replicates, and the control (no treatment) group is included as a baseline. The researchers aim to evaluate how each treatment affects gene expression across the different cell types.

library(ggplot2)
library(dplyr)
# Set seed for reproducibility
set.seed(123)

# Simulate data
n <- 50  # Number of cells per replicate
i <- 3   # Number of replicates
cell_lines <- c("HEK293", "MCF7", "A549")
treatments <- c("Control", "TreatmentA", "TreatmentB")

# Create a data frame
data <- expand.grid(
  CellLine = cell_lines,
  Treatment = treatments,
  CellID = 1:n,
  Replicate = 1:i
  )


# Add simulated gene expression data
# Control: Baseline gene expression
# TreatmentA: Increased gene expression with some variance
# TreatmentB: Variable effect depending on cell line
data <- data %>%
  mutate(GeneExpression = case_when(
    Treatment == "Control" ~ rnorm(n(), mean = 50, sd = 5),
    Treatment == "TreatmentA" ~ rnorm(n(), mean = 70, sd = 8),
    Treatment == "TreatmentB" & CellLine == "HEK293" ~ rnorm(n(), mean = 60, sd = 7),
    Treatment == "TreatmentB" & CellLine == "MCF7" ~ rnorm(n(), mean = 55, sd = 6),
    Treatment == "TreatmentB" & CellLine == "A549" ~ rnorm(n(), mean = 65, sd = 9)
                                    ),
  ProteinLevel = case_when(Treatment == "Control" ~ GeneExpression*1+rnorm(n(), mean = 100, sd = 10),
                           Treatment == "TreatmentA" & CellLine == "HEK293" ~ GeneExpression*0.2+rnorm(n(), mean = 100, sd = 10),
                           Treatment == "TreatmentA" & CellLine == "MCF7" ~ GeneExpression*1.1+rnorm(n(), mean = 100, sd = 10),
                           Treatment == "TreatmentA" & CellLine == "A549" ~ GeneExpression*0.9+rnorm(n(), mean = 100, sd = 10),
                           Treatment == "TreatmentB" ~ GeneExpression*1.1+rnorm(n(), mean = 100, sd = 10)))

2.2 Write Data

# Create folder for data
dir.create("./data", showWarnings = FALSE)
# Save the data as CSV
write.csv(data, "./data/biological_lab_data.csv", row.names = FALSE)

# remove all stored variables
rm(list = ls())

2.3 Load Data

data <- read.csv("./data/biological_lab_data.csv")