Add CITE-seq dataset

257749a2 · Jeffrey Pullin · 22d56804 · 257749a2 · 257749a2 · 257749a2
Commit 257749a2 authored 3 years ago by Jeffrey Pullin
--- a/code/download-raw-data.R
+++ b/code/download-raw-data.R
@@ -11,6 +11,7 @@ suppressMessages({
  library(SingleCellExperiment)
  library(readr)
  library(dplyr)
+  library(Seurat)
 })

 raw_lawlor <- LawlorPancreasData()
@@ -101,3 +102,48 @@ colData(ss3_pbmc) <- colData(ss3_pbmc) |>
  DataFrame()

 saveRDS(ss3_pbmc, here::here("data", "raw_data", "ss3_pbmc.rds"))
+
+# CITE-seq data
+
+# Adapted from the Seurat tutorial:
+# https://satijalab.org/seurat/articles/multimodal_vignette.html
+
+cbmc_rna <- as.sparse(
+  read.csv(file = here::here("data", "downloaded_data",
+                             "GSE100866_CBMC_8K_13AB_10X-RNA_umi.csv.gz"),
+           sep = ",",
+           header = TRUE,
+           row.names = 1)
+)
+
+# Discard all but the top 100 most highly expressed mouse genes, and remove the
+# 'HUMAN_' from the CITE-seq prefix.
+cbmc_rna <- CollapseSpeciesExpressionMatrix(cbmc_rna)
+
+# Load in the ADT UMI matrix
+cbmc_adt <- as.sparse(
+  read.csv(file = here::here("data", "downloaded_data",
+                             "GSE100866_CBMC_8K_13AB_10X-ADT_umi.csv.gz"),
+           sep = ",",
+           header = TRUE,
+           row.names = 1)
+)
+
+cbmc <- CreateSeuratObject(counts = cbmc_rna)
+
+adt_assay <- CreateAssayObject(counts = cbmc_adt)
+cbmc[["ADT"]] <- adt_assay
+
+cbmc <- NormalizeData(cbmc)
+cbmc <- FindVariableFeatures(cbmc)
+cbmc <- ScaleData(cbmc)
+cbmc <- RunPCA(cbmc, verbose = FALSE)
+cbmc <- FindNeighbors(cbmc, dims = 1:30)
+cbmc <- FindClusters(cbmc, resolution = 0.2, verbose = FALSE)
+cbmc <- RunUMAP(cbmc, dims = 1:30)
+cbmc <- RunTSNE(cbmc, dims = 1:30)
+DimPlot(cbmc, label = TRUE)
+
+citeseq <- as.SingleCellExperiment(cbmc)
+
+saveRDS(citeseq, here::here("data", "raw_data", "citeseq.rds"))
--- a/code/prep-citeseq.R
+++ b/code/prep-citeseq.R
+# This scripts prepares and saves the `citeseq` data.
+
+args <- R.utils::commandArgs(
+  trailingOnly = TRUE,
+  asValues = TRUE
+)
+
+suppressMessages({
+  library(TENxPBMCData)
+  library(scater)
+  library(Seurat)
+  library(scran)
+})
+
+set.seed(3112022)
+
+# Extensive processing of this data occurs in the download_data file, perhaps
+# it should be moved here.
+citeseq <- readRDS(args$raw_data)
+
+rowData(citeseq) <- DataFrame(value = rownames(citeseq))
+
+# Filter to the top 2000 genes.
+dec_citeseq <- modelGeneVarByPoisson(citeseq)
+top_citeseq <- getTopHVGs(dec_citeseq, n = 2000)
+citeseq <- citeseq[top_citeseq, ]
+
+# All cluster labels (not annotated).
+colLabels(citeseq) <- factor(citeseq$RNA_snn_res.0.2)
+
+saveRDS(citeseq, args$data)
--- a/config.yaml
+++ b/config.yaml
@@ -10,6 +10,7 @@ all_data_ids:
  - "paul"
  - "zhao"
  - "ss3_pbmc"
+  - "citeseq"
 general_sim_data_ids: ["zeisel", "pbmc3k", "lawlor", "paul"]
 time_sim_data_ids: ["pbmc3k"]