Building more complex Seurat objects #
In most cases, we will want to go beyond reading one sample into Seurat and performing biological analyses. We often want to create a Seurat object that has multiple samples (e.g. biological replicates or patients). These samples will also have unique pieces of metadata, such as HPV- PBMC or HPV+ TIL from our head and neck cancer dataset.
Prerequisites #
This analysis assumes that you have download the samples described in the GEO download tutorial under utilities.
Load packages #
First, we will load the necessary packages.
library(Seurat)
## Attaching SeuratObject
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(patchwork)
Read the individual files into a list #
The data I downloaded using the tutorial are located in /home/rstudio/docker_rstudio/data/geo_download. We will create a simple for loop to read all the files into individual variables stored in a list.
full_path <- "/home/rstudio/docker_rstudio/data/geo_download/"
raw_files <- list.files(paste(full_path))
# Not the metadata... we will use this in a second
raw_files <- raw_files[2:5]
raw_files
## [1] "HD_PBMC_1" "HD_PBMC_2" "HD_PBMC_3" "HD_PBMC_4"
# Create list for Seurat object
raw_list <- vector("list",length=length(raw_files))
raw_list # empty list of length 4
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## NULL
# Loop to read filtered feature barcode matrices into R data into R
for (i in 1:length(raw_list)) {
raw_list[[i]] <- Read10X(paste(full_path,raw_files[i],sep=""))
}
dim(raw_list[[1]]) # 33694 genes by 2445 cells
## [1] 33694 2445
raw_list[[1]][1:50,1:10] # sparse matrix with counts
## 50 x 10 sparse Matrix of class "dgCMatrix"
## [[ suppressing 10 column names 'AAACCTGCACAGACTT-1', 'AAACCTGCATCGGTTA-1', 'AAACCTGTCAAGCCTA-1' ... ]]
##
## RP11-34P13.3 . . . . . . . . . .
## FAM138A . . . . . . . . . .
## OR4F5 . . . . . . . . . .
## RP11-34P13.7 . . . . . . . . . .
## RP11-34P13.8 . . . . . . . . . .
## RP11-34P13.14 . . . . . . . . . .
## RP11-34P13.9 . . . . . . . . . .
## FO538757.3 . . . . . . . . . .
## FO538757.2 . . . . . . 1 . . .
## AP006222.2 1 . . . . . . . . .
## RP5-857K21.15 . . . . . . . . . .
## RP4-669L17.2 . . . . . . . . . .
## RP4-669L17.10 . . . . . . . . . .
## OR4F29 . . . . . . . . . .
## RP5-857K21.4 . . . . . . . . . .
## RP5-857K21.2 . . . . . . . . . .
## OR4F16 . . . . . . . . . .
## RP11-206L10.4 . . . . . . . . . .
## RP11-206L10.9 . . . . . . . . . .
## FAM87B . . . . . . . . . .
## LINC00115 . . . . . . . . . .
## FAM41C . . . . . . . . . .
## RP11-54O7.16 . . . . . . . . . .
## RP11-54O7.1 . . . . . . . . . .
## RP11-54O7.2 . . . . . . . . . .
## RP11-54O7.3 . . . . . . . . . .
## SAMD11 . . . . . . . . . .
## NOC2L . . . 2 . . . . . .
## KLHL17 . . . . . . . . . .
## PLEKHN1 . . . . . . . . . .
## PERM1 . . . . . . . . . .
## RP11-54O7.17 . . . . . . . . . .
## HES4 . . . . . . . . . .
## ISG15 . 4 1 . . . 1 1 . .
## RP11-54O7.11 . . . . . . . . . .
## AGRN . . . . . . . . . .
## RP11-54O7.18 . . . . . . . . . .
## RNF223 . . . . . . . . . .
## C1orf159 . . . . . . . . . .
## LINC01342 . . . . . . . . . .
## RP11-465B22.8 . . . . . . . . . .
## TTLL10-AS1 . . . . . . . . . .
## TTLL10 . . . . . . . . . .
## TNFRSF18 . . . . . . . . . .
## TNFRSF4 . . . 1 1 . . 1 . .
## SDF4 . . . . . . . 1 . .
## B3GALT6 . . . . . . . . . .
## FAM132A . . . . . . . . . .
## RP5-902P8.12 . . . . . . . . . .
## UBE2J2 . . . . . . . . . .
# Name each item in the list by its title
names(raw_list) <- raw_files
names(raw_list)
## [1] "HD_PBMC_1" "HD_PBMC_2" "HD_PBMC_3" "HD_PBMC_4"
Read in metadata #
Before we construct the combined object, we will need the metadata.
We can read that in from the tsv file using readr::read_tsv (readr is a package, and the “::” lets us call a function from that package without loading the package)
# Read in the metadata
metadata_file <- readr::read_tsv("/home/rstudio/docker_rstudio/data/geo_download/GSE139324_metadata.tsv")
## Rows: 63 Columns: 45
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (41): sample_accession, title, geo_accession, status, submission_date, l...
## dbl (4): channel_count, taxid_ch1, contact_zip/postal_code, data_row_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Re-name a few columns for easy use
colnames(metadata_file)[c(43,45)] <- c("disease_state","tissue")
# Filter to only our samples of interest
metadata_file_filtered <- metadata_file %>%
filter(title %in% raw_files)
# Make sure the order of names matches the raw data list
identical(metadata_file_filtered$title,names(raw_list))
## [1] TRUE
# Create an empty list of Seurat objects
ser_list <- vector("list",length=length(raw_list))
# Loop to Create individual Seurat objects and include metadata
for (i in 1:length(ser_list)) {
# Create Seurat Object
ser_list[[i]] <- CreateSeuratObject(raw_list[[i]])
# Create metadata for each object
meta_add <- data.frame(matrix(data=NA,nrow=ncol(ser_list[[i]]),ncol=3))
colnames(meta_add) <- c("title","disease_state","tissue")
rownames(meta_add) <- colnames(ser_list[[i]])
meta_add$title <- metadata_file_filtered[i,"title"] %>% pull()
meta_add$disease_state <- metadata_file_filtered[i,"disease_state"] %>% pull()
meta_add$tissue <- metadata_file_filtered[i,"tissue"] %>% pull()
# Add Metadata
ser_list[[i]] <- AddMetaData(ser_list[[i]],metadata=meta_add)
}
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
# Check out our list of Seurat objects
ser_list
## [[1]]
## An object of class Seurat
## 33694 features across 2445 samples within 1 assay
## Active assay: RNA (33694 features, 0 variable features)
##
## [[2]]
## An object of class Seurat
## 33694 features across 2436 samples within 1 assay
## Active assay: RNA (33694 features, 0 variable features)
##
## [[3]]
## An object of class Seurat
## 33694 features across 1767 samples within 1 assay
## Active assay: RNA (33694 features, 0 variable features)
##
## [[4]]
## An object of class Seurat
## 33694 features across 2315 samples within 1 assay
## Active assay: RNA (33694 features, 0 variable features)
Merge list of Seurat objects into a single seurat object #
ser_merged <- merge(ser_list[[1]],ser_list[2:4])
## Warning in CheckDuplicateCellNames(object.list = objects): Some cell names are
## duplicated across objects provided. Renaming to enforce unique cell names.
ser_merged@meta.data$title %>%
table()
## .
## HD_PBMC_1 HD_PBMC_2 HD_PBMC_3 HD_PBMC_4
## 2445 2436 1767 2315
ser_merged@meta.data %>%
select(title,disease_state) %>%
table()
## disease_state
## title Healthy donor
## HD_PBMC_1 2445
## HD_PBMC_2 2436
## HD_PBMC_3 1767
## HD_PBMC_4 2315
ser_merged@meta.data %>%
select(title,tissue) %>%
table()
## tissue
## title peripheral blood
## HD_PBMC_1 2445
## HD_PBMC_2 2436
## HD_PBMC_3 1767
## HD_PBMC_4 2315
Analysis of all 4 samples #
As per the shortcut in the PBMC vignette, here’s a quick analysis of the 4 healthy donor samples we merged into one Seurat object.
ser_merged <- ser_merged %>%
NormalizeData() %>%
FindVariableFeatures() %>%
ScaleData() %>%
RunPCA()
## Centering and scaling data matrix
## PC_ 1
## Positive: RPS29, IL32, TRAC, LTB, RPS18, TRBC2, IL7R, RPS12, TRBC1, RPS2
## EEF1A1, CCR7, CD247, CD69, PEBP1, TRAT1, GZMM, MYC, AQP3, TSHZ2
## CTSW, FHIT, NGFRAP1, CDC25B, RPS26, HIST1H4C, FKBP11, NELL2, CD8B, CD40LG
## Negative: CST3, FCN1, LYZ, CSTA, LST1, S100A9, CTSS, TYROBP, S100A8, MNDA
## FGL2, FTL, SAT1, TYMP, FTH1, VCAN, PSAP, FOS, LGALS1, FCER1G
## S100A12, SERPINA1, CFD, CYBB, NEAT1, SPI1, MS4A6A, AIF1, HLA-DRA, GPX1
## PC_ 2
## Positive: LTB, CCR7, RPS12, RPS18, EEF1A1, RPS2, RPLP1, CD79A, MYC, MS4A1
## IGHM, IL7R, NCF1, IGHD, LINC00926, BANK1, BIRC3, VPREB3, RPS29, FHIT
## TSHZ2, TCL1A, VIM, CD22, IGKC, TNFRSF13C, NGFRAP1, TRAC, FCRLA, FCER2
## Negative: NKG7, CST7, GZMA, PRF1, GNLY, KLRD1, GZMB, FGFBP2, HOPX, CTSW
## KLRF1, CCL5, SPON2, GZMH, TRDC, FCGR3A, CLIC3, CCL4, MATK, KLRB1
## ADGRG1, MYO1F, PFN1, S1PR5, C12orf75, CMC1, IL2RB, TTC38, PRSS23, C1orf21
## PC_ 3
## Positive: TRAC, IL7R, IL32, VIM, S100A6, S100A12, VCAN, S100A8, ANXA1, S100A9
## CXCL8, TRAT1, S100A4, TRBC1, MCL1, CD14, CH17-373J23.1, RGCC, LYZ, IL1B
## AIF1, NEAT1, FHIT, RBP7, NFKBIZ, RP11-160E2.6, TSHZ2, CSTA, DUSP6, TSPO
## Negative: CD79A, MS4A1, IGHM, CD79B, IGKC, LINC00926, IGHD, BANK1, TCL1A, VPREB3
## SPIB, HLA-DQA1, CD22, FAM129C, BLNK, HLA-DPA1, CD74, FCRLA, HLA-DPB1, FCER2
## TSPAN13, HLA-DOB, HLA-DQB1, TNFRSF13C, IGLC2, BLK, JCHAIN, CD19, CD24, RP11-693J15.5
## PC_ 4
## Positive: CDKN1C, HES4, CKB, CSF1R, TCF7L2, MS4A4A, SIGLEC10, CTD-2006K23.1, HMOX1, FCGR3A
## MS4A7, VMO1, BATF3, LINC01272, LRRC25, ZNF703, IFITM3, FAM110A, LYPD2, ICAM4
## CTSL, CDH23, TPPP3, RHOC, C1QA, LILRB2, PILRA, BID, OAS1, CD68
## Negative: CXCL8, VCAN, S100A12, IL1B, RP11-160E2.6, CH17-373J23.1, NFKBIA, FOSB, CCL3, FOS
## S100A8, IER3, CTD-3252C9.4, TNFAIP3, JUN, RP11-1143G9.4, EGR1, NCF1, JUND, LUCAT1
## S100A9, ITGAM, CD14, LINC00936, MT-CO1, MCL1, MS4A6A, KLF10, CYP1B1, CLEC4E
## PC_ 5
## Positive: CD79B, MS4A1, CD79A, LINC00926, IGHD, FCGR3A, CDKN1C, VPREB3, CD22, FCER2
## BANK1, MS4A7, SIGLEC10, IGHM, HES4, FCRL1, TNFRSF13C, CKB, MTSS1, TCF7L2
## LINC01272, HLA-DOB, FCRLA, RP11-693J15.5, CD19, CD24, RALGPS2, CD72, SWAP70, HMOX1
## Negative: SERPINF1, PLD4, GAS6, LRRC26, PPP1R14B, CLEC4C, LILRA4, ITM2C, TPM2, FCER1A
## DERL3, SCT, PTCRA, SMPD3, C1orf186, JCHAIN, LINC00996, SCN9A, IL3RA, LILRB4
## PTPRS, CCDC50, RP11-117D22.2, MZB1, UGCG, RP11-73G16.2, DNASE1L3, LAMP5, MAP1A, TNFRSF21
ElbowPlot(ser_merged)
ser_merged <- ser_merged %>%
FindNeighbors(.,dims=1:15) %>%
FindClusters(.,res=c(0.3,0.5,0.7)) %>%
RunUMAP(.,dims=1:10)
## Computing nearest neighbor graph
## Computing SNN
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 8963
## Number of edges: 319663
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9303
## Number of communities: 13
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 8963
## Number of edges: 319663
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9083
## Number of communities: 16
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 8963
## Number of edges: 319663
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8883
## Number of communities: 17
## Elapsed time: 0 seconds
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 19:09:30 UMAP embedding parameters a = 0.9922 b = 1.112
## 19:09:30 Read 8963 rows and found 10 numeric columns
## 19:09:30 Using Annoy for neighbor search, n_neighbors = 30
## 19:09:30 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 19:09:30 Writing NN index file to temp file /tmp/RtmpZYOKxd/file44aa2cf3936a
## 19:09:30 Searching Annoy index using 1 thread, search_k = 3000
## 19:09:32 Annoy recall = 100%
## 19:09:32 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 19:09:33 Initializing from normalized Laplacian + noise (using irlba)
## 19:09:33 Commencing optimization for 500 epochs, with 368458 positive edges
## 19:09:38 Optimization finished
Plot a UMAP of clusters by sample #
DimPlot(ser_merged,group.by="RNA_snn_res.0.3",label=T)
DimPlot(ser_merged,group.by="RNA_snn_res.0.3",split.by="title",label=T,
ncol=2)
Save output #
save_path <- "/home/rstudio/docker_rstudio/data/"
saveRDS(ser_merged,file=paste(save_path,"ser_merged_4_hd_pbmc_231027.rds",sep=""))