scGOclust_vignette

Load packages

scGOclust is a package that leverages Gene Ontology to analyse the functional profile of cells with scRNA-seq data.

# load required libraries

library(Seurat)
#> Attaching SeuratObject
library(pheatmap)
library(httr)

## if (!require("devtools")) install.packages("devtools")

## install latest from source
## for reprodubcibility we do not update dependencies
# devtools::install_github("YY-SONG0718/scGOclust", upgrade_dependencies = FALSE)

library(scGOclust)

2. Load input data

# get a gene to GO BP terms mapping table
# remove electronically inferred records

# sometimes ensembl complains about ssh certificate has expired, this is a known issue, run this code
httr::set_config(httr::config(ssl_verifypeer = FALSE)) 

#mmu_tbl = ensemblToGo(species = 'mmusculus', GO_linkage_type = c('experimental', 'phylogenetic', 'computational', 'author', 'curator' ))
#dme_tbl = ensemblToGo(species = 'dmelanogaster', GO_linkage_type = c('experimental', 'phylogenetic', 'computational', 'author', 'curator' ))

# here we load the example data for convenience
data(mmu_tbl)
data(dme_tbl)
# load the gene expression raw count objects
data(mmu_subset)
data(dme_subset)
ls()
#> [1] "dme_subset" "dme_tbl"    "mmu_subset" "mmu_tbl"

3. Build GO BP profile

## construct a Seurat object with GO BP as features

mmu_go_obj <- makeGOSeurat(ensembl_to_GO = mmu_tbl, feature_type = 'external_gene_name', seurat_obj = mmu_subset)
#> collect data
#> compute GO to cell matrix, might take a few secs
#> time used: 0.67 secs
#> returning GO Seurat object

dme_go_obj <- makeGOSeurat(ensembl_to_GO = dme_tbl, feature_type = 'external_gene_name', seurat_obj = dme_subset)
#> collect data
#> compute GO to cell matrix, might take a few secs
#> time used: 0.21 secs
#> returning GO Seurat object

4. Calculate cell type average GO BP profile

# specify the column with cell type annotation in seurat_obj@meta.data

mmu_ct_go <- getCellTypeGO(go_seurat_obj = mmu_go_obj, cell_type_col = 'cell_type_annotation')
#> perform normalization and log1p for mmu_go_obj
#> Centering and scaling data matrix
dme_ct_go <- getCellTypeGO(go_seurat_obj = dme_go_obj, cell_type_col = 'annotation')
#> perform normalization and log1p for dme_go_obj
#> Centering and scaling data matrix

5. Calculate within-species cell type functional similariy

# heatmap of Pearson's correlation coefficient of cell type average BP profiles within species

mmu_corr = cellTypeGOCorr(cell_type_go = mmu_ct_go, corr_method = 'pearson')
pheatmap(mmu_corr)

dme_corr = cellTypeGOCorr(cell_type_go = dme_ct_go, corr_method = 'pearson')
pheatmap(dme_corr)

5. Calculate cross-species cell type functional similariy


# calculate Pearson's correlation coefficient of cell type average BP profiles across species

corr = crossSpeciesCellTypeGOCorr(species_1 = 'mmusculus', species_2 = 'dmelanogaster', cell_type_go_sp1 = mmu_ct_go, cell_type_go_sp2 = dme_ct_go, corr_method = 'pearson')

# cross-species cell type profile heatmap

pheatmap(corr, width = 9, height = 10)


pheatmap(corr, scale = 'column', width = 9, height = 10)


# sheatmap tries to put cells with higher values on the diagonal
# helpful when cross-species cell type similarity signal is less clear

slanter::sheatmap((corr + 0.5), width = 9, height = 10)


# scale by row or column to see relative similarity

slanter::sheatmap((corr + 0.5), scale = 'column', width = 9, height = 10)

6. Dimensional reduction and UMAP visualization of cells with GO profile


# analyze the cell-by-GO BP profile as a count matrix
mmu_go_analyzed = analyzeGOSeurat(go_seurat_obj = mmu_go_obj, cell_type_col = 'cell_type_annotation')
#> perform normalization and log1p for mmu_go_obj
#> Computing nearest neighbor graph
#> Computing SNN
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#> 
#> Number of nodes: 219
#> Number of edges: 9890
#> 
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.4728
#> Number of communities: 4
#> Elapsed time: 0 seconds
#> Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
#> To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
#> This message will be shown once per session
#> 23:44:45 UMAP embedding parameters a = 0.9922 b = 1.112
#> 23:44:45 Read 219 rows and found 50 numeric columns
#> 23:44:45 Using Annoy for neighbor search, n_neighbors = 30
#> 23:44:45 Building Annoy index with metric = cosine, n_trees = 50
#> 0%   10   20   30   40   50   60   70   80   90   100%
#> [----|----|----|----|----|----|----|----|----|----|
#> **************************************************|
#> 23:44:45 Writing NN index file to temp file /var/folders/37/wf962dk574750g0xnnlxjwvm0000gp/T//RtmpsSBDIl/filee12716dbcd69
#> 23:44:45 Searching Annoy index using 1 thread, search_k = 3000
#> 23:44:45 Annoy recall = 100%
#> 23:44:45 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
#> 23:44:46 Initializing from normalized Laplacian + noise (using irlba)
#> 23:44:46 Commencing optimization for 500 epochs, with 7820 positive edges
#> 23:44:47 Optimization finished
# UMAP plot of the analyzed cell-by-GO BP profile
# labeled by previously specified cell annotation column in meta.data

DimPlot(mmu_go_analyzed, label = TRUE) + NoLegend()

dme_go_analyzed = analyzeGOSeurat(go_seurat_obj = dme_go_obj, cell_type_col = 'annotation')
#> perform normalization and log1p for dme_go_obj
#> Computing nearest neighbor graph
#> Computing SNN
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#> 
#> Number of nodes: 180
#> Number of edges: 6418
#> 
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.5467
#> Number of communities: 4
#> Elapsed time: 0 seconds
#> 23:44:47 UMAP embedding parameters a = 0.9922 b = 1.112
#> 23:44:47 Read 180 rows and found 50 numeric columns
#> 23:44:47 Using Annoy for neighbor search, n_neighbors = 30
#> 23:44:47 Building Annoy index with metric = cosine, n_trees = 50
#> 0%   10   20   30   40   50   60   70   80   90   100%
#> [----|----|----|----|----|----|----|----|----|----|
#> **************************************************|
#> 23:44:47 Writing NN index file to temp file /var/folders/37/wf962dk574750g0xnnlxjwvm0000gp/T//RtmpsSBDIl/filee12755a8512d
#> 23:44:47 Searching Annoy index using 1 thread, search_k = 3000
#> 23:44:47 Annoy recall = 100%
#> 23:44:48 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
#> 23:44:48 Initializing from normalized Laplacian + noise (using irlba)
#> 23:44:48 Commencing optimization for 500 epochs, with 6610 positive edges
#> 23:44:49 Optimization finished
DimPlot(dme_go_analyzed, label = TRUE) + NoLegend()

7. Get co-up and co-down regulated terms between pairs of cell types


## calculation takes a few minutes due to the Wilcoxon signed rank test

ct_shared_go = getCellTypeSharedGO(species_1 = 'mmusculus', species_2 = 'dmelanogaster', analyzed_go_seurat_sp1 = mmu_go_analyzed, analyzed_go_seurat_sp2 = dme_go_analyzed, cell_type_col_sp1 = 'cell_type_annotation', cell_type_col_sp2 = 'annotation')

head(ct_shared_go)

# query shared GO terms for specific cell type pairs

getCellTypeSharedTerms(shared_go = ct_shared_go,
                       cell_type_sp1 = 'intestine_Enteroendocrine cell', 
                       cell_type_sp2 = 'enteroendocrine cell',
                       return_full = FALSE)

sessionInfo()
#> R version 4.1.2 (2021-11-01)
#> Platform: x86_64-apple-darwin17.0 (64-bit)
#> Running under: macOS Big Sur 10.16
#> 
#> Matrix products: default
#> BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
#> 
#> locale:
#> [1] C/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] scGOclust_0.1.2    httr_1.4.5         pheatmap_1.0.12    SeuratObject_4.1.3
#> [5] Seurat_4.3.0      
#> 
#> loaded via a namespace (and not attached):
#>   [1] BiocFileCache_2.2.1    plyr_1.8.8             igraph_1.4.1          
#>   [4] lazyeval_0.2.2         sp_1.6-0               splines_4.1.2         
#>   [7] listenv_0.9.0          scattermore_0.8        GenomeInfoDb_1.30.1   
#>  [10] ggplot2_3.4.1          digest_0.6.31          htmltools_0.5.5       
#>  [13] fansi_1.0.4            magrittr_2.0.3         memoise_2.0.1         
#>  [16] tensor_1.5             cluster_2.1.3          ROCR_1.0-11           
#>  [19] limma_3.50.3           globals_0.16.2         Biostrings_2.62.0     
#>  [22] matrixStats_0.63.0     spatstat.sparse_3.0-1  prettyunits_1.1.1     
#>  [25] colorspace_2.1-0       rappdirs_0.3.3         blob_1.2.3            
#>  [28] ggrepel_0.9.3          xfun_0.38              dplyr_1.1.1           
#>  [31] crayon_1.5.2           RCurl_1.98-1.6         jsonlite_1.8.4        
#>  [34] progressr_0.13.0       spatstat.data_3.0-1    survival_3.3-1        
#>  [37] zoo_1.8-11             glue_1.6.2             slanter_0.2-0         
#>  [40] polyclip_1.10-4        gtable_0.3.3           zlibbioc_1.40.0       
#>  [43] XVector_0.34.0         leiden_0.4.3           future.apply_1.10.0   
#>  [46] BiocGenerics_0.40.0    abind_1.4-5            scales_1.2.1          
#>  [49] DBI_1.1.2              spatstat.random_3.1-4  miniUI_0.1.1.1        
#>  [52] Rcpp_1.0.10            progress_1.2.2         viridisLite_0.4.1     
#>  [55] xtable_1.8-4           reticulate_1.28        bit_4.0.4             
#>  [58] stats4_4.1.2           htmlwidgets_1.6.2      RColorBrewer_1.1-3    
#>  [61] ellipsis_0.3.2         ica_1.0-3              farver_2.1.1          
#>  [64] pkgconfig_2.0.3        XML_3.99-0.9           dbplyr_2.1.1          
#>  [67] sass_0.4.5             uwot_0.1.14            deldir_1.0-6          
#>  [70] utf8_1.2.3             labeling_0.4.2         tidyselect_1.2.0      
#>  [73] rlang_1.1.0            reshape2_1.4.4         later_1.3.0           
#>  [76] AnnotationDbi_1.56.2   munsell_0.5.0          tools_4.1.2           
#>  [79] cachem_1.0.7           cli_3.6.1              generics_0.1.3        
#>  [82] RSQLite_2.2.18         ggridges_0.5.4         evaluate_0.20         
#>  [85] stringr_1.5.0          fastmap_1.1.1          yaml_2.3.7            
#>  [88] goftest_1.2-3          knitr_1.42             bit64_4.0.5           
#>  [91] fitdistrplus_1.1-8     purrr_1.0.1            RANN_2.6.1            
#>  [94] KEGGREST_1.34.0        pbapply_1.7-0          future_1.32.0         
#>  [97] nlme_3.1-157           mime_0.12              pracma_2.4.2          
#> [100] xml2_1.3.3             biomaRt_2.50.3         compiler_4.1.2        
#> [103] rstudioapi_0.13        filelock_1.0.2         curl_5.0.0            
#> [106] plotly_4.10.1          png_0.1-8              spatstat.utils_3.0-2  
#> [109] tibble_3.2.1           bslib_0.4.2            stringi_1.7.12        
#> [112] highr_0.10             lattice_0.20-45        Matrix_1.5-1          
#> [115] vctrs_0.6.1            networkD3_0.4          pillar_1.9.0          
#> [118] lifecycle_1.0.3        spatstat.geom_3.1-0    lmtest_0.9-40         
#> [121] jquerylib_0.1.4        RcppAnnoy_0.0.20       data.table_1.14.8     
#> [124] cowplot_1.1.1          bitops_1.0-7           irlba_2.3.5.1         
#> [127] httpuv_1.6.9           patchwork_1.1.2        R6_2.5.1              
#> [130] promises_1.2.0.1       KernSmooth_2.23-20     gridExtra_2.3         
#> [133] IRanges_2.28.0         parallelly_1.35.0      codetools_0.2-18      
#> [136] assertthat_0.2.1       MASS_7.3-57            withr_2.5.0           
#> [139] sctransform_0.3.5      GenomeInfoDbData_1.2.7 S4Vectors_0.32.4      
#> [142] hms_1.1.1              parallel_4.1.2         grid_4.1.2            
#> [145] tidyr_1.3.0            rmarkdown_2.21         Rtsne_0.16            
#> [148] spatstat.explore_3.1-0 Biobase_2.54.0         shiny_1.7.4