Preparation

Set up the R environment by including a few necessary R libraries

library(ggplot2)
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
library(cowplot)
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggplot2':
## 
##     ggsave
library(ape)
library(Seurat)
## Registered S3 method overwritten by 'R.oo':
##   method        from       
##   throw.default R.methodsS3

Data import

Data import and simple process

## chromosome information 
chrsize <- read.table("hg19.chr_size", header=F, stringsAsFactor=T, col.names=c("chr", "stop"))
chrsize$chr <- factor(chrsize$chr, levels = c('chr1', 'chr2', 'chr3', 'chr4', 'chr5', 
                                              'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 
                                              'chr11', 'chr12', 'chr13', 'chr14', 
                                              'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 
                                              'chr20', 'chr21', 'chr22', 'chrX', 'chrY', 'chrM')) 
chrsize <- subset(chrsize, chr != 'chrM' & chr != 'chrY')
chrsize$start <- 0

## SE of tissues
SE.ti <- read.table("enc.tissue.SE_merged.bed", header=F, stringsAsFactor=F, 
                    col.names=c("chr", "start", "stop", "samples", "id"))
SE.ti$chr <- factor(SE.ti$chr, levels = c('chr1', 'chr2', 'chr3', 'chr4', 'chr5', 
                                          'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 
                                          'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 
                                          'chr16', 'chr17', 'chr18', 'chr19', 
                                          'chr20', 'chr21', 'chr22', 'chrX', 'chrY')) 
SE.ti <- with(SE.ti, SE.ti[order(chr),])
SE.ti$samples <- sapply(SE.ti$id, function(x) length(unique(substr(unlist(strsplit(x, ",")),1,13))))

## SE of cell lines
SE.cl <- read.table("enc.cell_line.SE_merged.bed", header=F, stringsAsFactor=F, 
                    col.names=c("chr", "start", "stop", "samples", "id"))
SE.cl$chr <- factor(SE.cl$chr, levels = c('chr1', 'chr2', 'chr3', 'chr4', 'chr5', 
                                          'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 
                                          'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 
                                          'chr16', 'chr17', 'chr18', 'chr19', 
                                          'chr20', 'chr21', 'chr22', 'chrX', 'chrY')) 
SE.cl <- with(SE.cl, SE.cl[order(chr),])
SE.cl$samples <- sapply(SE.cl$id, function(x) length(unique(substr(unlist(strsplit(x, ",")),1,13))))

## SE of primary cells
SE.pc <- read.table("enc.primary_cell.SE_merged.bed", header=F, stringsAsFactor=F, 
                    col.names=c("chr", "start", "stop", "samples", "id"))
SE.pc$chr <- factor(SE.pc$chr, levels = c('chr1', 'chr2', 'chr3', 'chr4', 'chr5', 
                                          'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 
                                          'chr11', 'chr12', 'chr13', 'chr14', 
                                          'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 
                                          'chr20', 'chr21', 'chr22', 'chrX', 'chrY')) 
SE.pc <- with(SE.pc, SE.pc[order(chr),])
SE.pc$samples <- sapply(SE.pc$id, function(x) length(unique(substr(unlist(strsplit(x, ",")),1,13))))

## For hierarchical clustering
sampleInfo <- read.csv2("SE_samples.csv")
sampleInfo <- subset(sampleInfo, Biosample.type %in% c("Cell line", "Primary cell",  "Tissue") & 
                       Data.sources %in% "ENCODE")

Jaccard <- read.table("ENCODE_network/Jaccard.txt", stringsAsFactors=F)
Jaccard <- subset(Jaccard, V1 %in% sampleInfo$Sample.ID & V2 %in% sampleInfo$Sample.ID)

## For PCA analysis
SE.occur <- read.table("ENCODE_network/enc_SE_merged.cnt.matrix") 
SE.occur <- SE.occur[, colnames(SE.occur) %in% sampleInfo$Sample.ID]

SE Landscape

Showing the landscape of SEs on each chromosome. The most majority of the SEs are not shared by difference cell types.

ggplot(data=chrsize, aes(xmin=start/1000000, xmax=stop/1000000, ymin=0, ymax=3)) +
  facet_grid(chr ~ ., switch='y') +
  geom_rect(fill='grey95', color='grey95') + 
  theme(axis.title.y=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank(),
        panel.background = element_blank(), 
        panel.border = element_blank(), 
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(), 
        axis.line.x = element_line(colour = "black"),       
        axis.line.y = element_blank(),
        strip.text.y = element_text(angle = 180),
        strip.background = element_blank()) + 
  geom_tile(data=SE.ti, aes(x=(start+stop)/2/1000000, y=0.5, fill=samples, color=samples)) + 
  geom_tile(data=SE.cl, aes(x=(start+stop)/2/1000000, y=1.5, fill=samples, color=samples)) + 
  geom_tile(data=SE.pc, aes(x=(start+stop)/2/1000000, y=2.5, fill=samples, color=samples)) + 
  scale_fill_gradientn(colours = rev(rainbow(10)[1:8]), trans='log10') +
  scale_color_gradientn(colours = rev(rainbow(10)[1:8]), trans='log10') + 
  xlab("Position on chromosome (Mb)") +
  labs(fill="Shared by # samples", color="Shared by # samples")

Hierarchical clustering

Hierarchical clustering of the samples based on pair-wise co-occurrence of SEs.

Jaccard.mat <- do.call(cbind, tapply(1:nrow(Jaccard), as.factor(Jaccard[,1]), function(x) 
  data.frame(Jaccard[x,3],row.names=Jaccard[x,2])))
sample.id <- data.frame(Sample.ID=colnames(Jaccard.mat))
sampleInfo <- merge(sample.id, sampleInfo, by="Sample.ID")

colnames(Jaccard.mat) <- paste(sampleInfo$Biosample.type, 
                               substr(sampleInfo$Tissue.type,1,10), 
                               1:ncol(Jaccard.mat), sep="_")
rownames(Jaccard.mat) <- colnames(Jaccard.mat)
mat_dist <- as.dist(1-sqrt(Jaccard.mat))
hc <- hclust(mat_dist, method='ward.D2')
colors = c("#FF00FF", "#8000FF", "#FF8000")
clus3 = cutree(hc, 3)
plot(as.phylo(hc), type = "fan", tip.color = colors[clus3],  cex = 0.6)

Principal component analysis

Principal component analysis of the occurrence of SEs in the investigated samples. The first two principal components are shown in the plot.

sample.id <- data.frame(Sample.ID=colnames(SE.occur))
sampleInfo <- merge(sample.id, sampleInfo, by="Sample.ID")

# using Seurat for PCA
obj <- CreateSeuratObject(SE.occur, assay="SE")
obj@assays$SE@var.features <- rownames(obj@assays$SE@counts)
obj@assays$SE@scale.data <- as.matrix(obj@assays$SE@counts)
obj <- RunPCA(object = obj, pc.genes = obj@var.genes, do.print = F)
## PC_ 1 
## Positive:  chr2-166421703-166504582, chr8-81776390-81793533, chr10-42383114-42397173, chr1-30721449-30734189, chr3-103209547-103219033, chr11-39239590-39245279, chr11-86901001-86913049, chr10-115603202-115617190, chr2-242743346-242777521, chr6-168567636-168583993 
##     chr15-45196707-45216056, chr9-90802437-90814884, chr2-223977387-223997135, chr8-127919066-127931147, chr11-102785327-102809961, chr13-21770057-21775046, chr10-71793488-71815578, chr12-17522467-17527121, chr4-117101251-117118475, chr5-144621479-144627988 
##     chr12-12389150-12405955, chr4-150265177-150271637, chr15-97594817-97637782, chr16-8620435-8632860, chr14-57508534-57539176, chr2-226656618-226661338, chr3-34296517-34337616, chr14-63055028-63082388, chr5-145375770-145385868, chr10-110948607-110964291 
## Negative:  chr1-234610051-235192382, chr2-43054277-43604458, chr1-149783661-149861972, chr5-171951314-172387461, chr1-8042255-8668389, chr11-65169424-65293668, chr2-238025629-238654437, chr17-79287388-79496032, chr17-75079810-75492078, chr20-52180179-52579524 
##     chr2-232435233-232593236, chr22-46390744-46514656, chr20-48836950-49212251, chr22-36616046-36905388, chr11-11963882-12311106, chr10-80787186-81120833, chr12-125092737-125438341, chr7-130551474-130811428, chr1-145073235-145292974, chr7-5558027-5750219 
##     chr10-3762505-4127780, chr17-38213886-38293387, chr5-149777956-150057665, chr12-124829518-125078062, chr9-130696640-130883276, chr9-72965156-73122240, chr14-68911015-69182461, chr10-73972293-74128717, chr1-27002723-27136764, chr14-50308058-50585122 
## PC_ 2 
## Positive:  chr17-79287388-79496032, chr2-232435233-232593236, chr2-43054277-43604458, chr1-149783661-149861972, chr17-38213886-38293387, chr22-41738045-41846014, chr19-42744319-42791747, chr10-134177600-134460043, chr7-5558027-5750219, chr12-58207429-58297497 
##     chr1-2107059-2263623, chr20-35074896-35180260, chr9-130696640-130883276, chr17-48914346-49066625, chr11-113895366-114133874, chr8-38557958-38666550, chr5-148746350-148846783, chr2-66646090-66835375, chr9-131864504-131966327, chr1-27002723-27136764 
##     chr11-119167282-119253784, chr12-57449155-57582529, chr1-10681342-10886360, chr10-126656950-126852012, chr2-174762474-174928264, chr19-16362565-16584689, chr1-164526202-164776187, chr8-38175192-38330619, chr1-207961675-208116788, chr19-48092091-48249734 
## Negative:  chr20-52180179-52579524, chr12-76223928-76432927, chr11-11963882-12311106, chr12-13244020-13378157, chr2-161036644-161351389, chr10-3762505-4127780, chr12-46736228-46972021, chr11-121998354-122091940, chr8-128793698-128984602, chr1-94012053-94248992 
##     chr2-36555701-36826369, chr7-92237974-92485564, chr11-95765800-96077401, chr12-52526850-52685291, chr21-36165691-36432151, chr12-66204990-66378976, chr1-67979201-68322599, chr1-214534683-214788853, chr10-33387536-33667930, chr12-65980541-66069346 
##     chr17-57822089-57953405, chr3-171756078-172043341, chr10-33212002-33337288, chr20-45881084-46250308, chr14-61750166-62136627, chr3-149003339-149133711, chr7-55080902-55238061, chr5-14137984-14285271, chr8-118805533-119139654, chr14-51928223-52084667 
## PC_ 3 
## Positive:  chr14-50308058-50585122, chr1-149783661-149861972, chr20-52180179-52579524, chr4-40151421-40339107, chr5-42981248-43050805, chr6-159029585-159293279, chr16-11636074-11919163, chr2-196996112-197160488, chr14-22878347-23053746, chr14-61750166-62136627 
##     chr2-70294838-70372074, chr21-34657990-34829079, chr11-65169424-65293668, chr2-64835970-65093395, chr17-62952499-63103462, chr4-185230731-185401813, chr17-61991008-62210507, chr15-75063015-75111821, chr17-56394563-56426848, chr17-73628003-73785724 
##     chrX-12964119-13052506, chr13-99843528-100041108, chr5-150378949-150639046, chr17-37892036-38054004, chr17-76680092-76823376, chr6-27739618-27809589, chr11-3845230-4029753, chr1-26551206-26655259, chr3-46956050-47079673, chr13-30902132-31050921 
## Negative:  chr10-29748217-30188048, chr11-11963882-12311106, chr2-238025629-238654437, chr12-2040792-2543978, chr5-148746350-148846783, chr8-124462285-124743503, chr4-169398456-169852902, chr3-187918733-188071492, chr1-85954066-86102776, chr7-134373321-134653428 
##     chr21-47439437-47564351, chr3-159477846-159666635, chr10-123746043-124013168, chr5-171951314-172387461, chr10-88408298-88591220, chr14-55030187-55276565, chr1-201404025-201578930, chr11-68764101-68857662, chr9-133630964-133763855, chr3-99551354-99701008 
##     chr3-57860735-58122482, chr2-33233409-33638371, chr2-36555701-36826369, chr8-38175192-38330619, chr11-119167282-119253784, chr2-1576127-1841989, chr1-8042255-8668389, chr1-203408193-203542290, chr5-146761781-146890764, chr3-134026398-134126541 
## PC_ 4 
## Positive:  chr2-33233409-33638371, chr14-22878347-23053746, chr11-95765800-96077401, chr2-196996112-197160488, chr12-11800725-12060196, chr15-85922308-86332558, chr3-187918733-188071492, chr20-43146161-43378352, chr19-16362565-16584689, chr3-151900196-152162028 
##     chr11-72629770-73137985, chr10-29748217-30188048, chr7-101434256-101689718, chr2-238025629-238654437, chr3-16320102-16566364, chr9-132613230-132836938, chr20-55947556-56082461, chr1-85954066-86102776, chr18-60746093-61014803, chr5-71401030-71607145 
##     chr12-2040792-2543978, chr1-120488069-120697349, chr9-97533568-97718653, chr10-26718387-26818735, chr1-198563799-198699870, chr4-40151421-40339107, chr15-101631431-101808005, chr5-88019240-88249803, chr18-13351849-13630652, chr18-56494640-56763252 
## Negative:  chr12-52526850-52685291, chr1-149783661-149861972, chr12-53240197-53407679, chr17-73628003-73785724, chr14-68911015-69182461, chr22-46390744-46514656, chr12-6375774-6514845, chr2-85114821-85203184, chr8-134202536-134354869, chr11-34606605-34708425 
##     chr10-73972293-74128717, chr10-105301977-105684589, chr19-11172240-11321305, chr17-48914346-49066625, chr17-79287388-79496032, chr1-201974297-202136423, chr12-76223928-76432927, chr21-44720317-44943783, chr1-16425986-16554013, chr17-17704339-17789196 
##     chr20-48836950-49212251, chr1-151890956-151980362, chr6-10372185-10432729, chr7-130551474-130811428, chr15-90536059-90778807, chr18-67946326-68196329, chr6-43728126-44052993, chr17-39765942-39862962, chr19-41208215-41231665, chr12-46736228-46972021 
## PC_ 5 
## Positive:  chr10-29748217-30188048, chr20-55947556-56082461, chr12-2040792-2543978, chr7-24850363-25022192, chr19-16362565-16584689, chr4-40151421-40339107, chr7-101434256-101689718, chr10-63482249-63874678, chr8-124462285-124743503, chr20-43146161-43378352 
##     chr2-173881734-174097506, chr4-144255760-144381376, chr10-88408298-88591220, chr14-59642956-59846076, chr2-28789090-29020323, chr5-52613920-52736716, chr3-73539202-73684272, chr10-71068273-71268191, chr12-52526850-52685291, chr9-522234-762996 
##     chr17-47642611-47862223, chr3-5163490-5256079, chr2-43054277-43604458, chr19-13093213-13217525, chr22-36121376-36319042, chr20-42733314-42858528, chr20-25170517-25318548, chr11-68764101-68857662, chr4-75222975-75324549, chr10-76929346-77036941 
## Negative:  chr11-95765800-96077401, chr8-128793698-128984602, chr16-29119830-29395565, chr7-47322089-47721406, chr21-46709650-46912365, chr2-102604838-102783705, chr16-11636074-11919163, chr16-89358961-89576370, chr3-11473793-11693721, chr19-1062413-1219498 
##     chr2-238025629-238654437, chr5-95033116-95227808, chr22-31598812-31704951, chr7-22599287-22771738, chr3-171756078-172043341, chr4-185230731-185401813, chr10-16975584-17131503, chr17-76218462-76425802, chr16-88217272-88316903, chr6-143139852-143275021 
##     chr15-78265092-78381598, chr9-127004676-127088085, chr1-85954066-86102776, chr9-37908158-38083981, chr9-112775234-112978986, chr5-42981248-43050805, chr4-2786592-2897361, chr13-114755129-114928596, chr6-43728126-44052993, chr11-121998354-122091940
pca.df <- data.frame(obj@reductions$pca@cell.embeddings[,1:2], 
                     celltype = sampleInfo$Biosample.type)
ggplot(pca.df, aes(PC_1, PC_2, color=celltype)) + geom_point() +
  scale_colour_manual(values = c("#8000FF", "#FF00FF", "#FF8000")) + 
  theme(legend.title=element_blank()) + 
  xlab("PC1") + ylab("PC2")

Show the analysis environment

sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: OS X El Capitan 10.11.6
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] Seurat_3.0.1  ape_5.3       cowplot_0.9.4 ggplot2_3.1.1
## 
## loaded via a namespace (and not attached):
##  [1] tsne_0.1-3          nlme_3.1-140        bitops_1.0-6       
##  [4] RColorBrewer_1.1-2  httr_1.4.0          sctransform_0.2.0  
##  [7] tools_3.6.0         R6_2.4.0            irlba_2.3.3        
## [10] KernSmooth_2.23-15  lazyeval_0.2.2      colorspace_1.4-1   
## [13] npsurv_0.4-0        withr_2.1.2         gridExtra_2.3      
## [16] tidyselect_0.2.5    compiler_3.6.0      plotly_4.9.0       
## [19] labeling_0.3        caTools_1.17.1.2    scales_1.0.0       
## [22] lmtest_0.9-37       ggridges_0.5.1      pbapply_1.4-0      
## [25] stringr_1.4.0       digest_0.6.19       rmarkdown_1.13     
## [28] R.utils_2.8.0       pkgconfig_2.0.2     htmltools_0.3.6    
## [31] bibtex_0.4.2        htmlwidgets_1.3     rlang_0.3.4        
## [34] rstudioapi_0.10     zoo_1.8-6           jsonlite_1.6       
## [37] ica_1.0-2           gtools_3.8.1        dplyr_0.8.1        
## [40] R.oo_1.22.0         magrittr_1.5        Matrix_1.2-17      
## [43] Rcpp_1.0.1          munsell_0.5.0       reticulate_1.12    
## [46] R.methodsS3_1.7.1   stringi_1.4.3       yaml_2.2.0         
## [49] gbRd_0.4-11         MASS_7.3-51.4       gplots_3.0.1.1     
## [52] Rtsne_0.15          plyr_1.8.4          grid_3.6.0         
## [55] parallel_3.6.0      gdata_2.18.0        listenv_0.7.0      
## [58] ggrepel_0.8.1       crayon_1.3.4        lattice_0.20-38    
## [61] splines_3.6.0       SDMTools_1.1-221.1  knitr_1.23         
## [64] pillar_1.4.1        igraph_1.2.4.1      future.apply_1.2.0 
## [67] reshape2_1.4.3      codetools_0.2-16    glue_1.3.1         
## [70] evaluate_0.14       lsei_1.2-0          metap_1.1          
## [73] data.table_1.12.2   png_0.1-7           Rdpack_0.11-0      
## [76] gtable_0.3.0        RANN_2.6.1          purrr_0.3.2        
## [79] tidyr_0.8.3         future_1.13.0       assertthat_0.2.1   
## [82] xfun_0.7            rsvd_1.0.1          survival_2.44-1.1  
## [85] viridisLite_0.3.0   tibble_2.1.3        cluster_2.0.9      
## [88] globals_0.12.4      fitdistrplus_1.0-14 ROCR_1.0-7