Preparation

Set up the R environment by including a few necessary R libraries

library(ggplot2)
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
library(cowplot)
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggplot2':
## 
##     ggsave
library(reshape2)
library(RColorBrewer)

Data import

sampleInfo <- read.csv2("SE_samples.csv")
SE_enc <- read.table("SE_enc.bed.gz")
TE_enc <- read.table("TE_enc.bed.gz")

SE vs TE: median size

First, get the midian size of SEs/TEs in each cell type/tissue, and format the data as a data frame with additional information.

SE_enc.median_size <- tapply(SE_enc[,5], as.factor(SE_enc[,4]), median)
SE_enc.median_size.df <- data.frame(Sample.ID=names(SE_enc.median_size), type="SE", size=SE_enc.median_size)
TE_enc.median_size <- tapply(TE_enc[,5], as.factor(TE_enc[,4]), median)
TE_enc.median_size.df <- data.frame(Sample.ID=names(TE_enc.median_size), type="TE", size=TE_enc.median_size)

median_size <- rbind(SE_enc.median_size.df, TE_enc.median_size.df)

data <- merge(median_size, sampleInfo, by="Sample.ID")
data <- subset(data, Biosample.type %in% c("Cell line", "Primary cell",  "Tissue"))

Using box-plots to compare the median size of SEs and TEs, and split the samples into cell lines, primary cells, and tissues. Here, the median size is in bp, and the values are plotted on a log scale.

ggplot(data, aes(x=type, y=size, color=type)) + geom_boxplot(outlier.size=0.3) + 
  geom_jitter(alpha=0.3, size=0.3) + 
  facet_grid(  ~ Biosample.type) + 
  scale_colour_manual(values=brewer.pal(7, "Set1")[2:3]) + 
  scale_y_log10() + theme(legend.title=element_blank()) +
  xlab("") + ylab("Median size (bp)")

Here, the values are plotted in a linear scale.

ggplot(data, aes(x=type, y=size, color=type)) + geom_boxplot(outlier.size=0.3) + 
  geom_jitter(alpha=0.3, size=0.3) +
  facet_grid(  ~ Biosample.type) + 
  scale_colour_manual(values=brewer.pal(7, "Set1")[2:3]) +
  theme(legend.title=element_blank()) +
  xlab("") + ylab("Median size (bp)")

SE vs TE: numbers

Get the number of SEs/TEs in each cell type/tissue, and format the data as a data frame with additional information.

.SE.num <- table(SE_enc[,4])
SE.num <- data.frame(Sample.ID=names(.SE.num), number=c(.SE.num), type="SE")
.TE.num <- table(TE_enc[,4])
TE.num <- data.frame(Sample.ID=names(.TE.num), number=c(.TE.num), type="TE")

num <- rbind(SE.num, TE.num)

data <- merge(num, sampleInfo, by="Sample.ID")
data <- subset(data, Biosample.type %in% c("Cell line", "Primary cell",  "Tissue"))

Using box-plots to compare the number of SEs and TEs, and split the samples into cell lines, primary cells, and tissues. Here, the values are plotted on a log scale.

ggplot(data, aes(x=type, y=number, color=type)) + geom_boxplot(outlier.size=0.3) + 
  geom_jitter(alpha=0.3, size=0.3) + 
  scale_y_log10() + 
  facet_grid(  ~ Biosample.type) + 
  scale_colour_manual(values=brewer.pal(7, "Set1")[2:3]) +
  theme(legend.title=element_blank()) +
  xlab("") + ylab("Number")

Here, the values are plotted in a linear scale.

ggplot(data, aes(x=type, y=number, color=type)) + geom_boxplot(outlier.size=0.3) + 
  geom_jitter(alpha=0.3, size=0.3) +
  facet_grid(  ~ Biosample.type) + 
  scale_colour_manual(values=brewer.pal(7, "Set1")[2:3]) +
  theme(legend.title=element_blank()) +
  xlab("") + ylab("Number")

Show the analysis environment

sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: OS X El Capitan 10.11.6
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] RColorBrewer_1.1-2 reshape2_1.4.3     cowplot_0.9.4     
## [4] ggplot2_3.1.1     
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.1       pillar_1.4.1     compiler_3.6.0   plyr_1.8.4      
##  [5] tools_3.6.0      digest_0.6.19    evaluate_0.14    tibble_2.1.3    
##  [9] gtable_0.3.0     pkgconfig_2.0.2  rlang_0.3.4      rstudioapi_0.10 
## [13] yaml_2.2.0       xfun_0.7         withr_2.1.2      stringr_1.4.0   
## [17] dplyr_0.8.1      knitr_1.23       grid_3.6.0       tidyselect_0.2.5
## [21] glue_1.3.1       R6_2.4.0         rmarkdown_1.13   purrr_0.3.2     
## [25] magrittr_1.5     scales_1.0.0     htmltools_0.3.6  assertthat_0.2.1
## [29] colorspace_1.4-1 labeling_0.3     stringi_1.4.3    lazyeval_0.2.2  
## [33] munsell_0.5.0    crayon_1.3.4