The following script represents the transcription factor motif-based analysis performed in our recent paper “An OCT2 / OCA-B / MEF2B Ternary Complex Controls the Activity and Architecture of an Essential Locus Control Region for Normal and Malignant Germinal Center B-cells”. This script runs through the following steps:
Please feel free to email me with any questions, comments or suggestions.
info at jchellmuth.com
library(tidyverse)
library(GenomicRanges)
library(rtracklayer)
# make values object for manual scales:
color.values <- c("TSS"="#228B22","non-essential enhancer (NE)"="#808080","essential enhancer (EE)"="#0064C8")
Here, we use DNase-seq peaks to define constituent enhancers (CE) at the BCL6 LCR. First, we overlap DNase-seq peaks with the genomic ranges covered by our screen. Then, we focus on the top quartile peaks to avoid spurious regulatory elements. DNase-seq peaks are extended to 2kb to reflect all sorrounding TF biding events, histone marks and functional data (CRISPRi). Finally, overlapping CEs are merged.
# import DNase-seq peaks (Encode PCI-LY7 repliacte 1)
gr.peaks.all <- import("ENCFF328RHP.bed",format = "narrowPeak")
# get peaks in regions covered by screen:
gr.regions <- import("screen_targeted_regions.bed")
gr.peaks <- subsetByOverlaps(gr.peaks.all,gr.regions[gr.regions$name!="BCL6-TSS"])
# Note: BCL6-TSS range is exlcuded from overlap as this regulatory element will be defined manually
# top25% DNAse seq peaks:
gr.ce <- as(gr.peaks[gr.peaks$signalValue>quantile(gr.peaks$signalValue,0.75)],"UCSCData")
# limit the ensuing analysis to the LCR sensu stricto:
lcr <- makeGRangesFromDataFrame(data.frame(chr="chr3",start="187887600",end="187995200"))
gr.ce <- subsetByOverlaps(gr.ce,lcr)
# extend to 2kb and reduce overlapping ranges:
gr.ce <- as(reduce(promoters(gr.ce,upstream=(1000-75),downstream=(1000+75))),"UCSCData")
# note that two CE's are <100bp of each other. These are merged to one CE here:
distanceToNearest(gr.ce) %>% data.frame()
## queryHits subjectHits distance
## 1 1 2 14920
## 2 2 3 14740
## 3 3 4 4900
## 4 4 5 60
## 5 5 4 60
## 6 6 7 2600
## 7 7 6 2600
## 8 8 9 2980
## 9 9 10 1860
## 10 10 11 540
## 11 11 10 540
gr.ce[4] <- range(gr.ce[4:5])
gr.ce <- gr.ce[-5]
# Add BCL6 TSS (chr3:187745727)
# UCSC: ENST00000406870.6
# Refseq: NM_001706
# make GRanges object for BCL6 TSS - here defined at 2Kb downstream
gr.tss <- GRanges(seqnames = "chr3",ranges = IRanges(start = 187745727-2000,width = 2000))
# merge to CE granges
gr.ce <- c(gr.ce,gr.tss)
# name ranges
gr.ce$re <- c(paste0("CE",1:(length(gr.ce)-1)),"TSS")
names(gr.ce) <- gr.ce$re
## factorize:
gr.ce$re <- factor(gr.ce$re,levels = c("control","TSS",paste0("CE",1:(length(unique(gr.ce$re))-1))))
Load growth score / library data.
# load library file with normalized and cleaned count data (lifted over to hg38):
load("../Script2.1_normalization/hg38_library_counts_norm_clean_centered.Rda")
# save control gRNAs for later usage in statistical testing:
lib.contr <- libm[libm$category=="non-targeting control",]
# remove control gRNAs (which do not have genomic coordinates):
lib <- libm[-grep("control",libm$category),]
# make GRanges object for library (omitting dispensable columns):
gr.lib <- makeGRangesFromDataFrame(lib[,grep("start|end|norm|r1|r2|chromStart|chromEnd|category",colnames(lib),invert = T)],
start.field = "base_after_cut",end.field = "base_after_cut",keep.extra.columns = T)
Overlap gRNA data (lib) with RE/constituent enhancer coordinates (gr.re) to allow analysis of all gRNA growth scores per RE.
# select overlapping gRNAs (incl growth scores)
gr.ov <- subsetByOverlaps(gr.lib,gr.ce)
# assign names of overlapping REs:
hits <- findOverlaps(query = gr.lib,subject = gr.ce)
gr.ov$re <- gr.ce[subjectHits(hits)]$re
gr.ov$category <- gr.ce[subjectHits(hits)]$category
# make df for plots:
df.ov <- data.frame(gr.ov)
Cutoff to distinguish core-enhancer from non-essential enhancers: mean growth score of enhancer is smaller than the 1% quantile of neg. control gRNAs
# define cutoff:
cutoff <- quantile(lib.contr$gs[lib.contr$category=="non-targeting control"],probs=c(0.01,0.99))
# categorize enhancers:
t.summary <- df.ov %>% group_by(re) %>% summarize(median.gs=median(gs),mean.gs=mean(gs))
essential <- t.summary %>% filter(mean.gs<cutoff[1]) %>% pull(re)
## assign RE categories:
gr.ce$category <- "non-essential enhancer (NE)"
gr.ce$category[gr.ce$re %in% essential] <- "essential enhancer (EE)"
gr.ce$category[gr.ce$re=="TSS"] <- "TSS"
# bind to overlapped growth score data:
df.ov <- left_join(df.ov,data.frame(mcols(gr.ce)),by=c("re"="re"))
# boxplot
ggplot(df.ov,aes(x=re,y=gs,fill=category))+
scale_x_discrete()+ # note: required in order to add annotate first
annotate("rect",ymin=cutoff[1],ymax=cutoff[2],xmin=-Inf,xmax=Inf,alpha=0.3,color="grey")+
geom_hline(yintercept = 0,color="darkgrey")+
geom_boxplot(outlier.shape = NA,width=0.6)+
coord_cartesian(ylim = c(-4,0.4))+
scale_fill_manual(values = color.values)+
labs(x="",fill="",y="CRISPRi\ngrowth score")+
theme_linedraw()+
theme(panel.grid = element_blank(),strip.background = element_rect(fill = "grey85"))+
theme(axis.text.x = element_text(angle=45,hjust=1))
# add colors to categorized regulatory elements
mcols(gr.ce) <- left_join(data.frame(mcols(gr.ce)),data.frame(itemRgb=color.values,category=names(color.values)))
## Joining, by = "category"
## Warning: Column `category` joining character vector and factor, coercing into
## character vector
# add trackline
gr.ce@trackLine <- as('track type=bed name="LCR constituent enhancers categorized" description="LCR constituent enhancers with 2kb TSS - categorized" visibility=squish itemRgb=on',"BasicTrackLine")
# export bed
export(gr.ce,con="LCR.constituent.enhancers.categorized.bed")
# save Rda
save(gr.ce,file="LCR.constituent.enhancers.categorized.Rda")
### BASH ##########################################################################################
# for bed to bigBed conversion, remove header and sort
tail -n +2 LCR.constituent.enhancers.categorized.bed | sort -k1,1 -k2,2n > LCR.constituent.enhancers.categorized.sorted.bed
/Applications/kentUtils/bedToBigBed LCR.constituent.enhancers.categorized.sorted.bed ~/UCSC/hg38.chrom.sizes LCR.constituent.enhancers.categorized.bb
# to display categorized CEs as highlights (shaded areas) in UCSC, paste the following output into UCSC link:
paste0("&highlight=",paste0(paste0("hg38.chr3%3A",start(gr.ce),"-",end(gr.ce),"%23",gsub("#","",gr.ce$itemRgb)),collapse = "%7C"))
## [1] "&highlight=hg38.chr3%3A187900071-187903310%230064C8%7Chg38.chr3%3A187918231-187920230%23808080%7Chg38.chr3%3A187934971-187936970%23808080%7Chg38.chr3%3A187941871-187947250%23808080%7Chg38.chr3%3A187957831-187959830%230064C8%7Chg38.chr3%3A187962431-187964430%23808080%7Chg38.chr3%3A187968751-187970750%230064C8%7Chg38.chr3%3A187973731-187975950%23808080%7Chg38.chr3%3A187977811-187980750%230064C8%7Chg38.chr3%3A187981291-187983290%23808080%7Chg38.chr3%3A187743727-187745726%23228B22"
sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Catalina 10.15.3
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] rtracklayer_1.44.4 GenomicRanges_1.36.1 GenomeInfoDb_1.20.0
## [4] IRanges_2.18.3 S4Vectors_0.22.1 BiocGenerics_0.30.0
## [7] forcats_0.4.0 stringr_1.4.0 dplyr_0.8.3
## [10] purrr_0.3.3 readr_1.3.1 tidyr_1.0.0
## [13] tibble_2.1.3 ggplot2_3.2.1 tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] Biobase_2.44.0 httr_1.4.1
## [3] jsonlite_1.6 modelr_0.1.5
## [5] assertthat_0.2.1 GenomeInfoDbData_1.2.1
## [7] cellranger_1.1.0 Rsamtools_2.0.3
## [9] yaml_2.2.0 pillar_1.4.2
## [11] backports_1.1.5 lattice_0.20-38
## [13] glue_1.3.1 digest_0.6.23
## [15] XVector_0.24.0 rvest_0.3.5
## [17] colorspace_1.4-1 htmltools_0.4.0
## [19] Matrix_1.2-18 XML_3.98-1.20
## [21] pkgconfig_2.0.3 broom_0.5.2
## [23] haven_2.2.0 zlibbioc_1.30.0
## [25] scales_1.1.0 BiocParallel_1.18.1
## [27] generics_0.0.2 farver_2.0.1
## [29] withr_2.1.2 SummarizedExperiment_1.14.1
## [31] lazyeval_0.2.2 cli_1.1.0
## [33] magrittr_1.5 crayon_1.3.4
## [35] readxl_1.3.1 evaluate_0.14
## [37] fs_1.3.1 nlme_3.1-142
## [39] xml2_1.2.2 tools_3.6.1
## [41] hms_0.5.2 lifecycle_0.1.0
## [43] matrixStats_0.55.0 munsell_0.5.0
## [45] reprex_0.3.0 DelayedArray_0.10.0
## [47] Biostrings_2.52.0 compiler_3.6.1
## [49] rlang_0.4.2 grid_3.6.1
## [51] RCurl_1.95-4.12 rstudioapi_0.10
## [53] bitops_1.0-6 labeling_0.3
## [55] rmarkdown_1.18 gtable_0.3.0
## [57] DBI_1.0.0 R6_2.4.1
## [59] GenomicAlignments_1.20.1 lubridate_1.7.4
## [61] knitr_1.26 zeallot_0.1.0
## [63] stringi_1.4.3 Rcpp_1.0.3
## [65] vctrs_0.2.0 dbplyr_1.4.2
## [67] tidyselect_0.2.5 xfun_0.11