Introduction

The following script represents the transcription factor motif-based analysis performed in our recent paper “An OCT2 / OCA-B / MEF2B Ternary Complex Controls the Activity and Architecture of an Essential Locus Control Region for Normal and Malignant Germinal Center B-cells”. This script runs through the following steps:

Please feel free to email me with any questions, comments or suggestions.
info at jchellmuth.com

Load required libraries and set color values

library(tidyverse)
library(GenomicRanges)
library(rtracklayer)
# make values object for manual scales:
color.values <- c("TSS"="#228B22","non-essential enhancer (NE)"="#808080","essential enhancer (EE)"="#0064C8")

Constituent enhancer definition

Here, we use DNase-seq peaks to define constituent enhancers (CE) at the BCL6 LCR. First, we overlap DNase-seq peaks with the genomic ranges covered by our screen. Then, we focus on the top quartile peaks to avoid spurious regulatory elements. DNase-seq peaks are extended to 2kb to reflect all sorrounding TF biding events, histone marks and functional data (CRISPRi). Finally, overlapping CEs are merged.

# import DNase-seq peaks (Encode PCI-LY7 repliacte 1)
gr.peaks.all <- import("ENCFF328RHP.bed",format = "narrowPeak")

# get peaks in regions covered by screen:
gr.regions <- import("screen_targeted_regions.bed")
gr.peaks <- subsetByOverlaps(gr.peaks.all,gr.regions[gr.regions$name!="BCL6-TSS"])
# Note: BCL6-TSS range is exlcuded from overlap as this regulatory element will be defined manually

# top25% DNAse seq peaks:
gr.ce <- as(gr.peaks[gr.peaks$signalValue>quantile(gr.peaks$signalValue,0.75)],"UCSCData")

# limit the ensuing analysis to the LCR sensu stricto:
lcr <- makeGRangesFromDataFrame(data.frame(chr="chr3",start="187887600",end="187995200"))
gr.ce <- subsetByOverlaps(gr.ce,lcr)

# extend to 2kb and reduce overlapping ranges:
gr.ce <- as(reduce(promoters(gr.ce,upstream=(1000-75),downstream=(1000+75))),"UCSCData")

# note that two CE's are <100bp of each other. These are merged to one CE here:
distanceToNearest(gr.ce) %>% data.frame()
##    queryHits subjectHits distance
## 1          1           2    14920
## 2          2           3    14740
## 3          3           4     4900
## 4          4           5       60
## 5          5           4       60
## 6          6           7     2600
## 7          7           6     2600
## 8          8           9     2980
## 9          9          10     1860
## 10        10          11      540
## 11        11          10      540
gr.ce[4] <- range(gr.ce[4:5])
gr.ce <- gr.ce[-5] 

# Add BCL6 TSS (chr3:187745727)
# UCSC: ENST00000406870.6
# Refseq: NM_001706
# make GRanges object for BCL6 TSS - here defined at 2Kb downstream
gr.tss <- GRanges(seqnames = "chr3",ranges = IRanges(start = 187745727-2000,width = 2000))
# merge to CE granges
gr.ce <- c(gr.ce,gr.tss)
# name ranges
gr.ce$re <- c(paste0("CE",1:(length(gr.ce)-1)),"TSS")
names(gr.ce) <- gr.ce$re
## factorize:
gr.ce$re <- factor(gr.ce$re,levels = c("control","TSS",paste0("CE",1:(length(unique(gr.ce$re))-1))))

Categorize constituent enhancers

Load growth score / library data.

# load library file with normalized and cleaned count data (lifted over to hg38):
load("../Script2.1_normalization/hg38_library_counts_norm_clean_centered.Rda")
# save control gRNAs for later usage in statistical testing:
lib.contr <- libm[libm$category=="non-targeting control",]
# remove control gRNAs (which do not have genomic coordinates):
lib <- libm[-grep("control",libm$category),]
# make GRanges object for library (omitting dispensable columns):
gr.lib <- makeGRangesFromDataFrame(lib[,grep("start|end|norm|r1|r2|chromStart|chromEnd|category",colnames(lib),invert = T)],
                                   start.field = "base_after_cut",end.field = "base_after_cut",keep.extra.columns = T)

Overlap gRNA data (lib) with RE/constituent enhancer coordinates (gr.re) to allow analysis of all gRNA growth scores per RE.

# select overlapping gRNAs (incl growth scores)
gr.ov <- subsetByOverlaps(gr.lib,gr.ce)
# assign names of overlapping REs:
hits <- findOverlaps(query = gr.lib,subject = gr.ce)
gr.ov$re <- gr.ce[subjectHits(hits)]$re
gr.ov$category <- gr.ce[subjectHits(hits)]$category
# make df for plots:
df.ov <- data.frame(gr.ov)

Plots

Cutoff to distinguish core-enhancer from non-essential enhancers: mean growth score of enhancer is smaller than the 1% quantile of neg. control gRNAs

# define cutoff:
cutoff <- quantile(lib.contr$gs[lib.contr$category=="non-targeting control"],probs=c(0.01,0.99))

# categorize enhancers:
t.summary <- df.ov %>% group_by(re) %>% summarize(median.gs=median(gs),mean.gs=mean(gs))
essential <- t.summary %>% filter(mean.gs<cutoff[1]) %>% pull(re)

## assign RE categories:
gr.ce$category <- "non-essential enhancer (NE)"
gr.ce$category[gr.ce$re %in% essential] <- "essential enhancer (EE)"
gr.ce$category[gr.ce$re=="TSS"] <- "TSS"

# bind to overlapped growth score data:
df.ov <- left_join(df.ov,data.frame(mcols(gr.ce)),by=c("re"="re"))

# boxplot
ggplot(df.ov,aes(x=re,y=gs,fill=category))+
  scale_x_discrete()+ # note: required in order to add annotate first
  annotate("rect",ymin=cutoff[1],ymax=cutoff[2],xmin=-Inf,xmax=Inf,alpha=0.3,color="grey")+
  geom_hline(yintercept = 0,color="darkgrey")+
  geom_boxplot(outlier.shape = NA,width=0.6)+
  coord_cartesian(ylim = c(-4,0.4))+
  scale_fill_manual(values = color.values)+
  labs(x="",fill="",y="CRISPRi\ngrowth score")+
  theme_linedraw()+
  theme(panel.grid = element_blank(),strip.background = element_rect(fill = "grey85"))+
  theme(axis.text.x = element_text(angle=45,hjust=1))

Export bed files

# add colors to categorized regulatory elements
mcols(gr.ce) <- left_join(data.frame(mcols(gr.ce)),data.frame(itemRgb=color.values,category=names(color.values)))
## Joining, by = "category"
## Warning: Column `category` joining character vector and factor, coercing into
## character vector
# add trackline
gr.ce@trackLine <- as('track type=bed name="LCR constituent enhancers categorized" description="LCR constituent enhancers with 2kb TSS - categorized" visibility=squish itemRgb=on',"BasicTrackLine")
# export bed
export(gr.ce,con="LCR.constituent.enhancers.categorized.bed")
# save Rda
save(gr.ce,file="LCR.constituent.enhancers.categorized.Rda")

bedToBigBeds for UCSC TrackHub

### BASH ##########################################################################################
# for bed to bigBed conversion, remove header and sort
tail -n +2 LCR.constituent.enhancers.categorized.bed | sort -k1,1 -k2,2n > LCR.constituent.enhancers.categorized.sorted.bed
/Applications/kentUtils/bedToBigBed LCR.constituent.enhancers.categorized.sorted.bed ~/UCSC/hg38.chrom.sizes LCR.constituent.enhancers.categorized.bb

UCSC track highlights

# to display categorized CEs as highlights (shaded areas) in UCSC, paste the following output into UCSC link:
paste0("&highlight=",paste0(paste0("hg38.chr3%3A",start(gr.ce),"-",end(gr.ce),"%23",gsub("#","",gr.ce$itemRgb)),collapse = "%7C"))
## [1] "&highlight=hg38.chr3%3A187900071-187903310%230064C8%7Chg38.chr3%3A187918231-187920230%23808080%7Chg38.chr3%3A187934971-187936970%23808080%7Chg38.chr3%3A187941871-187947250%23808080%7Chg38.chr3%3A187957831-187959830%230064C8%7Chg38.chr3%3A187962431-187964430%23808080%7Chg38.chr3%3A187968751-187970750%230064C8%7Chg38.chr3%3A187973731-187975950%23808080%7Chg38.chr3%3A187977811-187980750%230064C8%7Chg38.chr3%3A187981291-187983290%23808080%7Chg38.chr3%3A187743727-187745726%23228B22"

Session info

sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Catalina 10.15.3
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] rtracklayer_1.44.4   GenomicRanges_1.36.1 GenomeInfoDb_1.20.0 
##  [4] IRanges_2.18.3       S4Vectors_0.22.1     BiocGenerics_0.30.0 
##  [7] forcats_0.4.0        stringr_1.4.0        dplyr_0.8.3         
## [10] purrr_0.3.3          readr_1.3.1          tidyr_1.0.0         
## [13] tibble_2.1.3         ggplot2_3.2.1        tidyverse_1.3.0     
## 
## loaded via a namespace (and not attached):
##  [1] Biobase_2.44.0              httr_1.4.1                 
##  [3] jsonlite_1.6                modelr_0.1.5               
##  [5] assertthat_0.2.1            GenomeInfoDbData_1.2.1     
##  [7] cellranger_1.1.0            Rsamtools_2.0.3            
##  [9] yaml_2.2.0                  pillar_1.4.2               
## [11] backports_1.1.5             lattice_0.20-38            
## [13] glue_1.3.1                  digest_0.6.23              
## [15] XVector_0.24.0              rvest_0.3.5                
## [17] colorspace_1.4-1            htmltools_0.4.0            
## [19] Matrix_1.2-18               XML_3.98-1.20              
## [21] pkgconfig_2.0.3             broom_0.5.2                
## [23] haven_2.2.0                 zlibbioc_1.30.0            
## [25] scales_1.1.0                BiocParallel_1.18.1        
## [27] generics_0.0.2              farver_2.0.1               
## [29] withr_2.1.2                 SummarizedExperiment_1.14.1
## [31] lazyeval_0.2.2              cli_1.1.0                  
## [33] magrittr_1.5                crayon_1.3.4               
## [35] readxl_1.3.1                evaluate_0.14              
## [37] fs_1.3.1                    nlme_3.1-142               
## [39] xml2_1.2.2                  tools_3.6.1                
## [41] hms_0.5.2                   lifecycle_0.1.0            
## [43] matrixStats_0.55.0          munsell_0.5.0              
## [45] reprex_0.3.0                DelayedArray_0.10.0        
## [47] Biostrings_2.52.0           compiler_3.6.1             
## [49] rlang_0.4.2                 grid_3.6.1                 
## [51] RCurl_1.95-4.12             rstudioapi_0.10            
## [53] bitops_1.0-6                labeling_0.3               
## [55] rmarkdown_1.18              gtable_0.3.0               
## [57] DBI_1.0.0                   R6_2.4.1                   
## [59] GenomicAlignments_1.20.1    lubridate_1.7.4            
## [61] knitr_1.26                  zeallot_0.1.0              
## [63] stringi_1.4.3               Rcpp_1.0.3                 
## [65] vctrs_0.2.0                 dbplyr_1.4.2               
## [67] tidyselect_0.2.5            xfun_0.11