Introduction

The following script represents the transcription factor motif-based analysis performed in our recent paper “An OCT2 / OCA-B / MEF2B Ternary Complex Controls the Activity and Architecture of an Essential Locus Control Region for Normal and Malignant Germinal Center B-cells”. This script runs through the following steps:

defintion of BCL6-LCR constituent enhancers (CEs) using Encode DNase-seq data
categorization of CEs as essential enhancers (EE) or non-essential enhancers (NE) using CRISPRi screening data

Please feel free to email me with any questions, comments or suggestions.
info at jchellmuth.com

Load required libraries and set color values

library(tidyverse)
library(GenomicRanges)
library(rtracklayer)
# make values object for manual scales:
color.values <- c("TSS"="#228B22","non-essential enhancer (NE)"="#808080","essential enhancer (EE)"="#0064C8")

Constituent enhancer definition

Here, we use DNase-seq peaks to define constituent enhancers (CE) at the BCL6 LCR. First, we overlap DNase-seq peaks with the genomic ranges covered by our screen. Then, we focus on the top quartile peaks to avoid spurious regulatory elements. DNase-seq peaks are extended to 2kb to reflect all sorrounding TF biding events, histone marks and functional data (CRISPRi). Finally, overlapping CEs are merged.

# import DNase-seq peaks (Encode PCI-LY7 repliacte 1)
gr.peaks.all <- import("ENCFF328RHP.bed",format = "narrowPeak")

# get peaks in regions covered by screen:
gr.regions <- import("screen_targeted_regions.bed")
gr.peaks <- subsetByOverlaps(gr.peaks.all,gr.regions[gr.regions$name!="BCL6-TSS"])
# Note: BCL6-TSS range is exlcuded from overlap as this regulatory element will be defined manually

# top25% DNAse seq peaks:
gr.ce <- as(gr.peaks[gr.peaks$signalValue>quantile(gr.peaks$signalValue,0.75)],"UCSCData")

# limit the ensuing analysis to the LCR sensu stricto:
lcr <- makeGRangesFromDataFrame(data.frame(chr="chr3",start="187887600",end="187995200"))
gr.ce <- subsetByOverlaps(gr.ce,lcr)

# extend to 2kb and reduce overlapping ranges:
gr.ce <- as(reduce(promoters(gr.ce,upstream=(1000-75),downstream=(1000+75))),"UCSCData")

# note that two CE's are <100bp of each other. These are merged to one CE here:
distanceToNearest(gr.ce) %>% data.frame()

##    queryHits subjectHits distance
## 1          1           2    14920
## 2          2           3    14740
## 3          3           4     4900
## 4          4           5       60
## 5          5           4       60
## 6          6           7     2600
## 7          7           6     2600
## 8          8           9     2980
## 9          9          10     1860
## 10        10          11      540
## 11        11          10      540

gr.ce[4] <- range(gr.ce[4:5])
gr.ce <- gr.ce[-5] 

# Add BCL6 TSS (chr3:187745727)
# UCSC: ENST00000406870.6
# Refseq: NM_001706
# make GRanges object for BCL6 TSS - here defined at 2Kb downstream
gr.tss <- GRanges(seqnames = "chr3",ranges = IRanges(start = 187745727-2000,width = 2000))
# merge to CE granges
gr.ce <- c(gr.ce,gr.tss)
# name ranges
gr.ce$re <- c(paste0("CE",1:(length(gr.ce)-1)),"TSS")
names(gr.ce) <- gr.ce$re
## factorize:
gr.ce$re <- factor(gr.ce$re,levels = c("control","TSS",paste0("CE",1:(length(unique(gr.ce$re))-1))))

Categorize constituent enhancers

Load growth score / library data.

# load library file with normalized and cleaned count data (lifted over to hg38):
load("../Script2.1_normalization/hg38_library_counts_norm_clean_centered.Rda")
# save control gRNAs for later usage in statistical testing:
lib.contr <- libm[libm$category=="non-targeting control",]
# remove control gRNAs (which do not have genomic coordinates):
lib <- libm[-grep("control",libm$category),]
# make GRanges object for library (omitting dispensable columns):
gr.lib <- makeGRangesFromDataFrame(lib[,grep("start|end|norm|r1|r2|chromStart|chromEnd|category",colnames(lib),invert = T)],
                                   start.field = "base_after_cut",end.field = "base_after_cut",keep.extra.columns = T)

Overlap gRNA data (lib) with RE/constituent enhancer coordinates (gr.re) to allow analysis of all gRNA growth scores per RE.

# select overlapping gRNAs (incl growth scores)
gr.ov <- subsetByOverlaps(gr.lib,gr.ce)
# assign names of overlapping REs:
hits <- findOverlaps(query = gr.lib,subject = gr.ce)
gr.ov$re <- gr.ce[subjectHits(hits)]$re
gr.ov$category <- gr.ce[subjectHits(hits)]$category
# make df for plots:
df.ov <- data.frame(gr.ov)

Plots

Cutoff to distinguish core-enhancer from non-essential enhancers: mean growth score of enhancer is smaller than the 1% quantile of neg. control gRNAs

# define cutoff:
cutoff <- quantile(lib.contr$gs[lib.contr$category=="non-targeting control"],probs=c(0.01,0.99))

# categorize enhancers:
t.summary <- df.ov %>% group_by(re) %>% summarize(median.gs=median(gs),mean.gs=mean(gs))
essential <- t.summary %>% filter(mean.gs<cutoff[1]) %>% pull(re)

## assign RE categories:
gr.ce$category <- "non-essential enhancer (NE)"
gr.ce$category[gr.ce$re %in% essential] <- "essential enhancer (EE)"
gr.ce$category[gr.ce$re=="TSS"] <- "TSS"

# bind to overlapped growth score data:
df.ov <- left_join(df.ov,data.frame(mcols(gr.ce)),by=c("re"="re"))

# boxplot
ggplot(df.ov,aes(x=re,y=gs,fill=category))+
  scale_x_discrete()+ # note: required in order to add annotate first
  annotate("rect",ymin=cutoff[1],ymax=cutoff[2],xmin=-Inf,xmax=Inf,alpha=0.3,color="grey")+
  geom_hline(yintercept = 0,color="darkgrey")+
  geom_boxplot(outlier.shape = NA,width=0.6)+
  coord_cartesian(ylim = c(-4,0.4))+
  scale_fill_manual(values = color.values)+
  labs(x="",fill="",y="CRISPRi\ngrowth score")+
  theme_linedraw()+
  theme(panel.grid = element_blank(),strip.background = element_rect(fill = "grey85"))+
  theme(axis.text.x = element_text(angle=45,hjust=1))

Export bed files

# add colors to categorized regulatory elements
mcols(gr.ce) <- left_join(data.frame(mcols(gr.ce)),data.frame(itemRgb=color.values,category=names(color.values)))

## Joining, by = "category"

## Warning: Column `category` joining character vector and factor, coercing into
## character vector

# add trackline
gr.ce@trackLine <- as('track type=bed name="LCR constituent enhancers categorized" description="LCR constituent enhancers with 2kb TSS - categorized" visibility=squish itemRgb=on',"BasicTrackLine")
# export bed
export(gr.ce,con="LCR.constituent.enhancers.categorized.bed")
# save Rda
save(gr.ce,file="LCR.constituent.enhancers.categorized.Rda")

bedToBigBeds for UCSC TrackHub

### BASH ##########################################################################################
# for bed to bigBed conversion, remove header and sort
tail -n +2 LCR.constituent.enhancers.categorized.bed | sort -k1,1 -k2,2n > LCR.constituent.enhancers.categorized.sorted.bed
/Applications/kentUtils/bedToBigBed LCR.constituent.enhancers.categorized.sorted.bed ~/UCSC/hg38.chrom.sizes LCR.constituent.enhancers.categorized.bb

UCSC track highlights

# to display categorized CEs as highlights (shaded areas) in UCSC, paste the following output into UCSC link:
paste0("&highlight=",paste0(paste0("hg38.chr3%3A",start(gr.ce),"-",end(gr.ce),"%23",gsub("#","",gr.ce$itemRgb)),collapse = "%7C"))

## [1] "&highlight=hg38.chr3%3A187900071-187903310%230064C8%7Chg38.chr3%3A187918231-187920230%23808080%7Chg38.chr3%3A187934971-187936970%23808080%7Chg38.chr3%3A187941871-187947250%23808080%7Chg38.chr3%3A187957831-187959830%230064C8%7Chg38.chr3%3A187962431-187964430%23808080%7Chg38.chr3%3A187968751-187970750%230064C8%7Chg38.chr3%3A187973731-187975950%23808080%7Chg38.chr3%3A187977811-187980750%230064C8%7Chg38.chr3%3A187981291-187983290%23808080%7Chg38.chr3%3A187743727-187745726%23228B22"

Session info

sessionInfo()

## R version 3.6.1 (2019-07-05)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Catalina 10.15.3
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] rtracklayer_1.44.4   GenomicRanges_1.36.1 GenomeInfoDb_1.20.0 
##  [4] IRanges_2.18.3       S4Vectors_0.22.1     BiocGenerics_0.30.0 
##  [7] forcats_0.4.0        stringr_1.4.0        dplyr_0.8.3         
## [10] purrr_0.3.3          readr_1.3.1          tidyr_1.0.0         
## [13] tibble_2.1.3         ggplot2_3.2.1        tidyverse_1.3.0     
## 
## loaded via a namespace (and not attached):
##  [1] Biobase_2.44.0              httr_1.4.1                 
##  [3] jsonlite_1.6                modelr_0.1.5               
##  [5] assertthat_0.2.1            GenomeInfoDbData_1.2.1     
##  [7] cellranger_1.1.0            Rsamtools_2.0.3            
##  [9] yaml_2.2.0                  pillar_1.4.2               
## [11] backports_1.1.5             lattice_0.20-38            
## [13] glue_1.3.1                  digest_0.6.23              
## [15] XVector_0.24.0              rvest_0.3.5                
## [17] colorspace_1.4-1            htmltools_0.4.0            
## [19] Matrix_1.2-18               XML_3.98-1.20              
## [21] pkgconfig_2.0.3             broom_0.5.2                
## [23] haven_2.2.0                 zlibbioc_1.30.0            
## [25] scales_1.1.0                BiocParallel_1.18.1        
## [27] generics_0.0.2              farver_2.0.1               
## [29] withr_2.1.2                 SummarizedExperiment_1.14.1
## [31] lazyeval_0.2.2              cli_1.1.0                  
## [33] magrittr_1.5                crayon_1.3.4               
## [35] readxl_1.3.1                evaluate_0.14              
## [37] fs_1.3.1                    nlme_3.1-142               
## [39] xml2_1.2.2                  tools_3.6.1                
## [41] hms_0.5.2                   lifecycle_0.1.0            
## [43] matrixStats_0.55.0          munsell_0.5.0              
## [45] reprex_0.3.0                DelayedArray_0.10.0        
## [47] Biostrings_2.52.0           compiler_3.6.1             
## [49] rlang_0.4.2                 grid_3.6.1                 
## [51] RCurl_1.95-4.12             rstudioapi_0.10            
## [53] bitops_1.0-6                labeling_0.3               
## [55] rmarkdown_1.18              gtable_0.3.0               
## [57] DBI_1.0.0                   R6_2.4.1                   
## [59] GenomicAlignments_1.20.1    lubridate_1.7.4            
## [61] knitr_1.26                  zeallot_0.1.0              
## [63] stringi_1.4.3               Rcpp_1.0.3                 
## [65] vctrs_0.2.0                 dbplyr_1.4.2               
## [67] tidyselect_0.2.5            xfun_0.11

Identification of BCL6-LCR essential enhancers

BCL6 LCR paper script 2.2

Johannes C. Hellmuth

2/6/2020