Get Target sequence
# BCL6-LCR coordinates: chr3:187605559-187854096 (provided by Raj, email Dec 20 2016)
target_region <- "BCL6-LCR"
chr <- "chr3"
start <- 187605559
end <- 187854096
lcr <- Views(BSgenome.Hsapiens.UCSC.hg19[[chr]],start=start,end=end)
# region length:
cat((end-start)/1000," kb")
## 248.537 kb
Identify all potential PAMs and gRNAs
# Find all gRNAs + PAMs in plus sequence
PAM <- DNAString(paste(c(rep("N",21),"GG"),collapse = ""))
gRNA.p <- matchPattern(PAM,lcr,fixed = F)
names(gRNA.p)[1:length(gRNA.p)] <- "+"
# Find all gRNAs + PAMs in minus sequence
PAM.rc <- reverseComplement(PAM)
gRNA.m <- matchPattern(PAM.rc,lcr,fixed = F)
names(gRNA.m)[1:length(gRNA.m)] <- "-"
# combine + and - strand:
gRNA <- c(gRNA.p,gRNA.m)
df.gRNA <- data.frame(id=1:length(gRNA),
target_seq_plus=gRNA,
target_chr=chr,
target_start=start(gRNA),
target_end=end(gRNA),
strand=names(gRNA))
# transform select columns from factors to characters:
df.gRNA$target_chr <- as.character(df.gRNA$target_chr)
# reverseComplement - strand target_seq
df.gRNA$target_seq <- NA
df.gRNA$target_seq[df.gRNA$strand=="+"] <- df.gRNA$target_seq_plus[df.gRNA$strand=="+"]
df.gRNA$target_seq[df.gRNA$strand=="-"] <- as.character(reverseComplement(DNAStringSet(df.gRNA$target_seq_plus[df.gRNA$strand=="-"])))
# add genome browser input:
df.gRNA$genome_browser <- paste0(df.gRNA$target_chr,":",df.gRNA$target_start,"-",df.gRNA$target_end)
# add base position after cut:
# note that Cas9 cuts between -3 and -4 from PAM
# thus, for + strand gRNAs, the position AFTER cut will be -5 from target_end
# for - strand gRNAs, the position AFTER cut will be +6 from target_start
df.gRNA$base_after_cut <- NA
df.gRNA$base_after_cut[df.gRNA$strand=="+"] <- df.gRNA$target_end[df.gRNA$strand=="+"]-5
df.gRNA$base_after_cut[df.gRNA$strand=="-"] <- df.gRNA$target_start[df.gRNA$strand=="-"]+6
df.gRNA$base_after_cut <- as.integer(df.gRNA$base_after_cut)
### isolate gRNA sequence (i.e. target sequence without the PAM)
df.gRNA$gRNA <- NA
df.gRNA$gRNA <- as.character(subseq(DNAStringSet(df.gRNA$target_seq),start = 1,width = 20))
# # write initial / all gRNAs to BED:
# bed <- data.frame(chrom=df.gRNA$target_chr,chromStart=df.gRNA$target_start,chromEnd=df.gRNA$target_end,name=df.gRNA$id,score=1,strand=df.gRNA$strand)
# filename <- paste0(target_region,"_initial_gRNAs.bed")
# write('track name="initial gRNAs (pre filter)" description="inital gRNAs (pre filter)" visibility=1 colorByStrand="255,0,0 0,0,255"',file=filename,append = F)
# write.table(bed,file=filename,quote = F,col.names = F,row.names = F,append = T)
#
# # save as Rda file:
# filename <- paste0(target_region,"_initial_gRNAs.Rda")
# save(df.gRNA,file = filename)
Initial library stats
# total number of potential gRNAs (inital)
dim(df.gRNA)[1]
## [1] 26728
# calculate median distance between cut sites etc.:
l <- dim(df.gRNA)[1]
df.dist <- data.frame(cut=sort(df.gRNA$base_after_cut)[1:(l-1)])
df.dist$next_cut <- sort(df.gRNA$base_after_cut)[2:l]
df.dist$dist <- df.dist$next_cut-df.dist$cut
summary(df.dist$dist)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.000 5.000 9.298 13.000 219.000
Filter gRNAs
remove duplicates
# find all duplicates:
dup.f <- which(duplicated(df.gRNA$gRNA,fromLast=F))
dup.r <- which(duplicated(df.gRNA$gRNA,fromLast=T))
dup <- union(dup.r,dup.f)
if (length(dup)>=1){
print("Found >=1 duplicate gRNAs")
# # write duplicated gRNAs to BED:
# bed <- data.frame(chrom=df.gRNA$target_chr[dup],chromStart=df.gRNA$target_start[dup],chromEnd=df.gRNA$target_end[dup],
# ame=df.gRNA$id[dup],score=1,strand=df.gRNA$strand[dup])
# filename <- paste0(target_region,"_duplicated_gRNAs.bed")
# write('track name="duplicated gRNAs" description="duplicated gRNAs" visibility=1 colorByStrand="255,0,0
# 0,0,255"',file=filename,append = F)
# write.table(bed,file=filename,quote = F,col.names = F,row.names = F,append = T)
# delete all duplicates:
df.gRNA <- df.gRNA[-dup,]
# number of duplicates (removed):
length(dup)
# number of remaining gRNAs:
dim(df.gRNA)[1]
}else{print("Found no duplicate gRNAs")}
## [1] "Found >=1 duplicate gRNAs"
## [1] 25880
remove gRNAs containing potential transcriptional terminators
term <- grep("TTTT",df.gRNA$gRNA,fixed = F)
df.gRNA <- df.gRNA[-term,]
# number of polyT-gRNAs (removed):
length(term)
## [1] 2502
# number of remaining gRNAs:
dim(df.gRNA)[1]
## [1] 23378
remove gRNAs containing polyA/C/G stretches
polyA <- grep("AAAAA",df.gRNA$gRNA,fixed = F)
df.gRNA <- df.gRNA[-polyA,]
polyC <- grep("CCCCC",df.gRNA$gRNA,fixed = F)
df.gRNA <- df.gRNA[-polyC,]
polyG <- grep("GGGGG",df.gRNA$gRNA,fixed = F)
df.gRNA <- df.gRNA[-polyG,]
length(c(polyA,polyC,polyG))
## [1] 1386
# number of polyA/C/G-gRNAs (removed):
length(c(polyA,polyC,polyG))
## [1] 1386
# number of remaining gRNAs:
dim(df.gRNA)[1]
## [1] 21992
remove all gRNAs containing RE-sites
# this library will be cloned into pLKO5-gRNA vectors using BsmBI
RE.sense <- grep("CGTCTC",df.gRNA$gRNA,fixed = F)
if (length(RE.sense)>=1){
df.gRNA <- df.gRNA[-RE.sense,]}
RE.antisense <- grep("GAGACG",df.gRNA$gRNA,fixed = F)
if (length(RE.antisense)>=1){
df.gRNA <- df.gRNA[-RE.antisense,]}
# number of RE-containing gRNAs (removed):
length(c(RE.sense,RE.antisense))
## [1] 35
# number of remaining gRNAs:
dim(df.gRNA)[1]
## [1] 21957
Off-Target analysis and filtering
# Strategy: we take a simplistic approach whereby we identify all gRNAs that have a perfect off-target match (mm0) to any genomic locus other than the target region. The number of off-target hits is not taken into account and will presumably range from very few to abundant. Furthermore, non-perfect off-target hits are not taken into account and may have to be considered later on.
# load genome omitting random/unfinished chromosomes and additional haplotypes (i.e. limiting data to chr 1-24)
genome <- as.list(BSgenome.Hsapiens.UCSC.hg19)[1:24]
seqnames <- names(genome)
# exclude BCL6 LCR from mismatch analysis by masking chr3 (which produces an XString view)
chr3.masked <- mask(genome$chr3,start=start,end=end)
# mask genome by deleting old chr3 and adding masked chr3
genome.masked <- c(as.list(genome)[-3],as.list(chr3.masked))
genome.masked.set <- DNAStringSet(as.list(genome.masked))
### Pattern matching
# Note on strategy: searching for gRNA sequence + NGG is more precise than searching for the entire target sequence as PAM sequence can vary
# There are two options to enable inexact matching (i.e. using a wildcard / IUPAC ambiguity codes for NGG),
# 1.) the pattern library can be left un-processed (i.e. not using the pre-processing with pdict). However, vwichPDict seems to be incredibly slow with a non-processed dictionary.
# 2.) define a trusted band in the pattern library. The head and tail can then include IUPAC ambiguity codes.
# See Biostrings Reference manual matchPdict example 'D. USING A NON-PREPROCESSED DICTIONARY' and 'matchPDict-inexact' chapter for details
# Note on validity check: in the Addendum, I have manually specified each PAM and checked the vwchiPDict results for chr1 against the results of the code below. The results were identical.
# Note on vwichPDict: vwhichPDict only calcualtes which patterns in the input dictionary have at least one match in the subject / set of reference sequences (without counting number of matches or retrieving positions of matches)
### + strand
pdict.p <- DNAStringSet(paste0(df.gRNA$gRNA,"NGG"))
pdict.prepr.p <- PDict(pdict.p,tb.start = 1,tb.width = 20)
wpd.p <- vwhichPDict(pdict.prepr.p,genome.masked.set,max.mismatch = 0,fixed = "subject")
mm0.p <- unique(unlist(wpd.p))
### - strand
pdict.m <- reverseComplement(DNAStringSet(paste0(df.gRNA$gRNA,"NGG")))
pdict.prepr.m <- PDict(pdict.m,tb.start = 4,tb.width = 20)
wpd.m <- vwhichPDict(pdict.prepr.m,genome.masked.set,max.mismatch = 0,fixed="subject")
mm0.m <- unique(unlist(wpd.m))
length(mm0.m)
## [1] 2286
# make union of + and - strand off-target search results
mm0 <- union(mm0.p,mm0.m)
# # write gRNAs with perfect matches (0 mismatch / mm0) to off-targets (>1) to BED file:
# bed <- data.frame(chrom=df.gRNA$target_chr[mm0],chromStart=df.gRNA$target_start[mm0],chromEnd=df.gRNA$target_end[mm0],name=df.gRNA$id[mm0],score=1,strand=df.gRNA$strand[mm0])
# filename <- paste0(target_region,"_mm0-off-targets_gRNAs.bed")
# write('track name="off-target gRNAs (mm0)" description="off-target gRNAs (mm0)" visibility=1 colorByStrand="255,0,0 0,0,255"',file=filename,append = F)
# write.table(bed,file=filename,quote = F,col.names = F,row.names = F,append = T)
# delete gRNAs with 0-mismatch off-targets:
df.gRNA.offTarget.save <- df.gRNA
df.gRNA <- df.gRNA[-mm0,]
# number of gRNAs perfectly matching any off-target (removed)
# plus:
length(mm0.p)
## [1] 2274
# minus:
length(mm0.m)
## [1] 2286
# union:
length(mm0)
## [1] 2436
# number of remaining gRNAs:
dim(df.gRNA)[1]
## [1] 19521
Final library stats
# Total number of gRNAs:
dim(df.gRNA)[1]
## [1] 19521
# Total number of targeted cut sites:
length(unique(df.gRNA$base_after_cut))
## [1] 19086
# Number of gRNAs on each strand:
table(df.gRNA$strand)
##
## - +
## 10011 9510
# calculate distance between cut sites:
l <- dim(df.gRNA)[1]
df.dist <- data.frame(cut=sort(df.gRNA$base_after_cut)[1:(l-1)])
df.dist$next_cut <- sort(df.gRNA$base_after_cut)[2:l]
df.dist$dist <- df.dist$next_cut-df.dist$cut
# Summary of distance between cut sites:
summary(df.dist$dist)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 1.00 6.00 12.73 15.00 3420.00
# How often is the distance between cut sites 0,1,2,3,4... bp?
table(df.dist$dist)[1:100]
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
## 435 5391 496 1332 1056 1004 922 828 549 474 427 380 544 389 385
## 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
## 336 279 285 266 223 239 191 189 148 127 152 149 116 102 102
## 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44
## 91 100 87 78 63 64 60 73 49 56 41 45 44 48 50
## 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
## 53 44 28 32 35 35 42 37 32 26 17 20 26 23 22
## 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
## 31 16 18 14 16 12 16 11 18 15 13 16 13 8 8
## 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89
## 11 10 7 7 12 3 13 12 5 9 14 7 4 7 4
## 90 91 92 93 94 95 96 97 98 99
## 4 7 8 3 1 2 5 4 5 4
## Warning: Removed 1913 rows containing non-finite values (stat_bin).
## Warning: Removed 1913 rows containing non-finite values (stat_bin).
final library data files
# write final selection of gRNAs to BED:
bed <- data.frame(chrom=df.gRNA$target_chr,chromStart=df.gRNA$target_start,chromEnd=df.gRNA$target_end,name=df.gRNA$id,score=1,strand=df.gRNA$strand)
filename <- paste0(target_region,"_final_gRNAs_gRNAPos.bed")
write('track name="final gRNAs" description="final gRNAs" visibility=1 colorByStrand="255,0,0 0,0,255"',file=filename,append = F)
write.table(bed,file=filename,quote = F,col.names = F,row.names = F,append = T)
# write final selection of gRNAs to BED - cut positions only:
bed <- data.frame(chrom=df.gRNA$target_chr,chromStart=df.gRNA$base_after_cut,chromEnd=df.gRNA$base_after_cut,name=df.gRNA$id,score=1,strand=df.gRNA$strand)
filename <- paste0(target_region,"_final_gRNAs_cutPos.bed")
write('track name="final gRNAs cutPos" description="final gRNAs cutPos" visibility=1 colorByStrand="255,0,0 0,0,255"',file=filename,append = F)
write.table(bed,file=filename,quote = F,col.names = F,row.names = F,append = T)
# save gRNA dataset:
filename <- paste0(target_region,"_final_gRNAs.Rda")
save(df.gRNA,file = filename)