Get Target sequence

# BCL6-LCR coordinates: chr3:187605559-187854096 (provided by Raj, email Dec 20 2016)
target_region <- "BCL6-LCR"
chr <- "chr3"
start <- 187605559
end <- 187854096
lcr <- Views(BSgenome.Hsapiens.UCSC.hg19[[chr]],start=start,end=end)
# region length:
cat((end-start)/1000," kb")
## 248.537  kb

Identify all potential PAMs and gRNAs

# Find all gRNAs + PAMs in plus sequence
PAM <- DNAString(paste(c(rep("N",21),"GG"),collapse = ""))
gRNA.p <- matchPattern(PAM,lcr,fixed = F)
names(gRNA.p)[1:length(gRNA.p)] <- "+"

# Find all gRNAs + PAMs in minus sequence
PAM.rc <- reverseComplement(PAM)
gRNA.m <- matchPattern(PAM.rc,lcr,fixed = F)
names(gRNA.m)[1:length(gRNA.m)] <- "-"

# combine + and - strand:
gRNA <- c(gRNA.p,gRNA.m)
df.gRNA <- data.frame(id=1:length(gRNA),
                      target_seq_plus=gRNA,
                      target_chr=chr,
                      target_start=start(gRNA),
                      target_end=end(gRNA),
                      strand=names(gRNA))

# transform select columns from factors to characters:
df.gRNA$target_chr <- as.character(df.gRNA$target_chr)

# reverseComplement - strand target_seq
df.gRNA$target_seq <- NA
df.gRNA$target_seq[df.gRNA$strand=="+"] <- df.gRNA$target_seq_plus[df.gRNA$strand=="+"]
df.gRNA$target_seq[df.gRNA$strand=="-"] <- as.character(reverseComplement(DNAStringSet(df.gRNA$target_seq_plus[df.gRNA$strand=="-"])))

# add genome browser input:
df.gRNA$genome_browser <- paste0(df.gRNA$target_chr,":",df.gRNA$target_start,"-",df.gRNA$target_end)

# add base position after cut:
# note that Cas9 cuts between -3 and -4 from PAM
# thus, for + strand gRNAs, the position AFTER cut will be -5 from target_end
# for - strand gRNAs, the position AFTER cut will be +6 from target_start 
df.gRNA$base_after_cut <- NA
df.gRNA$base_after_cut[df.gRNA$strand=="+"] <- df.gRNA$target_end[df.gRNA$strand=="+"]-5
df.gRNA$base_after_cut[df.gRNA$strand=="-"] <- df.gRNA$target_start[df.gRNA$strand=="-"]+6
df.gRNA$base_after_cut <- as.integer(df.gRNA$base_after_cut)

### isolate gRNA sequence (i.e. target sequence without the PAM)
df.gRNA$gRNA <- NA
df.gRNA$gRNA <- as.character(subseq(DNAStringSet(df.gRNA$target_seq),start = 1,width = 20))

# # write initial / all gRNAs to BED:
# bed <- data.frame(chrom=df.gRNA$target_chr,chromStart=df.gRNA$target_start,chromEnd=df.gRNA$target_end,name=df.gRNA$id,score=1,strand=df.gRNA$strand)
# filename <- paste0(target_region,"_initial_gRNAs.bed")
# write('track name="initial gRNAs (pre filter)" description="inital gRNAs (pre filter)" visibility=1 colorByStrand="255,0,0 0,0,255"',file=filename,append = F)
# write.table(bed,file=filename,quote = F,col.names = F,row.names = F,append = T)
# 
# # save as Rda file:
# filename <- paste0(target_region,"_initial_gRNAs.Rda")
# save(df.gRNA,file = filename)

Initial library stats

# total number of potential gRNAs (inital)
dim(df.gRNA)[1]
## [1] 26728
# calculate median distance between cut sites etc.:
l <- dim(df.gRNA)[1]
df.dist <- data.frame(cut=sort(df.gRNA$base_after_cut)[1:(l-1)])
df.dist$next_cut <- sort(df.gRNA$base_after_cut)[2:l]
df.dist$dist <- df.dist$next_cut-df.dist$cut
summary(df.dist$dist)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.000   5.000   9.298  13.000 219.000

Filter gRNAs

remove duplicates

# find all duplicates:
dup.f <- which(duplicated(df.gRNA$gRNA,fromLast=F))
dup.r <- which(duplicated(df.gRNA$gRNA,fromLast=T))
dup <- union(dup.r,dup.f)

if (length(dup)>=1){
  print("Found >=1 duplicate gRNAs")
  # # write duplicated gRNAs to BED:
  # bed <- data.frame(chrom=df.gRNA$target_chr[dup],chromStart=df.gRNA$target_start[dup],chromEnd=df.gRNA$target_end[dup],
  #                   ame=df.gRNA$id[dup],score=1,strand=df.gRNA$strand[dup])
  # filename <- paste0(target_region,"_duplicated_gRNAs.bed")
  # write('track name="duplicated gRNAs" description="duplicated gRNAs" visibility=1 colorByStrand="255,0,0
  #       0,0,255"',file=filename,append = F)
  # write.table(bed,file=filename,quote = F,col.names = F,row.names = F,append = T)
  
  # delete all duplicates:
  df.gRNA <- df.gRNA[-dup,]
  
  # number of duplicates (removed):
  length(dup)
  # number of remaining gRNAs:
  dim(df.gRNA)[1]
}else{print("Found no duplicate gRNAs")}
## [1] "Found >=1 duplicate gRNAs"
## [1] 25880

remove gRNAs containing potential transcriptional terminators

term <- grep("TTTT",df.gRNA$gRNA,fixed = F)
df.gRNA <- df.gRNA[-term,]

# number of polyT-gRNAs (removed):
length(term)
## [1] 2502
# number of remaining gRNAs:
dim(df.gRNA)[1]
## [1] 23378

remove gRNAs containing polyA/C/G stretches

polyA <- grep("AAAAA",df.gRNA$gRNA,fixed = F)
df.gRNA <- df.gRNA[-polyA,]
polyC <- grep("CCCCC",df.gRNA$gRNA,fixed = F)
df.gRNA <- df.gRNA[-polyC,]
polyG <- grep("GGGGG",df.gRNA$gRNA,fixed = F)
df.gRNA <- df.gRNA[-polyG,]
length(c(polyA,polyC,polyG))
## [1] 1386
# number of polyA/C/G-gRNAs (removed):
length(c(polyA,polyC,polyG))
## [1] 1386
# number of remaining gRNAs:
dim(df.gRNA)[1]
## [1] 21992

remove all gRNAs containing RE-sites

# this library will be cloned into pLKO5-gRNA vectors using BsmBI
RE.sense <- grep("CGTCTC",df.gRNA$gRNA,fixed = F)
if (length(RE.sense)>=1){
  df.gRNA <- df.gRNA[-RE.sense,]}
RE.antisense <- grep("GAGACG",df.gRNA$gRNA,fixed = F)
if (length(RE.antisense)>=1){
  df.gRNA <- df.gRNA[-RE.antisense,]}

# number of RE-containing gRNAs (removed):
length(c(RE.sense,RE.antisense))
## [1] 35
# number of remaining gRNAs:
dim(df.gRNA)[1]
## [1] 21957

Off-Target analysis and filtering

# Strategy: we take a simplistic approach whereby we identify all gRNAs that have a perfect off-target match (mm0) to any genomic locus other than the target region. The number of off-target hits is not taken into account and will presumably range from very few to abundant. Furthermore, non-perfect off-target hits are not taken into account and may have to be considered later on. 

# load genome omitting random/unfinished chromosomes and additional haplotypes (i.e. limiting data to chr 1-24)
genome <- as.list(BSgenome.Hsapiens.UCSC.hg19)[1:24]
seqnames <- names(genome)
# exclude BCL6 LCR from mismatch analysis by masking chr3 (which produces an XString view)
chr3.masked <- mask(genome$chr3,start=start,end=end) 
# mask genome by deleting old chr3 and adding masked chr3 
genome.masked <- c(as.list(genome)[-3],as.list(chr3.masked))
genome.masked.set <- DNAStringSet(as.list(genome.masked))

### Pattern matching
# Note on strategy: searching for gRNA sequence + NGG is more precise than searching for the entire target sequence as PAM sequence can vary
# There are two options to enable inexact matching (i.e. using a wildcard / IUPAC ambiguity codes for NGG), 
# 1.) the pattern library can be left un-processed (i.e. not using the pre-processing with pdict). However, vwichPDict seems to be incredibly slow with a non-processed dictionary.
# 2.) define a trusted band in the pattern library. The head and tail can then include IUPAC ambiguity codes. 
# See Biostrings Reference manual matchPdict example 'D. USING A NON-PREPROCESSED DICTIONARY' and 'matchPDict-inexact' chapter for details
# Note on validity check: in the Addendum, I have manually specified each PAM and checked the vwchiPDict results for chr1 against the results of the code below. The results were identical.
# Note on vwichPDict: vwhichPDict only calcualtes which patterns in the input dictionary have at least one match in the subject / set of reference sequences (without counting number of matches or retrieving positions of matches)

### + strand
pdict.p <- DNAStringSet(paste0(df.gRNA$gRNA,"NGG"))
pdict.prepr.p <- PDict(pdict.p,tb.start = 1,tb.width = 20) 
wpd.p <- vwhichPDict(pdict.prepr.p,genome.masked.set,max.mismatch = 0,fixed = "subject")
mm0.p <- unique(unlist(wpd.p))

### - strand
pdict.m <- reverseComplement(DNAStringSet(paste0(df.gRNA$gRNA,"NGG")))
pdict.prepr.m <- PDict(pdict.m,tb.start = 4,tb.width = 20) 
wpd.m <- vwhichPDict(pdict.prepr.m,genome.masked.set,max.mismatch = 0,fixed="subject")
mm0.m <- unique(unlist(wpd.m))
length(mm0.m)
## [1] 2286
# make union of + and - strand off-target search results
mm0 <- union(mm0.p,mm0.m)

# # write gRNAs with perfect matches (0 mismatch / mm0) to off-targets (>1) to BED file:
# bed <- data.frame(chrom=df.gRNA$target_chr[mm0],chromStart=df.gRNA$target_start[mm0],chromEnd=df.gRNA$target_end[mm0],name=df.gRNA$id[mm0],score=1,strand=df.gRNA$strand[mm0])
# filename <- paste0(target_region,"_mm0-off-targets_gRNAs.bed")
# write('track name="off-target gRNAs (mm0)" description="off-target gRNAs (mm0)" visibility=1 colorByStrand="255,0,0 0,0,255"',file=filename,append = F)
# write.table(bed,file=filename,quote = F,col.names = F,row.names = F,append = T)

# delete gRNAs with 0-mismatch off-targets:
df.gRNA.offTarget.save <- df.gRNA
df.gRNA <- df.gRNA[-mm0,]

# number of gRNAs perfectly matching any off-target (removed)
# plus:
length(mm0.p)
## [1] 2274
# minus:
length(mm0.m)
## [1] 2286
# union:
length(mm0)
## [1] 2436
# number of remaining gRNAs:
dim(df.gRNA)[1]
## [1] 19521

Final library stats

# Total number of gRNAs:
dim(df.gRNA)[1]
## [1] 19521
# Total number of targeted cut sites:
length(unique(df.gRNA$base_after_cut))
## [1] 19086
# Number of gRNAs on each strand:
table(df.gRNA$strand)
## 
##     -     + 
## 10011  9510
# calculate distance between cut sites:
l <- dim(df.gRNA)[1]
df.dist <- data.frame(cut=sort(df.gRNA$base_after_cut)[1:(l-1)])
df.dist$next_cut <- sort(df.gRNA$base_after_cut)[2:l]
df.dist$dist <- df.dist$next_cut-df.dist$cut
# Summary of distance between cut sites:
summary(df.dist$dist)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    1.00    6.00   12.73   15.00 3420.00
# How often is the distance between cut sites 0,1,2,3,4... bp?
table(df.dist$dist)[1:100]
## 
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14 
##  435 5391  496 1332 1056 1004  922  828  549  474  427  380  544  389  385 
##   15   16   17   18   19   20   21   22   23   24   25   26   27   28   29 
##  336  279  285  266  223  239  191  189  148  127  152  149  116  102  102 
##   30   31   32   33   34   35   36   37   38   39   40   41   42   43   44 
##   91  100   87   78   63   64   60   73   49   56   41   45   44   48   50 
##   45   46   47   48   49   50   51   52   53   54   55   56   57   58   59 
##   53   44   28   32   35   35   42   37   32   26   17   20   26   23   22 
##   60   61   62   63   64   65   66   67   68   69   70   71   72   73   74 
##   31   16   18   14   16   12   16   11   18   15   13   16   13    8    8 
##   75   76   77   78   79   80   81   82   83   84   85   86   87   88   89 
##   11   10    7    7   12    3   13   12    5    9   14    7    4    7    4 
##   90   91   92   93   94   95   96   97   98   99 
##    4    7    8    3    1    2    5    4    5    4
## Warning: Removed 1913 rows containing non-finite values (stat_bin).

## Warning: Removed 1913 rows containing non-finite values (stat_bin).

final library data files

# write final selection of gRNAs to BED:
bed <- data.frame(chrom=df.gRNA$target_chr,chromStart=df.gRNA$target_start,chromEnd=df.gRNA$target_end,name=df.gRNA$id,score=1,strand=df.gRNA$strand)
filename <- paste0(target_region,"_final_gRNAs_gRNAPos.bed")
write('track name="final gRNAs" description="final gRNAs" visibility=1 colorByStrand="255,0,0 0,0,255"',file=filename,append = F)
write.table(bed,file=filename,quote = F,col.names = F,row.names = F,append = T)

# write final selection of gRNAs to BED - cut positions only:
bed <- data.frame(chrom=df.gRNA$target_chr,chromStart=df.gRNA$base_after_cut,chromEnd=df.gRNA$base_after_cut,name=df.gRNA$id,score=1,strand=df.gRNA$strand)
filename <- paste0(target_region,"_final_gRNAs_cutPos.bed")
write('track name="final gRNAs cutPos" description="final gRNAs cutPos" visibility=1 colorByStrand="255,0,0 0,0,255"',file=filename,append = F)
write.table(bed,file=filename,quote = F,col.names = F,row.names = F,append = T)

# save gRNA dataset:
filename <- paste0(target_region,"_final_gRNAs.Rda")
save(df.gRNA,file = filename)