Data normalization

# load library file (after liftover, now hg38):
load("../A_liftover/hg38_library_liftover.Rda")

# read space count files:
## note: spacer sequences were counted with count_spacers.py and the following script:
## ~/Sync/Analysis/CRISPR_scan/171230_CRISPRi-scan_count_data/Spacer_count_server.sh
r1s <- read.csv("~/Sync/CRISPRscan_BCL6_analysis/171230_CRISPRi-scan_count_data/R1S.count.csv",header = F,stringsAsFactors = F)
colnames(r1s)[2] <- "r1s_rc"
r1s$r1s_norm <- (r1s[,2]+1)/sum(r1s[,2])

r1e <- read.csv("~/Sync/CRISPRscan_BCL6_analysis/171230_CRISPRi-scan_count_data/R1E.count.csv",header = F,stringsAsFactors = F)
colnames(r1e)[2] <- "r1e_rc"
r1e$r1e_norm <- (r1e[,2]+1)/sum(r1e[,2])

r2s <- read.csv("~/Sync/CRISPRscan_BCL6_analysis/171230_CRISPRi-scan_count_data/R2S.count.csv",header = F,stringsAsFactors = F)
colnames(r2s)[2] <- "r2s_rc"
r2s$r2s_norm <- (r2s[,2]+1)/sum(r2s[,2])

r2e <- read.csv("~/Sync/CRISPRscan_BCL6_analysis/171230_CRISPRi-scan_count_data/R2E.count.csv",header = F,stringsAsFactors = F)
colnames(r2e)[2] <- "r2e_rc"
r2e$r2e_norm <- (r2e[,2]+1)/sum(r2e[,2])

# merge:
libm <- merge(lib,r1s,by.x = "protospacer.sequence",by.y = "V1")
libm <- merge(libm,r1e,by.x = "protospacer.sequence",by.y = "V1")
libm <- merge(libm,r2s,by.x = "protospacer.sequence",by.y = "V1")
libm <- merge(libm,r2e,by.x = "protospacer.sequence",by.y = "V1")

# calculate fold change rep1:
libm$r1_fc <- libm$r1e_norm/libm$r1s_norm
libm$r1_lfc <- log2(libm$r1_fc)
# calculate fold change rep2:
libm$r2_fc <- libm$r2e_norm/libm$r2s_norm
libm$r2_lfc <- log2(libm$r2_fc)


# Caveat:
# there are 5 more sequences in library than in count files
# this is due to the fact, that there are 5 sequences that are duplicated in library:
summary(r1s$V1 %in% lib$protospacer.sequence)
##    Mode    TRUE 
## logical   26993
summary(lib$protospacer.sequence %in% r1s$V1)
##    Mode    TRUE 
## logical   26998
summary(duplicated(lib$protospacer.sequence))
##    Mode   FALSE    TRUE 
## logical   26993       5
# these duplicates are gRNAs that work both as CRISPRn and CRISPRi controls:
lib[c(which(duplicated(lib$protospacer.sequence)),which(duplicated(lib$protospacer.sequence,fromLast = T))),]
##           gRNA.ID strand protospacer.sequence target        category chrom
## 26591   RFC3_ig-1   <NA> GCCGATACTTGTCCACCCAG   RFC3 CRISPRi control  <NA>
## 26592   RFC3_ig-2   <NA> GGGACGGCTGGACTATCACA   RFC3 CRISPRi control  <NA>
## 26621 RPL23A_ig-1   <NA> GCGCCGAAAGCGAAGAAGGA RPL23A CRISPRi control  <NA>
## 26932  CDC23_ig-4   <NA> GAGTACCTCCATGGTCCCGG  CDC23 CRISPRi control  <NA>
## 26997  HAUS5_ig-2   <NA> GGAGCTAGCGCAGGAAGCGC  HAUS5 CRISPRi control  <NA>
## 25752  CDC23_ng-1   <NA> GAGTACCTCCATGGTCCCGG  CDC23 CRISPRn control  <NA>
## 25842  HAUS5_ng-1   <NA> GGAGCTAGCGCAGGAAGCGC  HAUS5 CRISPRn control  <NA>
## 26022   RFC3_ng-1   <NA> GGGACGGCTGGACTATCACA   RFC3 CRISPRn control  <NA>
## 26024   RFC3_ng-3   <NA> GCCGATACTTGTCCACCCAG   RFC3 CRISPRn control  <NA>
## 26052 RPL23A_ng-1   <NA> GCGCCGAAAGCGAAGAAGGA RPL23A CRISPRn control  <NA>
##       chromStart chromEnd base_after_cut
## 26591         NA       NA             NA
## 26592         NA       NA             NA
## 26621         NA       NA             NA
## 26932         NA       NA             NA
## 26997         NA       NA             NA
## 25752         NA       NA             NA
## 25842         NA       NA             NA
## 26022         NA       NA             NA
## 26024         NA       NA             NA
## 26052         NA       NA             NA
# save library with normalized read counts
save(libm,file = "hg38_library_counts_norm.Rda")

Removal of outlier counts

# plot raw read counts:
cols <- grep("rc",colnames(libm),value = T)
dfm <- gather(libm,cols,key = "library",value="count")
dfm$library <- factor(dfm$library,levels = c("r1s_rc","r1e_rc","r2s_rc","r2e_rc"))
p <- ggplot(dfm)
p+geom_boxplot(aes(x=library,y=count,fill=library))+scale_y_sqrt()+
  labs(y="counts per gRNA",x="",fill="library")+theme_bw()+theme(axis.text.x = element_text(angle=45,hjust = 1))+
  scale_fill_manual(values = cp.paired[c(7:10)])

ggsave("plot_counts_boxplot_incl_outlier_incl_low-count.pdf",width = 5,height = 4.5)

# from the boxplot above, it is apparent that there is one extreme outlier in r1e:
quantile(libm$r1e_rc,probs = c(0,0.25,0.5,0.75,0.9,0.99,1))
##    0%   25%   50%   75%   90%   99%  100% 
##     0   319   539   748   961  1395 47514
libm %>% filter(r1e_rc>40000)
##   protospacer.sequence     gRNA.ID strand target        category chrom
## 1 GACAGACGGATCGGCAGGGC INTS12_ig-1   <NA> INTS12 CRISPRi control  <NA>
##   chromStart chromEnd base_after_cut r1s_rc     r1s_norm r1e_rc
## 1         NA       NA             NA   2607 0.0001825892  47514
##      r1e_norm r2s_rc     r2s_norm r2e_rc     r2e_norm    r1_fc   r1_lfc
## 1 0.003204523   1375 0.0001088352   1135 8.352333e-05 17.55045 4.133436
##       r2_fc    r2_lfc
## 1 0.7674294 -0.381894
# delete this outlier:
libm <- libm %>% filter(!r1e_rc>40000)

# plots raw read counts after outlier removal:
dfm <- gather(libm,cols,key = "library",value="count")
p <- ggplot(dfm)
p+geom_boxplot(aes(x=library,y=count,fill=library))+scale_y_sqrt()+
  labs(y="counts per gRNA",x="",fill="library")+theme_bw()+theme(axis.text.x = element_text(angle=45,hjust = 1))+
  scale_fill_manual(values = cp.paired[c(7:10)])

ggsave("plot_counts_boxplot_excl_outlier_incl_low-count.pdf",width = 5,height = 4.5)


# write count summary data to table:
df.summary <- do.call(cbind,lapply(libm[,cols], summary))
pdf("table_count_summaries_excl_outlier_incl_low-count.pdf", height=11, width=8.5)
grid.table(df.summary)
dev.off()
## quartz_off_screen 
##                 2

Overview of low-count gRNAs

# overview of raw count data at start of screen:
quantile(libm$r1s_rc,probs = c(0.01,0.02,0.05,0.1,0.2,0.25,0.5,0.75,1))
##   1%   2%   5%  10%  20%  25%  50%  75% 100% 
##    1    7   61  162  309  356  527  696 3958
quantile(libm$r2s_rc,probs = c(0.01,0.02,0.05,0.1,0.2,0.25,0.5,0.75,1))
##   1%   2%   5%  10%  20%  25%  50%  75% 100% 
##    1    7   56  145  275  317  466  616 3724
mean(c(libm$r1s_rc,libm$r2s_rc))
## [1] 498.7146
# overview of raw count data at end of screen:
quantile(libm$r1e_rc,probs = c(0.01,0.02,0.05,0.1,0.2,0.25,0.5,0.75,1))
##   1%   2%   5%  10%  20%  25%  50%  75% 100% 
##    0    5   38  108  259  319  539  748 5131
quantile(libm$r2e_rc,probs = c(0.01,0.02,0.05,0.1,0.2,0.25,0.5,0.75,1))
##   1%   2%   5%  10%  20%  25%  50%  75% 100% 
##    0    6   35  103  248  302  499  686 5550
# absent gRNAs:
summary(libm$r1s_rc==0 | libm$r1e_rc==0 | libm$r2s_rc==0 | libm$r2e_rct==0) # number of gRNAs that have count = 0 in any library
##    Mode 
## logical
summary(libm$r1s_rc==0 & libm$r1e_rc==0 & libm$r2s_rc==0 & libm$r2e_rct==0) # number of gRNAs that are not present in any library
##    Mode 
## logical
summary(libm$r1s_rc==0 & libm$r1e_rc==0) # number of gRNAs not present in replicate 1
##    Mode   FALSE    TRUE 
## logical   26805     192
summary(libm$r2s_rc==0 & libm$r2e_rc==0) # number of gRNAs not present in replicate 2
##    Mode   FALSE    TRUE 
## logical   26811     186
# very few gRNAs go from high counts to 0
summary(libm$r1s_rc>50 & libm$r1e_rc==0)
##    Mode   FALSE    TRUE 
## logical   26995       2
summary(libm$rep2_start>50 & libm$rep2_end==0)
##    Mode 
## logical
# but this depletion seems to be real as it is consistent between replicates:
libm[which((libm$rep1_start>50 & libm$rep1_end==0) | (libm$rep2_start>50 & libm$rep2_end==0)),]
##  [1] protospacer.sequence gRNA.ID              strand              
##  [4] target               category             chrom               
##  [7] chromStart           chromEnd             base_after_cut      
## [10] r1s_rc               r1s_norm             r1e_rc              
## [13] r1e_norm             r2s_rc               r2s_norm            
## [16] r2e_rc               r2e_norm             r1_fc               
## [19] r1_lfc               r2_fc                r2_lfc              
## <0 rows> (or 0-length row.names)
# calculate delta lfc between replicates to see at which raw read count variability starts to increase:
libm <- libm %>% mutate(delta_lfc=abs(r1_lfc-r2_lfc))

# plot delta lfc in relation to raw read count (starting libraries) with proposed cutoff:
dfm <- gather(libm,r1s_rc,r2s_rc,key = "library",value="raw_count")
ggplot(dfm)+geom_point(aes(x=raw_count,y=delta_lfc),alpha=0.5)+
  geom_vline(xintercept = 50,color="red")+
  facet_wrap(~library,ncol = 1)+
  coord_cartesian(xlim = c(0,500))+theme_bw()

ggsave("plot_counts_vs_delta-lfc.pdf",width = 7,height = 5)


# density plots with low counts:
dfm <- gather(libm,cols,key = "library",value="count")
ggplot(dfm)+
  geom_density(aes(x=count,color=library,fill=library),alpha=0.1)+
  labs(x="counts per gRNA",fill="library",color="library")+theme_bw()+
  scale_fill_manual(values = cp.paired[c(7:10)])+scale_color_manual(values = cp.paired[c(7:10)])+
  coord_cartesian(xlim = c(0,1000))

ggsave("plot_counts_density_excl_outlier_incl_low-count.pdf",width = 7,height = 5)

# density plots excluding low count gRNAs:
dfm <- libm %>% filter(r1s_rc>50 & r2s_rc>50) %>% gather(cols,key = "library",value="count")
ggplot(dfm)+
  geom_density(aes(x=count,color=library,fill=library),alpha=0.1)+
  labs(x="counts per gRNA",fill="library",color="library")+theme_bw()+
  scale_fill_manual(values = cp.paired[c(7:10)])+scale_color_manual(values = cp.paired[c(7:10)])+
  coord_cartesian(xlim = c(0,1000))

ggsave("plot_counts_density_excl_outlier_excl_low-count.pdf",width = 7,height = 5)

Removal of low count gRNAs

From the analysis above it appears that raw readCount > 50 is a valid cut-off (gRNAs with lower raw counts in starting library tend to have high delta lfc, i.e. inconsistent results between replicates)

# remove low count gRNAs:
libm <- libm %>% filter(r1s_rc>50 & r2s_rc>50)

# write count summary data to table:
df.summary <- do.call(cbind,lapply(libm[,cols], summary))
pdf("table_count_summaries_excl_outlier_excl_low-count.pdf", height=11, width=8.5)
grid.table(df.summary)
dev.off()
## quartz_off_screen 
##                 2
# save library with normalized and cleaned read counts (outlier and low-count gRNA removed) 
save(libm,file = "hg38_library_counts_norm_clean.Rda")

Center data

As shown below, neutral / negative control gRNAs tend to have positive lfc values (growth scores). This is presumably due to normalization and a substantial number of gRNAs having been depleted in end libraries (consequently, neutral gRNAs will have higher normalized read counts in end library). To eliminate this bias, I will subtract the median lfc value of negative/non-targeting gRNAs from all gRNAs. Effectively, this will center negative control around 0. Note that centering is performed for each replicate individually before calcualting mean lfc. This will allow visualization of each replicate lfc in genome browser centered at 0.

# load library file with normalized and cleaned count data (lifted over to hg38):
load("hg38_library_counts_norm_clean.Rda")

# factorize for plots:
lc <- sort(unique(libm$category))
libm$category <- factor(libm$category,levels = lc[c(11,6,7,5,1,2,3,4,8,9,10)])

# plot lfc per category
libg <- libm %>% gather(r1_lfc,r2_lfc,key = "replicate",value = "lfc")
libg$replicate[grep("r1_lfc",libg$replicate)] <- "replicate 1"
libg$replicate[grep("r2_lfc",libg$replicate)] <- "replicate 2"
p <- ggplot(libg[libg$category=="non-targeting control",],aes(x=lfc,fill=replicate,color=replicate))
p+geom_density(alpha=0.5)+
  theme_bw()+
  labs(x="Growth score",fill="",color="",subtitle="log2 fold change for non-targeting control gRNAs")

ggsave("plot_lfc_density_per_replicate.pdf",height = 4,width = 5)

# plot lfc per category
libg <- libm %>% gather(r1_lfc,r2_lfc,key = "replicate",value = "lfc")
libg$replicate[grep("r1_lfc",libg$replicate)] <- "replicate 1"
libg$replicate[grep("r2_lfc",libg$replicate)] <- "replicate 2"
p <- ggplot(libg,aes(x=category,y=lfc,fill=category))
p+geom_boxplot()+
  theme_bw()+theme(axis.text.x = element_text(angle=45,hjust = 1))+
  labs(x="",fill="gRNA category",y="log2 fold change")+
  facet_wrap(~replicate)

ggsave("plot_lfc_boxplot_category.pdf",height = 5,width = 10)

# mean and median of gRNA lfc values for each category:
libm.summary <- libm %>% group_by(category) %>% summarize(n=n(),
                                                          mean_rep1=mean(r1_lfc),median_rep1=median(r1_lfc),
                                                          mean_rep2=mean(r2_lfc),median_rep2=median(r2_lfc))
print(libm.summary)
## # A tibble: 11 x 6
##    category                  n mean_rep1 median_rep1 mean_rep2 median_rep2
##    <fct>                 <int>     <dbl>       <dbl>     <dbl>       <dbl>
##  1 non-targeting control   246    0.130       0.141    0.117       0.104  
##  2 CRISPRi control         462   -2.42       -2.56    -2.57       -2.62   
##  3 CRISPRn control         520   -0.430      -0.0394  -0.442      -0.0252 
##  4 BCL6-TSS               1864   -0.275      -0.0784  -0.256      -0.0407 
##  5 BCL6-Inter1             694    0.0796      0.115    0.112       0.143  
##  6 BCL6-Inter2             972    0.0413      0.0844   0.0521      0.107  
##  7 BCL6-Inter3             614    0.0482      0.0668   0.0766      0.109  
##  8 BCL6-LCR              18508   -0.0704      0.0307  -0.0485      0.0404 
##  9 LPP-1                   633    0.0654      0.158    0.0245      0.116  
## 10 LPP-2                   439    0.0126      0.0271   0.0365      0.0530 
## 11 LPP-3                   713   -0.0410     -0.0236  -0.00818     0.00550
pdf("table_lfc_per_category.pdf", height=11, width=8.5)
grid.table(libm.summary,rows = NULL)
dev.off()
## quartz_off_screen 
##                 2
# note that the median lfc for negative control gRNAs is markedly positive:
control.mean.rep1 <- mean(libm$r1_lfc[libm$category=="non-targeting control"])
print(control.mean.rep1)
## [1] 0.129562
control.mean.rep2 <- mean(libm$r2_lfc[libm$category=="non-targeting control"])
print(control.mean.rep2)
## [1] 0.1171217
# subtract median of control gRNAs from all lfc values
libm$r1_gs <- libm$r1_lfc-control.mean.rep1
libm$r2_gs <- libm$r2_lfc-control.mean.rep2


# calculate mean fc, lfc and gs (Growth score = centered lfc) to summarize replicates:
libm$fc <- rowMeans(libm[,c("r1_fc","r2_fc")])
libm$lfc <- log2(libm$fc)
control.mean <- mean(libm$lfc[libm$category=="non-targeting control"])
print(control.mean)
## [1] 0.1353692
libm$gs <- libm$lfc-control.mean
# alternative calculation
# libm$gs <-  log2(rowMeans(libm[,c("r1_fc","r2_fc")]))-mean(unlist(as.list(libm[libm$category=="non-targeting control",c("r1_fc","r2_fc")])))





# plot lfc per category
p <- ggplot(libm,aes(x=category,y=gs,fill=category))
p+geom_boxplot()+
  theme_bw()+theme(axis.text.x = element_text(angle=45,hjust = 1))+#scale_fill_manual(values = cp.set3)+
  labs(x="",fill="gRNA category",y="Growth score (centered lfc)")

ggsave("plot_growth-scores_boxplot_catgory_centered.pdf",height = 6,width = 7)

# mean and median of gRNA lfc values for each category:
libm.summary <- libm %>% group_by(category) %>% summarize(n=n(),mean=mean(gs),median=median(gs))
print(libm.summary)
## # A tibble: 11 x 4
##    category                  n      mean   median
##    <fct>                 <int>     <dbl>    <dbl>
##  1 non-targeting control   246  3.52e-18  0.00746
##  2 CRISPRi control         462 -2.54e+ 0 -2.67   
##  3 CRISPRn control         520 -5.55e- 1 -0.132  
##  4 BCL6-TSS               1864 -3.87e- 1 -0.157  
##  5 BCL6-Inter1             694 -2.78e- 2 -0.00533
##  6 BCL6-Inter2             972 -7.69e- 2 -0.0306 
##  7 BCL6-Inter3             614 -6.29e- 2 -0.0354 
##  8 BCL6-LCR              18508 -1.83e- 1 -0.0839 
##  9 LPP-1                   633 -7.98e- 2  0.0165 
## 10 LPP-2                   439 -1.00e- 1 -0.0813 
## 11 LPP-3                   713 -1.47e- 1 -0.133
pdf("table_growth-score_per_category_centered.pdf", height=11, width=8.5)
grid.table(libm.summary,rows = NULL)
dev.off()
## quartz_off_screen 
##                 2
# save library with normalized and cleaned read counts (outlier and low-count gRNA removed) 
save(libm,file = "hg38_library_counts_norm_clean_centered.Rda")