# load library file (after liftover, now hg38):
load("../A_liftover/hg38_library_liftover.Rda")
# read space count files:
## note: spacer sequences were counted with count_spacers.py and the following script:
## ~/Sync/Analysis/CRISPR_scan/171230_CRISPRi-scan_count_data/Spacer_count_server.sh
r1s <- read.csv("~/Sync/CRISPRscan_BCL6_analysis/171230_CRISPRi-scan_count_data/R1S.count.csv",header = F,stringsAsFactors = F)
colnames(r1s)[2] <- "r1s_rc"
r1s$r1s_norm <- (r1s[,2]+1)/sum(r1s[,2])
r1e <- read.csv("~/Sync/CRISPRscan_BCL6_analysis/171230_CRISPRi-scan_count_data/R1E.count.csv",header = F,stringsAsFactors = F)
colnames(r1e)[2] <- "r1e_rc"
r1e$r1e_norm <- (r1e[,2]+1)/sum(r1e[,2])
r2s <- read.csv("~/Sync/CRISPRscan_BCL6_analysis/171230_CRISPRi-scan_count_data/R2S.count.csv",header = F,stringsAsFactors = F)
colnames(r2s)[2] <- "r2s_rc"
r2s$r2s_norm <- (r2s[,2]+1)/sum(r2s[,2])
r2e <- read.csv("~/Sync/CRISPRscan_BCL6_analysis/171230_CRISPRi-scan_count_data/R2E.count.csv",header = F,stringsAsFactors = F)
colnames(r2e)[2] <- "r2e_rc"
r2e$r2e_norm <- (r2e[,2]+1)/sum(r2e[,2])
# merge:
libm <- merge(lib,r1s,by.x = "protospacer.sequence",by.y = "V1")
libm <- merge(libm,r1e,by.x = "protospacer.sequence",by.y = "V1")
libm <- merge(libm,r2s,by.x = "protospacer.sequence",by.y = "V1")
libm <- merge(libm,r2e,by.x = "protospacer.sequence",by.y = "V1")
# calculate fold change rep1:
libm$r1_fc <- libm$r1e_norm/libm$r1s_norm
libm$r1_lfc <- log2(libm$r1_fc)
# calculate fold change rep2:
libm$r2_fc <- libm$r2e_norm/libm$r2s_norm
libm$r2_lfc <- log2(libm$r2_fc)
# Caveat:
# there are 5 more sequences in library than in count files
# this is due to the fact, that there are 5 sequences that are duplicated in library:
summary(r1s$V1 %in% lib$protospacer.sequence)
## Mode TRUE
## logical 26993
summary(lib$protospacer.sequence %in% r1s$V1)
## Mode TRUE
## logical 26998
summary(duplicated(lib$protospacer.sequence))
## Mode FALSE TRUE
## logical 26993 5
# these duplicates are gRNAs that work both as CRISPRn and CRISPRi controls:
lib[c(which(duplicated(lib$protospacer.sequence)),which(duplicated(lib$protospacer.sequence,fromLast = T))),]
## gRNA.ID strand protospacer.sequence target category chrom
## 26591 RFC3_ig-1 <NA> GCCGATACTTGTCCACCCAG RFC3 CRISPRi control <NA>
## 26592 RFC3_ig-2 <NA> GGGACGGCTGGACTATCACA RFC3 CRISPRi control <NA>
## 26621 RPL23A_ig-1 <NA> GCGCCGAAAGCGAAGAAGGA RPL23A CRISPRi control <NA>
## 26932 CDC23_ig-4 <NA> GAGTACCTCCATGGTCCCGG CDC23 CRISPRi control <NA>
## 26997 HAUS5_ig-2 <NA> GGAGCTAGCGCAGGAAGCGC HAUS5 CRISPRi control <NA>
## 25752 CDC23_ng-1 <NA> GAGTACCTCCATGGTCCCGG CDC23 CRISPRn control <NA>
## 25842 HAUS5_ng-1 <NA> GGAGCTAGCGCAGGAAGCGC HAUS5 CRISPRn control <NA>
## 26022 RFC3_ng-1 <NA> GGGACGGCTGGACTATCACA RFC3 CRISPRn control <NA>
## 26024 RFC3_ng-3 <NA> GCCGATACTTGTCCACCCAG RFC3 CRISPRn control <NA>
## 26052 RPL23A_ng-1 <NA> GCGCCGAAAGCGAAGAAGGA RPL23A CRISPRn control <NA>
## chromStart chromEnd base_after_cut
## 26591 NA NA NA
## 26592 NA NA NA
## 26621 NA NA NA
## 26932 NA NA NA
## 26997 NA NA NA
## 25752 NA NA NA
## 25842 NA NA NA
## 26022 NA NA NA
## 26024 NA NA NA
## 26052 NA NA NA
# save library with normalized read counts
save(libm,file = "hg38_library_counts_norm.Rda")
# plot raw read counts:
cols <- grep("rc",colnames(libm),value = T)
dfm <- gather(libm,cols,key = "library",value="count")
dfm$library <- factor(dfm$library,levels = c("r1s_rc","r1e_rc","r2s_rc","r2e_rc"))
p <- ggplot(dfm)
p+geom_boxplot(aes(x=library,y=count,fill=library))+scale_y_sqrt()+
labs(y="counts per gRNA",x="",fill="library")+theme_bw()+theme(axis.text.x = element_text(angle=45,hjust = 1))+
scale_fill_manual(values = cp.paired[c(7:10)])
ggsave("plot_counts_boxplot_incl_outlier_incl_low-count.pdf",width = 5,height = 4.5)
# from the boxplot above, it is apparent that there is one extreme outlier in r1e:
quantile(libm$r1e_rc,probs = c(0,0.25,0.5,0.75,0.9,0.99,1))
## 0% 25% 50% 75% 90% 99% 100%
## 0 319 539 748 961 1395 47514
libm %>% filter(r1e_rc>40000)
## protospacer.sequence gRNA.ID strand target category chrom
## 1 GACAGACGGATCGGCAGGGC INTS12_ig-1 <NA> INTS12 CRISPRi control <NA>
## chromStart chromEnd base_after_cut r1s_rc r1s_norm r1e_rc
## 1 NA NA NA 2607 0.0001825892 47514
## r1e_norm r2s_rc r2s_norm r2e_rc r2e_norm r1_fc r1_lfc
## 1 0.003204523 1375 0.0001088352 1135 8.352333e-05 17.55045 4.133436
## r2_fc r2_lfc
## 1 0.7674294 -0.381894
# delete this outlier:
libm <- libm %>% filter(!r1e_rc>40000)
# plots raw read counts after outlier removal:
dfm <- gather(libm,cols,key = "library",value="count")
p <- ggplot(dfm)
p+geom_boxplot(aes(x=library,y=count,fill=library))+scale_y_sqrt()+
labs(y="counts per gRNA",x="",fill="library")+theme_bw()+theme(axis.text.x = element_text(angle=45,hjust = 1))+
scale_fill_manual(values = cp.paired[c(7:10)])
ggsave("plot_counts_boxplot_excl_outlier_incl_low-count.pdf",width = 5,height = 4.5)
# write count summary data to table:
df.summary <- do.call(cbind,lapply(libm[,cols], summary))
pdf("table_count_summaries_excl_outlier_incl_low-count.pdf", height=11, width=8.5)
grid.table(df.summary)
dev.off()
## quartz_off_screen
## 2
# overview of raw count data at start of screen:
quantile(libm$r1s_rc,probs = c(0.01,0.02,0.05,0.1,0.2,0.25,0.5,0.75,1))
## 1% 2% 5% 10% 20% 25% 50% 75% 100%
## 1 7 61 162 309 356 527 696 3958
quantile(libm$r2s_rc,probs = c(0.01,0.02,0.05,0.1,0.2,0.25,0.5,0.75,1))
## 1% 2% 5% 10% 20% 25% 50% 75% 100%
## 1 7 56 145 275 317 466 616 3724
mean(c(libm$r1s_rc,libm$r2s_rc))
## [1] 498.7146
# overview of raw count data at end of screen:
quantile(libm$r1e_rc,probs = c(0.01,0.02,0.05,0.1,0.2,0.25,0.5,0.75,1))
## 1% 2% 5% 10% 20% 25% 50% 75% 100%
## 0 5 38 108 259 319 539 748 5131
quantile(libm$r2e_rc,probs = c(0.01,0.02,0.05,0.1,0.2,0.25,0.5,0.75,1))
## 1% 2% 5% 10% 20% 25% 50% 75% 100%
## 0 6 35 103 248 302 499 686 5550
# absent gRNAs:
summary(libm$r1s_rc==0 | libm$r1e_rc==0 | libm$r2s_rc==0 | libm$r2e_rct==0) # number of gRNAs that have count = 0 in any library
## Mode
## logical
summary(libm$r1s_rc==0 & libm$r1e_rc==0 & libm$r2s_rc==0 & libm$r2e_rct==0) # number of gRNAs that are not present in any library
## Mode
## logical
summary(libm$r1s_rc==0 & libm$r1e_rc==0) # number of gRNAs not present in replicate 1
## Mode FALSE TRUE
## logical 26805 192
summary(libm$r2s_rc==0 & libm$r2e_rc==0) # number of gRNAs not present in replicate 2
## Mode FALSE TRUE
## logical 26811 186
# very few gRNAs go from high counts to 0
summary(libm$r1s_rc>50 & libm$r1e_rc==0)
## Mode FALSE TRUE
## logical 26995 2
summary(libm$rep2_start>50 & libm$rep2_end==0)
## Mode
## logical
# but this depletion seems to be real as it is consistent between replicates:
libm[which((libm$rep1_start>50 & libm$rep1_end==0) | (libm$rep2_start>50 & libm$rep2_end==0)),]
## [1] protospacer.sequence gRNA.ID strand
## [4] target category chrom
## [7] chromStart chromEnd base_after_cut
## [10] r1s_rc r1s_norm r1e_rc
## [13] r1e_norm r2s_rc r2s_norm
## [16] r2e_rc r2e_norm r1_fc
## [19] r1_lfc r2_fc r2_lfc
## <0 rows> (or 0-length row.names)
# calculate delta lfc between replicates to see at which raw read count variability starts to increase:
libm <- libm %>% mutate(delta_lfc=abs(r1_lfc-r2_lfc))
# plot delta lfc in relation to raw read count (starting libraries) with proposed cutoff:
dfm <- gather(libm,r1s_rc,r2s_rc,key = "library",value="raw_count")
ggplot(dfm)+geom_point(aes(x=raw_count,y=delta_lfc),alpha=0.5)+
geom_vline(xintercept = 50,color="red")+
facet_wrap(~library,ncol = 1)+
coord_cartesian(xlim = c(0,500))+theme_bw()
ggsave("plot_counts_vs_delta-lfc.pdf",width = 7,height = 5)
# density plots with low counts:
dfm <- gather(libm,cols,key = "library",value="count")
ggplot(dfm)+
geom_density(aes(x=count,color=library,fill=library),alpha=0.1)+
labs(x="counts per gRNA",fill="library",color="library")+theme_bw()+
scale_fill_manual(values = cp.paired[c(7:10)])+scale_color_manual(values = cp.paired[c(7:10)])+
coord_cartesian(xlim = c(0,1000))
ggsave("plot_counts_density_excl_outlier_incl_low-count.pdf",width = 7,height = 5)
# density plots excluding low count gRNAs:
dfm <- libm %>% filter(r1s_rc>50 & r2s_rc>50) %>% gather(cols,key = "library",value="count")
ggplot(dfm)+
geom_density(aes(x=count,color=library,fill=library),alpha=0.1)+
labs(x="counts per gRNA",fill="library",color="library")+theme_bw()+
scale_fill_manual(values = cp.paired[c(7:10)])+scale_color_manual(values = cp.paired[c(7:10)])+
coord_cartesian(xlim = c(0,1000))
ggsave("plot_counts_density_excl_outlier_excl_low-count.pdf",width = 7,height = 5)
From the analysis above it appears that raw readCount > 50 is a valid cut-off (gRNAs with lower raw counts in starting library tend to have high delta lfc, i.e. inconsistent results between replicates)
# remove low count gRNAs:
libm <- libm %>% filter(r1s_rc>50 & r2s_rc>50)
# write count summary data to table:
df.summary <- do.call(cbind,lapply(libm[,cols], summary))
pdf("table_count_summaries_excl_outlier_excl_low-count.pdf", height=11, width=8.5)
grid.table(df.summary)
dev.off()
## quartz_off_screen
## 2
# save library with normalized and cleaned read counts (outlier and low-count gRNA removed)
save(libm,file = "hg38_library_counts_norm_clean.Rda")
As shown below, neutral / negative control gRNAs tend to have positive lfc values (growth scores). This is presumably due to normalization and a substantial number of gRNAs having been depleted in end libraries (consequently, neutral gRNAs will have higher normalized read counts in end library). To eliminate this bias, I will subtract the median lfc value of negative/non-targeting gRNAs from all gRNAs. Effectively, this will center negative control around 0. Note that centering is performed for each replicate individually before calcualting mean lfc. This will allow visualization of each replicate lfc in genome browser centered at 0.
# load library file with normalized and cleaned count data (lifted over to hg38):
load("hg38_library_counts_norm_clean.Rda")
# factorize for plots:
lc <- sort(unique(libm$category))
libm$category <- factor(libm$category,levels = lc[c(11,6,7,5,1,2,3,4,8,9,10)])
# plot lfc per category
libg <- libm %>% gather(r1_lfc,r2_lfc,key = "replicate",value = "lfc")
libg$replicate[grep("r1_lfc",libg$replicate)] <- "replicate 1"
libg$replicate[grep("r2_lfc",libg$replicate)] <- "replicate 2"
p <- ggplot(libg[libg$category=="non-targeting control",],aes(x=lfc,fill=replicate,color=replicate))
p+geom_density(alpha=0.5)+
theme_bw()+
labs(x="Growth score",fill="",color="",subtitle="log2 fold change for non-targeting control gRNAs")
ggsave("plot_lfc_density_per_replicate.pdf",height = 4,width = 5)
# plot lfc per category
libg <- libm %>% gather(r1_lfc,r2_lfc,key = "replicate",value = "lfc")
libg$replicate[grep("r1_lfc",libg$replicate)] <- "replicate 1"
libg$replicate[grep("r2_lfc",libg$replicate)] <- "replicate 2"
p <- ggplot(libg,aes(x=category,y=lfc,fill=category))
p+geom_boxplot()+
theme_bw()+theme(axis.text.x = element_text(angle=45,hjust = 1))+
labs(x="",fill="gRNA category",y="log2 fold change")+
facet_wrap(~replicate)
ggsave("plot_lfc_boxplot_category.pdf",height = 5,width = 10)
# mean and median of gRNA lfc values for each category:
libm.summary <- libm %>% group_by(category) %>% summarize(n=n(),
mean_rep1=mean(r1_lfc),median_rep1=median(r1_lfc),
mean_rep2=mean(r2_lfc),median_rep2=median(r2_lfc))
print(libm.summary)
## # A tibble: 11 x 6
## category n mean_rep1 median_rep1 mean_rep2 median_rep2
## <fct> <int> <dbl> <dbl> <dbl> <dbl>
## 1 non-targeting control 246 0.130 0.141 0.117 0.104
## 2 CRISPRi control 462 -2.42 -2.56 -2.57 -2.62
## 3 CRISPRn control 520 -0.430 -0.0394 -0.442 -0.0252
## 4 BCL6-TSS 1864 -0.275 -0.0784 -0.256 -0.0407
## 5 BCL6-Inter1 694 0.0796 0.115 0.112 0.143
## 6 BCL6-Inter2 972 0.0413 0.0844 0.0521 0.107
## 7 BCL6-Inter3 614 0.0482 0.0668 0.0766 0.109
## 8 BCL6-LCR 18508 -0.0704 0.0307 -0.0485 0.0404
## 9 LPP-1 633 0.0654 0.158 0.0245 0.116
## 10 LPP-2 439 0.0126 0.0271 0.0365 0.0530
## 11 LPP-3 713 -0.0410 -0.0236 -0.00818 0.00550
pdf("table_lfc_per_category.pdf", height=11, width=8.5)
grid.table(libm.summary,rows = NULL)
dev.off()
## quartz_off_screen
## 2
# note that the median lfc for negative control gRNAs is markedly positive:
control.mean.rep1 <- mean(libm$r1_lfc[libm$category=="non-targeting control"])
print(control.mean.rep1)
## [1] 0.129562
control.mean.rep2 <- mean(libm$r2_lfc[libm$category=="non-targeting control"])
print(control.mean.rep2)
## [1] 0.1171217
# subtract median of control gRNAs from all lfc values
libm$r1_gs <- libm$r1_lfc-control.mean.rep1
libm$r2_gs <- libm$r2_lfc-control.mean.rep2
# calculate mean fc, lfc and gs (Growth score = centered lfc) to summarize replicates:
libm$fc <- rowMeans(libm[,c("r1_fc","r2_fc")])
libm$lfc <- log2(libm$fc)
control.mean <- mean(libm$lfc[libm$category=="non-targeting control"])
print(control.mean)
## [1] 0.1353692
libm$gs <- libm$lfc-control.mean
# alternative calculation
# libm$gs <- log2(rowMeans(libm[,c("r1_fc","r2_fc")]))-mean(unlist(as.list(libm[libm$category=="non-targeting control",c("r1_fc","r2_fc")])))
# plot lfc per category
p <- ggplot(libm,aes(x=category,y=gs,fill=category))
p+geom_boxplot()+
theme_bw()+theme(axis.text.x = element_text(angle=45,hjust = 1))+#scale_fill_manual(values = cp.set3)+
labs(x="",fill="gRNA category",y="Growth score (centered lfc)")
ggsave("plot_growth-scores_boxplot_catgory_centered.pdf",height = 6,width = 7)
# mean and median of gRNA lfc values for each category:
libm.summary <- libm %>% group_by(category) %>% summarize(n=n(),mean=mean(gs),median=median(gs))
print(libm.summary)
## # A tibble: 11 x 4
## category n mean median
## <fct> <int> <dbl> <dbl>
## 1 non-targeting control 246 3.52e-18 0.00746
## 2 CRISPRi control 462 -2.54e+ 0 -2.67
## 3 CRISPRn control 520 -5.55e- 1 -0.132
## 4 BCL6-TSS 1864 -3.87e- 1 -0.157
## 5 BCL6-Inter1 694 -2.78e- 2 -0.00533
## 6 BCL6-Inter2 972 -7.69e- 2 -0.0306
## 7 BCL6-Inter3 614 -6.29e- 2 -0.0354
## 8 BCL6-LCR 18508 -1.83e- 1 -0.0839
## 9 LPP-1 633 -7.98e- 2 0.0165
## 10 LPP-2 439 -1.00e- 1 -0.0813
## 11 LPP-3 713 -1.47e- 1 -0.133
pdf("table_growth-score_per_category_centered.pdf", height=11, width=8.5)
grid.table(libm.summary,rows = NULL)
dev.off()
## quartz_off_screen
## 2
# save library with normalized and cleaned read counts (outlier and low-count gRNA removed)
save(libm,file = "hg38_library_counts_norm_clean_centered.Rda")