Data normalization
# load library file (after liftover, now hg38):
load("../A_liftover/hg38_library_liftover.Rda")
# read space count files:
## note: spacer sequences were counted with count_spacers.py and the following script:
## ~/Sync/Analysis/CRISPR_scan/171230_CRISPRi-scan_count_data/Spacer_count_server.sh
r1s <- read.csv("~/Sync/CRISPRscan_BCL6_analysis/171230_CRISPRi-scan_count_data/R1S.count.csv",header = F,stringsAsFactors = F)
colnames(r1s)[2] <- "r1s_rc"
r1s$r1s_norm <- (r1s[,2]+1)/sum(r1s[,2])
r1e <- read.csv("~/Sync/CRISPRscan_BCL6_analysis/171230_CRISPRi-scan_count_data/R1E.count.csv",header = F,stringsAsFactors = F)
colnames(r1e)[2] <- "r1e_rc"
r1e$r1e_norm <- (r1e[,2]+1)/sum(r1e[,2])
r2s <- read.csv("~/Sync/CRISPRscan_BCL6_analysis/171230_CRISPRi-scan_count_data/R2S.count.csv",header = F,stringsAsFactors = F)
colnames(r2s)[2] <- "r2s_rc"
r2s$r2s_norm <- (r2s[,2]+1)/sum(r2s[,2])
r2e <- read.csv("~/Sync/CRISPRscan_BCL6_analysis/171230_CRISPRi-scan_count_data/R2E.count.csv",header = F,stringsAsFactors = F)
colnames(r2e)[2] <- "r2e_rc"
r2e$r2e_norm <- (r2e[,2]+1)/sum(r2e[,2])
# merge:
libm <- merge(lib,r1s,by.x = "protospacer.sequence",by.y = "V1")
libm <- merge(libm,r1e,by.x = "protospacer.sequence",by.y = "V1")
libm <- merge(libm,r2s,by.x = "protospacer.sequence",by.y = "V1")
libm <- merge(libm,r2e,by.x = "protospacer.sequence",by.y = "V1")
# calculate fold change rep1:
libm$r1_fc <- libm$r1e_norm/libm$r1s_norm
libm$r1_lfc <- log2(libm$r1_fc)
# calculate fold change rep2:
libm$r2_fc <- libm$r2e_norm/libm$r2s_norm
libm$r2_lfc <- log2(libm$r2_fc)
# Caveat:
# there are 5 more sequences in library than in count files
# this is due to the fact, that there are 5 sequences that are duplicated in library:
summary(r1s$V1 %in% lib$protospacer.sequence)
## Mode TRUE
## logical 26993
summary(lib$protospacer.sequence %in% r1s$V1)
## Mode TRUE
## logical 26998
summary(duplicated(lib$protospacer.sequence))
## Mode FALSE TRUE
## logical 26993 5
# these duplicates are gRNAs that work both as CRISPRn and CRISPRi controls:
lib[c(which(duplicated(lib$protospacer.sequence)),which(duplicated(lib$protospacer.sequence,fromLast = T))),]
## gRNA.ID strand protospacer.sequence target category chrom
## 26591 RFC3_ig-1 <NA> GCCGATACTTGTCCACCCAG RFC3 CRISPRi control <NA>
## 26592 RFC3_ig-2 <NA> GGGACGGCTGGACTATCACA RFC3 CRISPRi control <NA>
## 26621 RPL23A_ig-1 <NA> GCGCCGAAAGCGAAGAAGGA RPL23A CRISPRi control <NA>
## 26932 CDC23_ig-4 <NA> GAGTACCTCCATGGTCCCGG CDC23 CRISPRi control <NA>
## 26997 HAUS5_ig-2 <NA> GGAGCTAGCGCAGGAAGCGC HAUS5 CRISPRi control <NA>
## 25752 CDC23_ng-1 <NA> GAGTACCTCCATGGTCCCGG CDC23 CRISPRn control <NA>
## 25842 HAUS5_ng-1 <NA> GGAGCTAGCGCAGGAAGCGC HAUS5 CRISPRn control <NA>
## 26022 RFC3_ng-1 <NA> GGGACGGCTGGACTATCACA RFC3 CRISPRn control <NA>
## 26024 RFC3_ng-3 <NA> GCCGATACTTGTCCACCCAG RFC3 CRISPRn control <NA>
## 26052 RPL23A_ng-1 <NA> GCGCCGAAAGCGAAGAAGGA RPL23A CRISPRn control <NA>
## chromStart chromEnd base_after_cut
## 26591 NA NA NA
## 26592 NA NA NA
## 26621 NA NA NA
## 26932 NA NA NA
## 26997 NA NA NA
## 25752 NA NA NA
## 25842 NA NA NA
## 26022 NA NA NA
## 26024 NA NA NA
## 26052 NA NA NA
# save library with normalized read counts
save(libm,file = "hg38_library_counts_norm.Rda")
Removal of outlier counts
# plot raw read counts:
cols <- grep("rc",colnames(libm),value = T)
dfm <- gather(libm,cols,key = "library",value="count")
dfm$library <- factor(dfm$library,levels = c("r1s_rc","r1e_rc","r2s_rc","r2e_rc"))
p <- ggplot(dfm)
p+geom_boxplot(aes(x=library,y=count,fill=library))+scale_y_sqrt()+
labs(y="counts per gRNA",x="",fill="library")+theme_bw()+theme(axis.text.x = element_text(angle=45,hjust = 1))+
scale_fill_manual(values = cp.paired[c(7:10)])

ggsave("plot_counts_boxplot_incl_outlier_incl_low-count.pdf",width = 5,height = 4.5)
# from the boxplot above, it is apparent that there is one extreme outlier in r1e:
quantile(libm$r1e_rc,probs = c(0,0.25,0.5,0.75,0.9,0.99,1))
## 0% 25% 50% 75% 90% 99% 100%
## 0 319 539 748 961 1395 47514
libm %>% filter(r1e_rc>40000)
## protospacer.sequence gRNA.ID strand target category chrom
## 1 GACAGACGGATCGGCAGGGC INTS12_ig-1 <NA> INTS12 CRISPRi control <NA>
## chromStart chromEnd base_after_cut r1s_rc r1s_norm r1e_rc
## 1 NA NA NA 2607 0.0001825892 47514
## r1e_norm r2s_rc r2s_norm r2e_rc r2e_norm r1_fc r1_lfc
## 1 0.003204523 1375 0.0001088352 1135 8.352333e-05 17.55045 4.133436
## r2_fc r2_lfc
## 1 0.7674294 -0.381894
# delete this outlier:
libm <- libm %>% filter(!r1e_rc>40000)
# plots raw read counts after outlier removal:
dfm <- gather(libm,cols,key = "library",value="count")
p <- ggplot(dfm)
p+geom_boxplot(aes(x=library,y=count,fill=library))+scale_y_sqrt()+
labs(y="counts per gRNA",x="",fill="library")+theme_bw()+theme(axis.text.x = element_text(angle=45,hjust = 1))+
scale_fill_manual(values = cp.paired[c(7:10)])

ggsave("plot_counts_boxplot_excl_outlier_incl_low-count.pdf",width = 5,height = 4.5)
# write count summary data to table:
df.summary <- do.call(cbind,lapply(libm[,cols], summary))
pdf("table_count_summaries_excl_outlier_incl_low-count.pdf", height=11, width=8.5)
grid.table(df.summary)
dev.off()
## quartz_off_screen
## 2
Overview of low-count gRNAs
# overview of raw count data at start of screen:
quantile(libm$r1s_rc,probs = c(0.01,0.02,0.05,0.1,0.2,0.25,0.5,0.75,1))
## 1% 2% 5% 10% 20% 25% 50% 75% 100%
## 1 7 61 162 309 356 527 696 3958
quantile(libm$r2s_rc,probs = c(0.01,0.02,0.05,0.1,0.2,0.25,0.5,0.75,1))
## 1% 2% 5% 10% 20% 25% 50% 75% 100%
## 1 7 56 145 275 317 466 616 3724
mean(c(libm$r1s_rc,libm$r2s_rc))
## [1] 498.7146
# overview of raw count data at end of screen:
quantile(libm$r1e_rc,probs = c(0.01,0.02,0.05,0.1,0.2,0.25,0.5,0.75,1))
## 1% 2% 5% 10% 20% 25% 50% 75% 100%
## 0 5 38 108 259 319 539 748 5131
quantile(libm$r2e_rc,probs = c(0.01,0.02,0.05,0.1,0.2,0.25,0.5,0.75,1))
## 1% 2% 5% 10% 20% 25% 50% 75% 100%
## 0 6 35 103 248 302 499 686 5550
# absent gRNAs:
summary(libm$r1s_rc==0 | libm$r1e_rc==0 | libm$r2s_rc==0 | libm$r2e_rct==0) # number of gRNAs that have count = 0 in any library
## Mode
## logical
summary(libm$r1s_rc==0 & libm$r1e_rc==0 & libm$r2s_rc==0 & libm$r2e_rct==0) # number of gRNAs that are not present in any library
## Mode
## logical
summary(libm$r1s_rc==0 & libm$r1e_rc==0) # number of gRNAs not present in replicate 1
## Mode FALSE TRUE
## logical 26805 192
summary(libm$r2s_rc==0 & libm$r2e_rc==0) # number of gRNAs not present in replicate 2
## Mode FALSE TRUE
## logical 26811 186
# very few gRNAs go from high counts to 0
summary(libm$r1s_rc>50 & libm$r1e_rc==0)
## Mode FALSE TRUE
## logical 26995 2
summary(libm$rep2_start>50 & libm$rep2_end==0)
## Mode
## logical
# but this depletion seems to be real as it is consistent between replicates:
libm[which((libm$rep1_start>50 & libm$rep1_end==0) | (libm$rep2_start>50 & libm$rep2_end==0)),]
## [1] protospacer.sequence gRNA.ID strand
## [4] target category chrom
## [7] chromStart chromEnd base_after_cut
## [10] r1s_rc r1s_norm r1e_rc
## [13] r1e_norm r2s_rc r2s_norm
## [16] r2e_rc r2e_norm r1_fc
## [19] r1_lfc r2_fc r2_lfc
## <0 rows> (or 0-length row.names)
# calculate delta lfc between replicates to see at which raw read count variability starts to increase:
libm <- libm %>% mutate(delta_lfc=abs(r1_lfc-r2_lfc))
# plot delta lfc in relation to raw read count (starting libraries) with proposed cutoff:
dfm <- gather(libm,r1s_rc,r2s_rc,key = "library",value="raw_count")
ggplot(dfm)+geom_point(aes(x=raw_count,y=delta_lfc),alpha=0.5)+
geom_vline(xintercept = 50,color="red")+
facet_wrap(~library,ncol = 1)+
coord_cartesian(xlim = c(0,500))+theme_bw()
