This is the PRO-seq analysis vignette for the manuscript entitled “The androgen receptor does not directly regulate the transcription of DNA damage response genes”, which can be found at doi: https://doi.org/10.1101/2023.05.13.540653. The reader should be able to follow these analysis steps to get the same results that we report.
Bash code chunks will be in grey. Change the directory name to your working directory.
directory=/Volumes/External/Users/TScott/Radiation_enzalutamide_PRO
mkdir -p $directory
cd $directory
fasterq-dump SRX20800653
fasterq-dump SRX20800654
fasterq-dump SRX20800655
fasterq-dump SRX20800656
fasterq-dump SRX20800657
fasterq-dump SRX20800658
fasterq-dump SRX20800659
fasterq-dump SRX20800660
fasterq-dump SRX20800661
fasterq-dump SRX20800662
fasterq-dump SRX20800663
fasterq-dump SRX20800664
fasterq-dump SRX20800665
fasterq-dump SRX20800666
fasterq-dump SRX20800667
fasterq-dump SRX20800668
The later code expects these names.
mv SRX20800653_1.fastq LNCaP_rep1t1_batch1_PE1.fastq
mv SRX20800653_2.fastq LNCaP_rep1t1_batch1_PE2.fastq
mv SRX20800654_1.fastq LNCaP_rep1t2_batch1_PE1.fastq
mv SRX20800654_2.fastq LNCaP_rep1t2_batch1_PE2.fastq
mv SRX20800655_1.fastq LNCaP_rep2t1_batch1_PE1.fastq
mv SRX20800655_2.fastq LNCaP_rep2t1_batch1_PE2.fastq
mv SRX20800656_1.fastq LNCaP_rep2t2_batch1_PE1.fastq
mv SRX20800656_2.fastq LNCaP_rep2t2_batch1_PE2.fastq
mv SRX20800657_1.fastq LNCaP_6GyIR_rep1t1_batch1_PE1.fastq
mv SRX20800657_2.fastq LNCaP_6GyIR_rep1t1_batch1_PE2.fastq
mv SRX20800658_1.fastq LNCaP_6GyIR_rep1t2_batch1_PE1.fastq
mv SRX20800658_2.fastq LNCaP_6GyIR_rep1t2_batch1_PE2.fastq
mv SRX20800659_1.fastq LNCaP_6GyIR_rep2t1_batch1_PE1.fastq
mv SRX20800659_2.fastq LNCaP_6GyIR_rep2t1_batch1_PE2.fastq
mv SRX20800660_1.fastq LNCaP_6GyIR_rep2t2_batch1_PE1.fastq
mv SRX20800660_2.fastq LNCaP_6GyIR_rep2t2_batch1_PE2.fastq
mv SRX20800661_1.fastq LNCaP_rep3_batch2_PE1.fastq
mv SRX20800661_2.fastq LNCaP_rep3_batch2_PE2.fastq
mv SRX20800662_1.fastq LNCaP_rep4_batch2_PE1.fastq
mv SRX20800662_2.fastq LNCaP_rep4_batch2_PE2.fastq
mv SRX20800663_1.fastq LNCaP_6GyIR_rep3_batch2_PE1.fastq
mv SRX20800664_2.fastq LNCaP_6GyIR_rep3_batch2_PE2.fastq
mv SRX20800664_1.fastq LNCaP_6GyIR_rep4_batch2_PE1.fastq
mv SRX20800664_2.fastq LNCaP_6GyIR_rep4_batch2_PE2.fastq
mv SRX20800665_1.fastq LNCaP_10uMEnza_rep3_batch2_PE1.fastq
mv SRX20800665_2.fastq LNCaP_10uMEnza_rep3_batch2_PE2.fastq
mv SRX20800666_1.fastq LNCaP_10uMEnza_rep4_batch2_PE1.fastq
mv SRX20800666_2.fastq LNCaP_10uMEnza_rep4_batch2_PE2.fastq
mv SRX20800667_1.fastq LNCaP_10uMEnza_6GyIR_rep3_batch2_PE1.fastq
mv SRX20800667_2.fastq LNCaP_10uMEnza_6GyIR_rep3_batch2_PE2.fastq
mv SRX20800668_1.fastq LNCaP_10uMEnza_6GyIR_rep4_batch2_PE1.fastq
mv SRX20800668_2.fastq LNCaP_10uMEnza_6GyIR_rep4_batch2_PE2.fastq
For the initial basic processing steps, we used a pipeline we have thoroughly described elsewhere (https://github.com/guertinlab/Nascent_RNA_Methods). Please see the above GitHub repository for detailed explanations of the steps, as well as software dependencies and preprocessing steps (e.g., gene annotation file downloads).
Change the file locations to where they are on your computer.
directory=/Volumes/External/Users/TScott/Radiation_enzalutamide_PRO
annotation_prefix=/Volumes/External/Users/TScott/ER_Antagonists/210620_PRO/Homo_sapiens.GRCh38.104
UMI_length=8
read_size=46
cores=6
genome=~/Library/CloudStorage/Box-Box/GuertinLab/hg38/hg38.fa
genome_index=~/Library/CloudStorage/Box-Box/GuertinLab/hg38/hg38
prealign_rdna=/Volumes/External/Users/TScott/ER_Antagonists/210620_PRO/human_rDNA
You could put everything in this loop into a script and submit it in parallel on a computing cluster to save time.
cd $directory
for filename in LNCaP*_PE1.fastq.gz
do
name=$(echo $filename | awk -F"_PE1.fastq.gz" '{print $1}')
echo $name
echo 'unzipping raw' $name 'files'
gunzip ${name}_PE*.fastq.gz
echo 'removing dual adapter ligations and calculating the fraction of adapter/adapters in' $name
conda activate cutadaptenv
cutadapt --cores=$cores -m $((UMI_length+2)) -O 1 \
-a TGGAATTCTCGGGTGCCAAGG ${name}_PE1.fastq -o ${name}_PE1_noadap.fastq \
--too-short-output ${name}_PE1_short.fastq > ${name}_PE1_cutadapt.txt
cutadapt --cores=$cores -m $((UMI_length+10)) -O 1 \
-a GATCGTCGGACTGTAGAACTCTGAAC ${name}_PE2.fastq -o ${name}_PE2_noadap.fastq \
--too-short-output ${name}_PE2_short.fastq > ${name}_PE2_cutadapt.txt
conda deactivate
PE1_total=$(wc -l ${name}_PE1.fastq | awk '{print $1/4}')
PE1_w_Adapter=$(wc -l ${name}_PE1_short.fastq | awk '{print $1/4}')
AAligation=$(echo "scale=2 ; $PE1_w_Adapter / $PE1_total" | bc)
echo -e "value\texperiment\tthreshold\tmetric" > ${name}_QC_metrics.txt
echo -e "$AAligation\t$name\t0.80\tAdapter/Adapter" >> ${name}_QC_metrics.txt
echo 'removing short RNA insertions and reverse complementing in' $name
seqtk seq -L $((UMI_length+10)) -r ${name}_PE1_noadap.fastq > ${name}_PE1_noadap_trimmed.fastq
echo 'removing PCR duplicates from' $name
fqdedup -i ${name}_PE1_noadap_trimmed.fastq -o ${name}_PE1_dedup.fastq
PE1_noAdapter=$(wc -l ${name}_PE1_noadap.fastq | awk '{print $1/4}')
fastq_pair -t $PE1_noAdapter ${name}_PE1_noadap.fastq ${name}_PE2_noadap.fastq
echo 'calculating and plotting RNA insert sizes from' $name
flash -q --compress-prog=gzip --suffix=gz ${name}_PE1_noadap.fastq.paired.fq \
${name}_PE2_noadap.fastq.paired.fq -o ${name}
insert_size.R ${name}.hist ${UMI_length}
echo 'trimming off the UMI from' $name
seqtk trimfq -e ${UMI_length} ${name}_PE1_dedup.fastq > ${name}_PE1_processed.fastq
seqtk trimfq -e ${UMI_length} ${name}_PE2_noadap.fastq | seqtk seq -r - > ${name}_PE2_processed.fastq
echo 'aligning' $name 'to rDNA and removing aligned reads'
bowtie2 -p $cores -x $prealign_rdna -U ${name}_PE1_processed.fastq 2>${name}_bowtie2_rDNA.log | \
samtools sort -n - | samtools fastq -f 0x4 - > ${name}_PE1.rDNA.fastq
reads=$(wc -l ${name}_PE1.rDNA.fastq | awk '{print $1/4}')
fastq_pair -t $reads ${name}_PE1.rDNA.fastq ${name}_PE2_processed.fastq
echo 'aligning' $name 'to the genome'
bowtie2 -p $cores --maxins 1000 -x $genome_index --rf -1 ${name}_PE1.rDNA.fastq.paired.fq \
-2 ${name}_PE2_processed.fastq.paired.fq 2>${name}_bowtie2.log | \
samtools view -b - | samtools sort - -o ${name}.bam
PE1_prior_rDNA=$(wc -l ${name}_PE1_processed.fastq | awk '{print $1/4}')
PE1_post_rDNA=$(wc -l ${name}_PE1.rDNA.fastq | awk '{print $1/4}')
total_rDNA=$(echo "$(($PE1_prior_rDNA-$PE1_post_rDNA))")
echo 'calculating rDNA and genomic alignment rates for' $name
concordant_pe1=$(samtools view -c -f 0x42 ${name}.bam)
total=$(echo "$(($concordant_pe1+$total_rDNA))")
rDNA_alignment=$(echo "scale=2 ; $total_rDNA / $total" | bc)
echo -e "$rDNA_alignment\t$name\t0.20\trDNA Alignment Rate" >> ${name}_QC_metrics.txt
map_pe1=$(samtools view -c -f 0x42 ${name}.bam)
pre_alignment=$(wc -l ${name}_PE1.rDNA.fastq.paired.fq | awk '{print $1/4}')
alignment_rate=$(echo "scale=2 ; $map_pe1 / $pre_alignment" | bc)
echo -e "$alignment_rate\t$name\t0.80\tAlignment Rate" >> ${name}_QC_metrics.txt
echo 'plotting and calculating complexity for' $name
fqComplexity -i ${name}_PE1_noadap_trimmed.fastq
echo 'calculating and plotting theoretical sequencing depth'
echo 'to achieve a defined number of concordantly aligned reads for' $name
PE1_total=$(wc -l ${name}_PE1.fastq | awk '{print $1/4}')
PE1_noadap_trimmed=$(wc -l ${name}_PE1_noadap_trimmed.fastq | awk '{print $1/4}')
factorX=$(echo "scale=2 ; $PE1_noadap_trimmed / $PE1_total" | bc)
echo fraction of reads that are not adapter/adapter ligation products or below 10 base inserts
echo $factorX
PE1_dedup=$(wc -l ${name}_PE1_dedup.fastq | awk '{print $1/4}')
factorY=$(echo "scale=2 ; $concordant_pe1 / $PE1_dedup" | bc)
fqComplexity -i ${name}_PE1_noadap_trimmed.fastq -x $factorX -y $factorY
echo 'Separating paired end reads and creating genomic BED and bigWig intensity files for' $name
seqOutBias $genome ${name}.bam --no-scale --stranded --bed-stranded-positive \
--bw=$name.bigWig --bed=$name.bed --out-split-pairends --only-paired \
--tail-edge --read-size=$read_size
grep -v "random" ${name}_not_scaled_PE1.bed | grep -v "chrUn" | grep -v "chrEBV" | sort -k1,1 -k2,2n > ${name}_tmp.txt
mv ${name}_tmp.txt ${name}_not_scaled_PE1.bed
echo 'Calculating pause indices for' $name
mapBed -null "0" -s -a $annotation_prefix.pause.bed -b ${name}_not_scaled_PE1.bed | \
awk '$7>0' | sort -k5,5 -k7,7nr | sort -k5,5 -u > ${name}_pause.bed
join -1 5 -2 5 ${name}_pause.bed $annotation_prefix.bed | \
awk '{OFS="\t";} $2==$8 && $6==$12 {print $2, $3, $4, $1, $6, $7, $9, $10}' | \
awk '{OFS="\t";} $5 == "+" {print $1,$2+480,$8,$4,$6,$5} $5 == "-" {print $1,$7,$2 - 380,$4,$6,$5}' | \
awk '{OFS="\t";} $3>$2 {print $1,$2,$3,$4,$5,$6}' | \
sort -k1,1 -k2,2n > ${name}_pause_counts_body_coordinates.bed
mapBed -null "0" -s -a ${name}_pause_counts_body_coordinates.bed \
-b ${name}_not_scaled_PE1.bed | awk '$7>0' | \
awk '{OFS="\t";} {print $1,$2,$3,$4,$5,$6,$7,$5/100,$7/($3 - $2)}' | \
awk '{OFS="\t";} {print $1,$2,$3,$4,$5,$6,$7,$8,$9,$8/$9}' > ${name}_pause_body.bed
pause_index.R ${name}_pause_body.bed
echo 'Calculating exon density / intron density as a metric for nascent RNA purity for' $name
mapBed -null "0" -s -a $annotation_prefix.introns.bed \
-b ${name}_not_scaled_PE1.bed | awk '$7>0' | \
awk '{OFS="\t";} {print $1,$2,$3,$5,$5,$6,$7,($3 - $2)}' > ${name}_intron_counts.bed
mapBed -null "0" -s -a $annotation_prefix.no.first.exons.named.bed \
-b ${name}_not_scaled_PE1.bed | awk '$7>0' | \
awk '{OFS="\t";} {print $1,$2,$3,$4,$4,$6,$7,($3 - $2)}' > ${name}_exon_counts.bed
exon_intron_ratio.R ${name}_exon_counts.bed ${name}_intron_counts.bed
echo 'Counting reads in genes' for $name
echo -e "\t${name}" > ${name}_gene_counts.txt
mapBed -null "0" -s -a $annotation_prefix_sorted.bed -b ${name}_not_scaled_PE1.bed | \
awk '{OFS="\t";} {print $4,$7}' >> ${name}_gene_counts.txt
#clean up intermediate files and gzip
rm ${name}_PE1_short.fastq
rm ${name}_PE2_short.fastq
rm ${name}_PE1_noadap.fastq
rm ${name}_PE2_noadap.fastq
rm ${name}_PE1_noadap_trimmed.fastq
rm ${name}_PE1_dedup.fastq
rm ${name}_PE1_processed.fastq
rm ${name}_PE2_processed.fastq
rm ${name}_PE1_noadap.fastq.paired.fq
rm ${name}_PE2_noadap.fastq.paired.fq
rm ${name}_PE1_noadap.fastq.single.fq
rm ${name}_PE2_noadap.fastq.single.fq
rm ${name}_PE1.rDNA.fastq.paired.fq
rm ${name}_PE1.rDNA.fastq.single.fq
rm ${name}_PE2_processed.fastq.paired.fq
rm ${name}_PE2_processed.fastq.single.fq
rm ${name}.extendedFrags.fastq.gz
rm ${name}.notCombined_1.fastq.gz
rm ${name}.notCombined_2.fastq.gz
gzip ${name}_PE1.fastq
gzip ${name}_PE2.fastq
done
cat *_QC_metrics.txt | awk '!x[$0]++' > Radiation_enzalutamide_project_QC_metrics.txt
plot_all_metrics.R Radiation_enzalutamide_project_QC_metrics.txt Radiation_enzalutamide
The later code expects these names. “recounts” is a legacy name based on a previous incorrect counting method.
paste -d'\t' *_batch2_gene_recounts.txt > Radiation_enzalutamide_gene_recounts.txt
paste -d'\t' *_batch1_gene_recounts.txt > Radiation_enzalutamide_batch1_gene_recounts.txt
This is not necessary if you are doing the downstream analysis in the same directory as your initial processing.
cp Radiation_enzalutamide_gene*_recounts.txt /Library/CloudStorage/Box-Box/GioeliLab/Radiation_enzalutamide_R/
cp *.pdf /Library/CloudStorage/Box-Box/GioeliLab/Radiation_enzalutamide_R/
cp /Volumes/External/Users/TScott/ER_Antagonists/210620_PRO/Homo_sapiens.GRCh38.104.bed \
/Library/CloudStorage/Box-Box/GioeliLab/Radiation_enzalutamide_R/
R code chunks will be in light blue, to distinguish from bash code chunks in grey.
setwd("~/Library/CloudStorage/Box-Box/GioeliLab/Radiation_enzalutamide_R")
You will need to install these if you don’t have them already.
library(viridis)
library(lattice)
library(latticeExtra)
library(DESeq2)
library(EnhancedVolcano)
library(pheatmap)
library(viridis)
library(dendsort)
library(RColorBrewer)
library("org.Hs.eg.db", character.only = TRUE)
library(clusterProfiler)
library(enrichplot)
library(DOSE)
library(msigdbr)
library(PPInfer)
library(tidyverse)
library(sjPlot)
library(ggpubr)
library(fgsea)
library(MatchIt)
library(tidyverse)
library(bigWig)
library(grid)
library(zoo)
library(grid)
We will use all of these functions in subsequent steps.
`%notin%` <- Negate(`%in%`)
plotPCA.any <- function(object, intgroup="condition", ntop=500, returnData=FALSE, pcs = c(1,2))
{
stopifnot(length(pcs) == 2) ### added this to check number of PCs ####
# calculate the variance for each gene
rv <- rowVars(assay(object))
# select the ntop genes by variance
select <- order(rv, decreasing=TRUE)[seq_len(min(ntop, length(rv)))]
# perform a PCA on the data in assay(x) for the selected genes
pca <- prcomp(t(assay(object)[select,]))
# the contribution to the total variance for each component
percentVar <- pca$sdev^2 / sum( pca$sdev^2 )
if (!all(intgroup %in% names(colData(object)))) {
stop("the argument 'intgroup' should specify columns of colData(dds)")
}
intgroup.df <- as.data.frame(colData(object)[, intgroup, drop=FALSE])
# add the intgroup factors together to create a new grouping factor
group <- if (length(intgroup) > 1) {
factor(apply( intgroup.df, 1, paste, collapse=" : "))
} else {
colData(object)[[intgroup]]
}
# assembly the data for the plot
########## Here we just use the pcs object passed by the end user ####
d <- data.frame(PC1=pca$x[,pcs[1]], PC2=pca$x[,pcs[2]], group=group, intgroup.df, name=colnames(object))
if (returnData) {
attr(d, "percentVar") <- percentVar[c(pcs[1], pcs[2])]
return(d)
}
col = viridis(length(unique(colData(object)[[intgroup]])))
ggplot(data = d, aes(x = PC1, y = PC2, color = group, shape = group)) +
geom_point(size = 3) + xlab(paste0("PC", pcs[1], ": ", round(percentVar[pcs[1]] * 100), "% variance")) +
ylab(paste0("PC", pcs[2], ": ", round(percentVar[pcs[2]] * 100), "% variance")) +
coord_equal() +
theme_bw() +
scale_color_manual(values = col)
}
categorize.deseq.df.mods <- function(df, fdr = 0.05, log2fold = 0.0, treat = 'Estrogen')
{
df.activated = df[df$padj < fdr & !is.na(df$padj) & df$log2FoldChange > log2fold,]
df.repressed = df[df$padj < fdr & !is.na(df$padj) & df$log2FoldChange < -log2fold,]
df.unchanged = df[df$padj > 0.5 & !is.na(df$padj) & abs(df$log2FoldChange) < 0.25,]
df.dregs = df[!(df$padj < fdr & !is.na(df$padj) & df$log2FoldChange > log2fold) &
!(df$padj < fdr & !is.na(df$padj) & df$log2FoldChange < -log2fold) &
!(df$padj > 0.5 & !is.na(df$padj) & abs(df$log2FoldChange) < 0.25), ]
df.unchanged$treatment = paste(treat, 'Unchanged')
df.activated$treatment = paste(treat, 'Activated')
df.repressed$treatment = paste(treat, 'Repressed')
df.dregs$treatment = paste(treat, 'All Other Genes')
df.effects.lattice =
rbind(df.activated,
df.unchanged,
df.repressed,
df.dregs)
df.effects.lattice$treatment = factor(df.effects.lattice$treatment)
df.effects.lattice$treatment = relevel(df.effects.lattice$treatment, ref = paste(treat, 'Activated'))
df.effects.lattice$treatment = relevel(df.effects.lattice$treatment, ref = paste(treat, 'Repressed'))
df.effects.lattice$treatment = relevel(df.effects.lattice$treatment, ref = paste(treat, 'Unchanged'))
df.effects.lattice$treatment = relevel(df.effects.lattice$treatment, ref = paste(treat, 'All Other Genes'))
return(df.effects.lattice)
}
get.tss <- function(bedfile)
{
if (ncol(bedfile) > 6) {
bedfile = bedfile[,c(1:6)]
}
for (i in 1:nrow(bedfile)) {
if (bedfile[i,6] == '+') {
bedfile[i,3] = bedfile[i,2] + 1
} else {
bedfile[i,2] = bedfile[i,3] - 1
}
}
return(bedfile)
}
filter.deseq.into.bed <- function(deseq.df, gene.file, cat = 'R1881 Activated') {
deseq.df = deseq.df[deseq.df$treatment == cat,]
#print(dim(deseq.df))
#scientific notation was messing this up occasionally
x = gene.file$V4
#print(length(x))
y = gene.file[x %in% rownames(deseq.df),]
#print(dim(y))
z = get.tss(y)
#print(dim(z))
return(z)
}
bedTools.closest <- function(functionstring="/usr/local/anaconda3/bin/closestBed",bed1,bed2,opt.string="") {
options(scipen =99) # not to use scientific notation when writing out
#write bed formatted dataframes to tempfile
write.table(bed1,file= 'a.file.bed', quote=F,sep="\t",col.names=F,row.names=F)
write.table(bed2,file= 'b.file.bed', quote=F,sep="\t",col.names=F,row.names=F)
# create the command string and call the command using system()
command1=paste('sort -k1,1 -k2,2n', 'a.file.bed', '> a.file.sorted.bed')
cat(command1,"\n")
try(system(command1))
command2=paste('sort -k1,1 -k2,2n', 'b.file.bed', '> b.file.sorted.bed')
cat(command2,"\n")
try(system(command2))
command=paste(functionstring, opt.string,"-a",'a.file.sorted.bed',"-b",'b.file.sorted.bed',">",'out.file.bed',sep=" ")
cat(command,"\n")
try(system(command))
res=read.table('out.file.bed',header=F, comment.char='')
command3=paste('rm', 'a.file.bed', 'b.file.bed', 'a.file.sorted.bed', 'b.file.sorted.bed', 'out.file.bed')
cat(command3,"\n")
try(system(command3))
colnames(res) = c(colnames(bed1), colnames(bed2)[1:ncol(bed2)], 'dis' )
return(res)
}
get.raw.counts.full.dREG <- function(df, path.to.bigWig, file.prefix = 'LNCaP_')
{
vec.names = c()
df.full = read.table(df)[,c(1:3)]
inten.df=data.frame(matrix(ncol = 0, nrow = nrow(df.full)))
for (mod.bigWig in Sys.glob(file.path(path.to.bigWig,
paste(file.prefix, "*_plus_PE1.bigWig", sep ='')))) {
factor.name = strsplit(strsplit(mod.bigWig,
"/")[[1]][length(strsplit(mod.bigWig, "/")[[1]])], '_plus')[[1]][1]
print(factor.name)
vec.names = c(vec.names, factor.name)
loaded.bw.plus = load.bigWig(mod.bigWig)
print(mod.bigWig)
print(paste(path.to.bigWig,'/',factor.name, '_minus_PE1.bigWig', sep=''))
loaded.bw.minus = load.bigWig(paste(path.to.bigWig,'/',factor.name,
'_minus_PE1.bigWig', sep=''))
mod.inten.plus = bed.region.bpQuery.bigWig(loaded.bw.plus, df.full)
mod.inten.minus = bed.region.bpQuery.bigWig(loaded.bw.minus, df.full)
inten.df = cbind(inten.df, mod.inten.plus - mod.inten.minus)
}
colnames(inten.df) = vec.names
r.names = paste(df.full[,1], ':', df.full[,2], '-', df.full[,3], sep='')
row.names(inten.df) = r.names
return(inten.df)
}
process.deseq.df <- function(df, filename = 'file.name', fdr = 0.05, log2fold = 0.0)
{
increased = df[df$padj < fdr & !is.na(df$padj) & df$log2FoldChange > log2fold,]
decreased = df[df$padj < fdr & !is.na(df$padj) & df$log2FoldChange < -log2fold,]
unchanged = df[df$padj > 0.5 & !is.na(df$padj) & abs(df$log2FoldChange) < 0.25,]
dregs = df[!(df$padj < fdr & !is.na(df$padj) & df$log2FoldChange > log2fold) &
!(df$padj < fdr & !is.na(df$padj) & df$log2FoldChange < -log2fold) &
!(df$padj > 0.5 & !is.na(df$padj) & abs(df$log2FoldChange) < 0.25), ]
lst = list(increased, decreased, unchanged, dregs)
process.rows <- function(i, str) {
coor = rownames(i)
coor.start = sapply(strsplit(sapply(strsplit(as.character(coor),':'), "[", 2), "-"), "[", 1);
coor.end = sapply(strsplit(as.character(coor),'-'), "[", 2)
coor.chr = sapply(strsplit(as.character(coor),':'), "[", 1)
df.coor = cbind(coor.chr, coor.start, coor.end, as.character(i$baseMean),
as.character(i$log2FoldChange), '+', as.character(i$lfcSE),
as.character(i$pvalue), as.character(i$padj))
write.table(df.coor, file = paste(str,'_', filename, '_', fdr, '_FDR.bed', sep =''),
sep = '\t', quote=F, row.names=F, col.names=F)
}
process.rows(increased, 'increased')
process.rows(decreased, 'decreased')
process.rows(unchanged, 'unchanged')
process.rows(dregs, 'dregs')
}
sort_hclust <- function(...) as.hclust(dendsort(as.dendrogram(...)))
bedTools.intersect <- function(functionstring="/usr/local/anaconda3/bin/intersectBed",bed1,bed2,opt.string="")
{
command=paste(functionstring, opt.string,"-a", bed1,"-b", bed2,">",'out.file.bed',sep=" ")
cat(command,"\n")
try(system(command))
res=read.table('out.file.bed',header=F, comment.char='')
command3=paste('rm', 'out.file.bed')
cat(command3,"\n")
try(system(command3))
return(res)
}
PeaksWithMotif <- function(motif)
{
#Hard-coded name and bed files
name = sapply(strsplit(motif, "_fimo"), "[", 1)
act = "increased_df.deseq.effects.dREG.radiation.lattice.batch1_0.1_FDR.bed"
unc = "unchanged_df.deseq.effects.dREG.radiation.lattice.batch1_0.1_FDR.bed"
rep = "decreased_df.deseq.effects.dREG.radiation.lattice.batch1_0.1_FDR.bed"
act.intersect = bedTools.intersect(bed1 = act, bed2 = motif, opt.string = "-wao")
ncol = ncol(act.intersect)
act.with = sum(as.numeric(act.intersect[,ncol]) > 0)
act.without = sum(as.numeric(act.intersect[,ncol]) == 0)
unc.intersect = bedTools.intersect(bed1 = unc, bed2 = motif, opt.string = "-wao")
ncol = ncol(unc.intersect)
unc.with = sum(as.numeric(unc.intersect[,ncol]) > 0)
unc.without = sum(as.numeric(unc.intersect[,ncol]) == 0)
rep.intersect = bedTools.intersect(bed1 = rep, bed2 = motif, opt.string = "-wao")
ncol = ncol(rep.intersect)
rep.with = sum(as.numeric(rep.intersect[,ncol]) > 0)
rep.without = sum(as.numeric(rep.intersect[,ncol]) == 0)
result = cbind(c(act.with, act.without),
c(unc.with, unc.without),
c(rep.with, rep.without))
colnames(result) <- c("Activated", "Unchanged", "Repressed")
rownames(result) <- c("With motif", "Without motif")
print(chisq.test(result))
#Can remove this line if you want the actual counts
result = 100*sweep(result, 2, colSums(result), "/")
barplot(result, col = c("red", "darkblue"), main = paste0(name, " motif prevalance in\nputative regulatory elements"), legend.text = TRUE, args.legend = list(x = "topright", bg = "white", cex = 2), cex.axis = 2.5, cex.names = 1.8, cex.main=2)
return(result)
}
cdf.deseq.df <- function(df, genes = gene.file,
chip.peaks = 'Znf143_K562_GM12878_peaks_merged.sorted.bed',
class = 'Enzalutamide Repressed Genes', tf.name = 'ZNF143') {
bed.tss.activated = filter.deseq.into.bed(df, genes, cat = class)
paste(head(bed.tss.activated))
bed.tss.unchanged = filter.deseq.into.bed(df, genes, cat = 'Similarly Expressed Genes')
peaks = read.table(chip.peaks, sep = '\t')
print(head(peaks))
act.distance = bedTools.closest(bed1 = bed.tss.activated, bed2 = peaks[,c(1:3)], opt.string = '-D a')
unreg.distance = bedTools.closest(bed1 = bed.tss.unchanged, bed2 = peaks[,c(1:3)], opt.string = '-D a')
df.up.can = cbind(act.distance[,c(4, 10)], class)
df.un.can = cbind(unreg.distance[,c(4, 10)], "Similarly Expressed Genes")
colnames(df.up.can) = c(colnames(df.up.can)[1:2], 'status')
colnames(df.un.can) = c(colnames(df.up.can)[1:2], 'status')
df.all = rbind(df.up.can, df.un.can)
return(df.all)
}
plot.composites <- function(dat, treatment = "enzalutamide", summit = 'TSS', class= '', num.m = -200, num.p =90, y.low =0, y.high = 0.2, col.lines = c("#0000FF", "grey60")) {
pdf(file = paste0('~/Library/CloudStorage/Box-Box/GioeliLab/Radiation_enzalutamide_R/composite_PolII_density_around_',treatment,'_genes.pdf'), width=6, height=6)
print(xyplot(est ~ x, group = treatment, data = dat,
type = 'l',
scales=list(x=list(cex=1,relation = "free",font=2), y =list(cex=1, relation="free",font=2)),
col = col.lines,
#main=list(label=class, cex=0.6),
auto.key = list(points=F, lines=T, cex=1.2,font=2),
par.settings = list(strip.background=list(col="grey85"),
superpose.symbol = list(pch = c(16),
col=col.lines, cex =0.5),
superpose.line = list(col = col.lines, lwd=c(2),
lty = c(1))),
cex.axis=1.0,
par.strip.text=list(cex=0.9, font=1, col='black'),
aspect=1.0,
between=list(y=0.5, x=0.5),
# index.cond = list(c(1,3,4,2)),
lwd=2,
ylab = list(label = "PolII Density", cex =1.2,font=2),
xlab = list(label = paste("Distance from ", summit, sep=''), cex =1.2,font=2),
xlim = c(-200,1200)
))
dev.off()
}
bwplot.input.pause <- function(bed.input, treatment='enzalutamide', upstream = 350, downstream = 1399) {
#isolate TSSs and define 1000 bp window centered around TSS
tss.df = bed.input[,c(1,7:8,4:6)]
tss.df[,3] = tss.df[,2] + 500
tss.df[,2] = tss.df[,2] - 500
#sum polymerase density for each position in window
combined.plus.bw = load.bigWig('/Volumes/External/Users/TScott/Radiation_enzalutamide_PRO/pro_batch2_plus_merged.bigWig')
combined.minus.bw = load.bigWig('/Volumes/External/Users/TScott/Radiation_enzalutamide_PRO/pro_batch2_minus_merged.bigWig')
dat.x = bed6.step.probeQuery.bigWig(combined.plus.bw, combined.minus.bw, tss.df, step = 1, as.matrix = TRUE, op = "sum", follow.strand = FALSE)
dat.x[is.na(dat.x)] <- 0
dat.x.win = t(apply(dat.x, 1, function(x){rollapply(x, width = 50, FUN = mean, by = 1, by.column = FALSE,align = "left")}))
index.max = apply(dat.x.win, 1, which.max)
the.max = apply(dat.x.win, 1, max)
#find highest value in window to use as pause summit and take a 50 bp window around it - this is the 'pause region'
pause.df = tss.df
pause.df[2][pause.df$strand == '-',] = pause.df[2][pause.df$strand == '-',] + index.max[pause.df$strand == '-']
pause.df[2][pause.df$strand == '+',] = pause.df[2][pause.df$strand == '+',] + index.max[pause.df$strand == '+']
pause.df[3] = pause.df[2] + 50
#pause.df[3] = pause.df[2] + 25
#pause.df[2] = pause.df[2] - 25
#define 'body region' as end of pause region to the end of the plotting window (strand specific)
body.df = fiveprime.bed(pause.df, upstreamWindow = upstream, downstreamWindow = downstream)
body.df[body.df$strand == '+',2] = pause.df[pause.df$strand == '+',3]+1
body.df[body.df$strand == '-',3] = pause.df[pause.df$strand == '-',2]-1
#measure polymerase density within pause and body regions (sum for pause, average for body)
output.df=cbind.data.frame(bed.input[,c(1:6)], rep(treatment, nrow(bed.input)))
colnames(output.df)[7] = "treatment"
wiggle.plus = load.bigWig(paste0('/Volumes/External/Users/TScott/Radiation_enzalutamide_PRO/pro_', treatment, '_batch2_plus_merged.bigWig'))
wiggle.minus = load.bigWig(paste0('/Volumes/External/Users/TScott/Radiation_enzalutamide_PRO/pro_', treatment, '_batch2_minus_merged.bigWig'))
output.df[,8] = bed6.region.bpQuery.bigWig(wiggle.plus, wiggle.minus, pause.df, op = "sum")
output.df[,9] = bed6.region.bpQuery.bigWig(wiggle.plus, wiggle.minus, body.df, op = "avg")
colnames(output.df)[c(8,9)] = c('pause_sum','body_avg')
return(output.df)
}
panel.violin.hack <-
function (x, y, box.ratio = 1, box.width = box.ratio/(1 + box.ratio),
horizontal = TRUE, alpha = plot.polygon$alpha, border =
plot.polygon$border,
lty = plot.polygon$lty, lwd = plot.polygon$lwd, col = plot.polygon
$col,
varwidth = FALSE, bw = NULL, adjust = NULL, kernel = NULL,
window = NULL, width = NULL, n = 50, from = NULL, to = NULL,
cut = NULL, na.rm = TRUE, ...)
{
if (all(is.na(x) | is.na(y)))
return()
x <- as.numeric(x)
y <- as.numeric(y)
plot.polygon <- trellis.par.get("plot.polygon")
darg <- list()
darg$bw <- bw
darg$adjust <- adjust
darg$kernel <- kernel
darg$window <- window
darg$width <- width
darg$n <- n
darg$from <- from
darg$to <- to
darg$cut <- cut
darg$na.rm <- na.rm
my.density <- function(x) {
ans <- try(do.call("density", c(list(x = x), darg)),
silent = TRUE)
if (inherits(ans, "try-error"))
list(x = rep(x[1], 3), y = c(0, 1, 0))
else ans
}
numeric.list <- if (horizontal)
split(x, factor(y))
else split(y, factor(x))
levels.fos <- as.numeric(names(numeric.list))
d.list <- lapply(numeric.list, my.density)
dx.list <- lapply(d.list, "[[", "x")
dy.list <- lapply(d.list, "[[", "y")
max.d <- sapply(dy.list, max)
if (varwidth)
max.d[] <- max(max.d)
xscale <- current.panel.limits()$xlim
yscale <- current.panel.limits()$ylim
height <- box.width
if (horizontal) {
for (i in seq_along(levels.fos)) {
if (is.finite(max.d[i])) {
pushViewport(viewport(y = unit(levels.fos[i],
"native"), height = unit(height, "native"),
yscale = c(max.d[i] * c(-1, 1)), xscale = xscale))
grid.polygon(x = c(dx.list[[i]], rev(dx.list[[i]])),
y = c(dy.list[[i]], -rev(dy.list[[i]])),
default.units = "native",
# this is the point at which the index is added
gp = gpar(fill = col[i], col = border, lty = lty,
lwd = lwd, alpha = alpha))
popViewport()
}
}
}
else {
for (i in seq_along(levels.fos)) {
if (is.finite(max.d[i])) {
pushViewport(viewport(x = unit(levels.fos[i],
"native"), width = unit(height, "native"),
xscale = c(max.d[i] * c(-1, 1)), yscale = yscale))
grid.polygon(y = c(dx.list[[i]], rev(dx.list[[i]])),
x = c(dy.list[[i]], -rev(dy.list[[i]])),
default.units = "native",
# this is the point at which the index is added
gp = gpar(fill = col[i], col = border, lty = lty,
lwd = lwd, alpha = alpha))
popViewport()
}
}
}
invisible()
}
Here we use viridis.
col = viridis(5)
This allows us to convert between Ensembl gene names and gene symbols.
gene.file = read.table("Homo_sapiens.GRCh38.104_sorted.bed", sep = '\t', header = FALSE)
gene.file = gene.file[gene.file$V5 %notin% c("havana", "havana_tagene", "ensembl_havana", "ensembl"),]
gene.symbol = gene.file[,c(4,5)]
colnames(gene.symbol) = c("gene", "symbol")
This first experiment (batch 1) had 4 replicates each of irradiated or mock-treated cells. We identify differentialy expressed genes at a false discovery rate (FDR) of 0.1.
#Read in the counts data
x = read.table("Radiation_enzalutamide_batch1_gene_recounts.txt", sep = '\t', header = TRUE)
#The first column is the genes
rownames(x) = x[,1]
#Remove the odd columns, because they're all the same (the gene names)
x = x[,seq(2,to=ncol(x),by=2)]
#Generate the variable names by parsing the column (sample) names
radiation <- vector(mode="character", length=ncol(x))
radiation[grep("6GyIR", colnames(x))] = "Radiation"
radiation[c(1:length(colnames(x)))[!(c(1:length(colnames(x))) %in% grep("6GyIR", colnames(x)))]] = "Control"
radiation = relevel(factor(radiation), ref = "Control")
rep = factor(sapply(strsplit(sapply(strsplit(colnames(x), 'rep'), '[', 2), '_batch'), '[', 1))
#Run DESeq2
deseq.df = DESeqDataSetFromMatrix(x, cbind.data.frame(radiation, rep), ~ rep + radiation)
deseq.df = DESeq(deseq.df)
#Get differentially expressed genes with shrunken fold changes for ranking gene set enrichment analysis
res.deseq.radiation <- lfcShrink(deseq.df, coef = "radiation_Radiation_vs_Control", type="ashr")
#Only keep genes with gene symbols in the annotation file
res.deseq.radiation = res.deseq.radiation[rownames(res.deseq.radiation) %in% gene.symbol$gene,]
#Also get the non-shrunken fold changes (the other values are the same)
res.deseq.radiation.unshrunk <- results(deseq.df, contrast = c("radiation", "Radiation", "Control"))
res.deseq.radiation.unshrunk = res.deseq.radiation.unshrunk[rownames(res.deseq.radiation.unshrunk) %in% gene.symbol$gene,]
This function uses the DESeq PCA plotting function but allows us to plot any principle components.
rld_LNCaP = rlogTransformation(deseq.df)
#Reverse the order for the legend to match the next experiment
colData(rld_LNCaP)$radiation <- factor(colData(rld_LNCaP)$radiation, levels = rev(levels(colData(rld_LNCaP)$radiation)))
pdf("PCA_batch1_recount.pdf", height = 2, width = 4)
plotPCA.any(rld_LNCaP, intgroup="radiation") +
scale_color_manual(name = "Condition", values = col[c(4,2)]) +
scale_shape_manual(name = "Condition", values = c(16,16))
dev.off()
Here we plot the normalized counts for each differentially expressed gene at an FDR of 0.1, scaled by z-score across samples. Hierarchical clustering is performed both on the samples and on the genes.
#Get the normalized counts for each gene with an adjusted p-value < 0.1
wide_counts = counts(deseq.df[rownames(deseq.df) %in% rownames(res.deseq.radiation)[res.deseq.radiation$padj < .1 & !is.na(res.deseq.radiation$padj)],], normalized = TRUE)
#Generate the variable names by parsing the column (sample) names
radiation <- vector(mode="character", length=ncol(wide_counts))
radiation[grep("6GyIR", colnames(wide_counts))] = "Radiation"
radiation[c(1:length(colnames(wide_counts)))[!(c(1:length(colnames(wide_counts))) %in% grep("6GyIR", colnames(wide_counts)))]] = "Control"
rep = factor(sapply(strsplit(sapply(strsplit(colnames(wide_counts), 'rep'), '[', 2), '_batch'), '[', 1))
#Name each sample based on the radiation variable (drops the replicate number)
colnames(wide_counts) = radiation
#Plot
pdf("Heatmap_batch1_recount.pdf", width = 8, height = 5)
pheatmap(wide_counts, scale = "row", show_rownames = F, angle_col = 0, fontsize = 24,
color = rev(brewer.pal(11,"RdBu")),
cluster_cols = sort_hclust(hclust(dist(scale(t(wide_counts))))),
cluster_rows = sort_hclust(hclust(dist(t(scale(t(wide_counts)))))))
dev.off()
Here we make a volcano plot of the fold changes and the adjusted p-values. The dashed line indicating significance is using an FDR of 0.1. We also write a table of the values for the volcano plot.
#Color and name the points based on significance and direction
keyvals <- ifelse(
res.deseq.radiation.unshrunk$padj < .1 & res.deseq.radiation.unshrunk$log2FoldChange < 0, 'blue',
ifelse(res.deseq.radiation.unshrunk$padj < .1 & res.deseq.radiation.unshrunk$log2FoldChange > 0, 'red',
'black'))
keyvals[is.na(keyvals)] <- 'black'
names(keyvals)[keyvals == 'red'] <- 'Up'
names(keyvals)[keyvals == 'black'] <- 'Not Significant'
names(keyvals)[keyvals == 'blue'] <- 'Down'
#Plot
pdf("Radiation_Volcano_batch1_recount.pdf")
EnhancedVolcano(res.deseq.radiation.unshrunk,
lab = gene.symbol$symbol[gene.symbol$gene == rownames(res.deseq.radiation.unshrunk)],
x = 'log2FoldChange',
y = 'padj',
xlab = bquote(~log[2]~ '(fold change)'),
ylab = bquote('-' ~log[10]~ '(q-value)'),
title = 'Radiation versus Control',
subtitle = NULL,
caption = NULL,
pCutoff = 0.1,
FCcutoff = 0.0,
pointSize = 2.0,
labSize = 3.0,
xlim = c(-4,4),
colCustom = keyvals,
colAlpha = .25,
legendPosition = 'right',
legendLabSize = 10,
legendIconSize = 5.0) +
theme(plot.title = element_text(hjust = 0.5))
dev.off()
Radiation_batch1_Volcano_plot_data <- as.data.frame(res.deseq.radiation.unshrunk)
Radiation_batch1_Volcano_plot_data <- merge.data.frame(Radiation_batch1_Volcano_plot_data, gene.symbol,
by.x = "row.names", by.y = "gene")
Radiation_batch1_Volcano_plot_data <- Radiation_batch1_Volcano_plot_data[,c("symbol", "log2FoldChange", "padj")]
write.table(Radiation_batch1_Volcano_plot_data, file = "Radiation_batch1_Volcano_plot_data_recount.tsv", row.names=FALSE, sep="\t", quote = FALSE)
Here we perform gene set enrichment analysis with the Hallmark gene sets. All genes (not just differentially expressed genes) go into this analysis, a distinguishing feature of GSEA compared to over-representation analysis. Genes are ranked by their shrunken fold changes, which is designed to make the fold changes comparable across a range of gene expression levels. Counts-based data show heteroskedasticity, where the genes with fewer counts have higher variability. Each gene set is assigned a p-value, which is then adjusted for multiple hypothesis testing using the Benjamini-Hochberg (false discovery rate (FDR)) procedure. We plotted the gene sets that were significant at an FDR of 0.05. The Androgen Response gene set had an adjusted p-value of 0.495.
#Rank the genes based on shrunken fold change
original_gene_list <- res.deseq.radiation$log2FoldChange
names(original_gene_list) <- rownames(res.deseq.radiation)
gene_list<-na.omit(original_gene_list)
gene_list = sort(gene_list, decreasing = TRUE)
#Get the Hallmark gene sets
h_gene_sets = msigdbr(species = "Homo sapiens", category = "H")
msigdbr_list = split(x = h_gene_sets$ensembl_gene, f = h_gene_sets$gs_name)
#For reproducibility of random number generation for permutations, set the seed each time
set.seed(0)
#Run GSEA
fgseaRes <- fgsea(msigdbr_list, gene_list, eps = 0)
#Rank the results by the normalized enrichment score
fgseaResTidy <- fgseaRes %>%
as_tibble() %>%
arrange(desc(NES))
#Just take those with an adjusted p-value of < 0.05 for plotting
gse_top = fgseaResTidy[fgseaResTidy$padj < 0.05,]
#Plot
pdf("Hallmark_GSEA_batch1_FDR_0.05_recount.pdf", width = 14)
ggplot(gse_top, aes(reorder(pathway, NES), NES, fill = log(padj, base = 10))) +
geom_col() +
coord_flip() +
labs(x="Pathway", y="Normalized Enrichment Score",
title="Hallmark pathways") +
theme_minimal() +
theme(text = element_text(size = 20)) +
labs(fill = "log(padj)")
dev.off()
Here we use our PRO-seq data to identify putative regulatory elements. We use seqOutBias to generate bigWig files from all our samples, separated by paired end and strand, to upload to the dREG portal (https://dreg.dnasequence.org/). The output includes a LNCaP_batch1.dREG.peak.full.bed file that represents the genomic coordinates of each putative regulatory element. These are identified based on a signature of bidirectional transcription that occurs at both promoters and enhancers.
#Get all of the bam files in batch 1
bams=$(ls *_batch1.bam)
#Generate bigWigs
seqOutBias $genome $bams --no-scale --stranded --bw=LNCaP_batch1_combined_no_scale.bigWig \
--skip-bed --only-paired --out-split-pairends --tail-edge --read-size=$read_size
Now count PRO-seq reads in these peaks, identify regions of differential bidirectional transcription, and write bed files for each class of region (increased, decreased, and unchanged).
#Generate a counts table
counts.dreg = get.raw.counts.full.dREG(df = 'LNCaP_batch1.dREG.peak.full.bed',
path.to.bigWig = "/Volumes/External/Users/TScott/Radiation_enzalutamide_PRO/", file.prefix = 'LNCaP_')
#Just focus on batch 1
counts.dreg = counts.dreg[,grep("batch1", colnames(counts.dreg))]
#Save for future use so you don't have to re-count
save(counts.dreg, file = 'batch1.counts.dreg.Rdata')
#Pick up here if you're coming back to this analysis
load("batch1.counts.dreg.RData")
#Generate the variable names by parsing the column (sample) names
radiation <- vector(mode="character", length=ncol(counts.dreg))
radiation[grep("6GyIR", colnames(counts.dreg))] = "Radiation"
radiation[c(1:length(colnames(counts.dreg)))[!(c(1:length(colnames(counts.dreg))) %in% grep("6GyIR", colnames(counts.dreg)))]] = "Control"
rep = factor(sapply(strsplit(sapply(strsplit(colnames(counts.dreg), 'rep'), '[', 2), '_batch'), '[', 1))
#Run DESeq2
deseq.df = DESeqDataSetFromMatrix(counts.dreg, cbind.data.frame(radiation, rep), ~ rep + radiation)
deseq.df = DESeq(deseq.df)
#Get differentially transcribed regions
res.deseq.radiation = results(deseq.df, contrast = c("radiation", "Radiation", "Control"))
#Classify the regions at an FDR of 0.1
df.deseq.effects.dREG.radiation.lattice = categorize.deseq.df(res.deseq.radiation, fdr = 0.1, log2fold = 0.0, treat = 'Radiation')
#Write bed files for increased, decreased, and unchanged regions
process.deseq.df(df = df.deseq.effects.dREG.radiation.lattice,
filename = 'df.deseq.effects.dREG.radiation.lattice.batch1', fdr = 0.1, log2fold = 0)
For increased and decreased regions, convert the bed file to a fasta file (the nucleotide sequence).
for bed in *creased*0.1_FDR.bed
do
direction=$(echo $bed | awk -F"_df.deseq.effects.dREG" '{print $1}')
batch=$(echo $bed | awk -F"_0.1_FDR.bed" '{print $1}' | awk -F"lattice" '{print $2}')
batch=${batch/./_}
fastaFromBed -fi $genome -bed $bed -fo ${direction}${batch}.fasta
done
The script used in the next code chunk (220118_meme_pro_clusters.sh) is below:
#!/bin/bash
#SBATCH -p parallel
#SBATCH -A gioeli_lab
#SBATCH -t 72:00:00
#SBATCH --nodes=5
#SBATCH --ntasks-per-node=20
#SBATCH --mem-per-cpu=9000
module purge
module load gcc/9.2.0 mvapich2/2.3.3 meme/5.1.0
cd /scratch/ts2hx
srun -n100 meme -p 100 -oc ${name}_meme_output -objfun classic -evt 0.01 -nmotifs 1000 -searchsize 0 -minw 8 -maxw 15 -revcomp -dna -markov_order 3 -maxsize 100000000 $i
You can run MEME in parallel on a computing cluster to save time.
#Copy the script
scp ~/Library/CloudStorage/Box-Box/GuertinLab/Code/220118_meme_pro_clusters.sh \
ts2hx@rivanna.hpc.virginia.edu:/scratch/ts2hx
#Copy the files
scp *creased*.fasta ts2hx@rivanna.hpc.virginia.edu:/scratch/ts2hx
#Log in
ssh -Y ts2hx@rivanna.hpc.virginia.edu
#Move to working directory
cd /scratch/ts2hx
#Make the script executable
chmod +x 220118_meme_pro_clusters.sh
#Run MEME on each file
for i in *.fasta
do
name=$(echo $i | awk -F".fasta" '{print $1}')
sbatch --export=ALL,i=$i,name=$name --output=220118_meme_pro_clusters_${name}.out 220118_meme_pro_clusters.sh
done
#Leave the cluster
exit
#Copy the output to your local computer
scp -r ts2hx@rivanna.hpc.virginia.edu:/scratch/ts2hx/*out* .
Match the motifs to our curated database.
#Download the database from our Github repository if you don't have it already
wget https://github.com/Gioeli-Lab/AR_IR_PRO-seq_analysis/raw/main/Motif_databases/tomtom_db/homer_uniprobe_jaspar_edited.txt -P ~/Library/CloudStorage/Box-Box/GuertinLab/Motif_databases/
#Make this a variable to save space
motifs=~/Library/CloudStorage/Box-Box/GuertinLab/Motif_databases/tomtom_db/homer_uniprobe_jaspar_edited.txt
#For each output, match the motifs to the database
for i in *_meme_output/meme.txt
do
name=$(echo $i | awk -F"_meme_output" '{print $1}')
tomtom -no-ssc -oc ${name}_tomtom_output -verbosity 1 -incomplete-scores \
-min-overlap 1 -dist ed -evalue -thresh 0.1 $i $motifs
done
We identified the P53 motif in the regulatory elements with increased bidirectional transcription. This does not necessarily mean that the motif is not present in the other classes of regulatory elements. To explicitly compare the motif abundance, we first use FIMO to identify the top 1,000,000 motif instances in the genome. We also do this for the AR motif, as de novo motif analysis is not sensitive enough to detect all motifs.
#Download the motif files if you don't have them already
wget https://raw.githubusercontent.com/Gioeli-Lab/AR_IR_PRO-seq_analysis/main/Motif_databases/individual_memes_1/p53_homer_uniprobe_jaspar_edited.txt_meme.txt -P ~/Library/CloudStorage/Box-Box/GuertinLab/Motif_databases/individual_memes
wget https://raw.githubusercontent.com/Gioeli-Lab/AR_IR_PRO-seq_analysis/main/Motif_databases/individual_memes_1/ARE_meme.txt -P ~/Library/CloudStorage/Box-Box/GuertinLab/Motif_databases/individual_memes
#Change this to the directory where your motif file is
motif_directory=~/Library/CloudStorage/Box-Box/GuertinLab/Motif_databases/individual_memes
#Find individual motif instances in the genome
fimo --max-stored-scores 2500000 -oc P53_fimo ${motif_directory}/p53_homer_uniprobe_jaspar_edited.txt_meme.txt $genome
fimo --max-stored-scores 2000000 -oc Full_ARE_fimo ${motif_directory}/ARE_meme.txt $genome
#Take the top 1000000 most stringent matches
score=$(tail -n +2 P53_fimo/fimo.gff | sort -nrk6,6 | awk 'FNR == 1000000 {print $6}')
tail -n +2 P53_fimo/fimo.gff | awk -v sco="$score" '{ if ($6 >= sco) { print } }' | \
awk '{OFS="\t";} {print $1,$4,$5}' > P53_fimo/1000000_fimo.gff
score=$(tail -n +2 ARE_fimo/fimo.gff | sort -nrk6,6 | awk 'FNR == 1000000 {print $6}')
tail -n +2 ARE_fimo/fimo.gff | awk -v sco="$score" '{ if ($6 >= sco) { print } }' | \
awk '{OFS="\t";} {print $1,$4,$5}' > ARE_fimo/1000000_fimo.gff
Now we use this R function to find the proportion of each class of regulatory elements that contain one of these motif instances.
pdf("P53_fimo_barchart_batch1.pdf")
P53 = PeaksWithMotif("P53_fimo/1000000_fimo.gff")
dev.off()
pdf("ARE_fimo_barchart_batch1_recount.pdf")
ARE = PeaksWithMotif("ARE_fimo/1000000_fimo.gff")
dev.off()
This second experiment (batch 2) had 2 replicates each of: mock-treated + vehicle, irradiated + vehicle, mock-treated + enzalutamide, irradiated + enzalutamide.
#Read in the counts data
x = read.table("Radiation_enzalutamide_gene_recounts.txt", sep = '\t', header = TRUE)
#The first column is the genes
rownames(x) = x[,1]
#Remove the odd columns, because they're all the same (the gene names)
x = x[,seq(2,to=ncol(x),by=2)]
#Generate size factors for scaling composite plots based on read depth
size_factors <- estimateSizeFactorsForMatrix(x)
write.table(estimateSizeFactorsForMatrix(x),
file = 'norm.bedGraph.sizeFactor_batch2_recount.txt', quote =F, col.names=F)
#Generate the variable names by parsing the column (sample) names
enza <- vector(mode="character", length=ncol(x))
enza[grep("10uMEnza", colnames(x))] = "Enzalutamide"
enza[c(1:length(colnames(x)))[!(c(1:length(colnames(x))) %in% grep("10uMEnza", colnames(x)))]] = "Vehicle"
enza = relevel(factor(enza), ref = "Vehicle")
radiation <- vector(mode="character", length=ncol(x))
radiation[grep("6GyIR", colnames(x))] = "Radiation"
radiation[c(1:length(colnames(x)))[!(c(1:length(colnames(x))) %in% grep("6GyIR", colnames(x)))]] = "Control"
radiation = relevel(factor(radiation), ref = "Control")
rep = factor(sapply(strsplit(sapply(strsplit(colnames(x), 'rep'), '[', 2), '_batch'), '[', 1))
treatment = factor(paste(radiation, enza, sep = "/"), levels = c("Radiation/Enzalutamide", "Radiation/Vehicle", "Control/Enzalutamide", "Control/Vehicle"))
#Run DESeq2
deseq.df = DESeqDataSetFromMatrix(x, cbind.data.frame(enza, radiation, rep, treatment), ~ rep + radiation + enza)
deseq.df = DESeq(deseq.df)
#Get differentially expressed genes for each treatment
res.deseq.enza = results(deseq.df, contrast = c("enza", "Enzalutamide", "Vehicle"))
res.deseq.enza = res.deseq.enza[rownames(res.deseq.enza) %in% gene.symbol$gene,]
res.deseq.radiation = results(deseq.df, contrast = c("radiation", "Radiation", "Control"))
res.deseq.radiation = res.deseq.radiation[rownames(res.deseq.radiation) %in% gene.symbol$gene,]
Here we plot the normalized counts for each differentially expressed gene at an FDR of 0.1, scaled by z-score across samples. Hierarchical clustering is performed both on the samples and on the genes.
#Get the normalized counts for each gene with an adjusted p-value < 0.1
wide_counts = counts(deseq.df[rownames(deseq.df) %in%
c(rownames(res.deseq.enza)[res.deseq.enza$padj < .1 & !is.na(res.deseq.enza$padj)],
rownames(res.deseq.radiation)[res.deseq.radiation$padj < .1 & !is.na(res.deseq.radiation$padj)]),],
normalized = TRUE)
#Average the two replicates for each condition
for(i in seq(1, ncol(wide_counts), by = 2)) wide_counts[,i] <- (wide_counts[,i] + wide_counts[,i+1])/2
wide_counts = wide_counts[,seq(1, ncol(wide_counts), by = 2)]
#Generate the variable names by parsing the column (sample) names
enza <- vector(mode="character", length=ncol(wide_counts))
enza[grep("10uMEnza", colnames(wide_counts))] = "Enzalutamide"
enza[c(1:length(colnames(wide_counts)))[!(c(1:length(colnames(wide_counts))) %in% grep("10uMEnza", colnames(wide_counts)))]] = "Vehicle"
radiation <- vector(mode="character", length=ncol(wide_counts))
radiation[grep("6GyIR", colnames(wide_counts))] = "Radiation"
radiation[c(1:length(colnames(wide_counts)))[!(c(1:length(colnames(wide_counts))) %in% grep("6GyIR", colnames(wide_counts)))]] = "Control"
#Name each sample based on the treatment condition (drops the replicate number)
colnames(wide_counts) = paste(radiation,enza, sep = "/")
#Plot
pdf("Heatmap_recount.pdf", width = 8, height = 5)
pheatmap(wide_counts, scale = "row", show_rownames = F, angle_col = 0, fontsize = 24,
color = rev(brewer.pal(11,"RdBu")),
cluster_cols = sort_hclust(hclust(dist(scale(t(wide_counts))))),
cluster_rows = sort_hclust(hclust(dist(t(scale(t(wide_counts)))))))
dev.off()
Here we make volcano plots and MA plots. We do this for each treatment with all genes, as well as focusing on subsets of genes (DNA Repair Hallmark, Androgen Response Hallmark, and our definition of AR target genes as enzalutamide-repressed genes). We also write a table of the values for the volcano plots.
#Enzalutamide volcano plot
keyvals <- ifelse(
res.deseq.enza$padj < .1 & res.deseq.enza$log2FoldChange < 0, 'blue',
ifelse(res.deseq.enza$padj < .1 & res.deseq.enza$log2FoldChange > 0, 'red',
'black'))
keyvals[is.na(keyvals)] <- 'black'
names(keyvals)[keyvals == 'red'] <- 'Up'
names(keyvals)[keyvals == 'black'] <- 'Not Significant'
names(keyvals)[keyvals == 'blue'] <- 'Down'
pdf("Enzalutamide_Volcano_recount.pdf")
EnhancedVolcano(res.deseq.enza,
lab = gene.symbol$symbol[gene.symbol$gene == rownames(res.deseq.enza)],
x = 'log2FoldChange',
y = 'padj',
xlab = bquote(~log[2]~ '(fold change)'),
ylab = bquote('-' ~log[10]~ '(q-value)'),
title = 'Enzalutamide versus Vehicle',
subtitle = NULL,
caption = NULL,
pCutoff = 0.1,
FCcutoff = 0.0,
pointSize = 2.0,
labSize = 3.0,
xlim = c(-1,1),
ylim = c(0,70),
colCustom = keyvals,
colAlpha = .25,
legendPosition = 'right',
legendLabSize = 10,
legendIconSize = 5.0) +
theme(plot.title = element_text(hjust = 0.5))
dev.off()
Enzalutamide_Volcano_plot_data <- as.data.frame(res.deseq.enza)
Enzalutamide_Volcano_plot_data <- merge.data.frame(Enzalutamide_Volcano_plot_data, gene.symbol,
by.x = "row.names", by.y = "gene")
Enzalutamide_Volcano_plot_data <- Enzalutamide_Volcano_plot_data[,c("symbol", "log2FoldChange", "padj")]
write.table(Enzalutamide_Volcano_plot_data, file = "Enzalutamide_Volcano_plot_data_recount.tsv", row.names=FALSE, sep="\t", quote = FALSE)
#MA plot
repressed = as.data.frame(res.deseq.enza[!is.na(res.deseq.enza$padj) & res.deseq.enza$padj < .1 & res.deseq.enza$log2FoldChange < 0,])
activated = as.data.frame(res.deseq.enza[!is.na(res.deseq.enza$padj) & res.deseq.enza$padj < .1 & res.deseq.enza$log2FoldChange > 0,])
unchanged = as.data.frame(res.deseq.enza[rownames(res.deseq.enza) %notin% c(rownames(repressed), rownames(activated)),])
repressed$status = "Down"
activated$status = "Up"
unchanged$status = "Not Significant"
MA.df = rbind.data.frame(unchanged, activated, repressed)
MA.df$status = factor(MA.df$status, levels = c("Not Significant", "Up", "Down"))
pdf("Enzalutamide_MA_plot_recount.pdf", width = 7, height = 3.5)
ggplot(MA.df, aes(x = log(baseMean, 10), y = log2FoldChange, color = status)) +
geom_point() +
geom_hline(yintercept = 0, linetype="dashed") +
scale_color_manual(values = c("grey60", "red", "blue")) +
ylim(-4, 4) +
xlim(-1, 5) +
ylab(expression("log"[2]~"(fold change in PRO signal)")) +
xlab(expression("log"[10]~"(mean of normalized intensity)")) +
ggtitle("Enzalutamide versus Vehicle") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5), legend.title = NULL) +
labs(color=NULL)
dev.off()
#Enzalutamide volcano plot just with DNA Repair Hallmark gene set
DNA_repair = read.table("DNA_repair_hallmark.txt")$V1
DNA_repair = gene.symbol$gene[gene.symbol$symbol %in% DNA_repair]
DNA_repair = res.deseq.enza[rownames(res.deseq.enza) %in% DNA_repair,]
DNA_repair_gene_symbol = gene.symbol[gene.symbol$gene %in% rownames(DNA_repair),]
keyvals <- ifelse(
DNA_repair$padj < .1 & DNA_repair$log2FoldChange < 0, 'blue',
ifelse(DNA_repair$padj < .1 & DNA_repair$log2FoldChange > 0, 'red',
'black'))
keyvals[is.na(keyvals)] <- 'black'
names(keyvals)[keyvals == 'red'] <- 'Up'
names(keyvals)[keyvals == 'black'] <- 'Not Significant'
names(keyvals)[keyvals == 'blue'] <- 'Down'
pdf("Enzalutamide_Volcano_DNA_repair_hallmark_recount.pdf")
EnhancedVolcano(DNA_repair,
lab = DNA_repair_gene_symbol$symbol[DNA_repair_gene_symbol$gene == rownames(DNA_repair)],
x = 'log2FoldChange',
y = 'padj',
xlab = bquote(~log[2]~ '(fold change)'),
ylab = bquote('-' ~log[10]~ '(q-value)'),
title = 'Enzalutamide versus Vehicle',
subtitle = NULL,
caption = NULL,
pCutoff = 0.1,
FCcutoff = 0.0,
pointSize = 2.0,
labSize = 3.0,
xlim = c(-1,1),
ylim = c(0, 40),
colCustom = keyvals,
colAlpha = .25,
legendPosition = 'right',
legendLabSize = 10,
legendIconSize = 5.0) +
theme(plot.title = element_text(hjust = 0.5))
dev.off()
Enzalutamide_Volcano_plot_data <- as.data.frame(DNA_repair)
Enzalutamide_Volcano_plot_data <- merge.data.frame(Enzalutamide_Volcano_plot_data, gene.symbol,
by.x = "row.names", by.y = "gene")
Enzalutamide_Volcano_plot_data <- Enzalutamide_Volcano_plot_data[,c("symbol", "log2FoldChange", "padj")]
write.table(Enzalutamide_Volcano_plot_data, file = "Enzalutamide_Volcano_DNA_repair_hallmark_plot_data_recount.tsv", row.names=FALSE, sep="\t", quote = FALSE)
#MA plot (there are no repressed or activated genes, so we comment out these lines)
# repressed = as.data.frame(DNA_repair[!is.na(DNA_repair$padj) & DNA_repair$padj < .1 & DNA_repair$log2FoldChange < 0,])
# activated = as.data.frame(DNA_repair[!is.na(DNA_repair$padj) & DNA_repair$padj < .1 & DNA_repair$log2FoldChange > 0,])
unchanged = as.data.frame(DNA_repair[rownames(DNA_repair) %notin% c(rownames(repressed), rownames(activated)),])
# repressed$status = "Down"
# activated$status = "Up"
unchanged$status = "Not Significant"
MA.df = unchanged
MA.df$status = factor(MA.df$status, levels = c("Not Significant", "Up", "Down"))
pdf("Enzalutamide_MA_plot_DNA_repair_hallmark_recount.pdf", width = 7, height = 3.5)
ggplot(MA.df, aes(x = log(baseMean, 10), y = log2FoldChange, color = status)) +
geom_point() +
geom_hline(yintercept = 0, linetype="dashed") +
scale_color_manual(values = c("grey60", "red", "blue")) +
ylim(-4, 4) +
xlim(-1, 5) +
ylab(expression("log"[2]~"(fold change in PRO signal)")) +
xlab(expression("log"[10]~"(mean of normalized intensity)")) +
ggtitle("Enzalutamide versus Vehicle") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5), legend.title = NULL) +
labs(color=NULL)
dev.off()
#Enzalutamide volcano plot just with Androgen Response Hallmark gene set
Androgen_response = read.table("Androgen_response_hallmark.txt")$V1
Androgen_response = gene.symbol$gene[gene.symbol$symbol %in% Androgen_response]
Androgen_response = res.deseq.enza[rownames(res.deseq.enza) %in% Androgen_response,]
Androgen_response_gene_symbol = gene.symbol[gene.symbol$gene %in% rownames(Androgen_response),]
keyvals <- ifelse(
Androgen_response$padj < .1 & Androgen_response$log2FoldChange < 0, 'blue',
ifelse(Androgen_response$padj < .1 & Androgen_response$log2FoldChange > 0, 'red',
'black'))
keyvals[is.na(keyvals)] <- 'black'
names(keyvals)[keyvals == 'red'] <- 'Up'
names(keyvals)[keyvals == 'black'] <- 'Not Significant'
names(keyvals)[keyvals == 'blue'] <- 'Down'
pdf("Enzalutamide_Volcano_Androgen_response_hallmark_recount.pdf")
EnhancedVolcano(Androgen_response,
lab = Androgen_response_gene_symbol$symbol[Androgen_response_gene_symbol$gene == rownames(Androgen_response)],
x = 'log2FoldChange',
y = 'padj',
xlab = bquote(~log[2]~ '(fold change)'),
ylab = bquote('-' ~log[10]~ '(q-value)'),
title = 'Enzalutamide versus Vehicle',
subtitle = NULL,
caption = NULL,
pCutoff = 0.1,
FCcutoff = 0.0,
pointSize = 2.0,
labSize = 3.0,
xlim = c(-.75,.75),
ylim = c(0, 25),
colCustom = keyvals,
colAlpha = .25,
legendPosition = 'right',
legendLabSize = 10,
legendIconSize = 5.0) +
theme(plot.title = element_text(hjust = 0.5))
dev.off()
Enzalutamide_Volcano_plot_data <- as.data.frame(Androgen_response)
Enzalutamide_Volcano_plot_data <- merge.data.frame(Enzalutamide_Volcano_plot_data, gene.symbol,
by.x = "row.names", by.y = "gene")
Enzalutamide_Volcano_plot_data <- Enzalutamide_Volcano_plot_data[,c("symbol", "log2FoldChange", "padj")]
write.table(Enzalutamide_Volcano_plot_data, file = "Enzalutamide_Volcano_Androgen_response_hallmark_plot_data_recount.tsv", row.names=FALSE, sep="\t", quote = FALSE)
#MA plot
repressed = as.data.frame(Androgen_response[!is.na(Androgen_response$padj) & Androgen_response$padj < .1 & Androgen_response$log2FoldChange < 0,])
# activated = as.data.frame(Androgen_response[!is.na(Androgen_response$padj) & Androgen_response$padj < .1 & Androgen_response$log2FoldChange > 0,])
unchanged = as.data.frame(Androgen_response[rownames(Androgen_response) %notin% c(rownames(repressed), rownames(activated)),])
repressed$status = "Down"
# activated$status = "Up"
unchanged$status = "Not Significant"
MA.df = rbind.data.frame(unchanged, repressed)
MA.df$status = factor(MA.df$status, levels = c("Not Significant", "Up", "Down"))
pdf("Enzalutamide_MA_plot_Androgen_response_hallmark_recount.pdf", width = 7, height = 3.5)
ggplot(MA.df, aes(x = log(baseMean, 10), y = log2FoldChange, color = status)) +
geom_point() +
geom_hline(yintercept = 0, linetype="dashed") +
scale_color_manual(values = c("grey60", "blue")) +
ylim(-4, 4) +
xlim(-1, 5) +
ylab(expression("log"[2]~"(fold change in PRO signal)")) +
xlab(expression("log"[10]~"(mean of normalized intensity)")) +
ggtitle("Enzalutamide versus Vehicle") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5), legend.title = NULL) +
labs(color=NULL)
dev.off()
#Radiation volcano plot
keyvals <- ifelse(
res.deseq.radiation$padj < .1 & res.deseq.radiation$log2FoldChange < 0, 'blue',
ifelse(res.deseq.radiation$padj < .1 & res.deseq.radiation$log2FoldChange > 0, 'red',
'black'))
keyvals[is.na(keyvals)] <- 'black'
names(keyvals)[keyvals == 'red'] <- 'Up'
names(keyvals)[keyvals == 'black'] <- 'Not Significant'
names(keyvals)[keyvals == 'blue'] <- 'Down'
pdf("Radiation_Volcano_batch2_recount.pdf")
EnhancedVolcano(res.deseq.radiation,
lab = gene.symbol$symbol[gene.symbol$gene == rownames(res.deseq.radiation)],
x = 'log2FoldChange',
y = 'padj',
xlab = bquote(~log[2]~ '(fold change)'),
ylab = bquote('-' ~log[10]~ '(q-value)'),
title = 'Radiation versus Control',
subtitle = NULL,
caption = NULL,
pCutoff = 0.1,
FCcutoff = 0.0,
pointSize = 2.0,
labSize = 3.0,
xlim = c(-3,3),
colCustom = keyvals,
colAlpha = .25,
legendPosition = 'right',
legendLabSize = 10,
legendIconSize = 5.0) +
theme(plot.title = element_text(hjust = 0.5))
dev.off()
Radiation_batch2_Volcano_plot_data <- as.data.frame(res.deseq.radiation)
Radiation_batch2_Volcano_plot_data <- merge.data.frame(Radiation_batch2_Volcano_plot_data, gene.symbol,
by.x = "row.names", by.y = "gene")
Radiation_batch2_Volcano_plot_data <- Radiation_batch2_Volcano_plot_data[,c("symbol", "log2FoldChange", "padj")]
write.table(Radiation_batch2_Volcano_plot_data, file = "Radiation_batch2_Volcano_plot_data_recount.tsv", row.names=FALSE, sep="\t", quote = FALSE)
#MA plot
repressed = as.data.frame(res.deseq.radiation[!is.na(res.deseq.radiation$padj) & res.deseq.radiation$padj < .1 & res.deseq.radiation$log2FoldChange < 0,])
activated = as.data.frame(res.deseq.radiation[!is.na(res.deseq.radiation$padj) & res.deseq.radiation$padj < .1 & res.deseq.radiation$log2FoldChange > 0,])
unchanged = as.data.frame(res.deseq.radiation[rownames(res.deseq.radiation) %notin% c(rownames(repressed), rownames(activated)),])
repressed$status = "Down"
activated$status = "Up"
unchanged$status = "Not Significant"
MA.df = rbind.data.frame(unchanged, activated, repressed)
MA.df$status = factor(MA.df$status, levels = c("Not Significant", "Up", "Down"))
pdf("Radiation_MA_plot_batch2_recount.pdf", width = 7, height = 3.5)
ggplot(MA.df, aes(x = log(baseMean, 10), y = log2FoldChange, color = status)) +
geom_point() +
geom_hline(yintercept = 0, linetype="dashed") +
scale_color_manual(values = c("grey60", "red", "blue")) +
ylim(-4, 4) +
xlim(-1, 5) +
ylab(expression("log"[2]~"(fold change in PRO signal)")) +
xlab(expression("log"[10]~"(mean of normalized intensity)")) +
ggtitle("Radiation versus Control") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5), legend.title = NULL) +
labs(color=NULL)
dev.off()
#Radiation volcano plot just with DNA Repair Hallmark gene set
DNA_repair = read.table("DNA_repair_hallmark.txt")$V1
DNA_repair = gene.symbol$gene[gene.symbol$symbol %in% DNA_repair]
DNA_repair = res.deseq.radiation[rownames(res.deseq.radiation) %in% DNA_repair,]
DNA_repair_gene_symbol = gene.symbol[gene.symbol$gene %in% rownames(DNA_repair),]
keyvals <- ifelse(
DNA_repair$padj < .1 & DNA_repair$log2FoldChange < 0, 'blue',
ifelse(DNA_repair$padj < .1 & DNA_repair$log2FoldChange > 0, 'red',
'black'))
keyvals[is.na(keyvals)] <- 'black'
names(keyvals)[keyvals == 'red'] <- 'Up'
names(keyvals)[keyvals == 'black'] <- 'Not Significant'
names(keyvals)[keyvals == 'blue'] <- 'Down'
pdf("Radiation_Volcano_DNA_repair_hallmark_recount.pdf")
EnhancedVolcano(DNA_repair,
lab = DNA_repair_gene_symbol$symbol[DNA_repair_gene_symbol$gene == rownames(DNA_repair)],
x = 'log2FoldChange',
y = 'padj',
xlab = bquote(~log[2]~ '(fold change)'),
ylab = bquote('-' ~log[10]~ '(q-value)'),
title = 'Radiation versus Control',
subtitle = NULL,
caption = NULL,
pCutoff = 0.1,
FCcutoff = 0.0,
pointSize = 2.0,
labSize = 3.0,
xlim = c(-1,1),
ylim = c(0, 40),
colCustom = keyvals,
colAlpha = .25,
legendPosition = 'right',
legendLabSize = 10,
legendIconSize = 5.0) +
theme(plot.title = element_text(hjust = 0.5))
dev.off()
Radiation_batch2_Volcano_plot_data <- as.data.frame(DNA_repair)
Radiation_batch2_Volcano_plot_data <- merge.data.frame(Radiation_batch2_Volcano_plot_data, gene.symbol,
by.x = "row.names", by.y = "gene")
Radiation_batch2_Volcano_plot_data <- Radiation_batch2_Volcano_plot_data[,c("symbol", "log2FoldChange", "padj")]
write.table(Radiation_batch2_Volcano_plot_data, file = "Radiation_batch2_Volcano_DNA_repair_hallmark_plot_data_recount.tsv", row.names=FALSE, sep="\t", quote = FALSE)
#MA plot
repressed = as.data.frame(DNA_repair[!is.na(DNA_repair$padj) & DNA_repair$padj < .1 & DNA_repair$log2FoldChange < 0,])
activated = as.data.frame(DNA_repair[!is.na(DNA_repair$padj) & DNA_repair$padj < .1 & DNA_repair$log2FoldChange > 0,])
unchanged = as.data.frame(DNA_repair[rownames(DNA_repair) %notin% c(rownames(repressed), rownames(activated)),])
repressed$status = "Down"
activated$status = "Up"
unchanged$status = "Not Significant"
MA.df = rbind.data.frame(unchanged, activated, repressed)
MA.df$status = factor(MA.df$status, levels = c("Not Significant", "Up", "Down"))
pdf("Radiation_MA_plot_DNA_repair_hallmark_recount.pdf", width = 7, height = 3.5)
ggplot(MA.df, aes(x = log(baseMean, 10), y = log2FoldChange, color = status)) +
geom_point() +
geom_hline(yintercept = 0, linetype="dashed") +
scale_color_manual(values = c("grey60", "red", "blue")) +
ylim(-4, 4) +
xlim(-1, 5) +
ylab(expression("log"[2]~"(fold change in PRO signal)")) +
xlab(expression("log"[10]~"(mean of normalized intensity)")) +
ggtitle("Radiation versus Control") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5), legend.title = NULL) +
labs(color=NULL)
dev.off()
#Radiation volcano plot just with Androgen Response Hallmark gene set
Androgen_response = read.table("Androgen_response_hallmark.txt")$V1
Androgen_response = gene.symbol$gene[gene.symbol$symbol %in% Androgen_response]
Androgen_response = res.deseq.radiation[rownames(res.deseq.radiation) %in% Androgen_response,]
Androgen_response_gene_symbol = gene.symbol[gene.symbol$gene %in% rownames(Androgen_response),]
keyvals <- ifelse(
Androgen_response$padj < .1 & Androgen_response$log2FoldChange < 0, 'blue',
ifelse(Androgen_response$padj < .1 & Androgen_response$log2FoldChange > 0, 'red',
'black'))
keyvals[is.na(keyvals)] <- 'black'
names(keyvals)[keyvals == 'red'] <- 'Up'
names(keyvals)[keyvals == 'black'] <- 'Not Significant'
names(keyvals)[keyvals == 'blue'] <- 'Down'
pdf("Radiation_Volcano_Androgen_response_hallmark_recount.pdf")
EnhancedVolcano(Androgen_response,
lab = Androgen_response_gene_symbol$symbol[Androgen_response_gene_symbol$gene == rownames(Androgen_response)],
x = 'log2FoldChange',
y = 'padj',
xlab = bquote(~log[2]~ '(fold change)'),
ylab = bquote('-' ~log[10]~ '(q-value)'),
title = 'Radiation versus Control',
subtitle = NULL,
caption = NULL,
pCutoff = 0.1,
FCcutoff = 0.0,
pointSize = 2.0,
labSize = 3.0,
xlim = c(-.75,.75),
ylim = c(0, 25),
colCustom = keyvals,
colAlpha = .25,
legendPosition = 'right',
legendLabSize = 10,
legendIconSize = 5.0) +
theme(plot.title = element_text(hjust = 0.5))
dev.off()
Radiation_batch2_Volcano_plot_data <- as.data.frame(Androgen_response)
Radiation_batch2_Volcano_plot_data <- merge.data.frame(Radiation_batch2_Volcano_plot_data, gene.symbol,
by.x = "row.names", by.y = "gene")
Radiation_batch2_Volcano_plot_data <- Radiation_batch2_Volcano_plot_data[,c("symbol", "log2FoldChange", "padj")]
write.table(Radiation_batch2_Volcano_plot_data, file = "Radiation_batch2_Volcano_Androgen_response_hallmark_plot_data_recount.tsv", row.names=FALSE, sep="\t", quote = FALSE)
#MA plot
repressed = as.data.frame(Androgen_response[!is.na(Androgen_response$padj) & Androgen_response$padj < .1 & Androgen_response$log2FoldChange < 0,])
activated = as.data.frame(Androgen_response[!is.na(Androgen_response$padj) & Androgen_response$padj < .1 & Androgen_response$log2FoldChange > 0,])
unchanged = as.data.frame(Androgen_response[rownames(Androgen_response) %notin% c(rownames(repressed), rownames(activated)),])
repressed$status = "Down"
activated$status = "Up"
unchanged$status = "Not Significant"
MA.df = rbind.data.frame(unchanged, activated, repressed)
MA.df$status = factor(MA.df$status, levels = c("Not Significant", "Up", "Down"))
pdf("Radiation_MA_plot_Androgen_response_hallmark_recount.pdf", width = 7, height = 3.5)
ggplot(MA.df, aes(x = log(baseMean, 10), y = log2FoldChange, color = status)) +
geom_point() +
geom_hline(yintercept = 0, linetype="dashed") +
scale_color_manual(values = c("grey60", "red", "blue")) +
ylim(-4, 4) +
xlim(-1, 5) +
ylab(expression("log"[2]~"(fold change in PRO signal)")) +
xlab(expression("log"[10]~"(mean of normalized intensity)")) +
ggtitle("Radiation versus Control") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5), legend.title = NULL) +
labs(color=NULL)
dev.off()
#Radiation volcano plot just with AR target (enzalutamide-repressed) genes
Androgen_response = rownames(res.deseq.enza)[res.deseq.enza$padj < .1 & !is.na(res.deseq.enza$padj) & res.deseq.enza$log2FoldChange < 0]
Androgen_response = res.deseq.radiation[rownames(res.deseq.radiation) %in% Androgen_response,]
Androgen_response_gene_symbol = gene.symbol[gene.symbol$gene %in% rownames(Androgen_response),]
keyvals <- ifelse(
Androgen_response$padj < .1 & Androgen_response$log2FoldChange < 0, 'blue',
ifelse(Androgen_response$padj < .1 & Androgen_response$log2FoldChange > 0, 'red',
'black'))
keyvals[is.na(keyvals)] <- 'black'
names(keyvals)[keyvals == 'red'] <- 'Up'
names(keyvals)[keyvals == 'black'] <- 'Not Significant'
names(keyvals)[keyvals == 'blue'] <- 'Down'
pdf("Radiation_Volcano_enza_repressed_genes_recount.pdf")
EnhancedVolcano(Androgen_response,
lab = Androgen_response_gene_symbol$symbol[Androgen_response_gene_symbol$gene == rownames(Androgen_response)],
x = 'log2FoldChange',
y = 'padj',
xlab = bquote(~log[2]~ '(fold change)'),
ylab = bquote('-' ~log[10]~ '(q-value)'),
title = 'Radiation versus Control',
subtitle = NULL,
caption = NULL,
pCutoff = 0.1,
FCcutoff = 0.0,
pointSize = 2.0,
labSize = 3.0,
xlim = c(-1,1),
ylim = c(0,70),
colCustom = keyvals,
colAlpha = .25,
legendPosition = 'right',
legendLabSize = 10,
legendIconSize = 5.0) +
theme(plot.title = element_text(hjust = 0.5))
dev.off()
#MA plot
repressed = as.data.frame(Androgen_response[!is.na(Androgen_response$padj) & Androgen_response$padj < .1 & Androgen_response$log2FoldChange < 0,])
activated = as.data.frame(Androgen_response[!is.na(Androgen_response$padj) & Androgen_response$padj < .1 & Androgen_response$log2FoldChange > 0,])
unchanged = as.data.frame(Androgen_response[rownames(Androgen_response) %notin% c(rownames(repressed), rownames(activated)),])
repressed$status = "Down"
activated$status = "Up"
unchanged$status = "Not Significant"
MA.df = rbind.data.frame(unchanged, activated, repressed)
MA.df$status = factor(MA.df$status, levels = c("Not Significant", "Up", "Down"))
pdf("Radiation_MA_plot_enza_repressed_genes_recount.pdf", width = 7, height = 3.5)
ggplot(MA.df, aes(x = log(baseMean, 10), y = log2FoldChange, color = status)) +
geom_point() +
geom_hline(yintercept = 0, linetype="dashed") +
scale_color_manual(values = c("grey60", "red", "blue")) +
ylim(-4, 4) +
xlim(-1,5) +
ylab(expression("log"[2]~"(fold change in PRO signal)")) +
xlab(expression("log"[10]~"(mean of normalized intensity)")) +
ggtitle("Radiation versus Control") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5), legend.title = NULL) +
labs(color=NULL)
dev.off()
We downloaded publicly available AR ChIP-seq peaks from LNCaP cells (from https://doi.org/10.7554/eLife.27159, GEO accession GSM2716919) as a bed file from CistromeDB (ID 87052).
#We will re-use some of the code below, so we use define the data frame and treatment here
df = res.deseq.enza
treat = "Enzalutamide"
class = paste0(treat, " Repressed Genes")
#Make a set of unchanged genes that are expression-matched to the enzalutamide-repressed genes
df.activated = df[df$padj < 0.1 & !is.na(df$padj) & df$log2FoldChange < 0,]
df.unchanged = df[rownames(df) %notin% rownames(df.activated) & !is.na(df$padj) & df$log2FoldChange > 0,]
df.unchanged$treatment = "Similarly Expressed Genes"
df.activated$treatment = class
df.deseq.effects.lattice = rbind(df.unchanged, df.activated)
df.deseq.effects.lattice$treatment = factor(df.deseq.effects.lattice$treatment)
df.deseq.effects.lattice$treatment = relevel(df.deseq.effects.lattice$treatment, ref = 'Similarly Expressed Genes')
out = matchit(treatment ~ baseMean, data = df.deseq.effects.lattice, method = "optimal", ratio = 1)
#Just keep the two classes of genes we are interested in
df.deseq.effects.lattice =
df.deseq.effects.lattice[df.deseq.effects.lattice$treatment == class |
rownames(df.deseq.effects.lattice) %in% out$match.matrix,]
#Plot expression to confirm the match
ggplot(as.data.frame(df.deseq.effects.lattice), aes(x = log(baseMean, base = 10), color = treatment)) +
stat_ecdf(geom = "step", size = 1) +
ylab("Cumulative distribution function") +
xlab(expression("log"[10]~"(Mean of Normalized Counts)")) +
theme_bw() +
theme(text = element_text(size = 20))
#Write these genes to a file for later use
write.table(rownames(df.deseq.effects.lattice)[df.deseq.effects.lattice$treatment == 'Similarly Expressed Genes'], file = "Enzalutamide_repressed_similarly_expressed_genes_recount.txt")
#Assign the colors
col.lines = c("#0000FF", "grey60")
#Generate the CDF
df.all = cdf.deseq.df(df = df.deseq.effects.lattice, genes = gene.file, chip.peaks ='LNCaP_AR_peaks.bed', class = class, tf.name = 'Androgen Receptor')
#Plot
pdf("CDF_enzalutamide_repressed_AR_peaks_recount.pdf", height = 3.5, width = 4)
ecdfplot(~log(abs(dis), base = 10), groups = status, data = df.all,
auto.key = list(lines=TRUE, points=FALSE),
col = col.lines,
aspect = 1,
scales=list(relation="free",alternating=c(1,1,1,1)),
ylab = 'Cumulative\nDistribution Function',
xlab = expression('log'[10]~'AR Distance from TSS'),
between=list(y=1.0),
type = 'a',
xlim = c(0,7.5),
lwd=2,
par.settings = list(superpose.line = list(col = col.lines, lwd=3),
strip.background=list(col="grey85")),
panel = function(...) {
panel.abline(v= 200, lty =2)
panel.ecdfplot(...)
})
dev.off()
#Perform a Kolmogorov–Smirnov test for significance
ks = ks.test(x = log(abs(df.all$dis)[df.all$status == class]),
y = log(abs(df.all$dis)[df.all$status == 'Similarly Expressed Genes']))$p.value
ks = formatC(ks, format = "e", digits = 1)
ks
We don’t have P53 ChIP-seq data from LNCaP cells, but we used ENCODE data from two other cell lines (A549 and HepG2 cells) and found similar results. Here we download P53 ChIP-seq peaks from A549 cells as a bed file.
wget https://www.encodeproject.org/files/ENCFF699UTZ/@@download/ENCFF699UTZ.bed.gz
gunzip ENCFF699UTZ.bed.gz
mv ENCFF699UTZ.bed A549_P53.bed
Here we repeat the above steps, replacing the data frame with the radiation statistics, focusing on the radiation-activated genes, and using the P53 ChIP-seq peaks.
df = res.deseq.radiation
treat = "Radiation"
class = paste0(treat, " Activated Genes")
df.activated = df[df$padj < 0.1 & !is.na(df$padj) & df$log2FoldChange > 0,]
df.unchanged = df[rownames(df) %notin% rownames(df.activated) & !is.na(df$padj) & df$log2FoldChange < 0,]
df.unchanged$treatment = "Similarly Expressed Genes"
df.activated$treatment = class
df.deseq.effects.lattice = rbind(df.unchanged, df.activated)
df.deseq.effects.lattice$treatment = factor(df.deseq.effects.lattice$treatment)
df.deseq.effects.lattice$treatment = relevel(df.deseq.effects.lattice$treatment, ref = 'Similarly Expressed Genes')
out = matchit(treatment ~ baseMean, data = df.deseq.effects.lattice, method = "optimal", ratio = 1)
df.deseq.effects.lattice =
df.deseq.effects.lattice[df.deseq.effects.lattice$treatment == class |
rownames(df.deseq.effects.lattice) %in% out$match.matrix,]
ggplot(as.data.frame(df.deseq.effects.lattice), aes(x = log(baseMean, base = 10), color = treatment)) +
stat_ecdf(geom = "step", size = 1) +
ylab("Cumulative distribution function") +
xlab(expression("log"[10]~"(Mean of Normalized Counts)")) +
theme_bw() +
theme(text = element_text(size = 20))
write.table(rownames(df.deseq.effects.lattice)[df.deseq.effects.lattice$treatment == 'Similarly Expressed Genes'], file = "Radiation_activated_similarly_expressed_genes_recount.txt")
col.lines = c("#FF0000", "grey60")
df.all = cdf.deseq.df(df = df.deseq.effects.lattice, genes = gene.file, chip.peaks ='A549_P53.bed', class = class, tf.name = 'P53')
pdf("CDF_radiation_activated_P53_peaks_recount.pdf", height = 3.5, width = 4)
ecdfplot(~log(abs(dis), base = 10), groups = status, data = df.all,
auto.key = list(lines=TRUE, points=FALSE),
col = col.lines,
aspect = 1,
scales=list(relation="free",alternating=c(1,1,1,1)),
ylab = 'Cumulative\nDistribution Function',
xlab = expression('log'[10]~'P53 Distance from TSS'),
between=list(y=1.0),
type = 'a',
xlim = c(0,7.5),
lwd=2,
par.settings = list(superpose.line = list(col = col.lines, lwd=3),
strip.background=list(col="grey85")),
panel = function(...) {
panel.abline(v= 200, lty =2)
panel.ecdfplot(...)
})
dev.off()
ks = ks.test(x = log(abs(df.all$dis)[df.all$status == class]),
y = log(abs(df.all$dis)[df.all$status == 'Similarly Expressed Genes']))$p.value
ks = formatC(ks, format = "e", digits = 1)
ks
Here we repeat the above steps, focusing on both the radiation-activated and radiation-repressed genes, and using the AR ChIP-seq peaks.
col.lines = c("#FF0000", "grey60")
df.all = cdf.deseq.df(df = df.deseq.effects.lattice, genes = gene.file, chip.peaks ='LNCaP_AR_peaks.bed', class = class, tf.name = 'Androgen Receptor')
pdf("CDF_radiation_activated_AR_peaks_recount.pdf", height = 3.5, width = 4)
ecdfplot(~log(abs(dis), base = 10), groups = status, data = df.all,
auto.key = list(lines=TRUE, points=FALSE),
col = col.lines,
aspect = 1,
scales=list(relation="free",alternating=c(1,1,1,1)),
ylab = 'Cumulative\nDistribution Function',
xlab = expression('log'[10]~'AR Distance from TSS'),
between=list(y=1.0),
type = 'a',
xlim = c(0,7.5),
lwd=2,
par.settings = list(superpose.line = list(col = col.lines, lwd=3),
strip.background=list(col="grey85")),
panel = function(...) {
panel.abline(v= 200, lty =2)
panel.ecdfplot(...)
})
dev.off()
ks = ks.test(x = log(abs(df.all$dis)[df.all$status == class]),
y = log(abs(df.all$dis)[df.all$status == 'Similarly Expressed Genes']))$p.value
ks = formatC(ks, format = "e", digits = 1)
ks
df = res.deseq.radiation
treat = "Radiation"
class = paste0(treat, " Repressed Genes")
df.activated = df[df$padj < 0.1 & !is.na(df$padj) & df$log2FoldChange < 0,]
df.unchanged = df[rownames(df) %notin% rownames(df.activated) & !is.na(df$padj) & df$log2FoldChange < 0,]
df.unchanged$treatment = "Similarly Expressed Genes"
df.activated$treatment = class
df.deseq.effects.lattice = rbind(df.unchanged, df.activated)
df.deseq.effects.lattice$treatment = factor(df.deseq.effects.lattice$treatment)
df.deseq.effects.lattice$treatment = relevel(df.deseq.effects.lattice$treatment, ref = 'Similarly Expressed Genes')
out = matchit(treatment ~ baseMean, data = df.deseq.effects.lattice, method = "optimal", ratio = 1)
df.deseq.effects.lattice =
df.deseq.effects.lattice[df.deseq.effects.lattice$treatment == class |
rownames(df.deseq.effects.lattice) %in% out$match.matrix,]
ggplot(as.data.frame(df.deseq.effects.lattice), aes(x = log(baseMean, base = 10), color = treatment)) +
stat_ecdf(geom = "step", size = 1) +
ylab("Cumulative distribution function") +
xlab(expression("log"[10]~"(Mean of Normalized Counts)")) +
theme_bw() +
theme(text = element_text(size = 20))
write.table(rownames(df.deseq.effects.lattice)[df.deseq.effects.lattice$treatment == 'Similarly Expressed Genes'], file = "Radiation_repressed_similarly_expressed_genes_recount.txt")
col.lines = c("#0000FF", "grey60")
df.all = cdf.deseq.df(df = df.deseq.effects.lattice, genes = gene.file, chip.peaks ='LNCaP_AR_peaks.bed', class = class, tf.name = 'Androgen Receptor')
pdf("CDF_radiation_repressed_AR_peaks_recount.pdf", height = 3.5, width = 4)
ecdfplot(~log(abs(dis), base = 10), groups = status, data = df.all,
auto.key = list(lines=TRUE, points=FALSE),
col = col.lines,
aspect = 1,
scales=list(relation="free",alternating=c(1,1,1,1)),
ylab = 'Cumulative\nDistribution Function',
xlab = expression('log'[10]~'AR Distance from TSS'),
between=list(y=1.0),
type = 'a',
xlim = c(0,7.5),
lwd=2,
par.settings = list(superpose.line = list(col = col.lines, lwd=3),
strip.background=list(col="grey85")),
panel = function(...) {
panel.abline(v= 200, lty =2)
panel.ecdfplot(...)
})
dev.off()
ks = ks.test(x = log(abs(df.all$dis)[df.all$status == class]),
y = log(abs(df.all$dis)[df.all$status == 'Similarly Expressed Genes']))$p.value
ks = formatC(ks, format = "e", digits = 1)
ks
Here we classify sets of genes based on their response to either treatment, each at an FDR of 0.1. For example, enzalutamide-repressed radiation-activated genes are both significantly repressed by enzalutamide (at an FDR of 0.1) and significantly activated by radiation (at an FDR of 0.1). For each set of genes, we plot the average read depth normalized read counts across the four conditions. We then perform over-representation analysis for each gene set.
#Classify genes based on each treatment independently
effects.lattice.enza = categorize.deseq.df.mods(res.deseq.enza, fdr = .1, log2fold = 0, treat = 'Enzalutamide')
effects.lattice.radiation = categorize.deseq.df.mods(res.deseq.radiation, fdr = .1, log2fold = 0, treat = 'Radiation')
enza.repressed = rownames(effects.lattice.enza)[effects.lattice.enza$treatment == "Enzalutamide Repressed"]
enza.activated = rownames(effects.lattice.enza)[effects.lattice.enza$treatment == "Enzalutamide Activated"]
radiation.repressed = rownames(effects.lattice.radiation)[effects.lattice.radiation$treatment == "Radiation Repressed"]
radiation.activated = rownames(effects.lattice.radiation)[effects.lattice.radiation$treatment == "Radiation Activated"]
#Find the overlapping gene sets
enza.repressed.radiation.activated = enza.repressed[enza.repressed %in% radiation.activated]
enza.repressed.radiation.repressed = enza.repressed[enza.repressed %in% radiation.repressed]
enza.activated.radiation.activated = enza.activated[enza.activated %in% radiation.activated]
enza.activated.radiation.repressed = enza.activated[enza.activated %in% radiation.repressed]
#Just keep genes with gene symbols in our gene annotation file
enza.repressed.radiation.activated = gene.symbol$symbol[gene.symbol$gene %in% enza.repressed.radiation.activated]
enza.repressed.radiation.repressed = gene.symbol$symbol[gene.symbol$gene %in% enza.repressed.radiation.repressed]
enza.activated.radiation.activated = gene.symbol$symbol[gene.symbol$gene %in% enza.activated.radiation.activated]
enza.activated.radiation.repressed = gene.symbol$symbol[gene.symbol$gene %in% enza.activated.radiation.repressed]
#Write text files with each gene set for future use
write.table(enza.repressed.radiation.activated, "enza.repressed.radiation.activated_recount.txt",quote=F, row.names=F, col.names=F)
write.table(enza.repressed.radiation.repressed, "enza.repressed.radiation.repressed_recount.txt",quote=F, row.names=F, col.names=F)
write.table(enza.activated.radiation.activated, "enza.activated.radiation.activated_recount.txt",quote=F, row.names=F, col.names=F)
write.table(enza.activated.radiation.repressed, "enza.activated.radiation.repressed_recount.txt",quote=F, row.names=F, col.names=F)
The steps we use in this step are the same for each set of genes
#Get the read depth normalized counts
normalized_counts <- counts(deseq.df, normalized=TRUE)
normalized_counts <- normalized_counts %>%
data.frame() %>%
rownames_to_column(var="gene") %>%
as_tibble()
#Just keep the counts for enzalutamide-repressed radiation-activated genes
counts <- normalized_counts %>%
filter(gene %in% gene.symbol$gene[gene.symbol$symbol %in% enza.repressed.radiation.activated])
#Average the two replicates
for(i in seq(2, ncol(counts), 2)) counts[,i] <- (counts[,i] + counts[,i+1])/2
counts = counts[,c(1, seq(2, ncol(counts), 2))]
#Convert our data format from wide to long
gathered_counts <- counts %>%
gather(colnames(counts)[-1], key = "samplename", value = "normalized_counts")
#Generate the variable names by parsing the sample names
Condition = gathered_counts$samplename
Condition = gsub("LNCaP_rep", "LNCaP_Control/Vehicle_rep", Condition)
Condition = gsub("10uMEnza_6GyIR", "Radiation/Enzalutamide", Condition)
Condition = gsub("10uMEnza", "Control/Enzalutamide", Condition)
Condition = gsub("6GyIR", "Radiation/Vehicle", Condition)
Condition = factor(sapply(strsplit(sapply(strsplit(Condition, 'LNCaP_'), '[', 2), '_rep'), '[', 1))
Condition = factor(Condition, levels = c("Radiation/Enzalutamide", "Radiation/Vehicle", "Control/Enzalutamide", "Control/Vehicle"))
#Add the condition and the gene symbol information
gathered_counts = cbind(gathered_counts, Condition)
gathered_counts <- merge.data.frame(gathered_counts, gene.symbol, all.y = FALSE)
#Plot
pdf("normalized_counts_enza_repressed_radiation_activated_recount.pdf", width = 8)
ggplot(gathered_counts) +
geom_jitter(height = 0, width = 0.1, aes(x = symbol, y = normalized_counts, color = Condition, shape = Condition)) +
scale_color_manual(name = "Condition", values = col[c(4,4,2,2)]) +
scale_shape_manual(name = "Condition", values = c(17,16,17,16)) +
scale_y_log10() +
#ordering by counts in the Control/Vehicle condition
scale_x_discrete(limits = gathered_counts[gathered_counts$Condition == "Control/Vehicle", "symbol"][order(gathered_counts[gathered_counts$Condition == "Control/Vehicle", "normalized_counts"], decreasing = TRUE)]) +
xlab("Gene") +
ylab("Normalized Counts") +
ggtitle("16 genes repressed by enzalutamide\nand activated by radiation") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme(plot.title = element_text(hjust = 0.5)) +
theme(text = element_text(size = 20))
dev.off()
#Convert back to wide format and write the data to a .tsv file
counts <- spread(gathered_counts[,c("symbol", "Condition", "normalized_counts")], Condition, normalized_counts)
write.table(counts, file = "normalized_counts_enza_repressed_radiation_activated_recount.tsv", row.names=FALSE, sep="\t", quote = FALSE)
normalized_counts <- counts(deseq.df, normalized=TRUE)
normalized_counts <- normalized_counts %>%
data.frame() %>%
rownames_to_column(var="gene") %>%
as_tibble()
counts <- normalized_counts %>%
filter(gene %in% enza.activated[enza.activated %in% radiation.activated])
for(i in seq(2, ncol(counts), 2)) counts[,i] <- (counts[,i] + counts[,i+1])/2
counts = counts[,c(1, seq(2, ncol(counts), 2))]
gathered_counts <- counts %>%
gather(colnames(counts)[-1], key = "samplename", value = "normalized_counts")
Condition = gathered_counts$samplename
Condition = gsub("LNCaP_rep", "LNCaP_Control/Vehicle_rep", Condition)
Condition = gsub("10uMEnza_6GyIR", "Radiation/Enzalutamide", Condition)
Condition = gsub("10uMEnza", "Control/Enzalutamide", Condition)
Condition = gsub("6GyIR", "Radiation/Vehicle", Condition)
Condition = factor(sapply(strsplit(sapply(strsplit(Condition, 'LNCaP_'), '[', 2), '_rep'), '[', 1))
Condition = factor(Condition, levels = c("Radiation/Enzalutamide", "Radiation/Vehicle", "Control/Enzalutamide", "Control/Vehicle"))
gathered_counts = cbind(gathered_counts, Condition)
gathered_counts <- merge.data.frame(gathered_counts, gene.symbol, all.y = FALSE)
#ENSG00000272655 was more lowly expressed than ENSG00000214783, and both mapped to POLR2J4, so I'm just plotting the latter
pdf("normalized_counts_enza_activated_radiation_activated_recount.pdf", width = 14)
ggplot(gathered_counts[gathered_counts$gene != "ENSG00000272655",]) +
geom_jitter(height = 0, width = 0.1, aes(x = symbol, y = normalized_counts, color = Condition, shape = Condition)) +
scale_color_manual(name = "Condition", values = col[c(4,4,2,2)]) +
scale_shape_manual(name = "Condition", values = c(17,16,17,16)) +
scale_y_log10() +
#ordering by counts in the Control/Vehicle condition
scale_x_discrete(limits = gathered_counts[gathered_counts$Condition == "Control/Vehicle" & gathered_counts$gene != "ENSG00000272655", "symbol"][order(gathered_counts[gathered_counts$Condition == "Control/Vehicle" & gathered_counts$gene != "ENSG00000272655", "normalized_counts"], decreasing = TRUE)]) +
xlab("Gene") +
ylab("Normalized Counts") +
ggtitle("41 genes activated by enzalutamide\nand activated by radiation") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme(plot.title = element_text(hjust = 0.5)) +
theme(text = element_text(size = 20))
dev.off()
counts <- spread(gathered_counts[,c("symbol", "Condition", "normalized_counts")], Condition, normalized_counts)
write.table(counts, file = "normalized_counts_enza_activated_radiation_activated_recount.tsv", row.names=FALSE, sep="\t", quote = FALSE)
normalized_counts <- counts(deseq.df, normalized=TRUE)
normalized_counts <- normalized_counts %>%
data.frame() %>%
rownames_to_column(var="gene") %>%
as_tibble()
counts <- normalized_counts %>%
filter(gene %in% gene.symbol$gene[gene.symbol$symbol %in% enza.repressed.radiation.repressed])
for(i in seq(2, ncol(counts), 2)) counts[,i] <- (counts[,i] + counts[,i+1])/2
counts = counts[,c(1, seq(2, ncol(counts), 2))]
gathered_counts <- counts %>%
gather(colnames(counts)[-1], key = "samplename", value = "normalized_counts")
Condition = gathered_counts$samplename
Condition = gsub("LNCaP_rep", "LNCaP_Control/Vehicle_rep", Condition)
Condition = gsub("10uMEnza_6GyIR", "Radiation/Enzalutamide", Condition)
Condition = gsub("10uMEnza", "Control/Enzalutamide", Condition)
Condition = gsub("6GyIR", "Radiation/Vehicle", Condition)
Condition = factor(sapply(strsplit(sapply(strsplit(Condition, 'LNCaP_'), '[', 2), '_rep'), '[', 1))
Condition = factor(Condition, levels = c("Radiation/Enzalutamide", "Radiation/Vehicle", "Control/Enzalutamide", "Control/Vehicle"))
gathered_counts = cbind(gathered_counts, Condition)
gathered_counts <- merge.data.frame(gathered_counts, gene.symbol, all.y = FALSE)
pdf("normalized_counts_enza_repressed_radiation_repressed_recount.pdf", width = 20)
ggplot(gathered_counts) +
geom_jitter(height = 0, width = 0.1, aes(x = symbol, y = normalized_counts, color = Condition, shape = Condition)) +
scale_color_manual(name = "Condition", values = col[c(4,4,2,2)]) +
scale_shape_manual(name = "Condition", values = c(17,16,17,16)) +
scale_y_log10() +
#ordering by counts in the Control/Vehicle condition
scale_x_discrete(limits = gathered_counts[gathered_counts$Condition == "Control/Vehicle", "symbol"][order(gathered_counts[gathered_counts$Condition == "Control/Vehicle", "normalized_counts"], decreasing = TRUE)]) +
xlab("Gene") +
ylab("Normalized Counts") +
ggtitle("67 genes repressed by enzalutamide and repressed by radiation") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme(plot.title = element_text(hjust = 0.5)) +
theme(text = element_text(size = 20))
dev.off()
counts <- spread(gathered_counts[,c("symbol", "Condition", "normalized_counts")], Condition, normalized_counts)
write.table(counts, file = "normalized_counts_enza_repressed_radiation_repressed_recount.tsv", row.names=FALSE, sep="\t", quote = FALSE)
normalized_counts <- counts(deseq.df, normalized=TRUE)
normalized_counts <- normalized_counts %>%
data.frame() %>%
rownames_to_column(var="gene") %>%
as_tibble()
counts <- normalized_counts %>%
filter(gene %in% enza.activated[enza.activated %in% radiation.repressed])
for(i in seq(2, ncol(counts), 2)) counts[,i] <- (counts[,i] + counts[,i+1])/2
counts = counts[,c(1, seq(2, ncol(counts), 2))]
gathered_counts <- counts %>%
gather(colnames(counts)[-1], key = "samplename", value = "normalized_counts")
Condition = gathered_counts$samplename
Condition = gsub("LNCaP_rep", "LNCaP_Control/Vehicle_rep", Condition)
Condition = gsub("10uMEnza_6GyIR", "Radiation/Enzalutamide", Condition)
Condition = gsub("10uMEnza", "Control/Enzalutamide", Condition)
Condition = gsub("6GyIR", "Radiation/Vehicle", Condition)
Condition = factor(sapply(strsplit(sapply(strsplit(Condition, 'LNCaP_'), '[', 2), '_rep'), '[', 1))
Condition = factor(Condition, levels = c("Radiation/Enzalutamide", "Radiation/Vehicle", "Control/Enzalutamide", "Control/Vehicle"))
gathered_counts = cbind(gathered_counts, Condition)
gathered_counts <- merge.data.frame(gathered_counts, gene.symbol, all.y = FALSE)
#ENSG00000274452 was the more lowly expressed U2
pdf("normalized_counts_enza_activated_radiation_repressed_recount.pdf", width = 8)
ggplot(gathered_counts[gathered_counts$gene != "ENSG00000274452",]) +
geom_jitter(height = 0, width = 0.1, aes(x = symbol, y = normalized_counts, color = Condition, shape = Condition)) +
scale_color_manual(name = "Condition", values = col[c(4,4,2,2)]) +
scale_shape_manual(name = "Condition", values = c(17,16,17,16)) +
scale_y_log10() +
#ordering by counts in the Control/Vehicle condition
scale_x_discrete(limits = gathered_counts[gathered_counts$Condition == "Control/Vehicle" & gathered_counts$gene != "ENSG00000274452", "symbol"][order(gathered_counts[gathered_counts$Condition == "Control/Vehicle" & gathered_counts$gene != "ENSG00000274452", "normalized_counts"], decreasing = TRUE)]) +
xlab("Gene") +
ylab("Normalized Counts") +
ggtitle("7 genes activated by enzalutamide\nand repressed by radiation") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme(plot.title = element_text(hjust = 0.5)) +
theme(text = element_text(size = 20))
dev.off()
counts <- spread(gathered_counts[gathered_counts$gene != "ENSG00000274452",c("symbol", "Condition", "normalized_counts")], Condition, normalized_counts)
write.table(counts, file = "normalized_counts_enza_activated_radiation_repressed_recount.tsv", row.names=FALSE, sep="\t", quote = FALSE)
#Get the Hallmark gene sets
m_t2g <- msigdbr(species = "Homo sapiens", category = "H") %>%
dplyr::select(gs_name, human_gene_symbol)
#For reproducibility, set the seed here
set.seed(0)
#ORA for enzalutamide-repressed radiation-activated genes
em <- enricher(enza.repressed.radiation.activated, TERM2GENE=m_t2g, pvalueCutoff = 0.1)
#Rank by adjusted p-value
fgseaResTidy <- em %>%
as_tibble() %>%
arrange(desc(p.adjust))
#Rename (an unnecessary legacy line)
gse_top = fgseaResTidy
#Calculate the observed-expected ratio
gse_top$Ratio_ratio =
(as.numeric(sapply(strsplit(gse_top$GeneRatio, "/"), "[", 1)) /
as.numeric(sapply(strsplit(gse_top$GeneRatio, "/"), "[", 2))) /
(as.numeric(sapply(strsplit(gse_top$BgRatio, "/"), "[", 1)) /
as.numeric(sapply(strsplit(gse_top$BgRatio, "/"), "[", 2)))
#Add the gene symbols
gse_top$ID = paste0(gse_top$ID, " (", gse_top$geneID, ")")
#Plot
svg("Hallmark_ORA_enza_repressed_radiation_activated_recount.svg")
ggplot(gse_top, aes(reorder(ID, Ratio_ratio), Ratio_ratio, fill = log(p.adjust, base = 10))) +
geom_col(width = .8/2) +
coord_flip() +
scale_fill_gradient(limits = c(-10,0)) +
ylim(0,22) +
labs(y="Observed / Expected", x = NULL) +
scale_x_discrete(labels=c('A')) +
theme_minimal() +
labs(fill = "log(padj)") +
theme(text = element_text(size = 20))
dev.off()
#ORA for enzalutamide-repressed radiation-repressed genes
em <- enricher(enza.repressed.radiation.repressed, TERM2GENE=m_t2g, pvalueCutoff = 0.1)
fgseaResTidy <- em %>%
as_tibble() %>%
arrange(desc(p.adjust))
gse_top = fgseaResTidy
gse_top$Ratio_ratio =
(as.numeric(sapply(strsplit(gse_top$GeneRatio, "/"), "[", 1)) /
as.numeric(sapply(strsplit(gse_top$GeneRatio, "/"), "[", 2))) /
(as.numeric(sapply(strsplit(gse_top$BgRatio, "/"), "[", 1)) /
as.numeric(sapply(strsplit(gse_top$BgRatio, "/"), "[", 2)))
gse_top$ID = paste0(gse_top$ID, " (", gse_top$geneID, ")")
svg("Hallmark_ORA_enza_repressed_radiation_repressed_recount.svg")
ggplot(gse_top, aes(reorder(ID, Ratio_ratio), Ratio_ratio, fill = log(p.adjust, base = 10))) +
geom_col() +
coord_flip() +
scale_fill_gradient(limits = c(-10,0)) +
ylim(0,22) +
labs(y="Observed / Expected", x = NULL) +
scale_x_discrete(labels=c('C', 'B')) +
theme_minimal() +
labs(fill = "log(padj)") +
theme(text = element_text(size = 20))
dev.off()
#ORA for enzalutamide-activated radiation-activated genes
em <- enricher(enza.activated.radiation.activated, TERM2GENE=m_t2g, pvalueCutoff = 0.1)
fgseaResTidy <- em %>%
as_tibble() %>%
arrange(desc(p.adjust))
gse_top = fgseaResTidy
gse_top$Ratio_ratio =
(as.numeric(sapply(strsplit(gse_top$GeneRatio, "/"), "[", 1)) /
as.numeric(sapply(strsplit(gse_top$GeneRatio, "/"), "[", 2))) /
(as.numeric(sapply(strsplit(gse_top$BgRatio, "/"), "[", 1)) /
as.numeric(sapply(strsplit(gse_top$BgRatio, "/"), "[", 2)))
gse_top$ID = paste0(gse_top$ID, " (", gse_top$geneID, ")")
svg("Hallmark_ORA_enza_activated_radiation_activated_recount.svg")
ggplot(gse_top, aes(reorder(ID, Ratio_ratio), Ratio_ratio, fill = log(p.adjust, base = 10))) +
geom_col(width = .8/2) +
coord_flip() +
scale_fill_gradient(limits = c(-10,0)) +
ylim(0,22) +
labs(y="Observed / Expected", x = NULL) +
scale_x_discrete(labels=c('D')) +
theme_minimal() +
labs(fill = "log(padj)") +
theme(text = element_text(size = 20))
dev.off()
#This produced no enriched terms for enzalutamide-activated radiation-repressed genes
em <- enricher(enza.activated.radiation.repressed, TERM2GENE=m_t2g, pvalueCutoff = 0.1)
Here we scale the bigWigs based on the size factors generated by DESeq2 to account for differences in read depth. We just use paired end 1 because that is the polymerase location.
#Set variables for locations of the files used in the following code
#My bigWigs are on my external hard drive, and the rest of the files are on Box
main=/Volumes/External/Users/TScott/Radiation_enzalutamide_PRO/
sizes=~/Library/CloudStorage/Box-Box/GuertinLab/hg38/hg38.chrom.sizes
destination=~/Library/CloudStorage/Box-Box/GioeliLab/Radiation_enzalutamide_R/
factor2=${destination}/norm.bedGraph.sizeFactor_batch2.txt
cd $main
#Download the script to normalize the bigWigs
wget https://raw.githubusercontent.com/guertinlab/Nascent_RNA_Methods/main/normalize_bedGraph.py
chmod +x normalize_bedGraph.py
#Change python location in the file to where yours is
tail -n +2 normalize_bedGraph.py > tmp.txt
echo "#! /usr/local/anaconda3/bin/python" | cat - tmp.txt > normalize_bedGraph.py
rm tmp.txt
head normalize_bedGraph.py
#Move the script somewhere in your $PATH for future use not in this directory
mv normalize_bedGraph.py /usr/local/bin/
#Scale the bigWigs (just doing experiment 2 here because that's all we use for the figure)
for i in LNCaP_*_batch2_*_PE1.bigWig
do
name=$(echo $i | awk -F"_PE1.bigWig" '{print $1}')
strand=$(echo $name | awk -F"_" '{print $NF}')
name=$(echo $name | awk -F"_[p,m]" '{print $1}')
invscale1=$(grep ${name} $factor2 | awk -F" " '{print $2}')
invscale=$(echo $invscale1 | bc)
scaletrue=$(bc <<< "scale=4 ; 1.0 / $invscale")
name=${name}_${strand}_PE1
echo $name
echo $strand
echo $scaletrue
echo file_to_scale
echo ${name}.bigWig
bigWigToBedGraph ${name}.bigWig ${name}.bg
echo normalizing
normalize_bedGraph.py -i ${name}.bg -s $scaletrue -o ${name}_scaled.bg
bedGraphToBigWig ${name}_scaled.bg $sizes ${name}_scaled.bigWig
rm ${name}_scaled.bg
rm ${name}.bg
done
Now we merge the two replicates per condition. We use this for generating composite plots for each of the four conditions.
for i in LNCaP_*rep3_batch2_*_PE1_scaled.bigWig
do
name=$(echo $i | awk -F"_rep3" '{print $1}')
strand=$(echo $i | awk -F"batch2_" '{print $2}' | awk -F"_PE1_scaled.bigWig" '{print $1}')
echo $name
echo $strand
files=$(ls ${name}_rep*_batch2_${strand}_PE1_scaled.bigWig)
echo $files
> temp.txt
if [ "$strand" == "plus" ]
then
echo "track type=bedGraph name=${name}_batch2_PE1_plus color=255,0,0 alwaysZero=on visibility=full" >> temp.txt
fi
if [ "$strand" == "minus" ]
then
echo "track type=bedGraph name=${name}_batch2_PE1_minus color=0,0,255 alwaysZero=on visibility=full" >> temp.txt
fi
count=$(ls ${name}_rep*_batch2_${strand}_PE1_scaled.bigWig | wc -l | bc)
scaleall=$(bc <<< "scale=4 ; 1.0 / $count")
echo $count
echo $scaleall
name=${name}_batch2
bigWigMerge $files tmp.bg
normalize_bedGraph.py -i tmp.bg -s $scaleall -o ${name}_${strand}_PE1_scaled.bg
LC_COLLATE=C sort -k1,1 -k2,2n ${name}_${strand}_PE1_scaled.bg > ${name}_${strand}_PE1_scaled_sorted.bg
cat temp.txt ${name}_${strand}_PE1_scaled_sorted.bg > ${name}_${strand}_PE1_scaled.bedGraph
rm tmp.bg
rm temp.txt
rm ${name}_${strand}_PE1_scaled.bg
rm ${name}_${strand}_PE1_scaled_sorted.bg
head ${name}_${strand}_PE1_scaled.bedGraph
bedGraphToBigWig ${name}_${strand}_PE1_scaled.bedGraph $sizes ${name}_${strand}_PE1_combined_normalized.bigWig
gzip ${name}_${strand}_PE1_scaled.bedGraph
done
Here we merge all the samples for this experiment. This gives us more precision to detect the peak of paused polymerase density used to define the composite plotting windows.
plusfiles=$(ls *_batch2_plus_PE1_scaled.bigWig)
bigWigMerge ${plusfiles} pro_batch2_plus_merged.bedGraph
LC_COLLATE=C sort -k1,1 -k2,2n pro_batch2_plus_merged.bedGraph > pro_batch2_plus_merged_sorted.bedGraph
bedGraphToBigWig pro_batch2_plus_merged_sorted.bedGraph $sizes pro_batch2_plus_merged.bigWig
minusfiles=$(ls *_batch2_minus_PE1_scaled.bigWig)
bigWigMerge ${minusfiles} pro_batch2_minus_merged.bedGraph
LC_COLLATE=C sort -k1,1 -k2,2n pro_batch2_minus_merged.bedGraph > pro_batch2_minus_merged_sorted.bedGraph
bedGraphToBigWig pro_batch2_minus_merged_sorted.bedGraph $sizes pro_batch2_minus_merged.bigWig
Finally, we merge all samples for each of the two treatment variables. For example, 4 samples were irradiated (2 replicates were also treated with a vehicle control, and 2 replicates were also treated with enzalutamide). We use this for generating composite plots for radiation versus control and for enzalutamide versus vehicle.
plusfiles=$(ls LNCaP*6GyIR*batch2_plus_PE1_scaled.bigWig)
bigWigMerge ${plusfiles} pro_radiation_batch2_plus_merged.bedGraph
LC_COLLATE=C sort -k1,1 -k2,2n pro_radiation_batch2_plus_merged.bedGraph > pro_radiation_batch2_plus_merged_sorted.bedGraph
bedGraphToBigWig pro_radiation_batch2_plus_merged_sorted.bedGraph $sizes pro_radiation_batch2_plus_merged.bigWig
minusfiles=$(ls LNCaP*6GyIR*batch2_minus_PE1_scaled.bigWig)
bigWigMerge ${minusfiles} pro_radiation_batch2_minus_merged.bedGraph
LC_COLLATE=C sort -k1,1 -k2,2n pro_radiation_batch2_minus_merged.bedGraph > pro_radiation_batch2_minus_merged_sorted.bedGraph
bedGraphToBigWig pro_radiation_batch2_minus_merged_sorted.bedGraph $sizes pro_radiation_batch2_minus_merged.bigWig
plusfiles=$(ls LNCaP*batch2_plus_PE1_scaled.bigWig | grep -v 6GyIR)
bigWigMerge ${plusfiles} pro_control_batch2_plus_merged.bedGraph
LC_COLLATE=C sort -k1,1 -k2,2n pro_control_batch2_plus_merged.bedGraph > pro_control_batch2_plus_merged_sorted.bedGraph
bedGraphToBigWig pro_control_batch2_plus_merged_sorted.bedGraph $sizes pro_control_batch2_plus_merged.bigWig
minusfiles=$(ls LNCaP*batch2_minus_PE1_scaled.bigWig | grep -v 6GyIR)
bigWigMerge ${minusfiles} pro_control_batch2_minus_merged.bedGraph
LC_COLLATE=C sort -k1,1 -k2,2n pro_control_batch2_minus_merged.bedGraph > pro_control_batch2_minus_merged_sorted.bedGraph
bedGraphToBigWig pro_control_batch2_minus_merged_sorted.bedGraph $sizes pro_control_batch2_minus_merged.bigWig
plusfiles=$(ls LNCaP*10uMEnza*batch2_plus_PE1_scaled.bigWig)
bigWigMerge ${plusfiles} pro_enzalutamide_batch2_plus_merged.bedGraph
LC_COLLATE=C sort -k1,1 -k2,2n pro_enzalutamide_batch2_plus_merged.bedGraph > pro_enzalutamide_batch2_plus_merged_sorted.bedGraph
bedGraphToBigWig pro_enzalutamide_batch2_plus_merged_sorted.bedGraph $sizes pro_enzalutamide_batch2_plus_merged.bigWig
minusfiles=$(ls LNCaP*10uMEnza*batch2_minus_PE1_scaled.bigWig)
bigWigMerge ${minusfiles} pro_enzalutamide_batch2_minus_merged.bedGraph
LC_COLLATE=C sort -k1,1 -k2,2n pro_enzalutamide_batch2_minus_merged.bedGraph > pro_enzalutamide_batch2_minus_merged_sorted.bedGraph
bedGraphToBigWig pro_enzalutamide_batch2_minus_merged_sorted.bedGraph $sizes pro_enzalutamide_batch2_minus_merged.bigWig
plusfiles=$(ls LNCaP*batch2_plus_PE1_scaled.bigWig | grep -v 10uMEnza)
bigWigMerge ${plusfiles} pro_vehicle_batch2_plus_merged.bedGraph
LC_COLLATE=C sort -k1,1 -k2,2n pro_vehicle_batch2_plus_merged.bedGraph > pro_vehicle_batch2_plus_merged_sorted.bedGraph
bedGraphToBigWig pro_vehicle_batch2_plus_merged_sorted.bedGraph $sizes pro_vehicle_batch2_plus_merged.bigWig
minusfiles=$(ls LNCaP*batch2_minus_PE1_scaled.bigWig | grep -v 10uMEnza)
bigWigMerge ${minusfiles} pro_vehicle_batch2_minus_merged.bedGraph
LC_COLLATE=C sort -k1,1 -k2,2n pro_vehicle_batch2_minus_merged.bedGraph > pro_vehicle_batch2_minus_merged_sorted.bedGraph
bedGraphToBigWig pro_vehicle_batch2_minus_merged_sorted.bedGraph $sizes pro_vehicle_batch2_minus_merged.bigWig
#Remove the unsorted bedGraphs
rm *_merged.bedGraph
#Compress the sorted bedGraphs
gzip *_merged_sorted.bedGraph
#Move the files to the directory where you will be running the code below
cp *combined*.bigWig $destination
cp *.bedGraph.gz $destination
cp *_merged.bigWig $destination
Here we plot the composite polymerase density for all radiation-activated genes, for each of the radiation and control conditions. For each gene, we first find the peak of paused polymerase density by searching in a window of 500bp around the transcription start site. We center the plot on this summit across genes. We use a much more stringent FDR of 0.0001 to maximize the effect size.
#Isolate the radiation-activated genes
df.deseq.effects.lattice.radiation =
categorize.deseq.df.mods(res.deseq.radiation, fdr = 0.0001, log2fold = 0.0, treat = 'Radiation')
isolated.genes = rownames(df.deseq.effects.lattice.radiation)[df.deseq.effects.lattice.radiation$treatment == "Radiation Activated"]
bed.input = gene.file[gene.file$gene %in% isolated.genes,]
#Make a window around the TSS
bed.input[,8] = bed.input[,7] + 1
colnames(bed.input) = c('chr', 'start', 'end', 'gene', 'symbol', 'strand', 'TSS','TSS+1')
tss.df = bed.input[,c(1,7:8,4:6)]
tss.df[,3] = tss.df[,2] + 500
tss.df[,2] = tss.df[,2] - 500
#Sum polymerase density for each position in window
combined.plus.bw = load.bigWig('/Volumes/External/Users/TScott/Radiation_enzalutamide_PRO/pro_batch2_plus_merged.bigWig')
combined.minus.bw = load.bigWig('/Volumes/External/Users/TScott/Radiation_enzalutamide_PRO/pro_batch2_minus_merged.bigWig')
dat.x = bed6.step.probeQuery.bigWig(combined.plus.bw,combined.minus.bw, tss.df,
step = 1, as.matrix = TRUE, op = "sum", follow.strand = FALSE)
dat.x[is.na(dat.x)] <- 0
#Smoothen the data
dat.x.win = t(apply(dat.x, 1, function(x){rollapply(x, width = 50, FUN = mean,
by = 1, by.column = FALSE,align = "left")}))
index.max = apply(dat.x.win, 1, which.max)
the.max = apply(dat.x.win, 1, max)
#Find the highest value in window to use as pause summit
pause.df = tss.df
pause.df[2][pause.df$strand == '-',] = pause.df[2][pause.df$strand == '-',] + index.max[pause.df$strand == '-']
pause.df[2][pause.df$strand == '+',] = pause.df[2][pause.df$strand == '+',] + index.max[pause.df$strand == '+']
pause.df[3] = pause.df[2] + 50
pause.df[,2] = rowMeans(pause.df[,2:3])
pause.df[,3] = pause.df[,2] + 1
#Make a window around this summit for plotting
bedTSSwindow = fiveprime.bed(pause.df, upstreamWindow = upstream,
downstreamWindow = downstream)
#For each set of samples (radiation and control), calculate the polymerase density at each position in the window
composite.df <- data.frame()
for(treatment in c("radiation","control")) {
bwPlus = load.bigWig(paste0('/Volumes/External/Users/TScott/Radiation_enzalutamide_PRO/pro_', treatment, '_batch2_plus_merged.bigWig'))
bwMinus = load.bigWig(paste0('/Volumes/External/Users/TScott/Radiation_enzalutamide_PRO/pro_', treatment, '_batch2_minus_merged.bigWig'))
tss.matrix = bed6.step.bpQuery.bigWig(bwPlus, bwMinus , bedTSSwindow,
step = step, as.matrix=TRUE, follow.strand=TRUE)
coordin.start = (-upstream - 1) + (step * roll.avg)/2
coordin.end = downstream - (step * roll.avg)/2
composite.lattice = data.frame(seq(coordin.start, coordin.end, by = step),
rollmean(colMeans(tss.matrix), roll.avg),
treatment = treatment,
stringsAsFactors = FALSE)
colnames(composite.lattice) = c('x', 'est', 'treatment')
composite.lattice$x = as.numeric(composite.lattice$x)
composite.lattice$est = 1000* composite.lattice$est / nrow(tss.matrix)
composite.df <- rbind(composite.df,composite.lattice)
}
composite.df$treatment = factor(composite.df$treatment, levels = c("radiation", "control"))
save(composite.df,file='radiation.composites.stringent.Rdata')
load('radiation.composites.stringent.Rdata')
#Plot
plot.composites(composite.df, treatment = "radiation_activated_stringent", summit='Midpoint of Pause Peak', col.lines = c("red", "grey60"))
Similarly, we plot the composite polymerase density for all enzalutamide-repressed gene, for each of the enzalutamide and vehicle conditions.
#Isolate the enzalutamide-repressed genes
df.deseq.effects.lattice.enza =
categorize.deseq.df.mods(res.deseq.enza, fdr = 0.0001, log2fold = 0.0, treat = 'Enzalutamide')
isolated.genes = rownames(df.deseq.effects.lattice.enza)[df.deseq.effects.lattice.enza$treatment == "Enzalutamide Repressed"]
bed.input = gene.file[gene.file$gene %in% isolated.genes,]
#Make a window around the TSS
bed.input[,8] = bed.input[,7] + 1
colnames(bed.input) = c('chr', 'start', 'end', 'gene', 'symbol', 'strand', 'TSS','TSS+1')
tss.df = bed.input[,c(1,7:8,4:6)]
tss.df[,3] = tss.df[,2] + 500
tss.df[,2] = tss.df[,2] - 500
#Sum polymerase density for each position in window
combined.plus.bw = load.bigWig('/Volumes/External/Users/TScott/Radiation_enzalutamide_PRO/pro_batch2_plus_merged.bigWig')
combined.minus.bw = load.bigWig('/Volumes/External/Users/TScott/Radiation_enzalutamide_PRO/pro_batch2_minus_merged.bigWig')
dat.x = bed6.step.probeQuery.bigWig(combined.plus.bw,combined.minus.bw, tss.df,
step = 1, as.matrix = TRUE, op = "sum", follow.strand = FALSE)
dat.x[is.na(dat.x)] <- 0
#Smoothen the data
dat.x.win = t(apply(dat.x, 1, function(x){rollapply(x, width = 50, FUN = mean,
by = 1, by.column = FALSE,align = "left")}))
index.max = apply(dat.x.win, 1, which.max)
the.max = apply(dat.x.win, 1, max)
#Find highest value in window to use as pause summit
pause.df = tss.df
pause.df[2][pause.df$strand == '-',] = pause.df[2][pause.df$strand == '-',] + index.max[pause.df$strand == '-']
pause.df[2][pause.df$strand == '+',] = pause.df[2][pause.df$strand == '+',] + index.max[pause.df$strand == '+']
pause.df[3] = pause.df[2] + 50
pause.df[,2] = rowMeans(pause.df[,2:3])
pause.df[,3] = pause.df[,2] + 1
#Make a window around this summit for plotting
bedTSSwindow = fiveprime.bed(pause.df, upstreamWindow = upstream,
downstreamWindow = downstream)
#For each set of samples (enzalutamide and vehicle), calculate the polymerase density at each position in the window
composite.df <- data.frame()
for(treatment in c("enzalutamide","vehicle")) {
bwPlus = load.bigWig(paste0('/Volumes/External/Users/TScott/Radiation_enzalutamide_PRO/pro_', treatment, '_batch2_plus_merged.bigWig'))
bwMinus = load.bigWig(paste0('/Volumes/External/Users/TScott/Radiation_enzalutamide_PRO/pro_', treatment, '_batch2_minus_merged.bigWig'))
tss.matrix = bed6.step.bpQuery.bigWig(bwPlus, bwMinus , bedTSSwindow,
step = step, as.matrix=TRUE, follow.strand=TRUE)
coordin.start = (-upstream - 1) + (step * roll.avg)/2
coordin.end = downstream - (step * roll.avg)/2
composite.lattice = data.frame(seq(coordin.start, coordin.end, by = step),
rollmean(colMeans(tss.matrix), roll.avg),
treatment = treatment,
stringsAsFactors = FALSE)
colnames(composite.lattice) = c('x', 'est', 'treatment')
composite.lattice$x = as.numeric(composite.lattice$x)
composite.lattice$est = 1000* composite.lattice$est / nrow(tss.matrix)
composite.df <- rbind(composite.df,composite.lattice)
}
save(composite.df,file='enzalutamide.composites.stringent.Rdata')
load('enzalutamide.composites.stringent.Rdata')
#Plot
plot.composites(composite.df, treatment = "enzalutamide_repressed_stringent", summit='Midpoint of Pause Peak', col.lines = c("blue", "grey60"))
Now we want to calculate the pause ratio for each gene in the above classes, and the change in pause ratio in response to each treatment.
#Get the enzalutamide-repressed genes
isolated.genes = rownames(df.deseq.effects.lattice.enza)[df.deseq.effects.lattice.enza$treatment == "Enzalutamide Repressed"]
bed.input = gene.file[gene.file$gene %in% isolated.genes,]
bed.input[,8] = bed.input[,7] + 1
colnames(bed.input) = c('chr', 'start', 'end', 'gene', 'symbol', 'strand', 'TSS','TSS+1')
#For each condition, sum the polymerase density in the pause region, average the polymerase density in the gene body, and store in a data frame
pause.region.summation = data.frame()
for(treatment in c("enzalutamide","vehicle")) {
df = bwplot.input.pause(bed.input,treatment)
pause.region.summation = rbind(pause.region.summation,df)
}
#Get the radiation-activated genes
isolated.genes = rownames(df.deseq.effects.lattice.radiation)[df.deseq.effects.lattice.radiation$treatment == "Radiation Activated"]
bed.input = gene.file[gene.file$gene %in% isolated.genes,]
bed.input[,8] = bed.input[,7] + 1
colnames(bed.input) = c('chr', 'start', 'end', 'gene', 'symbol', 'strand', 'TSS','TSS+1')
#Add these pause region sums and gene body averages to the above data frame
for(treatment in c("radiation","control")) {
df = bwplot.input.pause(bed.input,treatment)
pause.region.summation = rbind(pause.region.summation,df)
}
#Calculate the pause index by dividing the sum of polymerase signal in the pause region by the average of polymerase signal in the gene body
pause.region.summation$pause_index = pause.region.summation[,'pause_sum'] / pause.region.summation[,'body_avg']
save(pause.region.summation,file='pause.region.summation.stringent.Rdata')
load('pause.region.summation.stringent.Rdata')
#Perform t-tests to calculate the significance of the difference in pause index for each treatment
t.test(log(pause.region.summation$pause_index[pause.region.summation$treatment == "enzalutamide"] /
pause.region.summation$pause_index[pause.region.summation$treatment == "vehicle"], base = 2)[
is.finite(log(pause.region.summation$pause_index[pause.region.summation$treatment == "enzalutamide"] /
pause.region.summation$pause_index[pause.region.summation$treatment == "vehicle"], base = 2))
], na.action = na.rm) #0.0133
t.test(log(pause.region.summation$pause_index[pause.region.summation$treatment == "radiation"] /
pause.region.summation$pause_index[pause.region.summation$treatment == "control"], base = 2)[
is.finite(log(pause.region.summation$pause_index[pause.region.summation$treatment == "radiation"] /
pause.region.summation$pause_index[pause.region.summation$treatment == "control"], base = 2))
], na.action = na.rm) #< 2.2e-16
#Plot the log fold change in pause index upon enzalutamide treatment
df = pause.region.summation
late = "enzalutamide"
early = "vehicle"
plot.df = data.frame(gene = df[df$treatment == late,'gene'],
ratio = df[df$treatment == late,'pause_index'] / df[df$treatment == early,'pause_index'],
x = 1)
pdf(paste0('pause_index_ratio_',late,'.v.',early,'_stringent_bwplot.pdf'), width=2.5, height=3.4)
trellis.par.set(box.umbrella = list(lty = 1, col='black', lwd=2),
box.rectangle = list(col = 'black', lwd=1.6),
plot.symbol = list(col='black', lwd=1.6, pch ='.'))
print(bwplot(log(ratio, base = 2) ~ x, data = plot.df,
between=list(y=1.0, x = 1.0),
scales=list(x=list(draw=FALSE),relation="free",rot = 45, alternating=c(1,1,1,1),cex=1,font=1),
xlab = '',
main = "Pause Index (PI) Ratio",
ylab= substitute(paste('log'[2]*'',nn), list(nn=paste0('(',late,' PI / ',early,' PI)'))),
horizontal =FALSE, col= 'black',
aspect = 2,
par.settings=list(par.xlab.text=list(cex=1.2,font=1),
par.ylab.text=list(cex=1.2,font=1),
par.main.text=list(cex=1.2, font=1),
plot.symbol = list(col='black', lwd=1.6, pch =19, cex = 0.25)),
strip = function(..., which.panel, bg) {
#bg.col = c("#ce228e" ,"grey60", "#2290cf","grey90")
strip.default(..., which.panel = which.panel,
bg = rep(bg.col, length = which.panel)[which.panel])
},
panel = function(..., box.ratio, col) {
panel.abline(h = 0, col = 'black', lty = 2)
panel.violin(..., col = "blue",
varwidth = FALSE, box.ratio = box.ratio, outer = FALSE)
panel.stripplot(..., col='black', do.out=FALSE, jitter.data=TRUE, amount = 0.2, pch = 16)
panel.bwplot(..., pch = '|', do.out = FALSE)
}))
dev.off()
#Plot the log fold change in pause index upon radiation treatment
late = "radiation"
early = "control"
plot.df = data.frame(gene = df[df$treatment == late,'gene'],
ratio = df[df$treatment == late,'pause_index'] / df[df$treatment == early,'pause_index'],
x = 1)
pdf(paste0('pause_index_ratio_',late,'.v.',early,'_stringent_bwplot.pdf'), width=2.5, height=3.4)
trellis.par.set(box.umbrella = list(lty = 1, col='black', lwd=2),
box.rectangle = list(col = 'black', lwd=1.6),
plot.symbol = list(col='black', lwd=1.6, pch ='.'))
print(bwplot(log(ratio, base = 2) ~ x, data = plot.df,
between=list(y=1.0, x = 1.0),
scales=list(x=list(draw=FALSE),relation="free",rot = 45, alternating=c(1,1,1,1),cex=1,font=1),
xlab = '',
main = "Pause Index (PI) Ratio",
ylab= substitute(paste('log'[2]*'',nn), list(nn=paste0('(',late,' PI / ',early,' PI)'))),
horizontal =FALSE, col= 'black',
aspect = 2,
par.settings=list(par.xlab.text=list(cex=1.2,font=1),
par.ylab.text=list(cex=1.2,font=1),
par.main.text=list(cex=1.2, font=1),
plot.symbol = list(col='black', lwd=1.6, pch =19, cex = 0.25)),
strip = function(..., which.panel, bg) {
#bg.col = c("#ce228e" ,"grey60", "#2290cf","grey90")
strip.default(..., which.panel = which.panel,
bg = rep(bg.col, length = which.panel)[which.panel])
},
panel = function(..., box.ratio, col) {
panel.abline(h = 0, col = 'black', lty = 2)
panel.violin(..., col = "red",
varwidth = FALSE, box.ratio = box.ratio, outer = FALSE)
panel.stripplot(..., col='black', do.out=FALSE, jitter.data=TRUE, amount = 0.2, pch = 16)
panel.bwplot(..., pch = '|', do.out = FALSE)
}))
dev.off()
Similarly to above, we want to calculate the pause ratio for each of these genes for each of the 4 conditions.
#Get the 16 genes
df.deseq.effects.lattice.radiation =
categorize.deseq.df.mods(res.deseq.radiation, fdr = 0.1, log2fold = 0.0, treat = 'Radiation')
df.deseq.effects.lattice.enza =
categorize.deseq.df.mods(res.deseq.enza, fdr = 0.1, log2fold = 0.0, treat = 'Enzalutamide')
isolated.genes = rownames(df.deseq.effects.lattice.enza)[df.deseq.effects.lattice.enza$treatment == "Enzalutamide Repressed"]
isolated.genes = isolated.genes[isolated.genes %in% rownames(df.deseq.effects.lattice.radiation)[df.deseq.effects.lattice.radiation$treatment == "Radiation Activated"]]
bed.input = gene.file[gene.file$gene %in% isolated.genes,]
#Make a window around the TSS
bed.input[,8] = bed.input[,7] + 1
colnames(bed.input) = c('chr', 'start', 'end', 'gene', 'symbol', 'strand', 'TSS','TSS+1')
tss.df = bed.input[,c(1,7:8,4:6)]
tss.df[,3] = tss.df[,2] + 500
tss.df[,2] = tss.df[,2] - 500
#Sum polymerase density for each position in window
combined.plus.bw = load.bigWig('/Volumes/External/Users/TScott/Radiation_enzalutamide_PRO/pro_batch2_plus_merged.bigWig')
combined.minus.bw = load.bigWig('/Volumes/External/Users/TScott/Radiation_enzalutamide_PRO/pro_batch2_minus_merged.bigWig')
dat.x = bed6.step.probeQuery.bigWig(combined.plus.bw,combined.minus.bw, tss.df,
step = 1, as.matrix = TRUE, op = "sum", follow.strand = FALSE)
dat.x[is.na(dat.x)] <- 0
#Smoothen the data
dat.x.win = t(apply(dat.x, 1, function(x){rollapply(x, width = 50, FUN = mean,
by = 1, by.column = FALSE,align = "left")}))
index.max = apply(dat.x.win, 1, which.max)
the.max = apply(dat.x.win, 1, max)
#Find highest value in window to use as pause summit and take a 50 bp window around it - this is the 'pause region'
pause.df = tss.df
pause.df[2][pause.df$strand == '-',] = pause.df[2][pause.df$strand == '-',] + index.max[pause.df$strand == '-']
pause.df[2][pause.df$strand == '+',] = pause.df[2][pause.df$strand == '+',] + index.max[pause.df$strand == '+']
pause.df[3] = pause.df[2] + 50
#Define 'body region' as end of pause region to the end of the plotting window (strand specific)
body.df = fiveprime.bed(pause.df, upstreamWindow = upstream, downstreamWindow = downstream)
body.df[body.df$strand == '+',2] = pause.df[pause.df$strand == '+',3]+1
body.df[body.df$strand == '-',3] = pause.df[pause.df$strand == '-',2]-1
#For each condition, measure polymerase density within pause and body regions (sum for pause, average for body)
pause.region.summation = data.frame()
for(treatment in c("","_6GyIR", "_10uMEnza", "_10uMEnza_6GyIR")) {
output.df=cbind.data.frame(bed.input[,c(1:6)], rep(treatment, nrow(bed.input)))
colnames(output.df)[7] = "treatment"
wiggle.plus = load.bigWig(paste0('/Volumes/External/Users/TScott/Radiation_enzalutamide_PRO/LNCaP', treatment, '_batch2_plus_PE1_combined_normalized.bigWig'))
wiggle.minus = load.bigWig(paste0('/Volumes/External/Users/TScott/Radiation_enzalutamide_PRO/LNCaP', treatment, '_batch2_minus_PE1_combined_normalized.bigWig'))
output.df[,8] = bed6.region.bpQuery.bigWig(wiggle.plus, wiggle.minus, pause.df, op = "sum")
output.df[,9] = bed6.region.bpQuery.bigWig(wiggle.plus, wiggle.minus, body.df, op = "avg")
colnames(output.df)[c(8,9)] = c('pause_sum','body_avg')
df = output.df
pause.region.summation = rbind(pause.region.summation,df)
}
#Calculate the pause index by dividing the sum of polymerase signal in the pause region by the average of polymerase signal in the gene body
pause.region.summation$pause_index = pause.region.summation[,'pause_sum'] / pause.region.summation[,'body_avg']
#Rename the conditions for plotting
pause.region.summation$treatment[pause.region.summation$treatment == ""] = "Control/Vehicle"
pause.region.summation$treatment[pause.region.summation$treatment == "_6GyIR"] = "Radiation/Vehicle"
pause.region.summation$treatment[pause.region.summation$treatment == "_10uMEnza"] = "Control/Enzalutamide"
pause.region.summation$treatment[pause.region.summation$treatment == "_10uMEnza_6GyIR"] = "Radiation/Enzalutamide"
pause.region.summation$treatment = factor(pause.region.summation$treatment, levels = c("Radiation/Enzalutamide", "Radiation/Vehicle", "Control/Enzalutamide", "Control/Vehicle"))
save(pause.region.summation,file='pause.region.summation.curated.Rdata')
load('pause.region.summation.curated.Rdata')
#Plot pause index ratios relative to the control/vehicle condition
df = pause.region.summation
col.lines = viridis(5)[c(4,4,2,2)]
early = "Control/Vehicle"
for (i in levels(factor(pause.region.summation$treatment))) {
df$ratio[df$treatment == i] = df$pause_index[df$treatment == i] / df$pause_index[df$treatment == early]
}
df$treatment = factor(df$treatment, levels = c("Radiation/Enzalutamide", "Radiation/Vehicle", "Control/Enzalutamide", "Control/Vehicle")[4:1])
#Plot
pdf('pause_index_ratio_curated_bwplot.pdf', width=5, height=3.4)
trellis.par.set(box.umbrella = list(lty = 1, col='black', lwd=2),
box.rectangle = list(col = 'black', lwd=1.6),
plot.symbol = list(col='black', lwd=1.6, pch ='.'))
print(bwplot(log(ratio, base = 2) ~ treatment, data = df, group = treatment,
between=list(y=1.0, x = 1.0),
scales=list(x=list(draw=FALSE),relation="free",rot = 45, alternating=c(1,1,1,1),cex=1,font=1),
xlab=levels(df$treatment),
main = "Pause Index (PI) Ratio",
ylab= expression("log"[2]~"(pause index ratio)"),
horizontal =FALSE, col= 'black',
aspect = 1,
par.settings=list(par.xlab.text=list(cex=.4,font=1),
par.ylab.text=list(cex=1.2,font=1),
par.main.text=list(cex=1.2, font=1),
plot.symbol = list(col='black', lwd=1.6, pch =19, cex = 0.25)),
strip = function(..., which.panel, bg) {
strip.default(..., which.panel = which.panel,
bg = rep(bg.col, length = which.panel)[which.panel])
},
panel = function(..., box.ratio, col) {
panel.abline(h = 0, col = 'black', lty = 2)
panel.violin.hack(..., col = col.lines,
varwidth = FALSE, box.ratio = box.ratio, outer = FALSE)
panel.stripplot(..., col='black', do.out=FALSE, jitter.data=TRUE, amount = 0.2,
pch = c(16, 17, 16, 17))
panel.bwplot(..., pch = '|', do.out = FALSE)
}))
dev.off()