Skip to content
Snippets Groups Projects
Commit 2e46c9bf authored by Ahmed Abbas's avatar Ahmed Abbas
Browse files

Delete get_CTCF_TADs_flags.R

parent dc1a0999
Branches
No related merge requests found
options(bedtools.path = '/home2/s206442/.conda/envs/my_renv1/bin')
library(bedtoolsr)
fname_ctcf <- '/work/pathology/s206442/dbet_data/inputs_gm12878_rnapol2/GSE63525_IMR90_Arrowhead_domainlist.txt'
#fname_ctcf <- '/work/pathology/s206442/dbet_data/inputs_gm12878_rnapol2/GSE63525_GM12878_primary+replicate_Arrowhead_domainlist.txt'
CTCF_motifs <- read.table(fname_ctcf,sep='\t',header=FALSE)
for(i in 1:1)
{
print(i)
if(i<23)
{
chr <- paste0('chr',i)
chr_number <- i
}else{
chr <- 'chrX'
chr_number <- 'X'
}
#Res_5Kb/chr%d_hic.bedpe
#Hi-C_data/Orig_inputs_',chr_num,'_CPP_HiC.txt
fname <- paste('/project/pathology/Mani_lab/s206442/IMR90_data/Hi-C_data/Orig_inputs_',chr,'_CPP_HiC.txt',sep='')
chia = read.table(fname,header=FALSE,sep='\t')
#fname <- paste('./peaks_bed_files/',chr,'_pk1.bed',sep='')
ind_chr <- which(chia[,1] == chr)
chr_loops <- chia[ind_chr,1:6]
s <- seq(1,dim(chr_loops)[1])
chr_loops <- data.frame(chr_loops,s)
chr_peaks <- data.frame(chr_loops[,1],chr_loops[,2],chr_loops[,6],chr_loops[,7])
#get the CTCF motifs of the current chromosome
ind_ctcf <- which(CTCF_motifs[,1]==chr_number)
ctcf_chr <- CTCF_motifs[ind_ctcf,1:6]
ctcf_chr[,1] <- chr
ctcf_chr[,4] <- chr
ctcf_peaks <- data.frame(ctcf_chr[,1],ctcf_chr[,2],ctcf_chr[,6])
intersect1 <- bedtoolsr::bt.intersect(a=chr_peaks,b=ctcf_peaks,wa = TRUE, wb = TRUE,f=1)
ind <- intersect1[,4]
ind
final <- rep(0,dim(chr_loops)[1])
ind <- intersect1[,4]
final[ind] <- 1
#keep only first peak and second peak and the orientation flag file
fname_final <- paste('/project/pathology/Mani_lab/s206442/Inputs_CNN/HiC_inputs/IMR90/TADs_',chr,'.txt',sep='')
write.table(final,file=fname_final,sep='\t',col.names=FALSE,row.names=FALSE,quote=FALSE)
}
#read the CTCF motif known1 downloaded from [http://compbio.mit.edu/encode-motifs/]
fname_ctcf <- '/work/pathology/s206442/dbet_data/CTCF.txt'
CTCF_motifs <- read.table(fname_ctcf,sep=' ',header=FALSE)
for(i in 1:1)
{
if(i<23)
{
chr <- paste0('chr',i)
}else{
chr <- 'chrX'
}
#fname <- paste('/project/pathology/Mani_lab/s206442/IMR90_data/Hi-C_data/Res_5Kb/',chr,'_hic.bedpe',sep='')
fname <- paste('/project/pathology/Mani_lab/s206442/IMR90_data/Hi-C_data/Orig_inputs_',chr,'_CPP_HiC.txt',sep='')
chia = read.table(fname,header=FALSE,sep='\t')
#fname <- paste('./peaks_bed_files/',chr,'_pk1.bed',sep='')
ind_chr <- which(chia[,1] == chr)
chr_pk1 <- chia[ind_chr,1:3]
#put a sequence number for each peak
seq_num <- seq(1,dim(chr_pk1)[1])
df1 <- data.frame(chr_pk1,seq_num)
#read peaks 2
#fname <- paste('./peaks_bed_files/',chr,'_pk2.bed',sep='')
chr_pk2 <- chia[ind_chr,4:6]
#put a sequence number of peaks 2
seq_num <- seq(1,dim(chr_pk2)[1])
df2 <- data.frame(chr_pk2,seq_num)
colnames(df1) <- c('Chr','Bp1','Bp2','SeqNum')
colnames(df2) <- c('Chr','Bp1','Bp2','SeqNum')
#get the CTCF motifs of the current chromosome
ind_ctcf <- which(CTCF_motifs[,2]==chr)
ctcf_chr <- CTCF_motifs[ind_ctcf,c(2,3,4,5)]
colnames(ctcf_chr) <- c('Chr','Bp1','Bp2','strand')
#intersect peak1 with the motifs
#Perform a “left outer join”. That is, for each feature in A report each overlap with B. If no overlaps are found, #report a NULL feature for B.
#Thus, we are sure that each peak will have an entry even if it does not have a match in the motifs file
intersect_pk1 <- bedtoolsr::bt.intersect(a = df1, b = ctcf_chr, wa = TRUE, wb = TRUE,F=1,loj=TRUE)
#intersect peak2 with the motifs
intersect_pk1 <- intersect_pk1[!duplicated(intersect_pk1[,4]),]
intersect_pk2 <- bedtoolsr::bt.intersect(a = df2, b = ctcf_chr, wa = TRUE, wb = TRUE,F=1,loj=TRUE)
intersect_pk2 <- intersect_pk2[!duplicated(intersect_pk2[,4]),]
#keep only the peak information, seq number, and motif strand
PK1_final <- intersect_pk1[,c(1,2,3,4,8)]
PK2_final <- intersect_pk2[,c(1,2,3,4,8)]
colnames(PK1_final) <- c('chr1','bp1_1','bp2_1','seq','strand1')
colnames(PK2_final) <- c('chr2','bp1_2','bp2_2','seq','strand2')
#Merge Peak1 and Peak 2 with seq number, so that peaks belonging to same interaction are in same row
merged <- merge(PK1_final,PK2_final,by='seq')
print(head(merged))
#make a new column that it '1' only if convergent orientation
col <- rep(0,dim(merged)[1])
ind <- which(merged$strand1=='+' & merged$strand2=='-')
col[ind] = 1
df_merged <- data.frame(merged,col)
print(head(df_merged))
#keep only first peak and second peak and the orientation flag file
final <- df_merged[,c(10)]
print(head(final))
fname_final <- paste('/project/pathology/Mani_lab/s206442/Inputs_CNN/HiC_inputs/IMR90/CTCF_',chr,'_orientation.txt',sep='')
write.table(final,file=fname_final,sep='\t',col.names=FALSE,row.names=FALSE,quote=FALSE)
}
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment