Hi,
I was wondering I'm going about this in the correct way. I need to test if
there are coding sequences or exons in hg19 which match a string of 100bp
"D" i.e. [A,G or T]. However I'm getting a strange result.
I get a hit on chr7, using the 100bp search however when I search with 60bp
sequence of "D" I don't get any hits.
library("BSgenome")
library("Biostrings")
library("BSgenome.Hsapiens.UCSC.hg19")
library("biomaRt")
library("GenomicFeatures")
#extract the alignments which match real genes
#(add this onto stefans script … req to buid the gr object using the regions
from the BSgenome alignment of the 100bp seqs.)
txdb <- makeTranscriptDbFromUCSC(genome = "hg19", tablename =
"knownGene")
## do once locally & save
#query.plus <-
DNAString("DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD")
# 100bp C free sequence
query.plus <-
DNAString("DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD")
#
60bp C free sequence
query.minus <- reverseComplement(query.plus)
chrList <- c("chr1", "chr2", "chr3",
"chr4", "chr5", "chr6", "chr7",
"chr8",
"chr9", "chr10", "chr11", "chr12",
"chr13", "chr14", "chr15", "chr16",
"chr17", "chr18", "chr19", "chr20",
"chr21", "chr22", "chrX", "chrY")
#access/group the by subset of annotation
annotGr <- exons(txdb)
#annotGr <- cds(txdb)
wholeGenomeMatch <- function()
{
for(i in chrList)
{
#matches on plus strand
matches.plus.strand <- matchPattern(query.plus, Hsapiens[[i]],
fixed=FALSE, max.mismatch=5)
mp <- as.matrix(matches.plus.strand)
#matches on minus strand
matches.minus.strand <- matchPattern(query.minus, Hsapiens[[i]],
fixed=FALSE, max.mismatch=5)
mm <- as.matrix(matches.minus.strand)
OL.p <- NULL
if(nrow(mp) > 0)
{
##### MATCH THE POSITIVE STRAND ######
#need to get start and end positions (end = start + (length-1))
gr <- GRanges(
seqnames = rep(i,nrow(mp)),
ranges = IRanges(start = mp[[1]],
end = mp[[1]]+(mp[,2]-1)),
strand = rep("+", nrow(mp)))
#OL <- findOverlaps(query=gr, subject=annotGr)
OL.p <- annotGr[(!is.na(match(annotGr, gr)))]
#as.data.frame(OL) ## view the results
#tdata.p <- annotGr[unique(queryHits(OL)),]
#tdata.p <- as.data.frame(tdata.p, row.names = NULL, optional = FALSE)
#write.table(tdata.p, paste(i,"plus.txt"))
cat( paste(i,"plus.txt\n"))
}
OL.m <- NULL
if(nrow(mm) > 0)
{
##### MATCH THE MINUS STRAND ########
#need to get start and end positions (end = start + (length-1))
gr <- GRanges(
seqnames = rep(i,nrow(mm)),
ranges = IRanges(start = mm[[1]],
end = mm[[1]]+(mm[,2]-1)),
strand = rep("-", nrow(mm)))
#OL <- findOverlaps(query=gr, subject=annotGr)
OL.m <- annotGr[(!is.na(match(annotGr, gr)))]
#tdata.m <- annotGr[unique(queryHits(OL)),]
#tdata.m <- as.data.frame(tdata.m, row.names = NULL, optional = FALSE)
#write.table(tdata.m, paste(i,"minus.txt"))
cat( paste(i,"minus.txt\n"))
#write.table(rbind(tdata.p, tdata.p), paste(i, ".txt"))
}
##write all matching results into one file
write.table(rbind(as.data.frame(OL.p), as.data.frame(OL.m)),
file="60bp_v_exons_max-mismatch=5.txt", append=TRUE)
}
}
wholeGenomeMatch()
#in the first instance with the 100bp search I get 1 hit in the output file:
"seqnames" "start" "end" "width"
"strand" "exon_id"
"seqnames" "start" "end" "width"
"strand" "exon_id"
"seqnames" "start" "end" "width"
"strand" "exon_id"
"seqnames" "start" "end" "width"
"strand" "exon_id"
"seqnames" "start" "end" "width"
"strand" "exon_id"
"seqnames" "start" "end" "width"
"strand" "exon_id"
"seqnames" "start" "end" "width"
"strand" "exon_id"
"1" "chr7" 420815 422845 2031 "+" 97277
"seqnames" "start" "end" "width"
"strand" "exon_id"
"seqnames" "start" "end" "width"
"strand" "exon_id"
"seqnames" "start" "end" "width"
"strand" "exon_id"
"seqnames" "start" "end" "width"
"strand" "exon_id"
"seqnames" "start" "end" "width"
"strand" "exon_id"
"seqnames" "start" "end" "width"
"strand" "exon_id"
"seqnames" "start" "end" "width"
"strand" "exon_id"
"seqnames" "start" "end" "width"
"strand" "exon_id"
"seqnames" "start" "end" "width"
"strand" "exon_id"
"seqnames" "start" "end" "width"
"strand" "exon_id"
"seqnames" "start" "end" "width"
"strand" "exon_id"
"seqnames" "start" "end" "width"
"strand" "exon_id"
"seqnames" "start" "end" "width"
"strand" "exon_id"
"seqnames" "start" "end" "width"
"strand" "exon_id"
"seqnames" "start" "end" "width"
"strand" "exon_id"
"seqnames" "start" "end" "width"
"strand" "exon_id"
"seqnames" "start" "end" "width"
"strand" "exon_id"
#with the 60 bp query string my output file reads no hits
"seqnames" "start" "end" "width"
"strand" "exon_id"
"seqnames" "start" "end" "width"
"strand" "exon_id"
"seqnames" "start" "end" "width"
"strand" "exon_id"
"seqnames" "start" "end" "width"
"strand" "exon_id"
"seqnames" "start" "end" "width"
"strand" "exon_id"
"seqnames" "start" "end" "width"
"strand" "exon_id"
"seqnames" "start" "end" "width"
"strand" "exon_id"
"seqnames" "start" "end" "width"
"strand" "exon_id"
"seqnames" "start" "end" "width"
"strand" "exon_id"
"seqnames" "start" "end" "width"
"strand" "exon_id"
"seqnames" "start" "end" "width"
"strand" "exon_id"
"seqnames" "start" "end" "width"
"strand" "exon_id"
"seqnames" "start" "end" "width"
"strand" "exon_id"
"seqnames" "start" "end" "width"
"strand" "exon_id"
"seqnames" "start" "end" "width"
"strand" "exon_id"
"seqnames" "start" "end" "width"
"strand" "exon_id"
"seqnames" "start" "end" "width"
"strand" "exon_id"
"seqnames" "start" "end" "width"
"strand" "exon_id"
"seqnames" "start" "end" "width"
"strand" "exon_id"
"seqnames" "start" "end" "width"
"strand" "exon_id"
"seqnames" "start" "end" "width"
"strand" "exon_id"
"seqnames" "start" "end" "width"
"strand" "exon_id"
"seqnames" "start" "end" "width"
"strand" "exon_id"
"seqnames" "start" "end" "width"
"strand" "exon_id"
Your help is much appreciated,
Amos
[[alternative HTML version deleted]]