Command line executed on EMBRAPA grid:

time java -Djava.io.tmpdir=.hpexome/ -jar HPexome.jar -R hpexome_validation/genome.chr.fa \
-I hpexome_validation/NA12877_S1.sort.fix.bam \
-known hpexome_validation/ALL.wgs.indels_mills_devine_hg19_leftAligned_collapsed_double_hit.indels.sites.chr.vcf \
-known hpexome_validation/ALL.wgs.low_coverage_vqsr.20101123.indels.sites.chr.vcf \
-knownSites hpexome_validation/clinvar_20140604.chr.vcf -dbsnp hpexome_validation/clinvar_20140604.chr.vcf \
-stand_call_conf 40 -stand_emit_conf 10 -minPruning 3 -l DEBUG \
-run -nct 8 -nt 8 -scattercount 8 \
-jobRunner GridEngine -jobEnv "make 1" \
-outdir hpexome_validation/validate_NA12877_try1 | qsub -V -cwd -N validate_NA12877_try1

Download VCF reference: ftp://ussd-ftp.illumina.com/hg19/8.0.1/NA12877/

Packages

library(VariantAnnotation)
library(knitr)
library(ggplot2)

Normalize VCF files:

workDir <- "/home/bioinf/hpexome_validation"
genomeFile <- file.path(workDir, "genome.chr.fa")
NA12877.ref <- file.path(workDir, "NA12877.vcf.gz")
NA12877.ref.vt <- file.path(workDir, "NA12877.vt.vcf")
NA12877.vcf <- file.path(workDir, "validate_NA12877_try1/NA12877_S1.sort.fix.HC.raw.vt.vcf.bz")
NA12877.vcf.vt <- file.path(workDir, "validate_NA12877_try1/NA12877_S1.sort.fix.HC.raw.vt.vcf")
vt decompose /home/bioinf/hpexome_validation/NA12877.vcf.gz | vt normalize -r /home/bioinf/hpexome_validation/genome.chr.fa - | vt uniq - -o /home/bioinf/hpexome_validation/NA12877.vt.vcf
vt decompose /home/bioinf/hpexome_validation/validate_NA12877_try1/NA12877_S1.sort.fix.HC.raw.vt.vcf.bz | vt normalize -r /home/bioinf/hpexome_validation/genome.chr.fa - | vt uniq - -o /home/bioinf/hpexome_validation/validate_NA12877_try1/NA12877_S1.sort.fix.HC.raw.vt.vcf
NA12877.ref.vt.bz <- paste0(NA12877.ref.vt, ".bz")
# bgzip(NA12877.ref.vt, NA12877.ref.vt.bz)
NA12877.vcf.vt.bz <- paste0(NA12877.vcf.vt, ".bz")
# bgzip(NA12877.ref.vt, NA12877.vcf.vt.bz)

Load VCF files into R:

genome <- "hg19"
ref <- readVcf(NA12877.ref.vt.bz, genome)
vcf <- readVcf(NA12877.vcf.vt.bz, genome)

Filter VCFs by selected chromosomes

chr.select <- paste0("chr", 1:22)
seqlevels(ref, force=TRUE) <- seqlevels(ref)[seqlevels(ref) %in% chr.select]
seqlevels(vcf, force=TRUE) <- seqlevels(vcf)[seqlevels(vcf) %in% chr.select]

Remove variants from reference that have genotype “0/.”

idx <- which(!grepl("0(\\/|\\|)\\.", geno(ref)$GT))
ref <- ref[idx, ]

Number of variants:

kable(data.frame(reference=nrow(ref), hpexome=nrow(vcf)))
reference hpexome
4425657 4706967

Subset VCF files using common variants between the two VCFs:

vcf.ranges <- rowRanges(vcf)
ref.ranges <- rowRanges(ref)
overlap <- findOverlaps(vcf.ranges, ref.ranges)
vcf.sub <- vcf[queryHits(overlap), ]
ref.sub <- ref[subjectHits(overlap), ]

Number of variants after subset (must be the same value for the two file):

kable(data.frame(reference=nrow(ref.sub), hpexome=nrow(vcf.sub)))
reference hpexome
4356821 4356821

The hpexome tool not found 68836 variants. However this tool found 350146 new variants. Because hpxome found 281310 more variants than reference.

Create genotype count table

vcf.gt <- geno(vcf.sub)$GT
ref.gt <- sub("\\|", "/", geno(ref.sub)$GT)
ref.gt <- sub("1\\/0", "0/1", ref.gt)

genoCount <- table(ref.gt, vcf.gt)
genoCount
##       vcf.gt
## ref.gt       .     0/1     1/.     1/1
##    .     34160    7871   34178    8255
##    0/1    2485 2584861    3347    4205
##    1/.   30938    2958   30945    2539
##    1/1     642    4807     663 1603967

Degree of concordance

sum(diag(genoCount)) / sum(genoCount) * 100
## [1] 97.63846

Create intevals for QG value:

gq <- geno(vcf.sub)$GQ
breaks <- unique(quantile(gq, seq(0, 1, .01), na.rm=TRUE))
intervals <- cut(gq, breaks)

This function count the calculates the number of correct genotypes:

f <- function(interval, vcf.gt, ref.gt, intervals)
{
  idx <- interval == intervals
  sum(vcf.gt[idx] == ref.gt[idx], na.rm=TRUE) / length(ref.gt[idx]) * 100
}

Create table for intervals and correct genotypes:

correct <- sapply(levels(intervals), f, vcf.gt, ref.gt, intervals, USE.NAMES = FALSE)
df <- data.frame(intervals=levels(intervals), correct)
kable(df)
intervals correct
(0,42] 18.59625
(42,64] 21.80658
(64,80] 22.67760
(80,92] 23.52834
(92,99] 96.33104

Plot intervals

ggplot(df, aes(x=intervals, y=correct)) +
    geom_bar(stat="identity") +
    labs(x="QG value intervals", y="% correct variant")
plot of chunk unnamed-chunk-15

plot of chunk unnamed-chunk-15