Title: | Inference of Chromosome-Length Haplotypes Using Genomic Data of Single Gamete Cells |
---|---|
Description: | Inference of chromosome-length haplotypes using a few haploid gametes of an individual. The gamete genotype data may be generated from various platforms including genotyping arrays and sequencing even with low-coverage. Hapi simply takes genotype data of known hetSNPs in single gamete cells as input and report the high-resolution haplotypes as well as confidence of each phased hetSNPs. The package also includes a module allowing downstream analyses and visualization of identified crossovers in the gametes. |
Authors: | Ruidong Li, Han Qu, Jinfeng Chen, Shibo Wang, Le Zhang, Julong Wei, Sergio Pietro Ferrante, Mikeal L. Roose, Zhenyu Jia |
Maintainer: | Ruidong Li <[email protected]> |
License: | GPL-3 |
Version: | 0.0.3 |
Built: | 2024-11-03 04:05:47 UTC |
Source: | https://github.com/cran/Hapi |
Hapi is a novel easy-to-use package that only requires 3 to 5 gametes to reconstruct accurate and high-resolution haplotypes of an individual. The gamete genotype data may be generated from various platforms including genotyping arrays and next generation sequencing even with low-coverage. Hapi simply takes genotype data of known hetSNPs in single gamete cells as input and report the high-resolution haplotypes as well as confidence level of each phased hetSNPs. The package also includes a module allowing downstream analyses and visualization of crossovers in the gametes.
Convert base (A/T/C/G) coded genotype to numeric (0/1) coded
base2num(gmt, ref, alt)
base2num(gmt, ref, alt)
gmt |
a dataframe of genotype data of gamete cells |
ref |
a character represents reference allele |
alt |
a character represents alternative allele |
a dataframe containing converted genotype
Ruidong Li
ref <- sample(c('A','T'),500, replace=TRUE) alt <- sample(c('C','G'),500, replace=TRUE) gmt <- data.frame(chr=rep(1,500), pos=seq_len(500), ref=ref, alt=alt, gmt1=ref, gmt2=alt, gmt3=ref, gmt4=ref, gmt5=c(alt[1:250], ref[251:500]), stringsAsFactors = FALSE) gmtDa <- base2num(gmt=gmt[5:9], ref=ref, alt=alt)
ref <- sample(c('A','T'),500, replace=TRUE) alt <- sample(c('C','G'),500, replace=TRUE) gmt <- data.frame(chr=rep(1,500), pos=seq_len(500), ref=ref, alt=alt, gmt1=ref, gmt2=alt, gmt3=ref, gmt4=ref, gmt5=c(alt[1:250], ref[251:500]), stringsAsFactors = FALSE) gmtDa <- base2num(gmt=gmt[5:9], ref=ref, alt=alt)
Crossover information across all gamete cells
Haplotypes of a single gamete cell for visualization
Assemble the consensus high-resolution haplotypes
hapiAssemble(gmt, draftHap, keepLowConsistency = TRUE, consistencyThresh = 0.85)
hapiAssemble(gmt, draftHap, keepLowConsistency = TRUE, consistencyThresh = 0.85)
gmt |
a dataframe of genotype data of gamete cells |
draftHap |
a dataframe with draft haplotype information |
keepLowConsistency |
logical, if low-consistent gamete cells should be kept |
consistencyThresh |
a numeric value of the threshold determining low-consistent gamete cells compared with the draft haplotype. Default is 0.85 |
a dataframe containing phased haplotypes
Ruidong Li
finalDraft <- rep(0,500) names(finalDraft) <- seq_len(500) ref <- rep(0,500) alt <- rep(1,500) gmtDa <- data.frame(gmt1=ref, gmt2=alt, gmt3=ref, gmt4=ref, gmt5=c(alt[1:250], ref[251:500]), stringsAsFactors = FALSE) idx1 <- sort(sample(seq_len(500), 30, replace = FALSE)) idx2 <- sort(sample(seq_len(500), 30, replace = FALSE)) idx3 <- sort(sample(seq_len(500), 30, replace = FALSE)) gmtDa[idx1,1] <- NA gmtDa[idx2,2] <- NA gmtDa[idx3,3] <- NA consensusHap <- hapiAssemble(draftHap = finalDraft, gmt = gmtDa)
finalDraft <- rep(0,500) names(finalDraft) <- seq_len(500) ref <- rep(0,500) alt <- rep(1,500) gmtDa <- data.frame(gmt1=ref, gmt2=alt, gmt3=ref, gmt4=ref, gmt5=c(alt[1:250], ref[251:500]), stringsAsFactors = FALSE) idx1 <- sort(sample(seq_len(500), 30, replace = FALSE)) idx2 <- sort(sample(seq_len(500), 30, replace = FALSE)) idx3 <- sort(sample(seq_len(500), 30, replace = FALSE)) gmtDa[idx1,1] <- NA gmtDa[idx2,2] <- NA gmtDa[idx3,3] <- NA consensusHap <- hapiAssemble(draftHap = finalDraft, gmt = gmtDa)
Assembly of haplotypes in regions at the end of a chromosome
hapiAssembleEnd(gmt, draftHap, consensusHap, k = 300)
hapiAssembleEnd(gmt, draftHap, consensusHap, k = 300)
gmt |
a dataframe of genotype data of gamete cells |
draftHap |
a dataframe with draft haplotype information |
consensusHap |
a dataframe of the consensus haplotype information |
k |
a numeric value for the number of hetSNPs that will be combined with markers beyond the framework for assembly. Default is 300 |
a dataframe containing phased haplotypes
Ruidong Li
finalDraft <- rep(0,500) names(finalDraft) <- seq_len(500) ref <- rep(0,500) alt <- rep(1,500) gmtDa <- data.frame(gmt1=ref, gmt2=alt, gmt3=ref, gmt4=ref, gmt5=c(alt[1:250], ref[251:500]), stringsAsFactors = FALSE) idx1 <- sort(sample(seq_len(500), 30, replace = FALSE)) idx2 <- sort(sample(seq_len(500), 30, replace = FALSE)) idx3 <- sort(sample(seq_len(500), 30, replace = FALSE)) gmtDa[idx1,1] <- NA gmtDa[idx2,2] <- NA gmtDa[idx3,3] <- NA consensusHap <- data.frame(hap1=rep(0,500),hap2=rep(1,500), total=rep(5,500),rate=rep(1,500), confidence=rep('F',500), stringsAsFactors = FALSE) rownames(consensusHap) <- seq_len(500) consensusHap <- hapiAssembleEnd(gmt = gmtDa, draftHap = finalDraft, consensusHap = consensusHap, k = 300)
finalDraft <- rep(0,500) names(finalDraft) <- seq_len(500) ref <- rep(0,500) alt <- rep(1,500) gmtDa <- data.frame(gmt1=ref, gmt2=alt, gmt3=ref, gmt4=ref, gmt5=c(alt[1:250], ref[251:500]), stringsAsFactors = FALSE) idx1 <- sort(sample(seq_len(500), 30, replace = FALSE)) idx2 <- sort(sample(seq_len(500), 30, replace = FALSE)) idx3 <- sort(sample(seq_len(500), 30, replace = FALSE)) gmtDa[idx1,1] <- NA gmtDa[idx2,2] <- NA gmtDa[idx3,3] <- NA consensusHap <- data.frame(hap1=rep(0,500),hap2=rep(1,500), total=rep(5,500),rate=rep(1,500), confidence=rep('F',500), stringsAsFactors = FALSE) rownames(consensusHap) <- seq_len(500) consensusHap <- hapiAssembleEnd(gmt = gmtDa, draftHap = finalDraft, consensusHap = consensusHap, k = 300)
Automatic inference of haplotypes
hapiAutoPhase(gmt, code = "atcg")
hapiAutoPhase(gmt, code = "atcg")
gmt |
a dataframe of genotype data of gamete cells |
code |
a character indicating the code style of genotype data.
One of |
a dataframe of inferred consensus haplotypes
Ruidong Li
ref <- sample(c('A','T'),500, replace=TRUE) alt <- sample(c('C','G'),500, replace=TRUE) gmt <- data.frame(chr=rep(1,500), pos=seq_len(500), ref=ref, alt=alt, gmt1=ref, gmt2=alt, gmt3=ref, gmt4=ref, gmt5=c(alt[1:250], ref[251:500]), stringsAsFactors = FALSE) hapOutput <- hapiAutoPhase(gmt=gmt, code='atcg')
ref <- sample(c('A','T'),500, replace=TRUE) alt <- sample(c('C','G'),500, replace=TRUE) gmt <- data.frame(chr=rep(1,500), pos=seq_len(500), ref=ref, alt=alt, gmt1=ref, gmt2=alt, gmt3=ref, gmt4=ref, gmt5=c(alt[1:250], ref[251:500]), stringsAsFactors = FALSE) hapOutput <- hapiAutoPhase(gmt=gmt, code='atcg')
Maximum Parsimony of Recombination (MPR) for proofreading of draft haplotypes
hapiBlockMPR(draftHap, gmtFrame, cvlink = 2, smallBlock = 100)
hapiBlockMPR(draftHap, gmtFrame, cvlink = 2, smallBlock = 100)
draftHap |
a dataframe with draft haplotype information |
gmtFrame |
a dataframe of raw genotype data in the framework |
cvlink |
a numeric value of number of cvlinks. Default is |
smallBlock |
a numeric value determining the size of small blocks that should be excluded from the draft haplotypes |
a dataframe of draft haplotypes after proofreading
Ruidong Li
ref <- rep(0,500) alt <- rep(1,500) gmtFrame <- data.frame(gmt1=ref, gmt2=alt, gmt3=ref, gmt4=ref, gmt5=c(alt[1:250], ref[251:500]), stringsAsFactors = FALSE) idx1 <- sort(sample(seq_len(500), 30, replace = FALSE)) idx2 <- sort(sample(seq_len(500), 30, replace = FALSE)) idx3 <- sort(sample(seq_len(500), 30, replace = FALSE)) gmtFrame[idx1,1] <- NA gmtFrame[idx2,2] <- NA gmtFrame[idx3,3] <- NA imputedFrame <- data.frame(gmt1=ref, gmt2=alt, gmt3=ref, gmt4=ref, gmt5=c(alt[1:250], ref[251:500]), stringsAsFactors = FALSE) draftHap <- hapiPhase(imputedFrame) finalDraft <- hapiBlockMPR(draftHap, gmtFrame, cvlink=2, smallBlock=100)
ref <- rep(0,500) alt <- rep(1,500) gmtFrame <- data.frame(gmt1=ref, gmt2=alt, gmt3=ref, gmt4=ref, gmt5=c(alt[1:250], ref[251:500]), stringsAsFactors = FALSE) idx1 <- sort(sample(seq_len(500), 30, replace = FALSE)) idx2 <- sort(sample(seq_len(500), 30, replace = FALSE)) idx3 <- sort(sample(seq_len(500), 30, replace = FALSE)) gmtFrame[idx1,1] <- NA gmtFrame[idx2,2] <- NA gmtFrame[idx3,3] <- NA imputedFrame <- data.frame(gmt1=ref, gmt2=alt, gmt3=ref, gmt4=ref, gmt5=c(alt[1:250], ref[251:500]), stringsAsFactors = FALSE) draftHap <- hapiPhase(imputedFrame) finalDraft <- hapiBlockMPR(draftHap, gmtFrame, cvlink=2, smallBlock=100)
Filter out hetSNPs in potential complex regions
hapiCVCluster(draftHap, minDistance = 1e+06, cvlink = 2)
hapiCVCluster(draftHap, minDistance = 1e+06, cvlink = 2)
draftHap |
a dataframe with draft haplotype information |
minDistance |
a numeric value of the distance between two
genomic positions with cv-links. Default is |
cvlink |
a numeric value of number of cvlinks. Default is |
a dataframe of regions to be filtered out
Ruidong Li
ref <- rep(0,500) alt <- rep(1,500) imputedFrame <- data.frame(gmt1=ref, gmt2=alt, gmt3=ref, gmt4=ref, gmt5=c(alt[1:250], ref[251:500]), stringsAsFactors = FALSE) draftHap <- hapiPhase(imputedFrame) cvCluster <- hapiCVCluster(draftHap = draftHap, cvlink=2)
ref <- rep(0,500) alt <- rep(1,500) imputedFrame <- data.frame(gmt1=ref, gmt2=alt, gmt3=ref, gmt4=ref, gmt5=c(alt[1:250], ref[251:500]), stringsAsFactors = FALSE) draftHap <- hapiPhase(imputedFrame) cvCluster <- hapiCVCluster(draftHap = draftHap, cvlink=2)
Histogram of crossover distance
hapiCVDistance(cv)
hapiCVDistance(cv)
cv |
a dataframe of crossover information |
a histogram
Ruidong Li
data(crossover) hapiCVDistance(cv=crossover)
data(crossover) hapiCVDistance(cv=crossover)
Visualization of crossover map
hapiCVMap(cv, chr = hg19, step = 5, gap = gap.hg19, x.limits = 6, y.breaks = NULL, y.labels = NULL)
hapiCVMap(cv, chr = hg19, step = 5, gap = gap.hg19, x.limits = 6, y.breaks = NULL, y.labels = NULL)
cv |
a dataframe of crossover information |
chr |
a dataframe of chromosome information, including length, and centrometric regions |
step |
a numeric value of genomic interval in Mb.
Default is |
gap |
a dataframe of unassembled regions with the first column is
chromosme, the second column is start position, and third column is the
end position of the gap. Default is gap for hg19.
If no gap region is provided, use |
x.limits |
a numeric value of limits on x axis |
y.breaks |
a vector of positions to show labels on y axis.
Default is |
y.labels |
a vector of labels on the y axis. Default is |
a plot of crossover map on all the chromosomes
Ruidong Li
data(crossover) hapiCVMap(cv=crossover)
data(crossover) hapiCVMap(cv=crossover)
Histogram of crossover resolution
hapiCVResolution(cv)
hapiCVResolution(cv)
cv |
a dataframe of crossover information |
a histogram
Ruidong Li
data(crossover) hapiCVResolution(cv=crossover)
data(crossover) hapiCVResolution(cv=crossover)
Filter out hetSNPs with potential genotyping errors
hapiFilterError(gmt, hmm = NULL)
hapiFilterError(gmt, hmm = NULL)
gmt |
a dataframe of genotype data of gamete cells |
hmm |
a list containing probabilities of a HMM. Default is |
a dataframe of genotype data of gamete cells
Ruidong Li
ref <- rep(0,500) alt <- rep(1,500) gmt <- data.frame(gmt1=ref, gmt2=alt, gmt3=ref, gmt4=ref, gmt5=c(alt[1:250], ref[251:500]), stringsAsFactors = FALSE) idx <- sort(sample(seq_len(500), 10, replace = FALSE)) gmt[idx,1] <- 1 gmtDa <- hapiFilterError(gmt = gmt)
ref <- rep(0,500) alt <- rep(1,500) gmt <- data.frame(gmt1=ref, gmt2=alt, gmt3=ref, gmt4=ref, gmt5=c(alt[1:250], ref[251:500]), stringsAsFactors = FALSE) idx <- sort(sample(seq_len(500), 10, replace = FALSE)) gmt[idx,1] <- 1 gmtDa <- hapiFilterError(gmt = gmt)
Selection of hetSNPs to form a framework
hapiFrameSelection(gmt, n = 3)
hapiFrameSelection(gmt, n = 3)
gmt |
a dataframe of genotype data of gamete cells |
n |
a numeric value of the minumum number of gametes with observed genotypes at a locus |
a dataframe of genotype data of gamete cells
Ruidong Li
ref <- rep(0,500) alt <- rep(1,500) gmt <- data.frame(gmt1=ref, gmt2=alt, gmt3=ref, gmt4=ref, gmt5=c(alt[1:250], ref[251:500]), stringsAsFactors = FALSE) idx <- sort(sample(seq_len(500), 10, replace = FALSE)) gmt[idx,1] <- NA gmt[idx,2] <- NA gmt[idx,3] <- NA gmtFrame <- hapiFrameSelection(gmt = gmt, n = 3)
ref <- rep(0,500) alt <- rep(1,500) gmt <- data.frame(gmt1=ref, gmt2=alt, gmt3=ref, gmt4=ref, gmt5=c(alt[1:250], ref[251:500]), stringsAsFactors = FALSE) idx <- sort(sample(seq_len(500), 10, replace = FALSE)) gmt[idx,1] <- NA gmt[idx,2] <- NA gmt[idx,3] <- NA gmtFrame <- hapiFrameSelection(gmt = gmt, n = 3)
Visualization of haplotypes in a single gamete cell
hapiGameteView(hap, chr = hg19, hap.color = c("deepskyblue2", "darkorange2"), centromere.fill = "black", x.breaks = NULL, x.labels = NULL, y.breaks = NULL, y.labels = NULL)
hapiGameteView(hap, chr = hg19, hap.color = c("deepskyblue2", "darkorange2"), centromere.fill = "black", x.breaks = NULL, x.labels = NULL, y.breaks = NULL, y.labels = NULL)
hap |
a dataframe of all the phased hetSNPs in all chromosomes |
chr |
a dataframe of chromosome information, including length, and centrometric regions |
hap.color |
a vector of colors for the two haplotypes.
Default is |
centromere.fill |
a character of the color for the centromeres.
Default is |
x.breaks |
a vector of positions to show labels on x axis.
Default is |
x.labels |
a vector of labels on the x axis.
Default is |
y.breaks |
a vector of positions to show labels on y axis.
Default is |
y.labels |
a vector of labels on the y axis.
Default is |
a plot of haplotypes in a single gamete cell
Ruidong Li
data(gamete11) hapiGameteView(hap=gamete11)
data(gamete11) hapiGameteView(hap=gamete11)
Indentify crossovers in gamete cells
hapiIdentifyCV(hap, gmt, hmm = NULL)
hapiIdentifyCV(hap, gmt, hmm = NULL)
hap |
a dataframe of the two haplotypes |
gmt |
a dataframe of genotype data of gamete cells |
hmm |
a list containing probabilities of a HMM. Default is |
a dataframe containing crossover information in each gamete cell
Ruidong Li
ref <- sample(c('A','T'),500, replace=TRUE) alt <- sample(c('C','G'),500, replace=TRUE) hap <- data.frame(hap1=ref, hap2=alt, stringsAsFactors = FALSE) rownames(hap) <- seq_len(500) gmt <- data.frame(gmt1=ref, gmt2=alt, gmt3=ref, gmt4=ref, gmt5=c(alt[1:250], ref[251:500]), stringsAsFactors = FALSE) cvOutput <- hapiIdentifyCV(hap=hap, gmt=gmt)
ref <- sample(c('A','T'),500, replace=TRUE) alt <- sample(c('C','G'),500, replace=TRUE) hap <- data.frame(hap1=ref, hap2=alt, stringsAsFactors = FALSE) rownames(hap) <- seq_len(500) gmt <- data.frame(gmt1=ref, gmt2=alt, gmt3=ref, gmt4=ref, gmt5=c(alt[1:250], ref[251:500]), stringsAsFactors = FALSE) cvOutput <- hapiIdentifyCV(hap=hap, gmt=gmt)
Imputation of missing genotypes in the framework
hapiImupte(gmt, nSPT = 2, allowNA = 0)
hapiImupte(gmt, nSPT = 2, allowNA = 0)
gmt |
a dataframe of genotype data of gamete cells in the framework |
nSPT |
a numeric value of the minumum number of supports for an imputation |
allowNA |
a numeric value of the maximum number of gametes with NA at a locus |
a dataframe of imputed genotypes in the framework
Ruidong Li
ref <- rep(0,500) alt <- rep(1,500) gmtFrame <- data.frame(gmt1=ref, gmt2=alt, gmt3=ref, gmt4=ref, gmt5=c(alt[1:250], ref[251:500]), stringsAsFactors = FALSE) idx1 <- sort(sample(seq_len(500), 30, replace = FALSE)) idx2 <- sort(sample(seq_len(500), 30, replace = FALSE)) idx3 <- sort(sample(seq_len(500), 30, replace = FALSE)) gmtFrame[idx1,1] <- NA gmtFrame[idx2,2] <- NA gmtFrame[idx3,3] <- NA imputedFrame <- hapiImupte(gmtFrame, nSPT=2, allowNA=0)
ref <- rep(0,500) alt <- rep(1,500) gmtFrame <- data.frame(gmt1=ref, gmt2=alt, gmt3=ref, gmt4=ref, gmt5=c(alt[1:250], ref[251:500]), stringsAsFactors = FALSE) idx1 <- sort(sample(seq_len(500), 30, replace = FALSE)) idx2 <- sort(sample(seq_len(500), 30, replace = FALSE)) idx3 <- sort(sample(seq_len(500), 30, replace = FALSE)) gmtFrame[idx1,1] <- NA gmtFrame[idx2,2] <- NA gmtFrame[idx3,3] <- NA imputedFrame <- hapiImupte(gmtFrame, nSPT=2, allowNA=0)
Phase draft haplotypes by majority voting
hapiPhase(gmt)
hapiPhase(gmt)
gmt |
a dataframe of imputed genotype data of gamete cells |
a dataframe of inferred draft haplotypes
Ruidong Li
ref <- rep(0,500) alt <- rep(1,500) imputedFrame <- data.frame(gmt1=ref, gmt2=alt, gmt3=ref, gmt4=ref, gmt5=c(alt[1:250], ref[251:500]), stringsAsFactors = FALSE) draftHap <- hapiPhase(gmt=imputedFrame)
ref <- rep(0,500) alt <- rep(1,500) imputedFrame <- data.frame(gmt1=ref, gmt2=alt, gmt3=ref, gmt4=ref, gmt5=c(alt[1:250], ref[251:500]), stringsAsFactors = FALSE) draftHap <- hapiPhase(gmt=imputedFrame)
Convert numeric (0/1) coded genotype to base (A/T/C/G) coded
num2base(hap, ref, alt)
num2base(hap, ref, alt)
hap |
a dataframe of consensus haplotypes |
ref |
a character represents reference allele |
alt |
a character represents alternative allele |
a dataframe containing converted haplotypes
Ruidong Li
ref <- sample(c('A','T'),500, replace=TRUE) alt <- sample(c('C','G'),500, replace=TRUE) consensusHap <- data.frame(hap1=rep(0,500),hap2=rep(1,500), total=rep(5,500),rate=rep(1,500), confidence=rep('F',500), stringsAsFactors = FALSE) rownames(consensusHap) <- seq_len(500) hap <- num2base(hap=consensusHap, ref=ref, alt=alt)
ref <- sample(c('A','T'),500, replace=TRUE) alt <- sample(c('C','G'),500, replace=TRUE) consensusHap <- data.frame(hap1=rep(0,500),hap2=rep(1,500), total=rep(5,500),rate=rep(1,500), confidence=rep('F',500), stringsAsFactors = FALSE) rownames(consensusHap) <- seq_len(500) hap <- num2base(hap=consensusHap, ref=ref, alt=alt)