Generating LD Reference Panel

Generating LD Reference Panel#

Description#

Our LD reference panel is generated from ADSP GCAD non-Hispanic white samples. Missing variants are mean imputed before correlations are calculated between variants. The cyvcf2 package is used to calculate dosage while applying a minor allele frequency threshold of 0.05%, a minor allele count threshold of 5, and a missingness threshold of 5%.

Input#

.vcf.bgz and .vcf.bgz.tbi files. Each file is specific to a chromosome and variant range (ex: gcad.qc.r4.wgs.36361.GATK.2022.08.15.biallelic.genotypes.chr9:10030-36899727.NHWextracted.MAF01.vcf.bgz).

Output#

cor.xz files containing LD pearson product-moment correlation coefficients.
cor.xz.bim file containing the list of variants in each cor.xz file.

Minimal Working Example Steps#

We start off by loading all of the requisite libraries

from cyvcf2 import VCF
import numpy as np
from math import nan
import argparse
import xz
from os import listdir

There are multiple VCFs per chromosome, so we group them into a Python dictionary.

# vcf_files for each chromosome
vcf_files = {}
for chrm in range(1, 22 + 1):
    base = "/restricted/projectnb/xqtl/R4_QC_NHWonly_rm_monomorphic/"
    file_start = "gcad.qc.r4.wgs.36361.GATK.2022.08.15.biallelic.genotypes."
    vcf_files["chr%i" % chrm] = [x for x in listdir(base) if 
                                 (x.endswith(".bgz")) and (x.startswith(file_start + "chr" + str(chrm) + ":") or
                                                           x.startswith(file_start + "chr" + str(chrm) + "."))]

When calculating the correlations between different variants we do mean imputation of the missing variants

# replaces NaN values in matrix with means of their rows
def fill_missing_with_row_means(data):
    # Calculate means of rows ignoring NaNs
    row_means = np.nanmean(data, axis=1)
    # Find indices where NaN values are
    inds = np.where(np.isnan(data))
    # Replace NaNs with the mean of the respective row
    data[inds] = np.take(row_means, inds[0])
    return data

Dosages are then calculated using the cyvcf2 package. We use a minor allele frequency threshold of 0.05%, minor allele count threshold of 5, and a missingness threshold of 5% to filter variants

# gets dosages (which pass filter criteria) from VCF file object
def get_dosages(vcf_obj, maf_min=0.0005, mac_min=5, msng_min=0.05):
    arr = []
    var_names = []
    for var in vcf_obj:
        # do not include multi-allelic variants
        if len(var.ALT) != 1:
            continue
        dosage = [sum(x[0:2]) for x in [[nan if x1 == -1 else x1 for x1 in x0] for x0 in var.genotypes]]
        # ignore if no variation exists
        if np.nanvar(dosage) == 0:
            continue
        # returns allele counts for the reference (first val) and alternative
        # (second val) alleles
        counts = [np.nansum([2 - x for x in dosage]), np.nansum(dosage)]
        nan_count = np.sum(np.isnan(dosage))
        num_samp_non_na = len(dosage) - nan_count
        mac = min(counts)
        maf = mac / num_samp_non_na
        msng_rate = nan_count / (num_samp_non_na + nan_count)
        # remove variants which don't match our criteria
        if (maf < maf_min) or (mac < mac_min) or (msng_rate > msng_min):
            continue
        arr.append(dosage)
        var_names.append(var.CHROM + ":" + "_".join([str(var.POS), var.REF, var.ALT[0]]))

    if len(var_names) != 0:
        return fill_missing_with_row_means(np.array(arr)), var_names
    # return empty (but 2D) array for empty values
    else:
        return np.array([[]]), var_names

We then create a function to get the dosages from a specific range (which is useful for the LD blocks)

# returns output of dosages on a per LD block basis
def get_dosages_range(vcf_obj, chrm, start, end):
    vcf_qry_str = chrm + ":" + str(start) + "-" + str(end)
    return get_dosages(vcf_obj(vcf_qry_str))


def flatten(xss):
    return [x for xs in xss for x in xs]

We then load the LD block locations into memory

!head /restricted/projectnb/casa/oaolayin/ROSMAP_NIA_geno/EUR_LD_blocks.bed 

chr1 	 16103 	 2888443
chr1 	 2888443 	 4320284
chr1 	 4320284 	 5853833
chr1 	 5853833 	 7110219
chr1 	 7110219 	 9473386
chr1 	 9473386 	 11328222
chr1 	 11328222 	 12710318
chr1 	 12710318 	 15244493
chr1 	 15244493 	 17351816
chr1 	 17351816 	 20110062

ld_block_file = "/restricted/projectnb/casa/oaolayin/ROSMAP_NIA_geno/EUR_LD_blocks.bed"
ld_blocks = []
with open(ld_block_file) as f:
    for line in f:
        elems = line.split()
        elems[-1] = elems[-1].strip()
        ld_blocks.append(elems)

We now calculate the correlation matrices for each of the blocks

for block in ld_blocks:
    chrm, start, end = block
    out = "/restricted/projectnb/xqtl/R4_cor_out/%s/%s_%s_%s.cor.xz" % (chrm, chrm, start, end)
    # aggregate dosages from different files
    dosages = [get_dosages_range(VCF(v), chrm, start, end) for v
               in vcf_files[chrm]]
    # get the variants
    variants = flatten([x[1] for x in dosages])
    dosage = np.concatenate([x[0] for x in dosages if len(x[1]) > 0], axis=0)
    dosages = None
    # get the cor
    cor = np.triu(np.corrcoef(dosage))
    dosage = None
    # write output files
    with open(out + ".bim", "w+") as f:
        for var in variants:
            chrm = var.split(":")[0]
            pos, ref, alt = var.replace(chrm + ":", "").split("_")
            elems = [chrm.replace("chr", ""), var, "0", pos, alt.strip(), ref]
            f.write("\t".join(elems))
            f.write("\n")
    with xz.open(out, "w+", preset=9) as f:
        for r in range(cor.shape[0]):
            f.write(" ".join(["{:.6f}".format(x) for x in cor[r, :]]).encode())
            f.write(b"\n")

Finally, the following R function can help generate meta-data table for all the files produced,

library(dplyr)
library(stringr)
generate_ld_meta_file = function(ld_meta_file_path){
    ld_meta_file <- list.files(ld_meta_file_path, pattern = "\\.bim$") %>%
    data.frame(path = .) %>%
    mutate(
      chrom = str_extract(path, "^[^_]+"),
      start_end = str_extract(path, "_[0-9]+_[0-9]+"),
      start = as.numeric(str_extract(start_end, "[0-9]+")),
      end = as.numeric(str_extract(start_end, "[0-9]+$")),
      path = paste0(basename(ld_meta_file_path), "/",chrom, "_", start, "_", end, ".cor.xz", 
                  ",", basename(ld_meta_file_path), "/",chrom, "_", start, "_", end, ".cor.xz.bim")
    )%>%select(chrom,start,end,path)
    return(ld_meta_file)
}

Then we generate ld_meta_file_chr file for each chromosome

ADSP_LD_file_path = "/mnt/vast/hpc/csg/data_public/20240120_ADSP_LD_matrix/"
for (i in 1:22){
    ADSP_LD_chrom_file_path = paste0(ADSP_LD_file_path,"/chr",i,sep="")
    ld_chrom_meta_file = generate_ld_meta_file(ADSP_LD_chrom_file_path)
    write.table(ld_chrom_meta_file,paste0(ADSP_LD_chrom_file_path,"/ld_meta_file_chr",i,".tsv",sep=""),sep="\t",col.names=TRUE,row.names = FALSE,quote=FALSE)
}

Troubleshooting#

Step	Substep	Problem	Possible Reason	Solution