Generating LD Reference Panel#
Description#
Our LD reference panel is generated from ADSP GCAD non-Hispanic white samples. Missing variants are mean imputed before correlations are calculated between variants. The cyvcf2 package is used to calculate dosage while applying a minor allele frequency threshold of 0.05%, a minor allele count threshold of 5, and a missingness threshold of 5%.
Input#
.vcf.bgz
and.vcf.bgz.tbi
files. Each file is specific to a chromosome and variant range (ex:gcad.qc.r4.wgs.36361.GATK.2022.08.15.biallelic.genotypes.chr9:10030-36899727.NHWextracted.MAF01.vcf.bgz
).
Output#
cor.xz
files containing LD pearson product-moment correlation coefficients.cor.xz.bim
file containing the list of variants in eachcor.xz
file.
Minimal Working Example Steps#
We start off by loading all of the requisite libraries
from cyvcf2 import VCF
import numpy as np
from math import nan
import argparse
import xz
from os import listdir
There are multiple VCFs per chromosome, so we group them into a Python dictionary.
# vcf_files for each chromosome
vcf_files = {}
for chrm in range(1, 22 + 1):
base = "/restricted/projectnb/xqtl/R4_QC_NHWonly_rm_monomorphic/"
file_start = "gcad.qc.r4.wgs.36361.GATK.2022.08.15.biallelic.genotypes."
vcf_files["chr%i" % chrm] = [x for x in listdir(base) if
(x.endswith(".bgz")) and (x.startswith(file_start + "chr" + str(chrm) + ":") or
x.startswith(file_start + "chr" + str(chrm) + "."))]
When calculating the correlations between different variants we do mean imputation of the missing variants
# replaces NaN values in matrix with means of their rows
def fill_missing_with_row_means(data):
# Calculate means of rows ignoring NaNs
row_means = np.nanmean(data, axis=1)
# Find indices where NaN values are
inds = np.where(np.isnan(data))
# Replace NaNs with the mean of the respective row
data[inds] = np.take(row_means, inds[0])
return data
Dosages are then calculated using the cyvcf2 package. We use a minor allele frequency threshold of 0.05%, minor allele count threshold of 5, and a missingness threshold of 5% to filter variants
# gets dosages (which pass filter criteria) from VCF file object
def get_dosages(vcf_obj, maf_min=0.0005, mac_min=5, msng_min=0.05):
arr = []
var_names = []
for var in vcf_obj:
# do not include multi-allelic variants
if len(var.ALT) != 1:
continue
dosage = [sum(x[0:2]) for x in [[nan if x1 == -1 else x1 for x1 in x0] for x0 in var.genotypes]]
# ignore if no variation exists
if np.nanvar(dosage) == 0:
continue
# returns allele counts for the reference (first val) and alternative
# (second val) alleles
counts = [np.nansum([2 - x for x in dosage]), np.nansum(dosage)]
nan_count = np.sum(np.isnan(dosage))
num_samp_non_na = len(dosage) - nan_count
mac = min(counts)
maf = mac / num_samp_non_na
msng_rate = nan_count / (num_samp_non_na + nan_count)
# remove variants which don't match our criteria
if (maf < maf_min) or (mac < mac_min) or (msng_rate > msng_min):
continue
arr.append(dosage)
var_names.append(var.CHROM + ":" + "_".join([str(var.POS), var.REF, var.ALT[0]]))
if len(var_names) != 0:
return fill_missing_with_row_means(np.array(arr)), var_names
# return empty (but 2D) array for empty values
else:
return np.array([[]]), var_names
We then create a function to get the dosages from a specific range (which is useful for the LD blocks)
# returns output of dosages on a per LD block basis
def get_dosages_range(vcf_obj, chrm, start, end):
vcf_qry_str = chrm + ":" + str(start) + "-" + str(end)
return get_dosages(vcf_obj(vcf_qry_str))
def flatten(xss):
return [x for xs in xss for x in xs]
We then load the LD block locations into memory
!head /restricted/projectnb/casa/oaolayin/ROSMAP_NIA_geno/EUR_LD_blocks.bed
chr1 16103 2888443
chr1 2888443 4320284
chr1 4320284 5853833
chr1 5853833 7110219
chr1 7110219 9473386
chr1 9473386 11328222
chr1 11328222 12710318
chr1 12710318 15244493
chr1 15244493 17351816
chr1 17351816 20110062
ld_block_file = "/restricted/projectnb/casa/oaolayin/ROSMAP_NIA_geno/EUR_LD_blocks.bed"
ld_blocks = []
with open(ld_block_file) as f:
for line in f:
elems = line.split()
elems[-1] = elems[-1].strip()
ld_blocks.append(elems)
We now calculate the correlation matrices for each of the blocks
for block in ld_blocks:
chrm, start, end = block
out = "/restricted/projectnb/xqtl/R4_cor_out/%s/%s_%s_%s.cor.xz" % (chrm, chrm, start, end)
# aggregate dosages from different files
dosages = [get_dosages_range(VCF(v), chrm, start, end) for v
in vcf_files[chrm]]
# get the variants
variants = flatten([x[1] for x in dosages])
dosage = np.concatenate([x[0] for x in dosages if len(x[1]) > 0], axis=0)
dosages = None
# get the cor
cor = np.triu(np.corrcoef(dosage))
dosage = None
# write output files
with open(out + ".bim", "w+") as f:
for var in variants:
chrm = var.split(":")[0]
pos, ref, alt = var.replace(chrm + ":", "").split("_")
elems = [chrm.replace("chr", ""), var, "0", pos, alt.strip(), ref]
f.write("\t".join(elems))
f.write("\n")
with xz.open(out, "w+", preset=9) as f:
for r in range(cor.shape[0]):
f.write(" ".join(["{:.6f}".format(x) for x in cor[r, :]]).encode())
f.write(b"\n")
Finally, the following R function can help generate meta-data table for all the files produced,
library(dplyr)
library(stringr)
generate_ld_meta_file = function(ld_meta_file_path){
ld_meta_file <- list.files(ld_meta_file_path, pattern = "\\.bim$") %>%
data.frame(path = .) %>%2222
mutate(
chrom = str_extract(path, "^[^_]+"),
start_end = str_extract(path, "_[0-9]+_[0-9]+"),
start = as.numeric(str_extract(start_end, "[0-9]+")),
end = as.numeric(str_extract(start_end, "[0-9]+$")),
path = paste0(basename(ld_meta_file_path), "/",chrom, "_", start, "_", end, ".cor.xz",
",", basename(ld_meta_file_path), "/",chrom, "_", start, "_", end, ".cor.xz.bim")
)%>%select(chrom,start,end,path)
return(ld_meta_file)
}
Then we generate ld_meta_file_chr file for each chromosome
ADSP_LD_file_path = "/mnt/vast/hpc/csg/data_public/20240120_ADSP_LD_matrix/"
for (i in 1:22){
ADSP_LD_chrom_file_path = paste0(ADSP_LD_file_path,"/chr",i,sep="")
ld_chrom_meta_file = generate_ld_meta_file(ADSP_LD_chrom_file_path)
write.table(ld_chrom_meta_file,paste0(ADSP_LD_chrom_file_path,"/ld_meta_file_chr",i,".tsv",sep=""),sep="\t",col.names=TRUE,row.names = FALSE,quote=FALSE)
}
Troubleshooting#
Step |
Substep |
Problem |
Possible Reason |
Solution |
---|---|---|---|---|