import pandas as pd
import numpy as np
import os
import subprocess
from io import StringIO
from .tools import get_reference_panel_path, get_plink19_path
## TO DO: accept lists of CHR/POS instead of SNP names for these functions
[docs]
def query_outcome_proxy(df, ld, snps_to_extract, snps_df=[]):
"""
Extract the best proxies from a dataframe, as well as specific SNPs.
Given a dataframe `df` (originating from GENO.data) and a dataframe of potential proxies
(output from `find_proxies`), this function extracts the best proxies from `df` as well as
the SNPs specified in `snps_to_extract`.
This is suited for querying outcome data.
Args:
df (pd.DataFrame): Dataframe of SNP information with the usual GENO columns
(SNP, BETA, SE, EAF, EA, NEA). EAF is not necessary.
ld (pd.DataFrame): Dataframe of proxies (output from `find_proxies`).
snps_to_extract (list): List of SNPs to extract in addition to the proxies.
snps_df (list, optional): List of SNPs to choose the proxy from. Should be the list of
SNPs in df. Can be provided to avoid recomputing it. Defaults to an empty list.
Returns:
pd.DataFrame: Dataframe with queried SNPs and their proxies.
"""
# If snps_df is empty, populate it with SNPs from df
if not snps_df:
snps_df = df.SNP.values
# Filter proxies that are present in df
ld = ld[ld.SNP_B.isin(snps_df)]
# Remove original SNPs
ld = ld[ld["SNP_A"] != ld["SNP_B"]]
# Sort by r2 and select the best proxy for each SNP
ld = ld.reindex(ld["R"].abs().sort_values(ascending=False).index)
ld = ld.groupby("SNP_A").first().reset_index(drop=False)
# Determine SNPs to query
snps_to_query = set(snps_to_extract) | set(ld.SNP_B.values)
df_queried = df[df.SNP.isin(snps_to_query)]
# Merge dataframes and identify proxies
output = df_queried.merge(ld, how="left", left_on="SNP", right_on="SNP_B")
output["proxy"] = output["SNP_B"].notnull()
# Flip BETA if the proxied SNP alleles are switched in the reference panel
conditions = [
(output["EA"] == output["B2"]),
(output["EA"] == output["B1"]),
(~output["proxy"]),
(
(output["EA"].isin([output["B1"], output["B2"]]) == False)
& (output["proxy"])
),
]
choices = [
-output["BETA"], # if EA == B2, flip the sign of BETA
output["BETA"], # if EA == B1, BETA does not change
output["BETA"], # if the original SNP was not proxied, BETA does not change
np.nan, # if the original SNP was proxied but EA is neither "B1" nor "B2", BETA is NaN
]
output["BETA"] = np.select(conditions, choices)
# Drop rows with NaN BETA values
nrow = output.shape[0]
output = output.dropna(subset=["BETA"])
if output.shape[0] < nrow:
print(
f"Deleted {nrow-output.shape[0]} base SNPs that did not have matching alleles in reference data."
)
print(f"Found proxies for {output['proxy'].sum()} SNPs.")
# Replace original SNPs with their proxy (if proxied)
output["SNP"] = np.where(output["proxy"], output["SNP_A"], output["SNP"])
output["POS"] = np.where(output["proxy"], output["BP_A"], output["POS"])
output["CHR"] = np.where(output["proxy"], output["CHR_A"], output["CHR"])
output["EA"] = np.where(output["proxy"], output["A1"], output["EA"])
output["NEA"] = np.where(output["proxy"], output["A2"], output["NEA"])
if "EAF" in output.columns:
output["EAF"] = np.where(output["proxy"], output["MAF_A"], output["EAF"])
# Drop columns related to ld
output = output.drop(columns=ld.columns)
return output
[docs]
def apply_proxies(df, ld, searchspace=None):
"""
Given a dataframe (coming from GENO.data attribute) and a dataframe of proxies
(output from find_proxies), replace the SNPs in df with their best proxies, if they exist.
This function is suited for exposure data (before running a PRS for instance).
Args:
df (DataFrame): Dataframe of SNP information with the usual GENO columns (SNP, BETA, SE, EAF, EA, NEA). EAF is not necessary.
ld (DataFrame): Dataframe of proxies (output from find_proxies).
searchspace (list, optional): List of SNPs to restrict the list of potential proxies. By default, includes all the proxies found. Using a searchspace can be done either at the find_proxies step or at this step, but it is much faster to use it at this step.
Returns:
DataFrame: A DataFrame with SNPs replaced by their best proxies, if they exist.
"""
# Check mandatory columns
mandatory_cols = ["EA", "SNP", "BETA"]
for col in mandatory_cols:
if col not in df.columns:
raise ValueError(f"The column {col} is not found in the data!")
# Filter by searchspace if provided
if searchspace:
print("Filtering the potential proxies with the searchspace provided.")
ld = ld[ld.SNP_B.isin(searchspace)]
# Remove original SNPs and sort by r2
ld = ld[ld["SNP_A"] != ld["SNP_B"]]
ld = ld.reindex(ld["R"].abs().sort_values(ascending=False).index)
# Select the best proxy for each SNP
ld = ld.groupby("SNP_A").first().reset_index(drop=False)
# Merge the dataframes
output = df.merge(ld, how="left", left_on="SNP", right_on="SNP_A")
output["proxy"] = pd.notnull(output["SNP_B"])
# Flip BETA if the original SNP alleles are switched in the reference panel
conditions = [
output["EA"] == output["A2"],
output["EA"] == output["A1"],
~output["proxy"],
~output["EA"].isin([output["A1"], output["A2"]]) & output["proxy"],
]
choices = [
-output["BETA"], # if EA == A2, flip the sign of BETA
output["BETA"], # if EA == A1, BETA does not change
output["BETA"], # if SNP_A is NaN (The original SNP was not proxied), BETA does not change
np.nan, # if the original SNP was proxied but EA is neither "A1" nor "A2", BETA is NaN
]
output["BETA"] = np.select(conditions, choices)
# Delete SNPs with mismatched alleles
nrow = output.shape[0]
output = output.dropna(subset=["BETA"])
if output.shape[0] < nrow:
print(
f"Deleted {nrow-output.shape[0]} base SNPs that did not have matching alleles in reference data."
)
print(f"Found proxies for {output['proxy'].sum()} missing SNPs.")
# Replace the original SNPs with their proxy (if proxied)
output["SNP"] = np.where(output["proxy"], output["SNP_B"], output["SNP"])
output["EA"] = np.where(output["proxy"], output["B1"], output["EA"])
if "POS" in output.columns:
output["POS"] = np.where(output["proxy"], output["BP_B"], output["POS"])
if "CHR" in output.columns:
output["CHR"] = np.where(output["proxy"], output["CHR_B"], output["CHR"])
if "NEA" in output.columns:
output["NEA"] = np.where(output["proxy"], output["B2"], output["NEA"])
if "EAF" in output.columns:
output["EAF"] = np.where(output["proxy"], output["MAF_B"], output["EAF"])
# Drop ld columns
output = output.drop(columns=ld.columns)
return output
[docs]
def find_proxies(
snp_list,
searchspace=None,
reference_panel="eur",
kb=5000,
r2=0.6,
window_snps=5000,
threads=1,
):
"""
Given a list of SNPs, return a table of proxies.
Args:
snp_list (list): List of rsids.
searchspace (list, optional): List of SNPs to include in the search. By default, includes the whole reference panel.
reference_panel (str, optional): The reference population to get linkage disequilibrium values and find proxies.
Accepts values: "EUR", "SAS", "AFR", "EAS", "AMR".
Alternatively, provide a path leading to a specific bed/bim/fam reference panel.
kb (int, optional): Width of the genomic window to look for proxies. Defaults to 5000.
r2 (float, optional): Minimum linkage disequilibrium value with the main SNP for a proxy to be included. Defaults to 0.6.
window_snps (int, optional): Compute the LD value for SNPs that are not more than x SNPs apart from the main SNP. Defaults to 5000.
threads (int, optional): Number of threads to use. Defaults to 1.
Returns:
DataFrame: A DataFrame containing the proxies. Only biallelic SNPs are returned.
"""
# Ensure tmp_GENAL directory exists
os.makedirs(f"tmp_GENAL/", exist_ok=True)
# Convert snp_list to numpy array
snp_list = np.array(list(snp_list))
# Check if searchspace is provided
if searchspace is None:
extract_arg = ""
else:
print("Searching proxies in the provided searchspace.")
with open("tmp_GENAL/searchspace.txt", "w") as file:
for s in searchspace + snp_list:
file.write(str(s) + "\n")
extract_arg = "--extract tmp_GENAL/searchspace.txt"
# Save snp_list to a file
np.savetxt("tmp_GENAL/snps_to_proxy.txt", snp_list, fmt="%s", delimiter=" ")
# Construct and execute the plink command
command = f"{get_plink19_path()} --bfile {get_reference_panel_path(reference_panel)} {extract_arg} --keep-allele-order --r in-phase with-freqs gz --ld-snp-list tmp_GENAL/snps_to_proxy.txt --ld-window-kb {kb} --ld-window-r2 {r2} --ld-window {window_snps} --out tmp_GENAL/proxy.targets --threads {threads}"
subprocess.run(command, shell=True, capture_output=True, text=True, check=True)
# Read and process the output
cmd = f"gunzip -c tmp_GENAL/proxy.targets.ld.gz"
unzipped_content = subprocess.check_output(cmd, shell=True).decode("utf-8")
ld = pd.read_csv(StringIO(unzipped_content), sep="\s+")
# Filter out multiallelic SNPs
ld["PHASE"] = ld["PHASE"].str.replace("/", "")
ld = ld[ld["PHASE"].apply(len) == 4]
ld = ld.reset_index(drop=True)
# Split the "PHASE" column into separate characters
temp = pd.DataFrame(
ld["PHASE"].apply(list).to_list(), columns=["A1", "B1", "A2", "B2"]
)
ld = pd.concat([ld, temp], axis=1)
# Convert integer columns to Int64 type
for int_col in ["CHR_A", "CHR_B", "BP_A", "BP_B"]:
ld[int_col] = ld[int_col].astype("Int64")
return ld