Source code for genal.tools

import os, subprocess
import pandas as pd
import json
import wget
import shutil
import tarfile

from .constants import REF_PANELS, REF_PANELS_URL

config_path = os.path.join(os.path.expanduser("~/.genal/"), "config.json")
# default_ref_path = os.path.join(os.getcwd(), "tmp_GENAL", "Reference_files")
default_ref_path = os.path.join(os.path.expanduser("~/.genal/"), "Reference_files")


[docs] def default_config(): """Returns default config values""" current_file_dir = os.path.dirname(os.path.abspath(__file__)) default_config = { "paths": { "plink19_path": "", "liftover_path": "", "geno_path": "", "ref_path": default_ref_path, } } return default_config
[docs] def read_config(): """Get config file data""" with open(config_path, "r") as f: config = json.load(f) return config
[docs] def write_config(config): """Write data to config file""" with open(config_path, "w") as f: json.dump(config, f, indent=4) return
[docs] def setup_genetic_path(path): """Configure the genetic data path based on user input and saved configuration.""" config = read_config() if path is None: if not "geno_path" in config["paths"]: raise TypeError("No path has been saved in the config file. Please provide one.") path = config["paths"]["geno_path"] print(f"Using path saved in config file: {path}") return path # Ensure correct path format if path.count("$") > 1: raise TypeError( "The path should contain at most 1 '$' sign. Use it to indicate the chromosome number if the data is split by chromosomes." ) # Check if the path is valid path = os.path.splitext(path)[0] # Remove file extension if path.count("$") == 0: if not check_bfiles(path): raise TypeError("The path does not lead to valid bed/bim/fam files.") else: check_split = [check_bfiles(path.replace("$", str(i))) for i in range(1, 23, 1)] if not any(check_split): raise TypeError("The path does not lead to valid bed/bim/fam files.") # Save path to config.json file config["paths"]["geno_path"] = path write_config(config) return path
[docs] def create_tmp(): """Create the temporary folder if not present""" if not os.path.exists("tmp_GENAL"): try: os.makedirs("tmp_GENAL") except OSError: raise OSError( "Unable to create the 'tmp_GENAL' directory. Check permissions." )
[docs] def set_reference_folder(path=""): """ Set a folder path to store reference data. This function allows users to specify a directory where reference data will be stored. If the directory doesn't exist, it will be created. If no path is provided, a default directory named 'tmp_GENAL' in the current working directory will be used. Parameters: path (str, optional): The desired directory path for storing reference data. Defaults to a temporary folder in the current working directory. Raises: OSError: If the directory cannot be created. Returns: None: The function prints messages to inform the user of the status and any errors. """ # If no path is provided, set default path to root/.genal/Reference_files if not path: path = default_ref_path print(f"No path provided, defaulting to {default_ref_path}.") # If the directory doesn't exist, attempt to create it if not os.path.isdir(path): try: os.makedirs(path) print(f"Creating the '{path}' directory.") except OSError: raise OSError( f"Unable to create the '{path}' directory. Check permissions." ) # Check if the directory is readable if not os.access(path, os.R_OK): print(f"Error: The directory '{path}' is not readable.") return # Check if the directory is writable if not os.access(path, os.W_OK): print(f"Error: The directory '{path}' is not writable.") return # Update the configuration with the new path config = read_config() config["paths"]["ref_path"] = path write_config(config) print(f"Reference files will be downloaded and stored in: '{path}'")
[docs] def get_reference_panel_path(reference_panel="eur"): """ Retrieve the path of the specified reference panel. This function checks if the provided reference panel is a valid path to bed/bim/fam files. If not, it checks if the reference panel exists in the reference folder. If it doesn't exist, the function attempts to download it. Parameters: reference_panel (str, optional): The name of the reference panel or a path to bed/bim/fam files. Defaults to "eur". Raises: ValueError: If the provided reference panel is not recognized. OSError: If there's an issue creating the directory. FileNotFoundError: If the reference panel is not found. Returns: str: The path to the reference panel. """ # Remove file extension and check if it's a path to a bed/bim/fam triple reference_panel = os.path.splitext(reference_panel)[0] if check_bfiles(reference_panel): ref_panel_path = reference_panel print(f"Using the provided path as the reference panel.") else: # If it's not a valid path, check if the reference panel is recognized reference_panel = reference_panel.lower() config = read_config() if reference_panel not in REF_PANELS: raise ValueError( f"The reference_panel argument can only take values in {REF_PANELS} or be a valid path to bed/bim/fam files." ) ref_path = config["paths"]["ref_path"] # Create the reference path if it doesn't exist if not os.path.exists(ref_path): try: os.makedirs(ref_path) except OSError: raise OSError( "Unable to create the 'tmp_GENAL' directory. Check permissions." ) ref_panel_name = reference_panel.upper() ref_panel_path = os.path.join(ref_path, ref_panel_name) # If the reference panel files don't exist, attempt to download them if not check_bfiles(ref_panel_path): print( f"The {reference_panel.capitalize()} (build 37) reference panel was not found. Attempting to download it..." ) print( "If you have already downloaded it, or wish to use your own reference panel, use genal.set_reference_folder(path) to avoid downloading again." ) try: wget.download(REF_PANELS_URL, out=os.path.join(ref_path, "reference_panels.tgz")) except Exception as e: print(f"Download unsuccessful: {e}") print( "Manually download the reference file and use genal.set_reference_folder(path)." ) raise FileNotFoundError(f"Reference panel {reference_panel} not found.") print("Download successful. Decompressing...") with tarfile.open(os.path.join(ref_path, "reference_panels.tgz"), "r:gz") as tar_ref: tar_ref.extractall(ref_path) else: print(f"Using the {ref_panel_name} (build 37) reference panel.") return ref_panel_path
[docs] def load_reference_panel(reference_panel="eur"): """Load the bim file from the reference panel specified.""" # Check if it's a path to a .bim file reference_panel = os.path.splitext(reference_panel)[0] if os.path.exists(reference_panel + ".bim"): ref_panel_path = reference_panel print(f"Using the provided bim file as the reference dataset.") # Else, check if it's one of the reference datasets names and get the path else: reference_panel = reference_panel.lower() if reference_panel == "multi": raise ValueError("Multi reference dataset not implemented yet.") else: ref_panel_path = get_reference_panel_path(reference_panel) #Load it and return it reference_panel_df = pd.read_csv( ref_panel_path + ".bim", sep="\t", names=["CHR","SNP","F","POS","A1","A2"] ) # Convert CHR to string and remove 'chr' prefix if present, then convert to int if str(reference_panel_df["CHR"][0]).startswith("chr"): reference_panel_df["CHR"] = reference_panel_df["CHR"].astype(str).str.replace("^chr", "", regex=True).astype(int) return reference_panel_df
[docs] def get_plink19_path(): """Return the plink19 path if it exists in the config file.""" config = read_config() if not config["paths"]["plink19_path"]: raise ValueError( "The path to plink 1.9 has not been set yet. Use set_plink(path_to_plink) first." ) else: return config["paths"]["plink19_path"]
[docs] def check_bfiles(filepath): """Check if the path specified leads to a bed/bim/fam triple.""" if ( os.path.exists("{}.bed".format(filepath)) and os.path.exists("{}.bim".format(filepath)) and os.path.exists("{}.fam".format(filepath)) ): return True return False
[docs] def delete_tmp(): """Delete the tmp folder.""" if os.path.isdir("tmp_GENAL"): shutil.rmtree("tmp_GENAL") print("The tmp_GENAL folder has been successfully deleted.") else: print("There is no tmp_GENAL folder to delete in the current directory.") return