Source code for querynator.__main__

"""Command line interface querynator."""
import json
import logging
import os
import random
import shutil
from enum import Enum

import click
import vcf
from vcf.parser import _Info as VcfInfo
from vcf.parser import field_counts as vcf_field_counts

import querynator
from querynator.helper_functions import gunzip_compressed_files, gzipped
from querynator.query_api import query_cgi, query_civic, vcf_file
from querynator.report_scripts import (
    add_tiers_and_scores_to_df,
    combine_cgi,
    combine_cgi_civic,
    combine_civic,
    create_report_htmls,
)

# Create logger
logger = logging.getLogger("Querynator")
# Create console handler
ch = logging.StreamHandler()
# Create formatter
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
ch.setFormatter(formatter)
# add ch to logger
logger.addHandler(ch)
logger.setLevel(logging.INFO)
logger.propagate = False


[docs]class EnumType(click.Choice): """ This is a class for a click.Choice of type EnumType """ def __init__(self, enum, case_sensitive=False): self.__enum = enum super().__init__(choices=[item.value for item in enum], case_sensitive=case_sensitive)
[docs] def convert(self, value, param, ctx): converted_str = super().convert(value, param, ctx) return self.__enum(converted_str)
[docs]def make_enum(values): """ Function to create an EnumType from a dict :param values: json/dict like object with {key: value} pairs :type values: dict :return: enumeration :rtype: Enum """ _k = _v = None class CancerType(Enum): nonlocal _k, _v for _k, _v in values.items(): locals()[_k] = _v return CancerType
[docs]def Cancer(): """ Function to create instance of click.Choice EnumType with cancer types\n source: https://www.cancergenomeinterpreter.org/js/cancertypes.js :return: Enumeration of cancer types :rtype: click.Choice EnumType """ directory_path = os.path.dirname(os.path.abspath(__file__)) new_path = os.path.join(directory_path, "query_api/cancertypes.js") with open(new_path) as dataFile: data = dataFile.read() obj = data[data.find(" {") : data.rfind("};") + 1] jsonObj = json.loads(obj) Cancer_enum = make_enum(jsonObj) return Cancer_enum
[docs]def filter_vcf_by_vep(vcf_path, logger): """ Function to filter given vcf to remove synonymous and low impact variants based on VEP annotation :param vcf_path: Variant Call Format (VCF) file (Version 4.2) :type vcf_path: str :return: list of lists of pyVCF3 records (input file, removed, filtered) :rtype: list """ if not vcf_file(vcf_path): logger.error("Can only filter variants in vcf files.") exit(1) if gzipped(vcf_path): vcf_path = gunzip_compressed_files(vcf_path, logger) # read vcf file in pyVCF in_vcf = vcf.Reader(open(vcf_path)) # creates dictionary with VEP info names as keys and index in list as columns # Name must be VEPs default "CSQ" if "CSQ" in in_vcf.infos: logger.info("Filtering vcf file") vep_dict = {name: pos for pos, name in enumerate(in_vcf.infos["CSQ"].desc.split(":")[1].strip().split("|"))} """ Exemplary for nf-core/sarek (https://nf-co.re/sarek) output {'Allele': 0, 'Consequence': 1, 'IMPACT': 2, 'SYMBOL': 3, 'Gene': 4, 'Feature_type': 5, 'Feature': 6, 'BIOTYPE': 7, 'EXON': 8, 'INTRON': 9, 'HGVSc': 10, 'HGVSp': 11, 'cDNA_position': 12, 'CDS_position': 13, 'Protein_position': 14, 'Amino_acids': 15, 'Codons': 16, 'Existing_variation': 17, 'DISTANCE': 18, 'STRAND': 19, 'FLAGS': 20, 'VARIANT_CLASS': 21, 'SYMBOL_SOURCE': 22, 'HGNC_ID': 23, 'CANONICAL': 24, 'MANE_SELECT': 25, 'MANE_PLUS_CLINICAL': 26, 'TSL': 27, 'APPRIS': 28, 'CCDS': 29, 'ENSP': 30, 'SWISSPROT': 31, 'TREMBL': 32, 'UNIPARC': 33, 'UNIPROT_ISOFORM': 34, 'GENE_PHENO': 35, 'SIFT': 36, 'PolyPhen': 37, 'DOMAINS': 38, 'miRNA': 39, 'AF': 40, 'AFR_AF': 41, 'AMR_AF': 42, 'EAS_AF': 43, 'EUR_AF': 44, 'SAS_AF': 45, 'AA_AF': 46, 'EA_AF': 47, 'gnomAD_AF': 48, 'gnomAD_AFR_AF': 49, 'gnomAD_AMR_AF': 50, 'gnomAD_ASJ_AF': 51, 'gnomAD_EAS_AF': 52, 'gnomAD_FIN_AF': 53, 'gnomAD_NFE_AF': 54, 'gnomAD_OTH_AF': 55, 'gnomAD_SAS_AF': 56, 'MAX_AF': 57, 'MAX_AF_POPS': 58, 'FREQS': 59, 'CLIN_SIG': 60, 'SOMATIC': 61, 'PHENO': 62, 'PUBMED': 63, 'MOTIF_NAME': 64, 'MOTIF_POS': 65, 'HIGH_INF_POS': 66, 'MOTIF_SCORE_CHANGE': 67, 'TRANSCRIPTION_FACTORS': 68} """ to_remove = [] to_keep = [] for record in in_vcf: if len(record.INFO["CSQ"]) > 1: # filter out low impact & synonymous variants from variants with multiple vep annotations if all( info_list[vep_dict["IMPACT"]] == "LOW" and info_list[vep_dict["Consequence"]] == "synonymous_variant" for info_list in [i.split("|") for i in record.INFO["CSQ"]] ): to_remove.append(record) else: to_keep.append(record) else: # filter out low impact & synonymous variants from variants with single vep annotations if ( record.INFO["CSQ"][0].split("|")[vep_dict["IMPACT"]] == "LOW" and record.INFO["CSQ"][0].split("|")[vep_dict["Consequence"]] == "synonymous_variant" ): to_remove.append(record) else: to_keep.append(record) return [in_vcf, to_keep, to_remove] else: logger.error("vcf file does not include required VEP INFO fields (key must be default 'CSQ')") exit(1)
[docs]def write_vcf(vcf_template, vcf_record_list, out_name): """ Function to write a vcf file from list of pyvcf3 records to result directory :param vcf_header: pysam header object from input vcf :type vcf_header: pysam header object :param vcf_record_list: list of pysam records :type vcf_record_list: list :param out_name: name for the created vcf file :type out_name: str :return: None :rtype: None """ # add querynator_id info to header vcf_template.infos["QID"] = VcfInfo("QID", ".", "String", "Querynator ID", ".", ".", ".") writer = vcf.Writer(open(f"{out_name}", "w"), vcf_template, lineterminator="\n") for record in vcf_record_list: # add querynator_id to record record.add_info("QID", random.randint(1000000, 9999999)) writer.write_record(record)
[docs]def get_unique_querynator_dir(querynator_output): """ add index if "querynator_results" already exists in user given out dir :param querynator_output: path to store querynator results :type querynator_output: str :return: unique result directory :rtype: str """ filename, extension = os.path.splitext(querynator_output) count = 1 while os.path.exists(querynator_output): querynator_output = filename + f"_{count}" count += 1 return querynator_output
def run_querynator(): print("\n __ ") print(" ____ ___ _____ _______ ______ ____ _/ /_____ _____") print(" / __ `/ / / / _ \/ ___/ / / / __ \/ __ `/ __/ __ \/ ___/") print("/ /_/ / /_/ / __/ / / /_/ / / / / /_/ / /_/ /_/ / /") print("\__, /\__,_/\___/_/ \__, /_/ /_/\__,_/\__/\____/_/") print(" /_/ /____/\n\n") # Launch the click cli querynator_cli() @click.group() @click.version_option(version=querynator.__version__) def querynator_cli(): """ querynator is a command-line tool to query cancer variant databases. It can query web APIs or local instances of the databases """ logger.info("Start") # querynator cgi_api @querynator_cli.command() @click.option( "-m", "--mutations", help="Please provide the path to the PASS filtered variant file (vcf,tsv,gtf,hgvs):\nSee more info here: https://www.cancergenomeinterpreter.org/faq#q22", type=click.Path(exists=True), ) @click.option( "-a", "--cnas", help="Please provide the path to the copy number alterations file:\n", type=click.Path(exists=True), ) @click.option( "-l", "--translocations", help="Please provide the path to the file containing translocations:\n", type=click.Path(exists=True), ) @click.option( "-o", "--outdir", required=True, type=click.STRING, help="i.e. sample name. Directory in which results will be stored.", ) @click.option( "-c", "--cancer", help="Please enter the cancer type to be searched. You must use quotation marks.", type=EnumType(Cancer()), show_default=False, ) @click.option( "-g", "--genome", type=click.Choice(["hg19", "GRCh37", "hg38", "GRCh38"], case_sensitive=True), help="Please enter the genome version", required=True, default="hg38", ) @click.option( "-t", "--token", help="Please provide your token for CGI database", required=True, type=click.STRING, default=None ) @click.option( "-e", "--email", help="Please provide your user email address for CGI", required=False, type=click.STRING, default=None, ) @click.option( "-f", "--filter_vep", help="If set, filters out synoymous and low impact variants based on VEP annotation", is_flag=True, show_default=True, default=False, ) def query_api_cgi(mutations, cnas, translocations, cancer, genome, token, email, outdir, filter_vep): if mutations is None and cnas is None and translocations is None: raise click.UsageError( "No input file provided. Please provide at least one of [mutations/cnas/translocations] as input." ) try: result_dir = get_unique_querynator_dir(f"{outdir}") dirname, basename = os.path.split(result_dir) original_input = {"mutations": mutations, "translocations": translocations, "cnas": cnas} # filter vcf file if required if mutations is not None and filter_vep: in_vcf_header, candidate_variants, removed_variants = filter_vcf_by_vep(mutations, logger) # create result directories os.makedirs(f"{result_dir}/vcf_files") write_vcf(in_vcf_header, removed_variants, f"{result_dir}/vcf_files/{basename}.removed_variants.vcf") write_vcf(in_vcf_header, candidate_variants, f"{result_dir}/vcf_files/{basename}.filtered_variants.vcf") # create and set new input file for cgi query mutations = f"{result_dir}/vcf_files/{basename}.filtered_variants.vcf" logger.info("Query the cancergenomeinterpreter (CGI)") headers = {"Authorization": email + " " + token} # run analysis query_cgi( mutations, cnas, translocations, genome, cancer, headers, logger, result_dir, original_input, filter_vep ) # move downloaded results to result dir if dirname != "": if not filter_vep: os.makedirs(result_dir) shutil.move(f"{dirname}/{basename}.cgi_results", result_dir) shutil.move(f"{dirname}/{basename}.cgi_results.zip", result_dir) else: if not filter_vep: os.makedirs(result_dir) shutil.move(f"{basename}.cgi_results", result_dir) shutil.move(f"{basename}.cgi_results.zip", result_dir) except FileNotFoundError: print("Cannot find file on disk. Please try another path.") # querynator civic_api @querynator_cli.command() @click.option( "-v", "--vcf", help="Please provide the path to a Variant Call Format (VCF) file (Version 4.2)", required=True, type=click.Path(exists=True), ) @click.option( "-o", "--outdir", required=True, type=click.STRING, help="i.e. sample name. Directory in which results will be stored.", ) @click.option( "-g", "--genome", type=click.Choice(["GRCh37", "GRCh38", "NCBI36"], case_sensitive=True), help="Please enter the reference genome version", required=True, default="GRCh38", ) @click.option( "-f", "--filter_vep", help="if set, filters out synoymous and low impact variants based on VEP annotation", is_flag=True, show_default=True, default=False, ) def query_api_civic(vcf, outdir, genome, filter_vep): try: result_dir = get_unique_querynator_dir(f"{outdir}") dirname, basename = os.path.split(result_dir) if filter_vep: in_vcf_header, candidate_variants, removed_variants = filter_vcf_by_vep(vcf, logger) # create result directories os.makedirs(f"{result_dir}/vcf_files") write_vcf(in_vcf_header, removed_variants, f"{result_dir}/vcf_files/{basename}.removed_variants.vcf") write_vcf(in_vcf_header, candidate_variants, f"{result_dir}/vcf_files/{basename}.filtered_variants.vcf") logger.info("Query the Clinical Interpretations of Variants In Cancer (CIViC)") # run analysis query_civic(candidate_variants, result_dir, logger, vcf, genome, filter_vep) else: logger.info("Query the Clinical Interpretations of Variants In Cancer (CIViC)") query_civic(vcf, result_dir, logger, vcf, genome, filter_vep) except FileNotFoundError: print("The provided file cannot be found. Please try another path.") # querynator create report @querynator_cli.command() @click.option( "-c", "--cgi_path", help="Path to a CGI result folder generated using the querynator", required=True, type=click.Path(exists=True), ) @click.option( "-j", "--civic_path", help="Path to a CIViC result folder generated using the querynator", required=True, type=click.Path(exists=True), ) @click.option( "-o", "--outdir", required=True, type=click.STRING, help="Name of new directory in which reports will be stored.", ) def create_report(cgi_path, civic_path, outdir): # create outdir report_dir = get_unique_querynator_dir(outdir) dirname, basename = os.path.split(report_dir) os.makedirs(f"{report_dir}/combined_files") os.makedirs(f"{report_dir}/report/variant_reports") os.makedirs(f"{report_dir}/report/plots") # combine the results combine_civic(civic_path, report_dir, logger) combine_cgi(cgi_path, report_dir, logger) combine_cgi_civic(report_dir, logger) # add tiers & ranking-score to merged results add_tiers_and_scores_to_df(report_dir, logger) # create report create_report_htmls(report_dir, basename, civic_path, logger) if __name__ == "__main__": run_querynator()