Source code for genrisk.utils

# -*- coding: utf-8 -*-
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from qmplot import qqplot
from tqdm import tqdm
from pybiomart import Dataset
from textwrap import wrap


def gene_length_normalize(*, genes_info, genes_col='HGNC symbol', length_col='gene_length', scores_df, samples_col):
    """
    Normalize dataset by gene length. if gene lengths file is not provided, info will be retrieved from ensembl.

    Parameters
    ----------
    genes_info : str
        file containing gene lengths. If file is not provided, info will be retrieved from ensembl
    genes_col : str
        column containing gene names.
    length_col : str
        column containing the length of each gene.
    scores_df : pd.DataFrame
        dataframe containing data to normalize.
    samples_col : str
        column containing samples IDs.

    Returns
    -------
    pd.Dataframe
        dataframe containing normalized dataframe.

    """
    unnormalized = []
    if not genes_info:
        dataset = Dataset(name='hsapiens_gene_ensembl',
                          host='http://www.ensembl.org')
        genes_df = dataset.query(attributes=['hgnc_symbol', 'start_position', 'end_position'])
        genes_df['gene_length'] = genes_df['Gene end (bp)'] - genes_df['Gene start (bp)']
    else:
        genes_df = pd.read_csv(genes_info, sep='\t')
    genes_lengths = genes_df.set_index(genes_col).to_dict()[length_col]
    for (name, data) in tqdm(scores_df.drop(columns=[samples_col]).iteritems(), desc="Normalizing genes scores"):
        if name not in genes_lengths.keys():
            unnormalized.append(name)
            continue
        # normalize genes by length
        scores_df[name] = round(scores_df[name] / genes_lengths[name], 5)
    scores_df = scores_df.drop(unnormalized, axis=1)
    return scores_df



[docs]
def draw_manhattan(*, data, chr_col, pos_col, pvals_col, genes_col, manhattan_output):
    """
    Generate  manhattan plot from a given dataset.

    Parameters
    ----------
    data : pd.DataFrame
        a dataframe with pvalues and gene information.
    chr_col : str
        the column with the chromosomes.
    pos_col : str
        the column containing the position/start.
    pvals_col : str
        the column containing the position/start.
    genes_col : str
        the column containing gene names.
    manhattan_output : str
        the path to output the manhattan plot image.

    Returns
    -------
    Manhattan plot

    """
    data.drop_duplicates(subset=[genes_col], inplace=True)
    data.dropna(subset=[pvals_col], inplace=True)
    data['-logp'] = - np.log10(data[pvals_col])
    data = data.dropna(subset=[genes_col, chr_col, pos_col], axis=0)
    data[chr_col].replace('X', 23, inplace=True)
    data[chr_col] = data[chr_col].astype('int64')
    data = data.sort_values([chr_col, pos_col])
    data.reset_index(inplace=True, drop=True)
    data['ind'] = data.index
    df_grouped = data.groupby((chr_col))
    fig = plt.figure(figsize=(14, 8))  # Set the figure size
    ax = fig.add_subplot(111)
    colors = ['black', 'grey']
    for num, (name, group) in enumerate(df_grouped):
        group.plot(kind='scatter', x='ind', y='-logp', color=colors[num % len(colors)], ax=ax)
    ax.set_ylim(0.0, max(data["-logp"]) + 1)

    for i in data[data['-logp'] > 5.6].index:
        ax.annotate(xy=(data.ind[i], data['-logp'][i]),
                    xytext=(data.ind[i] + 0.2, data['-logp'][i] + 0.2),
                    text=data[genes_col][i],
                    arrowprops=dict(arrowstyle="->", color='b', lw=0.5),
                    horizontalalignment='left', size='medium', rotation=20, color='black')
    # adjust_text(anno, arrowprops=dict(arrowstyle="->", color='b', lw=0.5))
    chrom_df = data.groupby(chr_col)['ind'].median()
    data['color group'] = data[chr_col].apply(lambda x: 'A' if x % 2 == 0 else 'B')
    fig.suptitle(manhattan_output.split('.')[0])
    ax.set_ylim(0.0, max(data["-logp"]) + 1)
    ax.set_xlabel(chr_col)
    ax.set_xticks(chrom_df)
    ax.set_xticklabels(chrom_df.index)
    ax.axhline(-np.log10(1.00e-05), c='blue', ls='--')
    ax.axhline(-np.log10(5.00e-08), c='red', ls='--')
    fig.savefig(manhattan_output)
    return ax




[docs]
def draw_qqplot(*, pvals, qq_output):
    """
    Generate QQ-plot for given data.

    Parameters
    ----------
    pvals : pd.Series
        the list of p_values.
    qq_output : str
        the path to output the QQplot image.

    Returns
    -------
    QQPlot

    """
    pvals.dropna(inplace=True)
    f, ax = plt.subplots(figsize=(6, 6), facecolor="w", edgecolor="k")
    title = "\n".join(wrap(qq_output.split('.')[0], 50))
    qqplot(data=pvals,
           title=title,
           marker="o",
           xlabel=r"Expected $-log_{10}{(P)}$",
           ylabel=r"Observed $-log_{10}{(P)}$",
           ax=ax)
    plt.savefig(qq_output)
    return ax



def merge_files(*, files_lst, sep, by, cols=None):
    """
    Merge a list of files with the same format into one dataframe.

    Parameters
    ----------
    files_lst
        a list with files to merge.
    sep
        the column seperator.
    by
        the common column between all files.
    cols
        selected columns to remain in dataframe.

    Returns
    -------
    merged dataframe

    """
    df = pd.read_csv(files_lst[0], sep=sep)
    for file in files_lst[1:]:
        df2 = pd.read_csv(file, sep=sep)
        df = pd.merge(df, df2, on=by)
    if cols:
        df = df[[cols]]
    return df