Enrichment Analysis of Phenotypic Classes for Wnt and Ras signatures

In this Notebook, I extract the Ras and Wnt transcriptomic signatures and search for them in each phenotypic class, performing an enrichment analysis to figure out if they are present.

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rc

import txtome as tx

rc('text', usetex=True)
rc('text.latex', preamble=r'\usepackage{cmbright}')
rc('font', **{'family': 'sans-serif', 'sans-serif': ['Helvetica']})

%matplotlib inline

# This enables SVG graphics inline. 
%config InlineBackend.figure_formats = {'png', 'retina'}

# JB's favorite Seaborn settings for notebooks
rc = {'lines.linewidth': 2,
      'axes.labelsize': 18, 
      'axes.titlesize': 18, 
      'axes.facecolor': 'DFDFE5'}
sns.set_context('notebook', rc=rc)
sns.set_style("dark")

mpl.rcParams['xtick.labelsize'] = 16 
mpl.rcParams['ytick.labelsize'] = 16 
mpl.rcParams['legend.fontsize'] = 14
In [2]:
q = 0.1
quant  = pd.read_csv('../output/SI1_dpy_22_results.csv')
quant = tx.fc_transcriptome(df=quant)

filtered = pd.read_csv('../input/filtered.csv')
Dropped 0 rows with NaNs in the b column
In [3]:
def hypergeometric_phenotype_test(genotype, q=0.1):
    """
    Perform a hypergeometric enrichment test to ascertain whether a given
    phenotypic class is enriched in transcripts that are differentially
    expressed in another `genotype`, given in fancy form.

    Params:
    -------
    genotype: str. An entry in the `fancy` column of the quant.df DataFrame
    q: float. Significance threshold for differential expression

    Output:
    -------
    d: DataFrame containing the log10(p-value) for each comparison 
    """
    cond = (quant.df.qval < q)
    
    # number of genes in the genome:
    lenGen = quant.df.ens_gene.nunique()
    # number of genes DE in the genotype of interest
    lenBar = quant.df[cond &
                      (quant.df.fancy == genotype)].ens_gene.nunique()
    
    d = {}
    # go through each phenotypic class:
    for phenotype, group in quant.df[cond].groupby('phenotypic class'):
        # get the number of genes in the current class
        lenPhen = group.ens_gene.nunique()

        # find only the genes in the current class and DE in the genotype of
        # interest
        tmp = group[(group.fancy == genotype)]
        lenIntersect = tmp.ens_gene.nunique()
        
        # M is the total number of balls in the urn,
        # n is the total number of green balls in the urn,
        # N is the number of balls picked
        # X is the number of green balls drawn
        [M, n, N] = [lenGen, lenPhen, lenBar]
        X = lenIntersect
        pval = stats.hypergeom.sf(X, M, n, N)
        d[phenotype] = -np.log10(pval)
    
    # place into dataframe
    d = pd.DataFrame.from_dict(d, orient="index")
    # turn the index (phenotypes) into a column
    d = d.reset_index()
    # rename columns
    d.columns = ['Phenotypic Class', 'logpval']
    d.sort_values('logpval', inplace=True)
    # add a genotype column to enable dataframe concatenation
    d['genotype'] = genotype
    return d

def internalization(genotype, q=0.1):
    """
    Calculate the internalization fraction between a given phenotypic class
    and the differentially expressed genes in a given `genotype`.
    
    Params:
    -------
    genotype: str. An entry in the `fancy` column of the quant.df DataFrame
    q: float. Significance threshold for differential expression

    Output:
    -------
    d: DataFrame with the log10(p-values) of each comparison
    """
    # find the genes that fit both conditions:
    lenGenotype = quant.select_sample(genotype,
                                      col='fancy').ens_gene.nunique()
        
    d = {}
    # go through each phenotypic class:
    for phenotype, group in quant.subset_sig().groupby('phenotypic class'):
        # get the number of genes in the current class
        lenPhen = group.ens_gene.nunique()

        # find the genes in the current class and DE in `genotype`
        tmp = group[(group.fancy == genotype)]
        # calculate the length of the intersection
        lenIntersect = tmp.ens_gene.nunique()
        
        # figure out which one is smaller:
        denominator = np.min([lenPhen, lenGenotype])
        
        # calculate the internalization factor
        d[phenotype] = lenIntersect/denominator
    
    # place into dataframe
    d = pd.DataFrame.from_dict(d, orient="index")
    # turn the index (phenotypes) into a column
    d = d.reset_index()
    # rename columns
    d.columns = ['Phenotypic Class', 'internalization']
    d.sort_values('internalization', inplace=True)
    # add a genotype column to enable dataframe concatenation
    d['genotype'] = genotype
    return d
In [4]:
# call hypergeometric_phenotype_test using list comprehension (pythonic)
# then concatenate the resulting list of dataframes into a single df
pvals = pd.concat([hypergeometric_phenotype_test(genotype)
                   for genotype in
                   ['bar-1(ga80)', 'let-60(n2021)', 'let-60(n1046)']])

# sort the dataframe:
pvals.sort_values(['genotype','logpval'], inplace=True)

# barplot
sns.barplot(y='Phenotypic Class', x='logpval', hue='genotype',
            data=pvals)

# significance line:
plt.axvline(10, ls='--', lw=2, color='k')

# fancify the labels
plt.gca().set_yticklabels(['\emph{dpy-22(bx93)}-specific',
                           '\emph{dpy-22(bx93)}-associated',
                           '\emph{dpy-22(sy622)}-specific',
                           '\emph{trans}-heterozygote-specific',
                           '\emph{dpy-22(sy622)}-associated'])

# set xlabel and suppress ylabel:
plt.xlabel(r'$\log_{10}{p}$')
plt.ylabel('')

# fix legend title size:
leg = plt.legend()
leg.set_title(title='Genotype', prop={"size": 16})

# save:
plt.savefig('../output/stp_pvals.svg', bbox_inches='tight')
In [5]:
# do the same for the internalization:
intern = pd.concat([internalization(genotype) for genotype in
                   ['bar-1(ga80)', 'let-60(n2021)', 'let-60(n1046)']])

# plot:
sns.barplot(y='Phenotypic Class', x='internalization', hue='genotype',
            data=intern)
Out[5]:
<matplotlib.axes._subplots.AxesSubplot at 0x10517d9e8>