In this Notebook, I extract the Ras and Wnt transcriptomic signatures and search for them in each phenotypic class, performing an enrichment analysis to figure out if they are present.
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rc
import txtome as tx
rc('text', usetex=True)
rc('text.latex', preamble=r'\usepackage{cmbright}')
rc('font', **{'family': 'sans-serif', 'sans-serif': ['Helvetica']})
%matplotlib inline
# This enables SVG graphics inline.
%config InlineBackend.figure_formats = {'png', 'retina'}
# JB's favorite Seaborn settings for notebooks
rc = {'lines.linewidth': 2,
'axes.labelsize': 18,
'axes.titlesize': 18,
'axes.facecolor': 'DFDFE5'}
sns.set_context('notebook', rc=rc)
sns.set_style("dark")
mpl.rcParams['xtick.labelsize'] = 16
mpl.rcParams['ytick.labelsize'] = 16
mpl.rcParams['legend.fontsize'] = 14
q = 0.1
quant = pd.read_csv('../output/SI1_dpy_22_results.csv')
quant = tx.fc_transcriptome(df=quant)
filtered = pd.read_csv('../input/filtered.csv')
def hypergeometric_phenotype_test(genotype, q=0.1):
"""
Perform a hypergeometric enrichment test to ascertain whether a given
phenotypic class is enriched in transcripts that are differentially
expressed in another `genotype`, given in fancy form.
Params:
-------
genotype: str. An entry in the `fancy` column of the quant.df DataFrame
q: float. Significance threshold for differential expression
Output:
-------
d: DataFrame containing the log10(p-value) for each comparison
"""
cond = (quant.df.qval < q)
# number of genes in the genome:
lenGen = quant.df.ens_gene.nunique()
# number of genes DE in the genotype of interest
lenBar = quant.df[cond &
(quant.df.fancy == genotype)].ens_gene.nunique()
d = {}
# go through each phenotypic class:
for phenotype, group in quant.df[cond].groupby('phenotypic class'):
# get the number of genes in the current class
lenPhen = group.ens_gene.nunique()
# find only the genes in the current class and DE in the genotype of
# interest
tmp = group[(group.fancy == genotype)]
lenIntersect = tmp.ens_gene.nunique()
# M is the total number of balls in the urn,
# n is the total number of green balls in the urn,
# N is the number of balls picked
# X is the number of green balls drawn
[M, n, N] = [lenGen, lenPhen, lenBar]
X = lenIntersect
pval = stats.hypergeom.sf(X, M, n, N)
d[phenotype] = -np.log10(pval)
# place into dataframe
d = pd.DataFrame.from_dict(d, orient="index")
# turn the index (phenotypes) into a column
d = d.reset_index()
# rename columns
d.columns = ['Phenotypic Class', 'logpval']
d.sort_values('logpval', inplace=True)
# add a genotype column to enable dataframe concatenation
d['genotype'] = genotype
return d
def internalization(genotype, q=0.1):
"""
Calculate the internalization fraction between a given phenotypic class
and the differentially expressed genes in a given `genotype`.
Params:
-------
genotype: str. An entry in the `fancy` column of the quant.df DataFrame
q: float. Significance threshold for differential expression
Output:
-------
d: DataFrame with the log10(p-values) of each comparison
"""
# find the genes that fit both conditions:
lenGenotype = quant.select_sample(genotype,
col='fancy').ens_gene.nunique()
d = {}
# go through each phenotypic class:
for phenotype, group in quant.subset_sig().groupby('phenotypic class'):
# get the number of genes in the current class
lenPhen = group.ens_gene.nunique()
# find the genes in the current class and DE in `genotype`
tmp = group[(group.fancy == genotype)]
# calculate the length of the intersection
lenIntersect = tmp.ens_gene.nunique()
# figure out which one is smaller:
denominator = np.min([lenPhen, lenGenotype])
# calculate the internalization factor
d[phenotype] = lenIntersect/denominator
# place into dataframe
d = pd.DataFrame.from_dict(d, orient="index")
# turn the index (phenotypes) into a column
d = d.reset_index()
# rename columns
d.columns = ['Phenotypic Class', 'internalization']
d.sort_values('internalization', inplace=True)
# add a genotype column to enable dataframe concatenation
d['genotype'] = genotype
return d
# call hypergeometric_phenotype_test using list comprehension (pythonic)
# then concatenate the resulting list of dataframes into a single df
pvals = pd.concat([hypergeometric_phenotype_test(genotype)
for genotype in
['bar-1(ga80)', 'let-60(n2021)', 'let-60(n1046)']])
# sort the dataframe:
pvals.sort_values(['genotype','logpval'], inplace=True)
# barplot
sns.barplot(y='Phenotypic Class', x='logpval', hue='genotype',
data=pvals)
# significance line:
plt.axvline(10, ls='--', lw=2, color='k')
# fancify the labels
plt.gca().set_yticklabels(['\emph{dpy-22(bx93)}-specific',
'\emph{dpy-22(bx93)}-associated',
'\emph{dpy-22(sy622)}-specific',
'\emph{trans}-heterozygote-specific',
'\emph{dpy-22(sy622)}-associated'])
# set xlabel and suppress ylabel:
plt.xlabel(r'$\log_{10}{p}$')
plt.ylabel('')
# fix legend title size:
leg = plt.legend()
leg.set_title(title='Genotype', prop={"size": 16})
# save:
plt.savefig('../output/stp_pvals.svg', bbox_inches='tight')
# do the same for the internalization:
intern = pd.concat([internalization(genotype) for genotype in
['bar-1(ga80)', 'let-60(n2021)', 'let-60(n1046)']])
# plot:
sns.barplot(y='Phenotypic Class', x='internalization', hue='genotype',
data=intern)