Predictions
The Adhesome is a set of proteins that form a complex network of interactions that together form the cellular processes of cell adhesion, migration and cytoskeletal organization [1][2]. We use machine learning to predict whether a gene is part of the Adhesome based on omics features.
The IPython notebook that performs the predictions is below (and on github).
We've calculated the results of this process by considering strictly intrinsic adhesome components and the full adhesome and associated components.
import os, sys, csv, gzip
import numpy as np
import pandas as pd
import scipy.sparse as sp
from scipy import io
import matplotlib.pyplot as plt
from matplotlib import rcParams
rcParams['pdf.fonttype'] = 42 ## Output Type 3 (Type3) or Type 42 (TrueType)
rcParams['font.sans-serif'] = 'Arial'
%matplotlib inline
from plots import COLORS10, enlarge_tick_fontsize
import seaborn as sns
sns.set_style('whitegrid')
from sklearn.cross_validation import (StratifiedKFold, cross_val_score)
from sklearn.metrics import *
from sklearn.decomposition import TruncatedSVD
from sklearn import ensemble
from sklearn.pipeline import Pipeline
import xgboost as xgb
1. Prepare the gene/feature attribute matrices and target class to for supervised machine learning¶
Load known Adhesome components from file
df_labels = pd.read_csv('../data/components.csv')
print df_labels.shape
df_labels.head()
These files below were downloaded from Harmonizome as the features for genes to predict whether they are adhesome components.
There are two types datasets: continues and binary. Continues datasets were standardized.
filenames = [
'InterPro_gene_attribute_matrix.txt.gz',
'ENCODE-TF_gene_attribute_matrix.txt.gz',
'CCLE-GE_gene_attribute_matrix_standardized.txt.gz',
'Allen-adult-brain_gene_attribute_matrix_standardized.txt.gz',
'Allen-dev-brain_gene_attribute_matrix_standardized.txt.gz'
]
basenames = [f.split('_')[0] for f in filenames]
are_continues = ['_standardized' in f for f in filenames]
def read_gene_attribute_matrix(fn):
'''
Reads a gzipped file downloaded from Harmonizome into a pandas.DataFrame,
with GeneSym as index.
'''
with gzip.open(fn) as f:
reader = csv.reader(f, delimiter='\t')
header = reader.next()
header[0] = 'GeneSym'
# Skip line 2, 3
reader.next()
reader.next()
# Remove the second and third columns which are meta data for GeneSym
header[1:3] = []
i = 0
df = []
if '_standardized' in fn: # continues
for row in reader:
row[1:3] = []
row[1:] = map(float, row[1:])
df.append(dict(zip(header, row)))
i += 1
if i % 2000 == 0:
print i
df = pd.DataFrame().from_records(df)[header]
df.set_index('GeneSym', inplace=True, verify_integrity=True)
else: # convert values to int and make sparse df
for row in reader:
row[1:3] = []
row[1:] = map(lambda x: int(float(x)), row[1:])
d = {h: v for h, v in zip(header, row) if v != 0}
df.append(d)
i += 1
if i % 2000 == 0:
print i
df = pd.DataFrame().from_records(df)[header]\
.set_index('GeneSym', verify_integrity=True)\
.to_sparse(fill_value=0)
return df
## Parse data files downloaded from Harmonizome
dfs = []
for fn in filenames:
df = read_gene_attribute_matrix('../data/%s' % fn)
print fn, df.shape
dfs.append(df)
## INNER JOIN all feature dfs
df_joined = reduce(lambda a, b: pd.merge(a, b, left_index=True, right_index=True, how='inner'),
dfs)
print df_joined.shape
## Create y
RNG = 2016
y = np.in1d(df_joined.index, df_labels['Official Symbol']).astype(np.int8)
print y.sum()
## Make CV
cv = StratifiedKFold(y, n_folds=3, shuffle=True, random_state=RNG)
## Export y
df_labels = pd.DataFrame({'y': y, 'GeneSym': df_joined.index}).set_index('GeneSym')
print df_labels.shape
df_labels.to_csv('../data/Adhesomes.csv')
df_labels.head()
## Keep only the shared genes across the 4 datasets
dfs = [df.ix[df_joined.index] for df in dfs]
## Export processed matrices in dfs
feature_names = {} # To store feature names
i = 0
for basename, df in zip(basenames, dfs):
feature_names[basename] = df.columns.tolist()
if are_continues[i]:
np.save('../data/%s_shared' % basename, df.values)
else: # sparse matrix
io.mmwrite('../data/%s_shared.mtx' % basename, sp.csr_matrix(df.values))
i += 1
del df_joined
del dfs, df
2. Load preprocessed matrices and perform classifications¶
# Load from matrices files generated above
Xs = []
for i, basename in enumerate(basenames):
if are_continues[i]:
Xs.append(np.load('../data/%s_shared.npy' % basename))
else:
Xs.append(io.mmread('../data/%s_shared.mtx' % basename).tocsr())
df_labels = pd.read_csv('../data/Adhesomes.csv')
y = df_labels['y'].values
ratio = float(np.sum(y == 0)) / np.sum(y==1)
print 'Number of known adhesome components: %d' % y.sum()
print 'Ratio of negative labels over positive labels: %.4f' % ratio
Perform dimentionality reduction using TruncatedSVD
for all the matrices
all_loadings = [] # collect loading matrices from SVD
for i, basename in enumerate(basenames):
svd = TruncatedSVD(n_components=60, random_state=RNG)
Xs[i] = svd.fit_transform(Xs[i])
all_loadings.append(svd.components_)
X_combined = np.hstack(Xs)
## Helper functions for evaluating classifiers
def cross_val_predictions(est, X, y, cv):
'''to get out-of-sample predictions and scores'''
y_preds = np.zeros(y.shape)
y_probas = np.zeros(y.shape)
for train_idx, valid_idx in cv:
print X[train_idx].shape, y[train_idx].shape
est.fit(X[train_idx], y[train_idx])
y_preds[valid_idx] = est.predict(X[valid_idx])
y_probas[valid_idx] = est.predict_proba(X[valid_idx])[:,1]
return y_preds, y_probas
def plot_roc(ests, Xs, y, cv, ax, colors=None, labels=None):
all_labels = []
total = len(labels)
if type(ests) == list and type(Xs) != list:
total = len(ests)
Xs = [Xs]*total
elif type(ests) != list and type(Xs) == list:
ests = [ests]*total
for i in range(total):
X = Xs[i]
est = ests[i]
label = labels[i]
color = colors[i]
all_labels.extend([label] * len(cv))
y_preds, y_probas = cross_val_predictions(est, X, y, cv)
fpr, tpr, _ = roc_curve(y, y_probas)
score = auc(fpr, tpr)
ax.plot(fpr, tpr, label=label + ' (AUC=%.3f)' % score, color=color, lw=2)
ax.set_xlabel('False Positive Rate', fontsize=16)
ax.set_ylabel('True Positive Rate', fontsize=16)
enlarge_tick_fontsize(ax, 12)
ax.legend(loc='lower right')
return
Estimate the number of rounds of boosting using early stopping for a single feature matrix InterPro: The boosting classifier will stop if the validation score does not improve in 50 rounds.
dtrain = xgb.DMatrix(Xs[0], label=y)
param = {
'max_depth':10, 'eta':0.05, 'silent':1, 'objective':'binary:logistic',
'subsample': 0.4, 'colsample_bytree': 0.6,
'min_child_weight': 50,
'scale_pos_weight': ratio,
'nthread': 6
}
num_round = 5000
scores = xgb.cv(param, dtrain, num_round,
folds=cv,
early_stopping_rounds=50,
metrics='auc', seed=RNG,
verbose_eval=10)
print scores.tail()
Estimate the number of rounds of boosting using early stopping for the combined feature matrix.
dtrain = xgb.DMatrix(X_combined, label=y)
param = {
'max_depth':12, 'eta':0.05, 'silent':1, 'objective':'binary:logistic',
'subsample': 0.4, 'colsample_bytree': 0.4,
'min_child_weight': 50,
'scale_pos_weight': ratio,
'nthread': 6
}
num_round = 5000
scores = xgb.cv(param, dtrain, num_round,
folds=cv,
early_stopping_rounds=50,
metrics='auc', seed=RNG,
verbose_eval=10)
print scores.tail()
Plot the ROC curves to evaluate the predictive performance of the GBM classifiers
Xs.append(X_combined)
basenames.append('Combined')
# optimized GBM classifiers
xgbc = xgb.XGBClassifier(seed=RNG, n_estimators=39, learning_rate=0.05,
max_depth=10, colsample_bytree=0.6, subsample=0.4, min_child_weight=50,
gamma=0, max_delta_step=0, nthread=6, silent=True, scale_pos_weight=ratio)
xgbc2 = xgb.XGBClassifier(seed=RNG, n_estimators=159, learning_rate=0.05,
max_depth=12, colsample_bytree=0.4, subsample=0.4, min_child_weight=50,
gamma=0, max_delta_step=0, nthread=6, silent=True, scale_pos_weight=ratio)
clfs = [xgbc] * 5 + [xgbc2]
fig, ax = plt.subplots(figsize=(6,6))
plot_roc(clfs, Xs, y, cv, ax, colors=COLORS10, labels=basenames)
3. Apply the GBM classifier to all the datasets to generate predictions¶
RNG = 20160628
xgbc2 = xgb.XGBClassifier(seed=RNG, n_estimators=160, learning_rate=0.05,
max_depth=12, colsample_bytree=0.4, subsample=0.4, min_child_weight=50,
gamma=0, max_delta_step=0, nthread=8, silent=True, scale_pos_weight=ratio)
cv = StratifiedKFold(y, n_folds=10, shuffle=True, random_state=RNG)
## Get out-of-fold predictions
y_preds, y_probas = cross_val_predictions(xgbc2, X_combined, y, cv)
## Get predictions on training fold
xgbc2.fit(X_combined, y)
y_probas_on_train = xgbc2.predict_proba(X_combined)[:, 1]
df_labels['OOF_preds'] = y_preds
df_labels['OOF_probas'] = y_probas
df_labels['train_probas'] = y_probas_on_train
df_labels.head()
df_labels.to_csv('../XGB160_on_combined_predictions.csv')
4. Interpret the classifier by feature importance¶
## Get the feature_importances_ from the fitted GBM
feature_importances = xgbc2.feature_importances_
print feature_importances.shape
## Count the number of original features
n_features = sum(map(len, feature_names.values()))
print 'There are %d features across these datasets used for the prediction' % n_features
Map feature importances on the SVD components back to original feature space by dot product between the feature importance vector and the loading matrix:
all_feature_names = []
all_feature_fis = []
datasets = []
for i, basename in enumerate(basenames[:-1]):
fi = feature_importances[i*60:(i+1)*60]
loadings = all_loadings[i]
original_feature_fis = np.dot(fi, loadings)
original_feature_names = feature_names[basename]
datasets.extend([basename] * len(original_feature_names))
all_feature_fis.extend( original_feature_fis.tolist() )
all_feature_names.extend( original_feature_names )
# Create a DataFrame of feature importances
df_feature_importances = pd.DataFrame({
'dataset': datasets,
'feature': all_feature_names,
'feature_importance': all_feature_fis})
Examine the most predictive features for adhesome components:
df_feature_importances.sort('feature_importance', ascending=False).head(20)
Examine the most predictive features in ENCODE-TF dataset for adhesome components:
df_feature_importances.query('dataset == "ENCODE-TF"').sort('feature_importance', ascending=False).head(5)
Write the full feature importance table to a file and provide a link to this file.
df_feature_importances.to_csv('feature_importances.csv')
from IPython.display import FileLink
FileLink('feature_importances.csv')