import os, sys, csv, gzip
import numpy as np
import pandas as pd
import scipy.sparse as sp
from scipy import io

import matplotlib.pyplot as plt
from matplotlib import rcParams
rcParams['pdf.fonttype'] = 42 ## Output Type 3 (Type3) or Type 42 (TrueType)
rcParams['font.sans-serif'] = 'Arial'
%matplotlib inline

from plots import COLORS10, enlarge_tick_fontsize
import seaborn as sns
sns.set_style('whitegrid')

from sklearn.cross_validation import (StratifiedKFold, cross_val_score)
from sklearn.metrics import *
from sklearn.decomposition import TruncatedSVD
from sklearn import ensemble
from sklearn.pipeline import Pipeline
import xgboost as xgb

/Library/Python/2.7/site-packages/IPython/html.py:14: ShimWarning: The `IPython.html` package has been deprecated. You should import from `notebook` instead. `IPython.html.widgets` has moved to `ipywidgets`.
  "`IPython.html.widgets` has moved to `ipywidgets`.", ShimWarning)

1. Prepare the gene/feature attribute matrices and target class to for supervised machine learning¶

Load known Adhesome components from file

df_labels = pd.read_csv('../data/components.csv')
print df_labels.shape
df_labels.head()

(232, 7)

These files below were downloaded from Harmonizome as the features for genes to predict whether they are adhesome components.

There are two types datasets: continues and binary. Continues datasets were standardized.

filenames = [
        'InterPro_gene_attribute_matrix.txt.gz', 
        'ENCODE-TF_gene_attribute_matrix.txt.gz',
        'CCLE-GE_gene_attribute_matrix_standardized.txt.gz',
        'Allen-adult-brain_gene_attribute_matrix_standardized.txt.gz',
        'Allen-dev-brain_gene_attribute_matrix_standardized.txt.gz'
    ]
basenames = [f.split('_')[0] for f in filenames]
are_continues = ['_standardized' in f for f in filenames]

def read_gene_attribute_matrix(fn):
	'''
	Reads a gzipped file downloaded from Harmonizome into a pandas.DataFrame,
	with GeneSym as index. 
	'''
	with gzip.open(fn) as f:
		reader = csv.reader(f, delimiter='\t')
		header = reader.next()
		header[0] = 'GeneSym'
		# Skip line 2, 3
		reader.next()
		reader.next()

		# Remove the second and third columns which are meta data for GeneSym
		header[1:3] = []

		i = 0
		df = []

		if '_standardized' in fn: # continues
			for row in reader:
				row[1:3] = []
				row[1:] = map(float, row[1:])
				df.append(dict(zip(header, row)))
				i += 1
				if i % 2000 == 0:
					print i
			df = pd.DataFrame().from_records(df)[header]
			df.set_index('GeneSym', inplace=True, verify_integrity=True)
					
		else: # convert values to int and make sparse df
			for row in reader:
				row[1:3] = []
				row[1:] = map(lambda x: int(float(x)), row[1:])

				d = {h: v for h, v in zip(header, row) if v != 0}
				df.append(d)
				i += 1
				if i % 2000 == 0:
					print i
			df = pd.DataFrame().from_records(df)[header]\
				.set_index('GeneSym', verify_integrity=True)\
				.to_sparse(fill_value=0)
		
	return df

## Parse data files downloaded from Harmonizome 
dfs = []
for fn in filenames:
    df = read_gene_attribute_matrix('../data/%s' % fn)
    print fn, df.shape
    dfs.append(df)

2000
4000
6000
8000
10000
12000
14000
16000
18000
InterPro_gene_attribute_matrix.txt.gz (18002, 11015)
2000
4000
6000
8000
10000
12000
14000
16000
18000
20000
22000
ENCODE-TF_gene_attribute_matrix.txt.gz (22845, 1679)
2000
4000
6000
8000
10000
12000
14000
16000
18000
CCLE-GE_gene_attribute_matrix_standardized.txt.gz (18046, 1037)
2000
4000
6000
8000
10000
12000
14000
16000
Allen-adult-brain_gene_attribute_matrix_standardized.txt.gz (17979, 414)
2000
4000
6000
8000
10000
12000
14000
16000
Allen-dev-brain_gene_attribute_matrix_standardized.txt.gz (17241, 492)

## INNER JOIN all feature dfs 
df_joined = reduce(lambda a, b: pd.merge(a, b, left_index=True, right_index=True, how='inner'), 
           dfs)
print df_joined.shape

(13464, 14637)

## Create y 
RNG = 2016
y = np.in1d(df_joined.index, df_labels['Official Symbol']).astype(np.int8)
print y.sum()
## Make CV
cv = StratifiedKFold(y, n_folds=3, shuffle=True, random_state=RNG)
## Export y
df_labels = pd.DataFrame({'y': y, 'GeneSym': df_joined.index}).set_index('GeneSym')
print df_labels.shape
df_labels.to_csv('../data/Adhesomes.csv')
df_labels.head()

216
(13464, 1)

## Keep only the shared genes across the 4 datasets
dfs = [df.ix[df_joined.index] for df in dfs]

## Export processed matrices in dfs
feature_names = {} # To store feature names
i = 0
for basename, df in zip(basenames, dfs):
    feature_names[basename] = df.columns.tolist()
    if are_continues[i]:
        np.save('../data/%s_shared' % basename, df.values)
    else: # sparse matrix
        io.mmwrite('../data/%s_shared.mtx' % basename, sp.csr_matrix(df.values))
    i += 1

del df_joined
del dfs, df

2. Load preprocessed matrices and perform classifications¶

# Load from matrices files generated above
Xs = []
for i, basename in enumerate(basenames):
    if are_continues[i]:
        Xs.append(np.load('../data/%s_shared.npy' % basename))
    else:
        Xs.append(io.mmread('../data/%s_shared.mtx' % basename).tocsr())

df_labels = pd.read_csv('../data/Adhesomes.csv')
y = df_labels['y'].values
ratio = float(np.sum(y == 0)) / np.sum(y==1)
print 'Number of known adhesome components: %d' % y.sum()
print 'Ratio of negative labels over positive labels: %.4f' % ratio

Number of known adhesome components: 216
Ratio of negative labels over positive labels: 61.3333

Perform dimentionality reduction using TruncatedSVD for all the matrices

all_loadings = [] # collect loading matrices from SVD
for i, basename in enumerate(basenames):
    svd = TruncatedSVD(n_components=60, random_state=RNG)
    Xs[i] = svd.fit_transform(Xs[i])
    all_loadings.append(svd.components_)

X_combined = np.hstack(Xs)

## Helper functions for evaluating classifiers
def cross_val_predictions(est, X, y, cv):
	'''to get out-of-sample predictions and scores'''
	y_preds = np.zeros(y.shape)
	y_probas = np.zeros(y.shape)
	for train_idx, valid_idx in cv:
		print X[train_idx].shape, y[train_idx].shape
		est.fit(X[train_idx], y[train_idx])
		y_preds[valid_idx] = est.predict(X[valid_idx])
		y_probas[valid_idx] = est.predict_proba(X[valid_idx])[:,1]
	return y_preds, y_probas


def plot_roc(ests, Xs, y, cv, ax, colors=None, labels=None):
	all_labels = []
	total = len(labels)

	if type(ests) == list and type(Xs) != list:
		total = len(ests)
		Xs = [Xs]*total
	elif type(ests) != list and type(Xs) == list:
		ests = [ests]*total
	
	for i in range(total):
		X = Xs[i]
		est = ests[i]
		
		label = labels[i]
		color = colors[i]
		all_labels.extend([label] * len(cv))

		y_preds, y_probas = cross_val_predictions(est, X, y, cv)
		fpr, tpr, _ = roc_curve(y, y_probas)
		score = auc(fpr, tpr)
		ax.plot(fpr, tpr, label=label + ' (AUC=%.3f)' % score, color=color, lw=2)

	ax.set_xlabel('False Positive Rate', fontsize=16)
	ax.set_ylabel('True Positive Rate', fontsize=16)

	enlarge_tick_fontsize(ax, 12)
	ax.legend(loc='lower right')
	return

Estimate the number of rounds of boosting using early stopping for a single feature matrix InterPro: The boosting classifier will stop if the validation score does not improve in 50 rounds.

dtrain = xgb.DMatrix(Xs[0], label=y)

param = {
    'max_depth':10, 'eta':0.05, 'silent':1, 'objective':'binary:logistic',
    'subsample': 0.4, 'colsample_bytree': 0.6,
    'min_child_weight': 50,
    'scale_pos_weight': ratio,
    'nthread': 6
}

num_round = 5000
scores = xgb.cv(param, dtrain, num_round, 
    folds=cv,
    early_stopping_rounds=50,
    metrics='auc', seed=RNG,
    verbose_eval=10)

print scores.tail()

[0]	train-auc:0.820459+0.0229896	test-auc:0.745151+0.0077069
[10]	train-auc:0.93892+0.0027836	test-auc:0.841864+0.0117731
[20]	train-auc:0.952989+0.0036986	test-auc:0.849968+0.00316156
[30]	train-auc:0.96102+0.00424137	test-auc:0.855808+0.00361613
[40]	train-auc:0.966086+0.00261812	test-auc:0.8555+0.00428185
[50]	train-auc:0.969816+0.0013973	test-auc:0.854981+0.00503997
[60]	train-auc:0.972845+0.00152091	test-auc:0.853563+0.00605039
[70]	train-auc:0.975211+0.0018458	test-auc:0.852767+0.00601178
[80]	train-auc:0.977073+0.00187863	test-auc:0.851972+0.00606561
    test-auc-mean  test-auc-std  train-auc-mean  train-auc-std
35       0.856751      0.002450        0.963063       0.003304
36       0.856716      0.003180        0.963542       0.002999
37       0.856693      0.002257        0.964056       0.003098
38       0.855968      0.001767        0.964853       0.002845
39       0.856858      0.004342        0.965392       0.002660

Estimate the number of rounds of boosting using early stopping for the combined feature matrix.

dtrain = xgb.DMatrix(X_combined, label=y)

param = {
    'max_depth':12, 'eta':0.05, 'silent':1, 'objective':'binary:logistic',
    'subsample': 0.4, 'colsample_bytree': 0.4,
    'min_child_weight': 50,
    'scale_pos_weight': ratio,
    'nthread': 6
}

num_round = 5000
scores = xgb.cv(param, dtrain, num_round, 
    folds=cv,
    early_stopping_rounds=50,
    metrics='auc', seed=RNG,
    verbose_eval=10)

print scores.tail()

[0]	train-auc:0.81481+0.00187584	test-auc:0.72085+0.0264555
[10]	train-auc:0.964683+0.00209533	test-auc:0.845875+0.0139989
[20]	train-auc:0.979513+0.00218351	test-auc:0.855574+0.00475015
[30]	train-auc:0.987437+0.00264669	test-auc:0.857632+0.00750839
[40]	train-auc:0.991936+0.00301162	test-auc:0.856888+0.00569661
[50]	train-auc:0.994006+0.0026162	test-auc:0.856431+0.00527264
[60]	train-auc:0.99581+0.00202458	test-auc:0.854646+0.00611534
[70]	train-auc:0.99678+0.00180488	test-auc:0.857454+0.00583333
[80]	train-auc:0.997784+0.00116644	test-auc:0.859574+0.00635607
[90]	train-auc:0.998488+0.000746378	test-auc:0.86078+0.00654498
[100]	train-auc:0.998812+0.000706678	test-auc:0.86345+0.00309085
[110]	train-auc:0.999146+0.000535133	test-auc:0.865704+0.00372815
[120]	train-auc:0.999369+0.000407981	test-auc:0.865754+0.00513564
[130]	train-auc:0.999508+0.00032021	test-auc:0.866895+0.00405058
[140]	train-auc:0.999609+0.000283679	test-auc:0.867821+0.00577482
[150]	train-auc:0.999665+0.000224317	test-auc:0.868358+0.00396938
[160]	train-auc:0.999714+0.000197435	test-auc:0.86945+0.00463421
[170]	train-auc:0.999764+0.000184588	test-auc:0.867881+0.00504133
[180]	train-auc:0.99979+0.000183194	test-auc:0.865975+0.00461011
[190]	train-auc:0.999819+0.000146981	test-auc:0.866868+0.00410002
[200]	train-auc:0.999842+0.000113291	test-auc:0.866867+0.00400328
     test-auc-mean  test-auc-std  train-auc-mean  train-auc-std
155       0.869587      0.003839        0.999685       0.000215
156       0.869491      0.003843        0.999693       0.000212
157       0.869433      0.003842        0.999691       0.000223
158       0.869687      0.003719        0.999699       0.000208
159       0.869809      0.003790        0.999698       0.000210

Plot the ROC curves to evaluate the predictive performance of the GBM classifiers

Xs.append(X_combined)
basenames.append('Combined')

# optimized GBM classifiers
xgbc = xgb.XGBClassifier(seed=RNG, n_estimators=39, learning_rate=0.05,
    max_depth=10, colsample_bytree=0.6, subsample=0.4, min_child_weight=50,
    gamma=0, max_delta_step=0, nthread=6, silent=True, scale_pos_weight=ratio)

xgbc2 = xgb.XGBClassifier(seed=RNG, n_estimators=159, learning_rate=0.05,
    max_depth=12, colsample_bytree=0.4, subsample=0.4, min_child_weight=50,
    gamma=0, max_delta_step=0, nthread=6, silent=True, scale_pos_weight=ratio)

clfs = [xgbc] * 5 + [xgbc2]

fig, ax = plt.subplots(figsize=(6,6))
plot_roc(clfs, Xs, y, cv, ax, colors=COLORS10, labels=basenames)

(8976, 60) (8976,)
(8976, 60) (8976,)
(8976, 60) (8976,)
(8976, 60) (8976,)
(8976, 60) (8976,)
(8976, 60) (8976,)
(8976, 60) (8976,)
(8976, 60) (8976,)
(8976, 60) (8976,)
(8976, 60) (8976,)
(8976, 60) (8976,)
(8976, 60) (8976,)
(8976, 60) (8976,)
(8976, 60) (8976,)
(8976, 60) (8976,)
(8976, 300) (8976,)
(8976, 300) (8976,)
(8976, 300) (8976,)

3. Apply the GBM classifier to all the datasets to generate predictions¶

RNG = 20160628
xgbc2 = xgb.XGBClassifier(seed=RNG, n_estimators=160, learning_rate=0.05,
    max_depth=12, colsample_bytree=0.4, subsample=0.4, min_child_weight=50,
    gamma=0, max_delta_step=0, nthread=8, silent=True, scale_pos_weight=ratio)

cv = StratifiedKFold(y, n_folds=10, shuffle=True, random_state=RNG)

## Get out-of-fold predictions
y_preds, y_probas = cross_val_predictions(xgbc2, X_combined, y, cv)

## Get predictions on training fold
xgbc2.fit(X_combined, y)
y_probas_on_train = xgbc2.predict_proba(X_combined)[:, 1]

(12117, 300) (12117,)
(12117, 300) (12117,)
(12117, 300) (12117,)
(12117, 300) (12117,)
(12117, 300) (12117,)
(12117, 300) (12117,)
(12118, 300) (12118,)
(12118, 300) (12118,)
(12119, 300) (12119,)
(12119, 300) (12119,)

df_labels['OOF_preds'] = y_preds
df_labels['OOF_probas'] = y_probas
df_labels['train_probas'] = y_probas_on_train

df_labels.head()

df_labels.to_csv('../XGB160_on_combined_predictions.csv')

4. Interpret the classifier by feature importance¶

## Get the feature_importances_ from the fitted GBM
feature_importances = xgbc2.feature_importances_
print feature_importances.shape

(300,)

## Count the number of original features
n_features = sum(map(len, feature_names.values()))
print 'There are %d features across these datasets used for the prediction' % n_features

There are 14637 features across these datasets used for the prediction

Map feature importances on the SVD components back to original feature space by dot product between the feature importance vector and the loading matrix:

all_feature_names = []
all_feature_fis = []
datasets = []
for i, basename in enumerate(basenames[:-1]):
    fi = feature_importances[i*60:(i+1)*60]
    loadings = all_loadings[i]
    original_feature_fis = np.dot(fi, loadings)
    original_feature_names = feature_names[basename]
    
    datasets.extend([basename] * len(original_feature_names))
    all_feature_fis.extend( original_feature_fis.tolist() )
    all_feature_names.extend( original_feature_names )

# Create a DataFrame of feature importances
df_feature_importances = pd.DataFrame({
        'dataset': datasets,
        'feature': all_feature_names, 
        'feature_importance': all_feature_fis})

Examine the most predictive features for adhesome components:

df_feature_importances.sort('feature_importance', ascending=False).head(20)

Examine the most predictive features in ENCODE-TF dataset for adhesome components:

df_feature_importances.query('dataset == "ENCODE-TF"').sort('feature_importance', ascending=False).head(5)

Write the full feature importance table to a file and provide a link to this file.

df_feature_importances.to_csv('feature_importances.csv')

from IPython.display import FileLink
FileLink('feature_importances.csv')

	Official Symbol	Gene ID	Protein name	Swiss-Prot ID	Synonyms	Functional Category	FA
0	ACTN1	87	Actinin	P12814	alpha-actinin	Actin regulation	Intrinsic Proteins
1	CFL1	1072	cofilin	P23528	CFL	Actin regulation	Intrinsic Proteins
2	CORO1B	57175	coronin 1B	Q9BR76	coronin-2	Actin regulation	Intrinsic Proteins
3	CTTN	2017	Cortactin	Q14247	EMS1, Amplaxin	Actin regulation	Intrinsic Proteins
4	FLNA	2316	Filamin	P21333	endothelial actin-binding protein, ABP-280, al...	Actin regulation	Intrinsic Proteins

	y
GeneSym
NUSAP1	0
WEE1	0
BUB1B	0
ASPM	0
TTK	0

	GeneSym	OOF_probas	train_probas
0	NUSAP1	0.024595	0.027813
1	WEE1	0.040820	0.036656
2	BUB1B	0.052495	0.024476
3	ASPM	0.050891	0.046398
4	TTK	0.092996	0.039838

	dataset	feature	feature_importance
10351	InterPro	Ubiquitin-related domain	0.008553
7372	InterPro	Protein kinase-like domain	0.007375
875	InterPro	Armadillo-type fold	0.007230
7369	InterPro	Protein kinase domain	0.006197
2466	InterPro	Death-like domain	0.005954
1214	InterPro	C2 domain	0.005682
14582	Allen-dev-brain	primary auditory cortex (core)_3 yrs_M_12980	0.005443
4473	InterPro	Immunoglobulin-like fold	0.005171
6930	InterPro	Pleckstrin homology-like domain	0.005047
10811	InterPro	Zinc finger, C2H2-like	0.004638
7370	InterPro	Protein kinase, ATP binding site	0.004525
1915	InterPro	Concanavalin A-like lectin/glucanase domain	0.004418
10808	InterPro	Zinc finger, C2H2	0.004306
14004	Allen-adult-brain	Edinger-Westphal nucleus, right	0.004265
10864	InterPro	Zinc finger, RING/FYVE/PHD-type	0.004254
10862	InterPro	Zinc finger, RING-type	0.003937
10779	InterPro	Zinc finger C2H2-type/integrase DNA-binding do...	0.003906
874	InterPro	Armadillo-like helical	0.003886
14150	Allen-dev-brain	hippocampus (hippocampal formation)_8 pcw_M_13058	0.003770
3694	InterPro	Galactose-binding domain-like	0.003763

	dataset	feature	feature_importance
11518	ENCODE-TF	FOXA1_HepG2_hg19_2	0.003254
11526	ENCODE-TF	FOXA2_HepG2_hg19_2	0.003180
12170	ENCODE-TF	POLR2AphosphoS5_G1E_mm9_2	0.003103
11517	ENCODE-TF	FOXA1_HepG2_hg19_1	0.003064
12169	ENCODE-TF	POLR2AphosphoS5_G1E_mm9_1	0.002839

Site Navigation

External links

Predictions

1. Prepare the gene/feature attribute matrices and target class to for supervised machine learning¶

2. Load preprocessed matrices and perform classifications¶

3. Apply the GBM classifier to all the datasets to generate predictions¶

4. Interpret the classifier by feature importance¶

Affiliations

Contact

Share