Skip to content

Commit 9cd4d46

Browse files
committed
more readability
1 parent 00a44f5 commit 9cd4d46

File tree

9 files changed

+379
-210
lines changed

9 files changed

+379
-210
lines changed

stratipy/biostat.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -415,6 +415,7 @@ def biostat_analysis(data_folder, result_folder, patient_data,
415415
n_components, n_permutations, lambd, tol_nmf,
416416
linkage_method, p_val_threshold, gene_id_ppi,
417417
idx_ppi, idx_ppi_only):
418+
# import hierarchical_clustering
418419
hierarchical_clustering_file = hierarchical_clustering.hierarchical_file(
419420
result_folder, mut_type, influence_weight, simplification, alpha, tol,
420421
keep_singletons, ngh_max, min_mutation, max_mutation, n_components,

stratipy/biostat_plot.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import os
33
sys.path.append(os.path.abspath('../../stratipy'))
44
from stratipy import biostat
5+
# import biostat
56
import numpy as np
67
import pandas as pd
78
import re
@@ -177,7 +178,7 @@ def biostat_individuals_plot(df, data_folder, ssc_mutation_data, gene_data,
177178

178179
# k=20 -> figsize=(5, 9)
179180
fig, ax = plt.subplots(nrows=df_fill.shape[0], ncols=df_fill.shape[1],
180-
sharex=True, sharey=True, figsize=(5, 20))
181+
sharex=True, sharey=True, figsize=(5, 9))
181182
if lambd > 0:
182183
nmf = 'GNMF'
183184
else:
@@ -186,7 +187,7 @@ def biostat_individuals_plot(df, data_folder, ssc_mutation_data, gene_data,
186187
fig.suptitle(
187188
"Statistical significance between individual clusters\n(mutation:{} // gene:{} // PPI:{} // {} // {})".
188189
format(ssc_mutation_data, gene_data, ppi_data, mut_type, nmf), x=0.5,
189-
y=1.15, fontsize=14, linespacing=2)
190+
y=1.15, fontsize=14, linespacing=2) # y=1.15 for k=20
190191

191192
for col in range(len(p_col)):
192193
for row in range(df_fill.shape[0]):

stratipy/filtering_diffusion.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424

2525

2626
# @profile
27-
def propagation(M, adj, alpha=0.7, tol=10e-6): # TODO equation, M, alpha
27+
def propagation(M, adj, alpha, tol=10e-6): # TODO equation, M, alpha
2828
"""Network propagation iterative process
2929
3030
Iterative algorithm for apply propagation using random walk on a network:

stratipy/formatting_data.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ def check_sparsity(X):
1313
def check_shape_matching(X, L, array_name, list_name):
1414
if X.shape[0] != len(L):
1515
raise Exception("Numbers in {} shape ({}) and in {} ({}) don't match "
16-
.format(array_name, X.shape(), list_name, len(L)))
16+
.format(array_name, X.shape, list_name, len(L)))
1717

1818

1919
#TODO check ID order in list and network

stratipy/hierarchical_clustering.py

Lines changed: 62 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import sys
22
import os
33
sys.path.append(os.path.abspath('../../stratipy'))
4-
from stratipy import consensus_clustering
4+
# from stratipy import consensus_clustering
55
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster, cophenet
66
from scipy.spatial.distance import pdist
77
import numpy as np
@@ -127,6 +127,67 @@ def linkage_dendrogram(hierarchical_clustering_file, distance_genes,
127127
bbox_inches='tight')
128128

129129

130+
# for no SSC data
131+
def individual_linkage_dendrogram(hierarchical_clustering_file,
132+
distance_patients, ppi_data, mut_type, alpha, ngh_max,
133+
n_components, n_permutations, lambd, linkage_method,
134+
patient_data, data_folder, result_folder, repro):
135+
136+
existance_same_param = os.path.exists(hierarchical_clustering_file)
137+
138+
if existance_same_param:
139+
h = loadmat(hierarchical_clustering_file)
140+
# cluster index for each individual
141+
clust_nb_patients = np.squeeze(h['flat_cluster_number_individuals'])
142+
# individuals' index
143+
idx_patients = np.squeeze(h['dendrogram_index_individuals'])
144+
print(' **** Same parameters file of hierarchical clustering already exists')
145+
else:
146+
# hierarchical clustering on distance matrix (here: distance_patients)
147+
start = time.time()
148+
Z_patients = linkage(distance_patients, method=linkage_method)
149+
end = time.time()
150+
print("---------- Linkage based on Individual distance = {} ---------- {}"
151+
.format(datetime.timedelta(seconds=end-start),
152+
datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")),
153+
flush=True)
154+
155+
P_patients = dendrogram(
156+
Z_patients, count_sort='ascending', no_labels=True)
157+
158+
idx_patients = np.array(P_patients['leaves'])
159+
160+
# forms flat clusters from Z
161+
# given k -> maxclust
162+
clust_nb_patients = fcluster(
163+
Z_patients, n_components, criterion='maxclust')
164+
165+
# start = time.time()
166+
savemat(hierarchical_clustering_file,
167+
{'Z_linkage_matrix_individuals': Z_patients,
168+
'dendrogram_data_dictionary_individuals': P_patients,
169+
'dendrogram_index_individuals': idx_patients,
170+
'flat_cluster_number_individuals': clust_nb_patients},
171+
do_compression=True)
172+
173+
D_patients = distance_patients[idx_patients, :][:, idx_patients]
174+
175+
fig = plt.figure(figsize=(3, 3))
176+
im = plt.imshow(D_patients, interpolation='nearest', cmap=cm.viridis)
177+
plt.axis('off')
178+
if repro:
179+
directory = result_folder
180+
else:
181+
directory = data_folder
182+
fig_directory = directory + 'figures/similarity/' + patient_data + '_' + ppi_data + '/'
183+
os.makedirs(fig_directory, exist_ok=True)
184+
fig_name = ('{}_{}_k={}_ngh={}_permut={}_lambd={}'.format(
185+
mut_type, alpha, n_components, ngh_max, n_permutations, lambd))
186+
print('saving plot')
187+
plt.savefig('{}{}.png'.format(fig_directory, fig_name),
188+
bbox_inches='tight')
189+
190+
130191
def hierarchical(result_folder, distance_genes, distance_patients, ppi_data,
131192
mut_type, influence_weight, simplification, alpha, tol,
132193
keep_singletons, ngh_max, min_mutation, max_mutation,

stratipy/load_data.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -80,13 +80,13 @@ def get_indiv_list(indiv_from_df):
8080
return alist
8181

8282

83-
def mutation_profile_coordinate(df, indiv_list):
83+
def mutation_profile_coordinate(df, indiv_list, indiv_type):
8484
coord_gene = []
8585
coord_indiv = []
8686

8787
for i in trange(df.shape[0], desc="mutation profile coordinates"):
8888
# for each row (each gene), we get list of individuals' ID
89-
individuals_per_gene = coordinate(df.iloc[i, 1], indiv_list)
89+
individuals_per_gene = coordinate(df[indiv_type][i], indiv_list)
9090
# for each element of coordinate listes x/y:
9191
for j in individuals_per_gene:
9292
# gene is saved as gene's INDEX (not EntrezGene) in dataframe
@@ -122,12 +122,15 @@ def load_overall_SSC_mutation_profile(data_folder, ssc_mutation_data):
122122

123123
# individuals' ID list for each row is transformed in string of list
124124
# we thus reformate to list
125-
df.individuals = df.individuals.apply(eval)
125+
indiv_type = 'eu_individuals' # only Europeans
126+
df[indiv_type] = df[indiv_type].apply(eval)
126127
# create individual ID list
127-
indiv = get_indiv_list(df.individuals)
128+
indiv = get_indiv_list(df[indiv_type])
129+
if '[' in indiv: indiv.remove('[')
130+
if ']' in indiv: indiv.remove(']')
128131

129132
# calculate coordinates genes x individuals -> sparse matrix
130-
coord_indiv, coord_gene = mutation_profile_coordinate(df, indiv)
133+
coord_indiv, coord_gene = mutation_profile_coordinate(df, indiv, indiv_type)
131134
# mutation weight = 1
132135
weight = np.ones(len(coord_gene))
133136
# coo matrix then to csr matrix
@@ -162,6 +165,8 @@ def load_specific_SSC_mutation_profile(data_folder, ssc_mutation_data, ssc_subgr
162165
else:
163166
mutation_profile, gene_id, indiv = (
164167
load_overall_SSC_mutation_profile(data_folder, ssc_mutation_data))
168+
print("SSC overall mutation profile matrix\n shape: {}\n stored elements: {}".format(mutation_profile.shape, mutation_profile.nnz))
169+
165170

166171
# if SSC 1 or 2
167172
if ssc_subgroups != "SSC":
@@ -210,6 +215,8 @@ def load_specific_SSC_mutation_profile(data_folder, ssc_mutation_data, ssc_subgr
210215
savemat(mutation_profile_file, {'mutation_profile': mutation_profile,
211216
'gene_id': gene_id,
212217
'indiv': indiv}, do_compression=True)
218+
219+
print("Mutation profile matrix\n shape: {}\n stored elements: {}".format(mutation_profile.shape, mutation_profile.nnz))
213220

214221
return mutation_profile, gene_id, indiv
215222

stratipy/nbs.py

Lines changed: 44 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# coding: utf-8
33
import sys
44
import os
5-
sys.path.append(os.path.abspath('../../stratipy_cluster'))
5+
sys.path.append(os.path.abspath('../../stratipy'))
66
from stratipy import (load_data, formatting_data, filtering_diffusion,
77
nmf_bootstrap, consensus_clustering,
88
hierarchical_clustering, biostat, biostat_go,
@@ -21,8 +21,11 @@ def initiation(mut_type, alpha, patient_data, data_folder, ssc_mutation_data,
2121
alpha = 0
2222

2323
if patient_data == 'SSC':
24+
# result_folder = (
25+
# data_folder + 'result_' + ssc_mutation_data + '_' +
26+
# ssc_subgroups + '_' + gene_data + '_' + ppi_data + '/')
2427
result_folder = (
25-
data_folder + 'result_' + ssc_mutation_data + '_' +
28+
data_folder + '/Volumes/Abu3/min/201812_MAF50_alpha0.7/result_' + ssc_mutation_data + '_' +
2629
ssc_subgroups + '_' + gene_data + '_' + ppi_data + '/')
2730
else:
2831
result_folder = (data_folder + 'result_' + patient_data + '_' +
@@ -117,55 +120,55 @@ def post_bootstrap(result_folder, mut_type, influence_weight, simplification,
117120
ppi_data, patient_data, data_folder, ssc_subgroups,
118121
ssc_mutation_data, gene_data, p_val_threshold, compute,
119122
overwrite):
120-
print("------------ consensus_clustering.py ------------ {}"
121-
.format(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
122-
flush=True)
123-
distance_genes, distance_patients = (
124-
consensus_clustering.sub_consensus(
125-
result_folder, mut_type, influence_weight, simplification, alpha,
126-
tol, keep_singletons, ngh_max, min_mutation, max_mutation,
127-
n_components, n_permutations, lambd, tol_nmf,
128-
compute_gene_clustering, run_consensus))
129-
130-
print("------------ hierarchical_clustering.py ------------ {}"
131-
.format(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
132-
flush=True)
133-
hierarchical_clustering.hierarchical(
134-
result_folder, distance_genes, distance_patients, ppi_data, mut_type,
135-
influence_weight, simplification, alpha, tol, keep_singletons, ngh_max,
136-
min_mutation, max_mutation, n_components, n_permutations, lambd,
137-
tol_nmf, linkage_method, patient_data, data_folder, ssc_subgroups,
138-
ssc_mutation_data, gene_data)
123+
# print("------------ consensus_clustering.py ------------ {}"
124+
# .format(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
125+
# flush=True)
126+
# distance_genes, distance_patients = (
127+
# consensus_clustering.sub_consensus(
128+
# result_folder, mut_type, influence_weight, simplification, alpha,
129+
# tol, keep_singletons, ngh_max, min_mutation, max_mutation,
130+
# n_components, n_permutations, lambd, tol_nmf,
131+
# compute_gene_clustering, run_consensus))
132+
#
133+
# print("------------ hierarchical_clustering.py ------------ {}"
134+
# .format(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
135+
# flush=True)
136+
# hierarchical_clustering.hierarchical(
137+
# result_folder, distance_genes, distance_patients, ppi_data, mut_type,
138+
# influence_weight, simplification, alpha, tol, keep_singletons, ngh_max,
139+
# min_mutation, max_mutation, n_components, n_permutations, lambd,
140+
# tol_nmf, linkage_method, patient_data, data_folder, ssc_subgroups,
141+
# ssc_mutation_data, gene_data)
139142

140143
print("\n------------ biostat.py ------------ {}"
141144
.format(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
142145
flush=True)
143-
# gene_id_ppi, idx_ppi, idx_ppi_only = preprocessing(
144-
# data_folder, patient_data, ssc_mutation_data, ssc_subgroups, gene_data,
145-
# ppi_data, result_folder, influence_weight, simplification, compute,
146-
# overwrite, alpha, tol, ngh_max, keep_singletons, min_mutation,
147-
# max_mutation, mut_type)
148-
#
149-
# biostat.biostat_analysis(
150-
# data_folder, result_folder, patient_data, ssc_mutation_data,
151-
# ssc_subgroups, ppi_data, gene_data, mut_type, influence_weight,
152-
# simplification, alpha, tol, keep_singletons, ngh_max, min_mutation,
153-
# max_mutation, n_components, n_permutations, lambd, tol_nmf,
154-
# linkage_method, p_val_threshold, gene_id_ppi, idx_ppi, idx_ppi_only)
146+
gene_id_ppi, idx_ppi, idx_ppi_only = preprocessing(
147+
data_folder, patient_data, ssc_mutation_data, ssc_subgroups, gene_data,
148+
ppi_data, result_folder, influence_weight, simplification, compute,
149+
overwrite, alpha, tol, ngh_max, keep_singletons, min_mutation,
150+
max_mutation, mut_type)
151+
152+
biostat.biostat_analysis(
153+
data_folder, result_folder, patient_data, ssc_mutation_data,
154+
ssc_subgroups, ppi_data, gene_data, mut_type, influence_weight,
155+
simplification, alpha, tol, keep_singletons, ngh_max, min_mutation,
156+
max_mutation, n_components, n_permutations, lambd, tol_nmf,
157+
linkage_method, p_val_threshold, gene_id_ppi, idx_ppi, idx_ppi_only)
155158

156-
biostat_go.biostat_go_enrichment(
157-
alpha, result_folder, mut_type, patient_data, data_folder, ssc_mutation_data,
158-
ssc_subgroups, gene_data, ppi_data, lambd, n_components, ngh_max, n_permutations)
159+
# biostat_go.biostat_go_enrichment(
160+
# alpha, result_folder, mut_type, patient_data, data_folder, ssc_mutation_data,
161+
# ssc_subgroups, gene_data, ppi_data, lambd, n_components, ngh_max, n_permutations)
159162

160163
print("\n------------ biostat_plot.py ------------ {}"
161164
.format(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
162165
flush=True)
163166
# no need SSC1/SSC2, no need k
164-
biostat_plot.load_plot_biostat_individuals(
165-
result_folder, data_folder, ssc_mutation_data,
166-
gene_data, patient_data, ppi_data, mut_type, lambd, influence_weight,
167-
simplification, alpha, tol, keep_singletons, ngh_max, min_mutation,
168-
max_mutation, n_components, n_permutations, tol_nmf, linkage_method)
167+
# biostat_plot.load_plot_biostat_individuals(
168+
# result_folder, data_folder, ssc_mutation_data,
169+
# gene_data, patient_data, ppi_data, mut_type, lambd, influence_weight,
170+
# simplification, alpha, tol, keep_singletons, ngh_max, min_mutation,
171+
# max_mutation, n_components, n_permutations, tol_nmf, linkage_method)
169172
###############################################################################
170173
###############################################################################
171174
###############################################################################

0 commit comments

Comments
 (0)