more readability

0m1n0 · 0m1n0 · commit 9cd4d4694c52 · 2019-04-01T13:32:09.000+02:00
diff --git a/stratipy/biostat.py b/stratipy/biostat.py
@@ -415,6 +415,7 @@ def biostat_analysis(data_folder, result_folder, patient_data,
                      n_components, n_permutations, lambd, tol_nmf,
                      linkage_method, p_val_threshold, gene_id_ppi,
                      idx_ppi, idx_ppi_only):
+#     import hierarchical_clustering
     hierarchical_clustering_file = hierarchical_clustering.hierarchical_file(
         result_folder, mut_type, influence_weight, simplification, alpha, tol,
         keep_singletons, ngh_max, min_mutation, max_mutation, n_components,
diff --git a/stratipy/biostat_plot.py b/stratipy/biostat_plot.py
@@ -2,6 +2,7 @@
 import os
 sys.path.append(os.path.abspath('../../stratipy'))
 from stratipy import biostat
+# import biostat
 import numpy as np
 import pandas as pd
 import re
@@ -177,7 +178,7 @@ def biostat_individuals_plot(df, data_folder, ssc_mutation_data, gene_data,
 
     # k=20 -> figsize=(5, 9)
     fig, ax = plt.subplots(nrows=df_fill.shape[0], ncols=df_fill.shape[1],
-                           sharex=True, sharey=True, figsize=(5, 20))
+                           sharex=True, sharey=True, figsize=(5, 9))
     if lambd > 0:
         nmf = 'GNMF'
     else:
@@ -186,7 +187,7 @@ def biostat_individuals_plot(df, data_folder, ssc_mutation_data, gene_data,
     fig.suptitle(
         "Statistical significance between individual clusters\n(mutation:{} // gene:{} // PPI:{} // {} // {})".
         format(ssc_mutation_data, gene_data, ppi_data, mut_type, nmf), x=0.5,
-        y=1.15, fontsize=14, linespacing=2)
+        y=1.15, fontsize=14, linespacing=2) # y=1.15 for k=20
 
     for col in range(len(p_col)):
         for row in range(df_fill.shape[0]):
diff --git a/stratipy/filtering_diffusion.py b/stratipy/filtering_diffusion.py
@@ -24,7 +24,7 @@
 
 
 # @profile
-def propagation(M, adj, alpha=0.7, tol=10e-6):  # TODO equation, M, alpha
+def propagation(M, adj, alpha, tol=10e-6):  # TODO equation, M, alpha
     """Network propagation iterative process
 
     Iterative algorithm for apply propagation using random walk on a network:
diff --git a/stratipy/formatting_data.py b/stratipy/formatting_data.py
@@ -13,7 +13,7 @@ def check_sparsity(X):
 def check_shape_matching(X, L, array_name, list_name):
     if X.shape[0] != len(L):
         raise Exception("Numbers in {} shape ({}) and in {} ({}) don't match "
-                        .format(array_name, X.shape(), list_name, len(L)))
+                        .format(array_name, X.shape, list_name, len(L)))
 
 
 #TODO check ID order in list and network
diff --git a/stratipy/hierarchical_clustering.py b/stratipy/hierarchical_clustering.py
@@ -1,7 +1,7 @@
 import sys
 import os
 sys.path.append(os.path.abspath('../../stratipy'))
-from stratipy import consensus_clustering
+# from stratipy import consensus_clustering
 from scipy.cluster.hierarchy import linkage, dendrogram, fcluster, cophenet
 from scipy.spatial.distance import pdist
 import numpy as np
@@ -127,6 +127,67 @@ def linkage_dendrogram(hierarchical_clustering_file, distance_genes,
                     bbox_inches='tight')
 
 
+# for no SSC data
+def individual_linkage_dendrogram(hierarchical_clustering_file,
+                       distance_patients, ppi_data, mut_type, alpha, ngh_max,
+                       n_components, n_permutations, lambd, linkage_method,
+                       patient_data, data_folder, result_folder, repro):
+
+    existance_same_param = os.path.exists(hierarchical_clustering_file)
+
+    if existance_same_param:
+        h = loadmat(hierarchical_clustering_file)
+        # cluster index for each individual
+        clust_nb_patients = np.squeeze(h['flat_cluster_number_individuals'])
+        # individuals' index
+        idx_patients = np.squeeze(h['dendrogram_index_individuals'])
+        print(' **** Same parameters file of hierarchical clustering already exists')
+    else:
+        # hierarchical clustering on distance matrix (here: distance_patients)
+        start = time.time()
+        Z_patients = linkage(distance_patients, method=linkage_method)
+        end = time.time()
+        print("---------- Linkage based on Individual distance = {} ---------- {}"
+              .format(datetime.timedelta(seconds=end-start),
+                      datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")),
+              flush=True)
+
+        P_patients = dendrogram(
+            Z_patients, count_sort='ascending', no_labels=True)
+        
+        idx_patients = np.array(P_patients['leaves'])
+
+        # forms flat clusters from Z
+        # given k -> maxclust
+        clust_nb_patients = fcluster(
+            Z_patients, n_components, criterion='maxclust')
+
+        # start = time.time()
+        savemat(hierarchical_clustering_file,
+                {'Z_linkage_matrix_individuals': Z_patients,
+                 'dendrogram_data_dictionary_individuals': P_patients,
+                 'dendrogram_index_individuals': idx_patients,
+                 'flat_cluster_number_individuals': clust_nb_patients},
+                do_compression=True)
+
+        D_patients = distance_patients[idx_patients, :][:, idx_patients]
+
+        fig = plt.figure(figsize=(3, 3))
+        im = plt.imshow(D_patients, interpolation='nearest', cmap=cm.viridis)
+        plt.axis('off')
+        if repro:
+            directory = result_folder
+        else:
+            directory = data_folder
+        fig_directory = directory + 'figures/similarity/' + patient_data + '_' + ppi_data + '/'
+        os.makedirs(fig_directory, exist_ok=True)
+        fig_name = ('{}_{}_k={}_ngh={}_permut={}_lambd={}'.format(
+            mut_type, alpha, n_components, ngh_max, n_permutations, lambd))
+        print('saving plot')
+        plt.savefig('{}{}.png'.format(fig_directory, fig_name),
+                    bbox_inches='tight')
+
+
 def hierarchical(result_folder, distance_genes, distance_patients, ppi_data,
                  mut_type, influence_weight, simplification, alpha, tol,
                  keep_singletons, ngh_max, min_mutation, max_mutation,
diff --git a/stratipy/load_data.py b/stratipy/load_data.py
@@ -80,13 +80,13 @@ def get_indiv_list(indiv_from_df):
     return alist
 
 
-def mutation_profile_coordinate(df, indiv_list):
+def mutation_profile_coordinate(df, indiv_list, indiv_type):
     coord_gene = []
     coord_indiv = []
 
     for i in trange(df.shape[0], desc="mutation profile coordinates"):
         # for each row (each gene), we get list of individuals' ID
-        individuals_per_gene = coordinate(df.iloc[i, 1], indiv_list)
+        individuals_per_gene = coordinate(df[indiv_type][i], indiv_list)
         # for each element of coordinate listes x/y:
         for j in individuals_per_gene:
             # gene is saved as gene's INDEX (not EntrezGene) in dataframe
@@ -122,12 +122,15 @@ def load_overall_SSC_mutation_profile(data_folder, ssc_mutation_data):
 
         # individuals' ID list for each row is transformed in string of list
         # we thus reformate to list
-        df.individuals = df.individuals.apply(eval)
+        indiv_type = 'eu_individuals' # only Europeans
+        df[indiv_type] = df[indiv_type].apply(eval)
         # create individual ID list
-        indiv = get_indiv_list(df.individuals)
+        indiv = get_indiv_list(df[indiv_type])
+        if '[' in indiv: indiv.remove('[')
+        if ']' in indiv: indiv.remove(']')
 
         # calculate coordinates genes x individuals -> sparse matrix
-        coord_indiv, coord_gene = mutation_profile_coordinate(df, indiv)
+        coord_indiv, coord_gene = mutation_profile_coordinate(df, indiv, indiv_type)
         # mutation weight = 1
         weight = np.ones(len(coord_gene))
         # coo matrix then to csr matrix
@@ -162,6 +165,8 @@ def load_specific_SSC_mutation_profile(data_folder, ssc_mutation_data, ssc_subgr
     else:
         mutation_profile, gene_id, indiv = (
             load_overall_SSC_mutation_profile(data_folder, ssc_mutation_data))
+        print("SSC overall mutation profile matrix\n    shape: {}\n    stored elements: {}".format(mutation_profile.shape, mutation_profile.nnz))
+    
 
         # if SSC 1 or 2
         if ssc_subgroups != "SSC":
@@ -210,6 +215,8 @@ def load_specific_SSC_mutation_profile(data_folder, ssc_mutation_data, ssc_subgr
         savemat(mutation_profile_file, {'mutation_profile': mutation_profile,
                                         'gene_id': gene_id,
                                         'indiv': indiv}, do_compression=True)
+    
+    print("Mutation profile matrix\n    shape: {}\n    stored elements: {}".format(mutation_profile.shape, mutation_profile.nnz))
 
     return mutation_profile, gene_id, indiv
 
diff --git a/stratipy/nbs.py b/stratipy/nbs.py
@@ -2,7 +2,7 @@
 # coding: utf-8
 import sys
 import os
-sys.path.append(os.path.abspath('../../stratipy_cluster'))
+sys.path.append(os.path.abspath('../../stratipy'))
 from stratipy import (load_data, formatting_data, filtering_diffusion,
                       nmf_bootstrap, consensus_clustering,
                       hierarchical_clustering, biostat, biostat_go,
@@ -21,8 +21,11 @@ def initiation(mut_type, alpha, patient_data, data_folder, ssc_mutation_data,
         alpha = 0
 
     if patient_data == 'SSC':
+        # result_folder = (
+        #     data_folder + 'result_' + ssc_mutation_data + '_' +
+        #     ssc_subgroups + '_' + gene_data + '_' + ppi_data + '/')
         result_folder = (
-            data_folder + 'result_' + ssc_mutation_data + '_' +
+            data_folder + '/Volumes/Abu3/min/201812_MAF50_alpha0.7/result_' + ssc_mutation_data + '_' +
             ssc_subgroups + '_' + gene_data + '_' + ppi_data + '/')
     else:
         result_folder = (data_folder + 'result_' + patient_data + '_' +
@@ -117,55 +120,55 @@ def post_bootstrap(result_folder, mut_type, influence_weight, simplification,
                    ppi_data, patient_data, data_folder, ssc_subgroups,
                    ssc_mutation_data, gene_data, p_val_threshold, compute,
                    overwrite):
-    print("------------ consensus_clustering.py ------------ {}"
-          .format(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
-          flush=True)
-    distance_genes, distance_patients = (
-        consensus_clustering.sub_consensus(
-            result_folder, mut_type, influence_weight, simplification, alpha,
-            tol, keep_singletons, ngh_max, min_mutation, max_mutation,
-            n_components, n_permutations, lambd, tol_nmf,
-            compute_gene_clustering, run_consensus))
-
-    print("------------ hierarchical_clustering.py ------------ {}"
-          .format(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
-          flush=True)
-    hierarchical_clustering.hierarchical(
-        result_folder, distance_genes, distance_patients, ppi_data, mut_type,
-        influence_weight, simplification, alpha, tol, keep_singletons, ngh_max,
-        min_mutation, max_mutation, n_components, n_permutations, lambd,
-        tol_nmf, linkage_method, patient_data, data_folder, ssc_subgroups,
-        ssc_mutation_data, gene_data)
+    # print("------------ consensus_clustering.py ------------ {}"
+    #       .format(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
+    #       flush=True)
+    # distance_genes, distance_patients = (
+    #     consensus_clustering.sub_consensus(
+    #         result_folder, mut_type, influence_weight, simplification, alpha,
+    #         tol, keep_singletons, ngh_max, min_mutation, max_mutation,
+    #         n_components, n_permutations, lambd, tol_nmf,
+    #         compute_gene_clustering, run_consensus))
+    #
+    # print("------------ hierarchical_clustering.py ------------ {}"
+    #       .format(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
+    #       flush=True)
+    # hierarchical_clustering.hierarchical(
+    #     result_folder, distance_genes, distance_patients, ppi_data, mut_type,
+    #     influence_weight, simplification, alpha, tol, keep_singletons, ngh_max,
+    #     min_mutation, max_mutation, n_components, n_permutations, lambd,
+    #     tol_nmf, linkage_method, patient_data, data_folder, ssc_subgroups,
+    #     ssc_mutation_data, gene_data)
 
     print("\n------------ biostat.py ------------ {}"
           .format(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
           flush=True)
-    # gene_id_ppi, idx_ppi, idx_ppi_only = preprocessing(
-    #     data_folder, patient_data, ssc_mutation_data, ssc_subgroups, gene_data,
-    #     ppi_data, result_folder, influence_weight, simplification, compute,
-    #     overwrite, alpha, tol, ngh_max, keep_singletons, min_mutation,
-    #     max_mutation, mut_type)
-    #
-    # biostat.biostat_analysis(
-    #     data_folder, result_folder, patient_data, ssc_mutation_data,
-    #     ssc_subgroups, ppi_data, gene_data, mut_type, influence_weight,
-    #     simplification, alpha, tol, keep_singletons, ngh_max, min_mutation,
-    #     max_mutation, n_components, n_permutations, lambd, tol_nmf,
-    #     linkage_method, p_val_threshold, gene_id_ppi, idx_ppi, idx_ppi_only)
+    gene_id_ppi, idx_ppi, idx_ppi_only = preprocessing(
+        data_folder, patient_data, ssc_mutation_data, ssc_subgroups, gene_data,
+        ppi_data, result_folder, influence_weight, simplification, compute,
+        overwrite, alpha, tol, ngh_max, keep_singletons, min_mutation,
+        max_mutation, mut_type)
+
+    biostat.biostat_analysis(
+        data_folder, result_folder, patient_data, ssc_mutation_data,
+        ssc_subgroups, ppi_data, gene_data, mut_type, influence_weight,
+        simplification, alpha, tol, keep_singletons, ngh_max, min_mutation,
+        max_mutation, n_components, n_permutations, lambd, tol_nmf,
+        linkage_method, p_val_threshold, gene_id_ppi, idx_ppi, idx_ppi_only)
 
-    biostat_go.biostat_go_enrichment(
-        alpha, result_folder, mut_type, patient_data, data_folder, ssc_mutation_data,
-        ssc_subgroups, gene_data, ppi_data, lambd, n_components, ngh_max, n_permutations)
+    # biostat_go.biostat_go_enrichment(
+    #     alpha, result_folder, mut_type, patient_data, data_folder, ssc_mutation_data,
+    #     ssc_subgroups, gene_data, ppi_data, lambd, n_components, ngh_max, n_permutations)
 
     print("\n------------ biostat_plot.py ------------ {}"
           .format(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
           flush=True)
     # no need SSC1/SSC2, no need k
-    biostat_plot.load_plot_biostat_individuals(
-        result_folder, data_folder, ssc_mutation_data,
-        gene_data, patient_data, ppi_data, mut_type, lambd, influence_weight,
-        simplification, alpha, tol, keep_singletons, ngh_max, min_mutation,
-        max_mutation, n_components, n_permutations, tol_nmf, linkage_method)
+    # biostat_plot.load_plot_biostat_individuals(
+    #     result_folder, data_folder, ssc_mutation_data,
+    #     gene_data, patient_data, ppi_data, mut_type, lambd, influence_weight,
+    #     simplification, alpha, tol, keep_singletons, ngh_max, min_mutation,
+    #     max_mutation, n_components, n_permutations, tol_nmf, linkage_method)
 ###############################################################################
 ###############################################################################
 ###############################################################################
diff --git a/stratipy/nbs_local.py b/stratipy/nbs_local.py
diff --git a/stratipy/parameters.py b/stratipy/parameters.py