|
261 | 261 | "output_type": "stream",
|
262 | 262 | "text": [
|
263 | 263 | "@> 3918 out of 20361 cases ignored with missing features.\n",
|
264 |
| - "@> CV iteration # 1: AUROC = 0.871 AUPRC = 0.932 OOB score = 0.820\n", |
265 |
| - "@> CV iteration # 2: AUROC = 0.874 AUPRC = 0.934 OOB score = 0.818\n", |
266 |
| - "@> CV iteration # 3: AUROC = 0.866 AUPRC = 0.925 OOB score = 0.821\n", |
267 |
| - "@> CV iteration # 4: AUROC = 0.878 AUPRC = 0.934 OOB score = 0.819\n", |
| 264 | + "@> CV iteration # 1: AUROC = 0.871 AUPRC = 0.931 OOB score = 0.820\n", |
| 265 | + "@> CV iteration # 2: AUROC = 0.875 AUPRC = 0.934 OOB score = 0.820\n", |
| 266 | + "@> CV iteration # 3: AUROC = 0.867 AUPRC = 0.926 OOB score = 0.820\n", |
| 267 | + "@> CV iteration # 4: AUROC = 0.878 AUPRC = 0.934 OOB score = 0.818\n", |
268 | 268 | "@> CV iteration # 5: AUROC = 0.872 AUPRC = 0.935 OOB score = 0.821\n",
|
269 |
| - "@> CV iteration # 6: AUROC = 0.877 AUPRC = 0.937 OOB score = 0.820\n", |
270 |
| - "@> CV iteration # 7: AUROC = 0.871 AUPRC = 0.931 OOB score = 0.820\n", |
271 |
| - "@> CV iteration # 8: AUROC = 0.871 AUPRC = 0.933 OOB score = 0.821\n", |
272 |
| - "@> CV iteration # 9: AUROC = 0.862 AUPRC = 0.923 OOB score = 0.820\n", |
273 |
| - "@> CV iteration #10: AUROC = 0.879 AUPRC = 0.934 OOB score = 0.819\n", |
| 269 | + "@> CV iteration # 6: AUROC = 0.875 AUPRC = 0.935 OOB score = 0.820\n", |
| 270 | + "@> CV iteration # 7: AUROC = 0.871 AUPRC = 0.931 OOB score = 0.819\n", |
| 271 | + "@> CV iteration # 8: AUROC = 0.873 AUPRC = 0.934 OOB score = 0.821\n", |
| 272 | + "@> CV iteration # 9: AUROC = 0.861 AUPRC = 0.922 OOB score = 0.820\n", |
| 273 | + "@> CV iteration #10: AUROC = 0.879 AUPRC = 0.935 OOB score = 0.818\n", |
274 | 274 | "@> ------------------------------------------------------------\n",
|
275 | 275 | "@> Cross-validation summary:\n",
|
276 | 276 | "@> training dataset size: 16443\n",
|
277 | 277 | "@> fraction of positives: 0.690\n",
|
278 | 278 | "@> mean AUROC: 0.872 +/- 0.005\n",
|
279 | 279 | "@> mean AUPRC: 0.932 +/- 0.004\n",
|
280 | 280 | "@> mean OOB score: 0.820 +/- 0.001\n",
|
281 |
| - "@> optimal cutoff*: 0.711 +/- 0.034\n", |
| 281 | + "@> optimal cutoff*: 0.709 +/- 0.036\n", |
282 | 282 | "@> (* argmax of Youden's index)\n",
|
283 | 283 | "@> feature importances:\n",
|
284 | 284 | "@> wt_PSIC: 0.143\n",
|
285 |
| - "@> Delta_PSIC: 0.189\n", |
| 285 | + "@> Delta_PSIC: 0.188\n", |
286 | 286 | "@> SASA: 0.068\n",
|
287 | 287 | "@> ANM_MSF-chain: 0.074\n",
|
288 | 288 | "@> ANM_effectiveness-chain: 0.078\n",
|
|
298 | 298 | "@> ROC plot saved to ROC.png\n",
|
299 | 299 | "@> ------------------------------------------------------------\n",
|
300 | 300 | "@> Classifier training summary:\n",
|
301 |
| - "@> mean OOB score: 0.821\n", |
| 301 | + "@> mean OOB score: 0.820\n", |
302 | 302 | "@> feature importances:\n",
|
303 |
| - "@> wt_PSIC: 0.140\n", |
304 |
| - "@> Delta_PSIC: 0.191\n", |
305 |
| - "@> SASA: 0.068\n", |
| 303 | + "@> wt_PSIC: 0.144\n", |
| 304 | + "@> Delta_PSIC: 0.188\n", |
| 305 | + "@> SASA: 0.069\n", |
306 | 306 | "@> ANM_MSF-chain: 0.074\n",
|
307 | 307 | "@> ANM_effectiveness-chain: 0.078\n",
|
308 | 308 | "@> ANM_sensitivity-chain: 0.071\n",
|
309 | 309 | "@> stiffness-chain: 0.080\n",
|
310 | 310 | "@> entropy: 0.098\n",
|
311 | 311 | "@> ranked_MI: 0.068\n",
|
312 |
| - "@> BLOSUM: 0.048\n", |
| 312 | + "@> BLOSUM: 0.047\n", |
313 | 313 | "@> new_feature: 0.084\n",
|
314 | 314 | "@> ------------------------------------------------------------\n",
|
315 | 315 | "@> Feat. importance plot saved to feat_importances.png\n"
|
|
381 | 381 | "cell_type": "markdown",
|
382 | 382 | "metadata": {},
|
383 | 383 | "source": [
|
384 |
| - "We cannot simply use the main interface `rhapsody()`, because the new feature cannot be computed automatically by Rhapsody:" |
385 |
| - ] |
386 |
| - }, |
387 |
| - { |
388 |
| - "cell_type": "raw", |
389 |
| - "metadata": {}, |
390 |
| - "source": [ |
391 |
| - "# This will cause an error\n", |
392 |
| - "rh = rd.rhapsody(test_SAVs, main_classifier='custom_classifier.pkl')" |
| 384 | + "We cannot simply use the main interface `rhapsody()`, because the new feature cannot be computed automatically by Rhapsody:\n", |
| 385 | + "```\n", |
| 386 | + "# This would cause an error\n", |
| 387 | + "rh = rd.rhapsody(test_SAVs, main_classifier='custom_classifier.pkl')\n", |
| 388 | + "```" |
393 | 389 | ]
|
394 | 390 | },
|
395 | 391 | {
|
|
409 | 405 | "output_type": "stream",
|
410 | 406 | "text": [
|
411 | 407 | "@> Submitting query to PolyPhen-2...\n",
|
412 |
| - "@> Query to PolyPhen-2 started in 9.9s.\n", |
| 408 | + "@> Query to PolyPhen-2 started in 1.7s.\n", |
413 | 409 | "@> PolyPhen-2 is running...\n",
|
414 |
| - "@> Query to PolyPhen-2 completed in 9.7s.\n", |
| 410 | + "@> Query to PolyPhen-2 completed in 20.0s.\n", |
415 | 411 | "@> PolyPhen-2's output parsed.\n"
|
416 | 412 | ]
|
417 | 413 | }
|
|
474 | 470 | "text": [
|
475 | 471 | "@> Sequence-conservation features have been retrieved from PolyPhen-2's output.\n",
|
476 | 472 | "@> Mapping SAVs to PDB structures...\n",
|
477 |
| - "Mapping SAV 'O00238 31 R H' to PDB: 0%| | 0/5 [00:00<?]@> WARNING Unable to recover pickle: Pickle UniprotMap-O00238.pkl was too old and was ignored.\n", |
478 |
| - "@> PDB file is found in the local folder (/home/luca/.../3mdy.pdb.gz).\n", |
479 |
| - "@> 858 atoms and 1 coordinate set(s) were parsed in 0.06s.\n", |
480 |
| - "Mapping SAV 'O00294 496 A T' to PDB: 20%|██ | 1/5 [00:01<00:05]@> Pickle 'UniprotMap-O00238.pkl' saved.\n", |
481 |
| - "@> WARNING Unable to recover pickle: Pickle UniprotMap-O00294.pkl was too old and was ignored.\n", |
482 |
| - "@> PDB file is found in the local folder (/home/luca/.../2fim.pdb.gz).\n", |
483 |
| - "@> 456 atoms and 1 coordinate set(s) were parsed in 0.06s.\n", |
484 |
| - "@> PDB file is found in the local folder (/home/luca/.../3c5n.pdb.gz).\n", |
485 |
| - "@> 454 atoms and 1 coordinate set(s) were parsed in 0.17s.\n", |
486 |
| - "@> Chain A in 2FIM was aligned in 0.1s.\n", |
487 |
| - "Mapping SAV 'P01112 58 T R' to PDB: 40%|████ | 2/5 [00:02<00:03] @> Pickle 'UniprotMap-O00294.pkl' saved.\n", |
| 473 | + "Mapping SAV 'O00238 31 R H' to PDB: 0%| | 0/5 [00:00<?]@> Pickle 'UniprotMap-O00238.pkl' recovered.\n", |
| 474 | + "Mapping SAV 'O00294 496 A T' to PDB: 20%|██ | 1/5 [00:00<00:00]@> Pickle 'UniprotMap-O00238.pkl' saved.\n", |
| 475 | + "@> Pickle 'UniprotMap-O00294.pkl' recovered.\n", |
| 476 | + "Mapping SAV 'P01112 58 T R' to PDB: 40%|████ | 2/5 [00:00<00:00] @> Pickle 'UniprotMap-O00294.pkl' saved.\n", |
488 | 477 | "@> Pickle 'UniprotMap-P01112.pkl' recovered.\n",
|
489 | 478 | "Mapping SAV 'P01112 170 K I' to PDB: 100%|██████████| 5/5 [00:02<00:00]\n",
|
490 | 479 | "@> Pickle 'UniprotMap-P01112.pkl' saved.\n",
|
491 |
| - "@> 4 out of 5 SAVs have been mapped to PDB in 2.6s.\n", |
| 480 | + "@> 4 out of 5 SAVs have been mapped to PDB in 2.3s.\n", |
492 | 481 | "@> Computing structural and dynamical features from PDB structures...\n",
|
493 | 482 | "Analizing mutation site 1AA9:A 170: 0%| | 0/5 [00:00<?]@> Pickle 'PDBfeatures-1AA9.pkl' recovered.\n",
|
494 | 483 | "Analizing mutation site 2FIM:A 443: 0%| | 0/5 [00:00<?]@> Pickle 'PDBfeatures-1AA9.pkl' saved.\n",
|
495 |
| - "@> WARNING Unable to recover pickle: Pickle was too old and was ignored.\n", |
496 |
| - "@> PDB file is found in the local folder (/home/luca/.../2fim.pdb.gz).\n", |
497 |
| - "@> 3841 atoms and 1 coordinate set(s) were parsed in 0.06s.\n", |
498 |
| - "@> Running DSSP...\n", |
499 |
| - "@> DSSP finished in 1.4s.\n", |
500 |
| - "@> Kirchhoff was built in 0.01s.\n", |
501 |
| - "@> 223 modes were calculated in 0.61s.\n", |
502 |
| - "@> Hessian was built in 0.18s.\n", |
503 |
| - "@> 666 modes were calculated in 0.11s.\n", |
504 |
| - "@> Calculating covariance matrix\n", |
505 |
| - "@> Covariance matrix calculated in 0.0s.\n", |
506 |
| - "@> Calculating perturbation response\n", |
507 |
| - "@> Perturbation response matrix calculated in 0.0s.\n", |
508 |
| - "@> Perturbation response scanning completed in 0.1s.\n", |
509 |
| - "@> Calculating stiffness matrix.\n", |
510 |
| - "@> Stiffness matrix calculated in 0.24s.\n", |
511 |
| - "@> The range of effective force constant is: 4.581280954242648 to 26.124107763482908.\n", |
512 |
| - "Analizing mutation site 4Q21:A 58: 60%|██████ | 3/5 [00:03<00:02] @> Pickle 'PDBfeatures-2FIM.pkl' saved.\n", |
| 484 | + "@> Pickle 'PDBfeatures-2FIM.pkl' recovered.\n", |
| 485 | + "Analizing mutation site 4Q21:A 58: 0%| | 0/5 [00:00<?] @> Pickle 'PDBfeatures-2FIM.pkl' saved.\n", |
513 | 486 | "@> Pickle 'PDBfeatures-4Q21.pkl' recovered.\n",
|
514 |
| - "Analizing mutation site 4Q21:A 30: 60%|██████ | 3/5 [00:03<00:02]@> Pickle 'PDBfeatures-4Q21.pkl' saved.\n", |
515 |
| - "Analizing mutation site 4Q21:A 30: 100%|██████████| 5/5 [00:03<00:00]\n", |
516 |
| - "@> PDB features have been computed in 3.1s.\n", |
| 487 | + "Analizing mutation site 4Q21:A 30: 0%| | 0/5 [00:00<?]@> Pickle 'PDBfeatures-4Q21.pkl' saved.\n", |
| 488 | + "Analizing mutation site 4Q21:A 30: 100%|██████████| 5/5 [00:00<00:00]\n", |
| 489 | + "@> PDB features have been computed in 0.0s.\n", |
517 | 490 | "@> Computing sequence properties from Pfam domains...\n",
|
518 | 491 | "Mapping SAV 'O00238 31 R H' to Pfam: 0%| | 0/5 [00:00<?]@> Pickle 'UniprotMap-O00238.pkl' recovered.\n",
|
519 |
| - "@> Retrieving Pfam search results: https://pfam.xfam.org/protein/O00238?output=xml\n", |
520 |
| - "@> Pfam search completed in 1.44s.\n", |
521 |
| - "@> Query 'O00238' matched 3 Pfam families.\n", |
522 |
| - "@> Processing PF01064...\n", |
523 |
| - "@> Pfam MSA for PF01064 is written as PF01064_full.sth.\n", |
524 |
| - "@> 1768 sequence(s) with 359 residues were parsed in 0.03s.\n", |
525 |
| - "@> Number of columns in MSA reduced to 77.\n", |
526 |
| - "@> Row occupancy refinement reduced number of rows from 1768 to 1728 in 0.01s.\n", |
527 |
| - "@> Sequence identity refinement reduced number of rows from 1728 to 668 in 0.16s.\n", |
528 |
| - "@> Mutual information matrix was calculated in 0.02s.\n", |
529 |
| - "Mapping SAV 'O00294 496 A T' to Pfam: 20%|██ | 1/5 [00:05<00:23]@> Pickle 'UniprotMap-O00238.pkl' saved.\n", |
| 492 | + "Mapping SAV 'O00294 496 A T' to Pfam: 0%| | 0/5 [00:00<?]@> Pickle 'UniprotMap-O00238.pkl' saved.\n", |
530 | 493 | "@> Pickle 'UniprotMap-O00294.pkl' recovered.\n",
|
531 |
| - "@> Retrieving Pfam search results: https://pfam.xfam.org/protein/O00294?output=xml\n", |
532 |
| - "@> Pfam search completed in 1.07s.\n", |
533 |
| - "@> Query 'O00294' matched 1 Pfam families.\n", |
534 |
| - "@> Processing PF01167...\n", |
535 |
| - "@> Pfam MSA for PF01167 is written as PF01167_full.sth.\n", |
536 |
| - "@> 2789 sequence(s) with 1659 residues were parsed in 0.02s.\n", |
537 |
| - "@> Number of columns in MSA reduced to 241.\n", |
538 |
| - "@> Row occupancy refinement reduced number of rows from 2789 to 2108 in 0.00s.\n", |
539 |
| - "@> Sequence identity refinement reduced number of rows from 2108 to 1102 in 1.07s.\n", |
540 |
| - "@> Mutual information matrix was calculated in 0.24s.\n", |
541 |
| - "Mapping SAV 'P01112 58 T R' to Pfam: 40%|████ | 2/5 [00:14<00:20] @> Pickle 'UniprotMap-O00294.pkl' saved.\n", |
| 494 | + "Mapping SAV 'P01112 58 T R' to Pfam: 0%| | 0/5 [00:00<?] @> Pickle 'UniprotMap-O00294.pkl' saved.\n", |
542 | 495 | "@> Pickle 'UniprotMap-P01112.pkl' recovered.\n",
|
543 |
| - "Mapping SAV 'P01112 170 K I' to Pfam: 40%|████ | 2/5 [00:14<00:20]@> WARNING Unable to compute Pfam features: No Pfam domain for resid 170.\n", |
| 496 | + "Mapping SAV 'P01112 170 K I' to Pfam: 60%|██████ | 3/5 [00:00<00:00]@> WARNING Unable to compute Pfam features: No Pfam domain for resid 170.\n", |
544 | 497 | "@> Pickle 'UniprotMap-P01112.pkl' saved.\n",
|
545 |
| - "Mapping SAV 'P01112 170 K I' to Pfam: 100%|██████████| 5/5 [00:14<00:00]\n", |
546 |
| - "@> SAVs have been mapped on Pfam domains and sequence properties have been computed in 15.0s.\n", |
547 |
| - "@> Random Forest classifier imported in 12.6s.\n", |
548 |
| - "@> 3 predictions computed in 0.8s.\n", |
| 498 | + "Mapping SAV 'P01112 170 K I' to Pfam: 100%|██████████| 5/5 [00:00<00:00]\n", |
| 499 | + "@> SAVs have been mapped on Pfam domains and sequence properties have been computed in 0.4s.\n", |
| 500 | + "@> Random Forest classifier imported in 22.6s.\n", |
| 501 | + "@> 3 predictions computed in 0.5s.\n", |
549 | 502 | "@> Recovering EVmutation data...\n",
|
550 |
| - "@> EVmutation scores recovered in 0.1s.\n" |
| 503 | + "@> EVmutation scores recovered in 0.5s.\n" |
551 | 504 | ]
|
552 | 505 | },
|
553 | 506 | {
|
554 | 507 | "data": {
|
555 | 508 | "text/plain": [
|
556 |
| - "array([('O00294 496 A T', 'known_neu', 0.07533333, 0.02985149, 'neutral', 0.351, 'neutral', -3.1479, 'neutral'),\n", |
| 509 | + "array([('O00294 496 A T', 'known_neu', 0.086 , 0.03065213, 'neutral', 0.351, 'neutral', -3.1479, 'neutral'),\n", |
557 | 510 | " ('O00238 31 R H', 'new', nan, nan, '?', 0.219, 'neutral', -2.4718, 'neutral'),\n",
|
558 |
| - " ('P01112 58 T R', 'new', 0.9486667 , 0.9037717 , 'deleterious', 1. , 'deleterious', -9.7604, 'deleterious'),\n", |
559 |
| - " ('P01112 30 D E', 'new', 0.126 , 0.04657676, 'neutral', 0.001, 'neutral', 0.2196, 'neutral'),\n", |
| 511 | + " ('P01112 58 T R', 'new', 0.956 , 0.913615 , 'deleterious', 1. , 'deleterious', -9.7604, 'deleterious'),\n", |
| 512 | + " ('P01112 30 D E', 'new', 0.12666667, 0.044859 , 'neutral', 0.001, 'neutral', 0.2196, 'neutral'),\n", |
560 | 513 | " ('P01112 170 K I', 'new', nan, nan, '?', 0. , 'neutral', nan, '?')],\n",
|
561 | 514 | " dtype=[('SAV coords', '<U50'), ('training info', '<U12'), ('score', '<f4'), ('path. prob.', '<f4'), ('path. class', '<U12'), ('PolyPhen-2 score', '<f4'), ('PolyPhen-2 path. class', '<U12'), ('EVmutation score', '<f4'), ('EVmutation path. class', '<U12')])"
|
562 | 515 | ]
|
|
587 | 540 | "name": "python",
|
588 | 541 | "nbconvert_exporter": "python",
|
589 | 542 | "pygments_lexer": "ipython3",
|
590 |
| - "version": "3.7.5" |
| 543 | + "version": "3.7.4" |
591 | 544 | }
|
592 | 545 | },
|
593 | 546 | "nbformat": 4,
|
|
0 commit comments