Skip to content
This repository was archived by the owner on Jul 11, 2020. It is now read-only.

Commit 4e14be3

Browse files
committed
Added MultiBLAST.py and .gitignore
1 parent 8ff433c commit 4e14be3

File tree

2 files changed

+351
-0
lines changed

2 files changed

+351
-0
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# Pycharm files
2+
.idea/

MultiBLAST.py

Lines changed: 349 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,349 @@
1+
#!/usr/bin/env python
2+
# coding: utf-8
3+
4+
"""
5+
MultiBLAST
6+
Sequentially Blast every fasta file (.faa or .fas) on a folder against a local database
7+
8+
DEPENDENCIES:
9+
- Python 2.7
10+
- Blast+ executables (https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastDocs&DOC_TYPE=Download)
11+
- LINUX (tested and developed on Ubuntu 14.0.4, but will most likely work on WINDOWS and MAC too)
12+
13+
HOW TO RUN:
14+
- Organize your files as follows on a folder:
15+
- MultiBLAST.py
16+
- database files (obtained via makeblast, with 'makeblastdb -in [yourGenes].pfasta -dbtype prot')
17+
- folder with fasta files (.faa or .fas) to blast (must be the only existing folder in the directory)
18+
- Run MultiBLAST.py from the command-line as follows 'python MultiBLAST.py [-t [#]]'
19+
- For more help, type 'python MultiBLAST.py -h'
20+
21+
ENSURES:
22+
- Each fasta file will be blasted against the given database
23+
- Each blast output is created inside the fasta folder
24+
"""
25+
26+
import os
27+
import argparse
28+
from time import time, strftime, localtime
29+
from sys import exit
30+
31+
__author__ = 'Pedro HC David, https://github.com/Kronopt'
32+
__credits__ = ['Pedro HC David']
33+
__version__ = '1.0'
34+
__date__ = '00:17h, 21/10/2016'
35+
__status__ = 'Finished'
36+
37+
38+
def blast(threads):
39+
"""
40+
Main Function
41+
42+
PARAMETERS:
43+
threads : int/str
44+
Number of CPU threads to use
45+
46+
REQUIRES:
47+
threads must be a number >= 1
48+
49+
ENSURES:
50+
Blast each file against the given database
51+
"""
52+
53+
root_folder, root_list = root_files()
54+
55+
# Obtains root path, database name and a list containing all the fasta failes
56+
folder_root_dir = fasta_folder(root_list)
57+
db = db_file(root_list)
58+
fasta_files = fasta_files_folder(root_folder, folder_root_dir)
59+
60+
start_blast_text()
61+
62+
# Timer, START
63+
start = time()
64+
65+
# Time of program start
66+
program_start_time = strftime('%d/%m/%y %H:%M:%S', localtime())
67+
68+
# Main loop
69+
loop_blast(fasta_files, root_folder, folder_root_dir, db, threads)
70+
end_blast_text(start, program_start_time)
71+
72+
73+
def exit_script():
74+
"""
75+
Exits the script
76+
"""
77+
78+
print
79+
print '*****'
80+
print 'EXITING SCRIPT'
81+
print '*****'
82+
exit()
83+
84+
85+
# -----------------------------------------------------
86+
# FILES AND FOLDERS SECTION
87+
# -----------------------------------------------------
88+
89+
90+
def root_files():
91+
"""
92+
Creates a list of every file/folder in the current working directory (ROOT)
93+
"""
94+
95+
root_folder = os.getcwd()
96+
root_list = os.listdir(root_folder)
97+
root_list.sort()
98+
99+
return root_folder, root_list
100+
101+
102+
def fasta_folder(root_list):
103+
"""
104+
Name of the folder containing the fasta files (.faa or .fas) files
105+
Prints the folder to the screen
106+
107+
PARAMETERS:
108+
root_list : list of str
109+
List of directories in the root directory
110+
111+
REQUIRES:
112+
Directories must be valid
113+
114+
ENSURES:
115+
Path to the only folder in the root directory
116+
"""
117+
118+
print
119+
120+
folder_root_dir = filter(lambda x: os.path.isdir(x), root_list)
121+
if len(folder_root_dir) != 1:
122+
print 'ERROR: None or more than one folder was found'
123+
exit_script()
124+
125+
print 'FASTA FOLDER:'
126+
print ' ', folder_root_dir[0]
127+
print
128+
129+
return folder_root_dir[0]
130+
131+
132+
def db_file(root_list):
133+
"""
134+
Database files
135+
Prints the name of the database to the screen
136+
137+
PARAMETERS:
138+
root_list : list of str
139+
List of directories in the root directory
140+
141+
REQUIRES:
142+
Directories must be valid
143+
144+
ENSURES:
145+
Path to the .pfasta database file
146+
"""
147+
148+
db = filter(lambda x: x.endswith('.pfasta'), root_list)
149+
if len(db) != 1:
150+
print 'ERROR: None or more than one database file was found'
151+
exit_script()
152+
153+
print 'DB FILE:'
154+
print ' ', db[0]
155+
print
156+
157+
return db[0]
158+
159+
160+
def fasta_files_folder(root_folder, folder_root_dir):
161+
"""
162+
Fasta files
163+
Prints every file name to the screen
164+
165+
PARAMETERS:
166+
root_folder : str
167+
Root folder's path
168+
folder_root_dir : str
169+
Fasta file folder's name
170+
171+
REQUIRES:
172+
root_folder must be a valid directory
173+
folder_root_dir must be a valid folder name
174+
175+
ENSURES:
176+
List of fasta files
177+
"""
178+
179+
fasta_files = filter(lambda x: (x.endswith('.faa') or x.endswith('.fas'))
180+
and os.path.isfile(root_folder + '/' + folder_root_dir + '/' + x),
181+
os.listdir(root_folder + '/' + folder_root_dir))
182+
fasta_files.sort()
183+
184+
print 'FASTA FILES:'
185+
for faa_file in fasta_files:
186+
print ' ', faa_file
187+
print
188+
189+
return fasta_files
190+
191+
192+
# -----------------------------------------------------
193+
# blast SECTION
194+
# -----------------------------------------------------
195+
196+
197+
def start_blast_text():
198+
"""
199+
Start blast text
200+
"""
201+
202+
print
203+
print '*****'
204+
print 'STARTING BLASTS'
205+
print '*****'
206+
print
207+
print
208+
209+
210+
def loop_blast(fasta_files, root_folder, folder_root_dir, db, threads):
211+
"""
212+
Main code for blasting each fasta file
213+
Prints information on each successful blast
214+
215+
PARAMETERS:
216+
fasta_files : list of str
217+
List of fasta files
218+
root_folder : str
219+
Path to root folder
220+
folder_root_dir : str
221+
Fasta folder's name
222+
db : str
223+
Database name
224+
threads : str/int
225+
Number of CPU threads to use
226+
227+
REQUIRES:
228+
fasta_files must have a list of existing fasta files
229+
root_folder must be a valide directory
230+
folder_root_dir must be the name of an existing folder
231+
db must be the name of the existing database
232+
threads must be a number >= 1
233+
234+
ENSURES:
235+
Blast of each fasta file agains the local specified database
236+
"""
237+
238+
count = 1
239+
for fasta_file in fasta_files:
240+
241+
# Timer, START
242+
start = time()
243+
244+
print '------------------'
245+
print 'blast', count, 'out of', len(fasta_files)
246+
print 'BLASTING:'
247+
print ' ', fasta_file
248+
249+
out_file_name = fasta_file[:len(fasta_file) - 4] + '__' + db[0:-7] + '_OUT'
250+
251+
# Tries to blast
252+
standard_blast = 'blastp -out ' + out_file_name + ' -outfmt 6 -query "' + root_folder + '/' + \
253+
folder_root_dir + '/' + fasta_file + '" -db ' + db + ' -num_threads ' + str(threads)
254+
255+
try:
256+
os.system(standard_blast)
257+
except:
258+
print
259+
print 'ERROR: Could not run NCBI blast+ program blastp'
260+
exit_script()
261+
262+
# Timer, END
263+
end = time()
264+
265+
days, hours, minutes, seconds = sec_to_hours(end - start)
266+
267+
print
268+
print 'TIME ELAPSED:'
269+
if days != 0:
270+
print str(days) + 'd ',
271+
print str(hours) + 'h', str(minutes) + 'm', str(seconds) + 's'
272+
print
273+
274+
# Moves the output file to the output file folder
275+
os.rename(out_file_name, folder_root_dir + '/' + out_file_name)
276+
277+
count += 1
278+
279+
280+
def end_blast_text(timer_start, program_start):
281+
"""
282+
End blast text
283+
284+
PARAMETERS:
285+
timer_start : float
286+
Value obtained from time.time()
287+
program_start : str
288+
Current time in text format
289+
290+
ENSURES:
291+
Time information printed to the screen
292+
"""
293+
294+
days, hours, minutes, seconds = sec_to_hours(time() - timer_start)
295+
program_end = strftime('%d/%m/%y %H:%M:%S', localtime())
296+
297+
print '------------------'
298+
print 'TIME OF START/END:'
299+
print program_start
300+
print program_end
301+
print
302+
print 'DURATION:'
303+
if days != 0:
304+
print str(days) + 'd ',
305+
print str(hours) + 'h', str(minutes) + 'm', str(seconds) + 's'
306+
print
307+
print
308+
print '*****'
309+
print 'END OF BLASTS'
310+
print '*****'
311+
print
312+
313+
314+
def sec_to_hours(seconds):
315+
"""
316+
Converts seconds into hours (or days, if it takes that long to run)
317+
318+
PARAMETERS:
319+
seconds : int/str/float
320+
number of seconds
321+
322+
REQUIRES:
323+
seconds must be a number >= 1
324+
325+
ENSURES:
326+
Seconds translated into days, hours, minutes and seconds
327+
"""
328+
329+
seconds = int(seconds)
330+
331+
days = seconds / 86400
332+
hours = (seconds - days * 86400) / 3600
333+
minutes = (seconds - days * 86400 - hours * 3600) / 60
334+
seconds = (seconds - days * 86400 - hours * 3600 - minutes * 60)
335+
336+
return days, hours, minutes, seconds
337+
338+
339+
if __name__ == '__main__':
340+
parser = argparse.ArgumentParser(description='Blasts fasta files (.faa or .fas) to a local database. '
341+
'Organize your files so that the root folder has only: [1] this '
342+
'script (MultiBLAST.py); [2] all the database files (obtained via '
343+
'"makeblastdb -in [yourGenes].pfasta -dbtype prot"); [3] folder with '
344+
'fasta files (.faa or .fas) to blast (must be the only folder '
345+
'available)')
346+
parser.add_argument('-t', metavar='#', nargs='?', default=1, type=int, const=1, help='Number of CPU threads')
347+
arguments = parser.parse_args()
348+
349+
blast(arguments.t)

0 commit comments

Comments
 (0)