19
19
from __future__ import print_function
20
20
21
21
import collections
22
+ import glob
22
23
import json
24
+ import os
25
+ import re
23
26
24
27
from werkzeug import wrappers
25
28
29
+ from tensorflow .python .framework import tensor_util
26
30
from tensorflow .python .platform import tf_logging as logging
31
+ from tensorflow .tensorboard .backend .event_processing import event_accumulator
32
+ from tensorflow .tensorboard .backend .event_processing import event_file_loader
27
33
from tensorflow .tensorboard .lib .python import http_util
28
34
from tensorflow .tensorboard .plugins import base_plugin
29
35
42
48
# The default run to retrieve health pills for.
43
49
_DEFAULT_RUN = '.'
44
50
51
+ # The POST key of HEALTH_PILLS_ROUTE for the specific step to retrieve health
52
+ # pills for.
53
+ _STEP_POST_KEY = 'step'
54
+
55
+ # A glob pattern for files containing debugger-related events.
56
+ _DEBUGGER_EVENTS_GLOB_PATTERN = 'events.debugger*'
57
+
45
58
46
59
class DebuggerPlugin (base_plugin .TBPlugin ):
47
60
"""TensorFlow Debugger plugin. Receives requests for debugger-related data.
@@ -58,17 +71,18 @@ def __init__(self, event_multiplexer):
58
71
"""
59
72
self ._event_multiplexer = event_multiplexer
60
73
61
- def get_plugin_apps (self , unused_run_paths , unused_logdir ):
62
- """Obtains a mapping between routes and handlers.
74
+ def get_plugin_apps (self , unused_run_paths , logdir ):
75
+ """Obtains a mapping between routes and handlers. Stores the logdir.
63
76
64
77
Args:
65
78
unused_run_paths: A mapping between run paths and handlers.
66
- unused_logdir : The logdir string - the directory of events files.
79
+ logdir : The logdir string - the directory of events files.
67
80
68
81
Returns:
69
82
A mapping between routes and handlers (functions that respond to
70
83
requests).
71
84
"""
85
+ self ._logdir = logdir
72
86
return {
73
87
_HEALTH_PILLS_ROUTE : self ._serve_health_pills_handler ,
74
88
}
@@ -77,15 +91,27 @@ def get_plugin_apps(self, unused_run_paths, unused_logdir):
77
91
def _serve_health_pills_handler (self , request ):
78
92
"""A (wrapped) werkzeug handler for serving health pills.
79
93
80
- Accepts POST requests and responds with health pills. Specifically, the
81
- handler expects a required "node_names" and an optional "run" POST data key.
82
- The value of the "node_names" key should be a JSON-ified list of node names
83
- for which the client would like to request health pills. The value of the
84
- "run" key (which defaults to ".") should be the run to retrieve health pills
85
- for. This data is sent via POST (not GET) because URL length is limited.
94
+ Accepts POST requests and responds with health pills. The request accepts
95
+ several POST parameters:
96
+
97
+ node_names: (required string) A JSON-ified list of node names for which
98
+ the client would like to request health pills.
99
+ run: (optional string) The run to retrieve health pills for. Defaults to
100
+ '.'. This data is sent via POST (not GET) since URL length is limited.
101
+ step: (optional integer): The session run step for which to
102
+ retrieve health pills. If provided, the handler reads the health pills
103
+ of that step from disk (which is slow) and produces a response with
104
+ only health pills at that step. If not provided, the handler returns a
105
+ response with health pills at all steps sampled by the event
106
+ multiplexer (the fast path). The motivation here is that, sometimes,
107
+ one desires to examine health pills at a specific step (to say find
108
+ the first step that causes a model to blow up with NaNs).
109
+ get_plugin_apps must be called before this slower feature is used
110
+ because that method passes the logdir (directory path) to this plugin.
86
111
87
112
This handler responds with a JSON-ified object mapping from node names to a
88
- list of health pill event objects, each of which has these properties.
113
+ list (of size 1) of health pill event objects, each of which has these
114
+ properties.
89
115
90
116
{
91
117
'wall_time': float,
@@ -112,7 +138,7 @@ def _serve_health_pills_handler(self, request):
112
138
113
139
if _NODE_NAMES_POST_KEY not in request .form :
114
140
logging .error (
115
- 'The %s POST key was not found in the request for health pills.' ,
141
+ 'The %r POST key was not found in the request for health pills.' ,
116
142
_NODE_NAMES_POST_KEY )
117
143
return wrappers .Response (status = 400 )
118
144
@@ -123,30 +149,197 @@ def _serve_health_pills_handler(self, request):
123
149
# Different JSON libs raise different exceptions, so we just do a
124
150
# catch-all here. This problem is complicated by how Tensorboard might be
125
151
# run in many different environments, as it is open-source.
126
- logging .error (
127
- 'Could not decode node name JSON string %s: %s' ,
128
- jsonified_node_names , e )
152
+ logging .error ('Could not decode node name JSON string %r: %s' ,
153
+ jsonified_node_names , e )
129
154
return wrappers .Response (status = 400 )
130
155
131
156
if not isinstance (node_names , list ):
132
- logging .error (
133
- '%s is not a JSON list of node names:' , jsonified_node_names )
157
+ logging .error ('%r is not a JSON list of node names:' ,
158
+ jsonified_node_names )
134
159
return wrappers .Response (status = 400 )
135
160
136
- mapping = collections .defaultdict (list )
137
161
run = request .form .get (_RUN_POST_KEY , _DEFAULT_RUN )
162
+ step_string = request .form .get (_STEP_POST_KEY , None )
163
+ if step_string is None :
164
+ # Use all steps sampled by the event multiplexer (Relatively fast).
165
+ mapping = self ._obtain_sampled_health_pills (run , node_names )
166
+ else :
167
+ # Read disk to obtain the health pills for that step (Relatively slow).
168
+ # Make sure that the directory for the run exists.
169
+ # Determine the directory of events file to read.
170
+ events_directory = self ._logdir
171
+ if run != _DEFAULT_RUN :
172
+ # Use the directory for the specific run.
173
+ events_directory = os .path .join (events_directory , run )
174
+
175
+ step = int (step_string )
176
+ try :
177
+ mapping = self ._obtain_health_pills_at_step (
178
+ events_directory , node_names , step )
179
+ except IOError as error :
180
+ logging .error (
181
+ 'Error retrieving health pills for step %d: %s' , step , error )
182
+ return wrappers .Response (status = 404 )
183
+
184
+ # Convert event_accumulator.HealthPillEvents to JSON-able dicts.
185
+ jsonable_mapping = {}
186
+ for node_name , events in mapping .items ():
187
+ jsonable_mapping [node_name ] = [e ._asdict () for e in events ]
188
+ return http_util .Respond (request , jsonable_mapping , 'application/json' )
189
+
190
+ def _obtain_sampled_health_pills (self , run , node_names ):
191
+ """Obtains the health pills for a run sampled by the event multiplexer.
192
+
193
+ This is much faster than the alternative path of reading health pills from
194
+ disk.
195
+
196
+ Args:
197
+ run: The run to fetch health pills for.
198
+ node_names: A list of node names for which to retrieve health pills.
199
+
200
+ Returns:
201
+ A dictionary mapping from node name to a list of
202
+ event_accumulator.HealthPillEvents.
203
+ """
204
+ mapping = {}
138
205
for node_name in node_names :
139
206
try :
140
- pill_events = self ._event_multiplexer .HealthPills (run , node_name )
141
- for pill_event in pill_events :
142
- mapping [node_name ].append ({
143
- 'wall_time' : pill_event [0 ],
144
- 'step' : pill_event [1 ],
145
- 'node_name' : pill_event [2 ],
146
- 'output_slot' : pill_event [3 ],
147
- 'value' : pill_event [4 ],
148
- })
207
+ mapping [node_name ] = self ._event_multiplexer .HealthPills (run , node_name )
149
208
except KeyError :
150
- logging .info ('No health pills found for node %s.' , node_name )
209
+ logging .info ('No health pills found for node %r.' , node_name )
210
+ continue
211
+
212
+ return mapping
213
+
214
+ def _obtain_health_pills_at_step (self , events_directory , node_names , step ):
215
+ """Reads disk to obtain the health pills for a run at a specific step.
216
+
217
+ This could be much slower than the alternative path of just returning all
218
+ health pills sampled by the event multiplexer. It could take tens of minutes
219
+ to complete this call for large graphs for big step values (in the
220
+ thousands).
221
+
222
+ Args:
223
+ events_directory: The directory containing events for the desired run.
224
+ node_names: A list of node names for which to retrieve health pills.
225
+ step: The step to obtain health pills for.
226
+
227
+ Returns:
228
+ A dictionary mapping from node name to a list of health pill objects (see
229
+ docs for _serve_health_pills_handler for properties of those objects).
230
+
231
+ Raises:
232
+ IOError: If no files with health pill events could be found.
233
+ """
234
+ # Obtain all files with debugger-related events.
235
+ pattern = os .path .join (events_directory , _DEBUGGER_EVENTS_GLOB_PATTERN )
236
+ file_paths = glob .glob (pattern )
237
+
238
+ if not file_paths :
239
+ raise IOError (
240
+ 'No events files found that matches the pattern %r.' , pattern )
241
+
242
+ # Sort by name (and thus by timestamp).
243
+ file_paths .sort ()
244
+
245
+ mapping = collections .defaultdict (list )
246
+ node_name_set = frozenset (node_names )
247
+
248
+ for file_path in file_paths :
249
+ should_stop = self ._process_health_pill_event (
250
+ node_name_set , mapping , step , file_path )
251
+ if should_stop :
252
+ break
253
+
254
+ return mapping
255
+
256
+ def _process_health_pill_event (self , node_name_set , mapping , target_step ,
257
+ file_path ):
258
+ """Creates health pills out of data in an event.
259
+
260
+ Creates health pills out of the event and adds them to the mapping.
261
+
262
+ Args:
263
+ node_name_set: A set of node names that are relevant.
264
+ mapping: The mapping from node name to event_accumulator.HealthPillEvents.
265
+ This object may be destructively modified.
266
+ target_step: The target step at which to obtain health pills.
267
+ file_path: The path to the file with health pill events.
268
+
269
+ Returns:
270
+ Whether we should stop reading events because future events are no longer
271
+ relevant.
272
+ """
273
+ events_loader = event_file_loader .EventFileLoader (file_path )
274
+ for event in events_loader .Load ():
275
+ if not event .HasField ('summary' ):
276
+ logging .warning ('An event in a debugger events file lacks a summary.' )
277
+ continue
278
+
279
+ if event .step < target_step :
280
+ # This event is not of the relevant step. We perform this check
281
+ # first because the majority of events will be eliminated from
282
+ # consideration by this check.
283
+ continue
284
+
285
+ if event .step > target_step :
286
+ # We have passed the relevant step. No need to read more events.
287
+ return True
288
+
289
+ for value in event .summary .value :
290
+ # Since we seek health pills for a specific step, this function
291
+ # returns 1 health pill per node per step. The wall time is the
292
+ # seconds since the epoch.
293
+ health_pill = self ._process_health_pill_value (
294
+ node_name_set , event .wall_time , event .step , value )
295
+ if not health_pill :
296
+ continue
297
+ mapping [health_pill .node_name ].append (health_pill )
298
+
299
+ # Keep reading events.
300
+ return False
301
+
302
+ def _process_health_pill_value (self , node_name_set , wall_time , step , value ):
303
+ """Creates a dict containing various properties of a health pill.
304
+
305
+ Args:
306
+ node_name_set: A set of node names that are relevant.
307
+ wall_time: The wall time in seconds.
308
+ step: The session run step of the event.
309
+ value: The health pill value.
310
+
311
+ Returns:
312
+ An event_accumulator.HealthPillEvent. Or None if one could not be created.
313
+ """
314
+ if not value .HasField ('tensor' ):
315
+ logging .warning (
316
+ 'An event in a debugger events file lacks a tensor value.' )
317
+ return None
318
+
319
+ if value .tag != event_accumulator .HEALTH_PILL_EVENT_TAG :
320
+ logging .warning (
321
+ ('A debugger-related event lacks the %r tag. It instead has '
322
+ 'the %r tag.' ), event_accumulator .HEALTH_PILL_EVENT_TAG , value .tag )
323
+ return None
324
+
325
+ match = re .match (r'^(.*):(\d+):DebugNumericSummary$' , value .node_name )
326
+ if not match :
327
+ logging .warning (
328
+ ('A event with a health pill has an invalid watch, (i.e., an '
329
+ 'unexpected debug op): %r' ), value .node_name )
330
+ return None
331
+
332
+ node_name = match .group (1 )
333
+ if node_name not in node_name_set :
334
+ # This event is not relevant.
335
+ return None
151
336
152
- return http_util .Respond (request , mapping , 'application/json' )
337
+ # Since we seek health pills for a specific step, this function
338
+ # returns 1 health pill per node per step. The wall time is the
339
+ # seconds since the epoch.
340
+ return event_accumulator .HealthPillEvent (
341
+ wall_time = wall_time ,
342
+ step = step ,
343
+ node_name = node_name ,
344
+ output_slot = int (match .group (2 )),
345
+ value = list (tensor_util .MakeNdarray (value .tensor )))
0 commit comments