Add slow path to fetch health pills at individual steps.

tensorflower-gardener · tensorflower-gardener · commit dc17f76fe438 · 2017-03-14T01:45:31.000-07:00
Tensorboard samples steps, yet users desire health pills at specific steps. This change makes the debugger plugin read directly from disk when the user specifies a specific step. This is much slower (It could take minutes.) than the alternative path of querying the multiplexer for sampled health pills.
Change: 150041439
diff --git a/tensorflow/tensorboard/backend/event_processing/event_accumulator.py b/tensorflow/tensorboard/backend/event_processing/event_accumulator.py
@@ -28,6 +28,7 @@
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.core.protobuf.config_pb2 import RunMetadata
 from tensorflow.core.util.event_pb2 import SessionLog
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
 from tensorflow.tensorboard.backend.event_processing import directory_watcher
@@ -116,7 +117,7 @@
 # The tag that values containing health pills have. Health pill data is stored
 # in tensors. In order to distinguish health pill values from scalar values, we
 # rely on how health pill values have this special tag value.
-_HEALTH_PILL_EVENT_TAG = '__health_pill__'
+HEALTH_PILL_EVENT_TAG = '__health_pill__'
 
 
 def IsTensorFlowEventsFile(path):
@@ -318,7 +319,7 @@ def _ProcessEvent(self, event):
       self._tagged_metadata[tag] = event.tagged_run_metadata.run_metadata
     elif event.HasField('summary'):
       for value in event.summary.value:
-        if value.HasField('tensor') and value.tag == _HEALTH_PILL_EVENT_TAG:
+        if value.HasField('tensor') and value.tag == HEALTH_PILL_EVENT_TAG:
           self._ProcessHealthPillSummary(value, event)
         else:
           for summary_type, summary_func in SUMMARY_TYPES.items():
@@ -341,7 +342,7 @@ def _ProcessHealthPillSummary(self, value, event):
       value: A summary_pb2.Summary.Value with a Tensor field.
       event: The event_pb2.Event containing that value.
     """
-    elements = np.fromstring(value.tensor.tensor_content, dtype=np.float64)
+    elements = tensor_util.MakeNdarray(value.tensor)
 
     # The node_name property of the value object is actually a watch key: a
     # combination of node name, output slot, and a suffix. We capture the
diff --git a/tensorflow/tensorboard/backend/event_processing/event_accumulator_test.py b/tensorflow/tensorboard/backend/event_processing/event_accumulator_test.py
@@ -25,6 +25,7 @@
 
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import summary_pb2
+from tensorflow.core.framework import types_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.util import event_pb2
 from tensorflow.python.framework import constant_op
@@ -70,15 +71,13 @@ def AddScalar(self, tag, wall_time=0, step=0, value=0):
                 tag=tag, simple_value=value)]))
     self.AddEvent(event)
 
-  def AddHealthPill(self, wall_time, step, node_name, output_slot, elements):
-    event = event_pb2.Event()
-    event.wall_time = wall_time
-    event.step = step
-    value = event.summary.value.add()
-    # The node_name property is actually a watch key.
-    value.node_name = '%s:%d:DebugNumericSummary' % (node_name, output_slot)
-    value.tag = '__health_pill__'
-    value.tensor.tensor_shape.dim.add().size = len(elements)
+  def AddHealthPill(self, wall_time, step, op_name, output_slot, elements):
+    event = event_pb2.Event(step=step, wall_time=wall_time)
+    value = event.summary.value.add(
+        tag='__health_pill__',
+        node_name='%s:%d:DebugNumericSummary' % (op_name, output_slot))
+    value.tensor.tensor_shape.dim.add(size=len(elements))
+    value.tensor.dtype = types_pb2.DT_DOUBLE
     value.tensor.tensor_content = np.array(elements, dtype=np.float64).tobytes()
     self.AddEvent(event)
 
diff --git a/tensorflow/tensorboard/plugins/debugger/BUILD b/tensorflow/tensorboard/plugins/debugger/BUILD
@@ -15,7 +15,10 @@ py_library(
     srcs = ["debugger_plugin.py"],
     srcs_version = "PY2AND3",
     deps = [
+        "//tensorflow/python:framework",
         "//tensorflow/python:platform",
+        "//tensorflow/tensorboard/backend/event_processing:event_accumulator",
+        "//tensorflow/tensorboard/backend/event_processing:event_file_loader",
         "//tensorflow/tensorboard/lib/python:http_util",
         "//tensorflow/tensorboard/plugins:base_plugin",
     ],
diff --git a/tensorflow/tensorboard/plugins/debugger/debugger_plugin.py b/tensorflow/tensorboard/plugins/debugger/debugger_plugin.py
@@ -19,11 +19,17 @@
 from __future__ import print_function
 
 import collections
+import glob
 import json
+import os
+import re
 
 from werkzeug import wrappers
 
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.tensorboard.backend.event_processing import event_accumulator
+from tensorflow.tensorboard.backend.event_processing import event_file_loader
 from tensorflow.tensorboard.lib.python import http_util
 from tensorflow.tensorboard.plugins import base_plugin
 
@@ -42,6 +48,13 @@
 # The default run to retrieve health pills for.
 _DEFAULT_RUN = '.'
 
+# The POST key of HEALTH_PILLS_ROUTE for the specific step to retrieve health
+# pills for.
+_STEP_POST_KEY = 'step'
+
+# A glob pattern for files containing debugger-related events.
+_DEBUGGER_EVENTS_GLOB_PATTERN = 'events.debugger*'
+
 
 class DebuggerPlugin(base_plugin.TBPlugin):
   """TensorFlow Debugger plugin. Receives requests for debugger-related data.
@@ -58,17 +71,18 @@ def __init__(self, event_multiplexer):
     """
     self._event_multiplexer = event_multiplexer
 
-  def get_plugin_apps(self, unused_run_paths, unused_logdir):
-    """Obtains a mapping between routes and handlers.
+  def get_plugin_apps(self, unused_run_paths, logdir):
+    """Obtains a mapping between routes and handlers. Stores the logdir.
 
     Args:
       unused_run_paths: A mapping between run paths and handlers.
-      unused_logdir: The logdir string - the directory of events files.
+      logdir: The logdir string - the directory of events files.
 
     Returns:
       A mapping between routes and handlers (functions that respond to
       requests).
     """
+    self._logdir = logdir
     return {
         _HEALTH_PILLS_ROUTE: self._serve_health_pills_handler,
     }
@@ -77,15 +91,27 @@ def get_plugin_apps(self, unused_run_paths, unused_logdir):
   def _serve_health_pills_handler(self, request):
     """A (wrapped) werkzeug handler for serving health pills.
 
-    Accepts POST requests and responds with health pills. Specifically, the
-    handler expects a required "node_names" and an optional "run" POST data key.
-    The value of the "node_names" key should be a JSON-ified list of node names
-    for which the client would like to request health pills. The value of the
-    "run" key (which defaults to ".") should be the run to retrieve health pills
-    for. This data is sent via POST (not GET) because URL length is limited.
+    Accepts POST requests and responds with health pills. The request accepts
+    several POST parameters:
+
+      node_names: (required string) A JSON-ified list of node names for which
+          the client would like to request health pills.
+      run: (optional string) The run to retrieve health pills for. Defaults to
+          '.'. This data is sent via POST (not GET) since URL length is limited.
+      step: (optional integer): The session run step for which to
+          retrieve health pills. If provided, the handler reads the health pills
+          of that step from disk (which is slow) and produces a response with
+          only health pills at that step. If not provided, the handler returns a
+          response with health pills at all steps sampled by the event
+          multiplexer (the fast path). The motivation here is that, sometimes,
+          one desires to examine health pills at a specific step (to say find
+          the first step that causes a model to blow up with NaNs).
+          get_plugin_apps must be called before this slower feature is used
+          because that method passes the logdir (directory path) to this plugin.
 
     This handler responds with a JSON-ified object mapping from node names to a
-    list of health pill event objects, each of which has these properties.
+    list (of size 1) of health pill event objects, each of which has these
+    properties.
 
     {
         'wall_time': float,
@@ -112,7 +138,7 @@ def _serve_health_pills_handler(self, request):
 
     if _NODE_NAMES_POST_KEY not in request.form:
       logging.error(
-          'The %s POST key was not found in the request for health pills.',
+          'The %r POST key was not found in the request for health pills.',
           _NODE_NAMES_POST_KEY)
       return wrappers.Response(status=400)
 
@@ -123,30 +149,197 @@ def _serve_health_pills_handler(self, request):
       # Different JSON libs raise different exceptions, so we just do a
       # catch-all here. This problem is complicated by how Tensorboard might be
       # run in many different environments, as it is open-source.
-      logging.error(
-          'Could not decode node name JSON string %s: %s',
-          jsonified_node_names, e)
+      logging.error('Could not decode node name JSON string %r: %s',
+                    jsonified_node_names, e)
       return wrappers.Response(status=400)
 
     if not isinstance(node_names, list):
-      logging.error(
-          '%s is not a JSON list of node names:', jsonified_node_names)
+      logging.error('%r is not a JSON list of node names:',
+                    jsonified_node_names)
       return wrappers.Response(status=400)
 
-    mapping = collections.defaultdict(list)
     run = request.form.get(_RUN_POST_KEY, _DEFAULT_RUN)
+    step_string = request.form.get(_STEP_POST_KEY, None)
+    if step_string is None:
+      # Use all steps sampled by the event multiplexer (Relatively fast).
+      mapping = self._obtain_sampled_health_pills(run, node_names)
+    else:
+      # Read disk to obtain the health pills for that step (Relatively slow).
+      # Make sure that the directory for the run exists.
+      # Determine the directory of events file to read.
+      events_directory = self._logdir
+      if run != _DEFAULT_RUN:
+        # Use the directory for the specific run.
+        events_directory = os.path.join(events_directory, run)
+
+      step = int(step_string)
+      try:
+        mapping = self._obtain_health_pills_at_step(
+            events_directory, node_names, step)
+      except IOError as error:
+        logging.error(
+            'Error retrieving health pills for step %d: %s', step, error)
+        return wrappers.Response(status=404)
+
+    # Convert event_accumulator.HealthPillEvents to JSON-able dicts.
+    jsonable_mapping = {}
+    for node_name, events in mapping.items():
+      jsonable_mapping[node_name] = [e._asdict() for e in events]
+    return http_util.Respond(request, jsonable_mapping, 'application/json')
+
+  def _obtain_sampled_health_pills(self, run, node_names):
+    """Obtains the health pills for a run sampled by the event multiplexer.
+
+    This is much faster than the alternative path of reading health pills from
+    disk.
+
+    Args:
+      run: The run to fetch health pills for.
+      node_names: A list of node names for which to retrieve health pills.
+
+    Returns:
+      A dictionary mapping from node name to a list of
+      event_accumulator.HealthPillEvents.
+    """
+    mapping = {}
     for node_name in node_names:
       try:
-        pill_events = self._event_multiplexer.HealthPills(run, node_name)
-        for pill_event in pill_events:
-          mapping[node_name].append({
-              'wall_time': pill_event[0],
-              'step': pill_event[1],
-              'node_name': pill_event[2],
-              'output_slot': pill_event[3],
-              'value': pill_event[4],
-          })
+        mapping[node_name] = self._event_multiplexer.HealthPills(run, node_name)
       except KeyError:
-        logging.info('No health pills found for node %s.', node_name)
+        logging.info('No health pills found for node %r.', node_name)
+        continue
+
+    return mapping
+
+  def _obtain_health_pills_at_step(self, events_directory, node_names, step):
+    """Reads disk to obtain the health pills for a run at a specific step.
+
+    This could be much slower than the alternative path of just returning all
+    health pills sampled by the event multiplexer. It could take tens of minutes
+    to complete this call for large graphs for big step values (in the
+    thousands).
+
+    Args:
+      events_directory: The directory containing events for the desired run.
+      node_names: A list of node names for which to retrieve health pills.
+      step: The step to obtain health pills for.
+
+    Returns:
+      A dictionary mapping from node name to a list of health pill objects (see
+      docs for _serve_health_pills_handler for properties of those objects).
+
+    Raises:
+      IOError: If no files with health pill events could be found.
+    """
+    # Obtain all files with debugger-related events.
+    pattern = os.path.join(events_directory, _DEBUGGER_EVENTS_GLOB_PATTERN)
+    file_paths = glob.glob(pattern)
+
+    if not file_paths:
+      raise IOError(
+          'No events files found that matches the pattern %r.', pattern)
+
+    # Sort by name (and thus by timestamp).
+    file_paths.sort()
+
+    mapping = collections.defaultdict(list)
+    node_name_set = frozenset(node_names)
+
+    for file_path in file_paths:
+      should_stop = self._process_health_pill_event(
+          node_name_set, mapping, step, file_path)
+      if should_stop:
+        break
+
+    return mapping
+
+  def _process_health_pill_event(self, node_name_set, mapping, target_step,
+                                 file_path):
+    """Creates health pills out of data in an event.
+
+    Creates health pills out of the event and adds them to the mapping.
+
+    Args:
+      node_name_set: A set of node names that are relevant.
+      mapping: The mapping from node name to event_accumulator.HealthPillEvents.
+          This object may be destructively modified.
+      target_step: The target step at which to obtain health pills.
+      file_path: The path to the file with health pill events.
+
+    Returns:
+      Whether we should stop reading events because future events are no longer
+      relevant.
+    """
+    events_loader = event_file_loader.EventFileLoader(file_path)
+    for event in events_loader.Load():
+      if not event.HasField('summary'):
+        logging.warning('An event in a debugger events file lacks a summary.')
+        continue
+
+      if event.step < target_step:
+        # This event is not of the relevant step. We perform this check
+        # first because the majority of events will be eliminated from
+        # consideration by this check.
+        continue
+
+      if event.step > target_step:
+        # We have passed the relevant step. No need to read more events.
+        return True
+
+      for value in event.summary.value:
+        # Since we seek health pills for a specific step, this function
+        # returns 1 health pill per node per step. The wall time is the
+        # seconds since the epoch.
+        health_pill = self._process_health_pill_value(
+            node_name_set, event.wall_time, event.step, value)
+        if not health_pill:
+          continue
+        mapping[health_pill.node_name].append(health_pill)
+
+    # Keep reading events.
+    return False
+
+  def _process_health_pill_value(self, node_name_set, wall_time, step, value):
+    """Creates a dict containing various properties of a health pill.
+
+    Args:
+      node_name_set: A set of node names that are relevant.
+      wall_time: The wall time in seconds.
+      step: The session run step of the event.
+      value: The health pill value.
+
+    Returns:
+      An event_accumulator.HealthPillEvent. Or None if one could not be created.
+    """
+    if not value.HasField('tensor'):
+      logging.warning(
+          'An event in a debugger events file lacks a tensor value.')
+      return None
+
+    if value.tag != event_accumulator.HEALTH_PILL_EVENT_TAG:
+      logging.warning(
+          ('A debugger-related event lacks the %r tag. It instead has '
+           'the %r tag.'), event_accumulator.HEALTH_PILL_EVENT_TAG, value.tag)
+      return None
+
+    match = re.match(r'^(.*):(\d+):DebugNumericSummary$', value.node_name)
+    if not match:
+      logging.warning(
+          ('A event with a health pill has an invalid watch, (i.e., an '
+           'unexpected debug op): %r'), value.node_name)
+      return None
+
+    node_name = match.group(1)
+    if node_name not in node_name_set:
+      # This event is not relevant.
+      return None
 
-    return http_util.Respond(request, mapping, 'application/json')
+    # Since we seek health pills for a specific step, this function
+    # returns 1 health pill per node per step. The wall time is the
+    # seconds since the epoch.
+    return event_accumulator.HealthPillEvent(
+        wall_time=wall_time,
+        step=step,
+        node_name=node_name,
+        output_slot=int(match.group(2)),
+        value=list(tensor_util.MakeNdarray(value.tensor)))
diff --git a/tensorflow/tensorboard/plugins/debugger/debugger_plugin_test.py b/tensorflow/tensorboard/plugins/debugger/debugger_plugin_test.py