robustrobotics · y-veys · Apr 19, 2025 · Apr 27, 2025 · Apr 28, 2025 · GoldenZephyr
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -10,5 +10,6 @@ repos:
         args: [--select, I, --fix]
       # Run the linter.
       - id: ruff
+        args: [--fix]
       # Run the formatter.
       - id: ruff-format
diff --git a/spot_tools/examples/test_spot.py b/spot_tools/examples/test_spot.py
@@ -3,6 +3,7 @@
 import io
 import os  # Import the os package
 import time
+from copy import copy
 
 import cv2
 import numpy as np
@@ -21,7 +22,7 @@
     stow_arm,
 )
 from spot_skills.door_utils import execute_open_door
-from spot_skills.grasp_utils import object_grasp
+from spot_skills.grasp_utils import object_grasp, object_grasp_YOLO, object_place
 from spot_skills.navigation_utils import (
     navigate_to_relative_pose,
 )
@@ -104,18 +105,20 @@ def _run_gaze_test(spot) -> None:
     relative_poses = [
         # (resting_point, "Looking nowhere"),
         # (relative_up, "Looking up"),
-        (relative_up, "Looking up"),
-        (math_helpers.Vec3(x=0, y=5, z=0), "Looking left"),
-        (math_helpers.Vec3(x=0, y=-5, z=0), "Looking right"),
+        # (relative_up, "Looking up"),
+        (math_helpers.Vec3(x=1.76, y=1, z=0.05), "Looking where Travis tells me to"),
+        (math_helpers.Vec3(x=1.29, y=-0.34, z=-0.54), "Looking right"),
         # (math_helpers.Vec3(x=0, y=-0.5, ), "Looking -dy"),
-        # (math_helpers.Vec3(x=0.5, y=0, ), "Looking dx"),
+        # (math_helpers.Vec3(x=0.5, y=0, z=0), "Looking dx"),
         # (math_helpers.Vec3(x=-0.5, y=0, ), "Looking -dx"),
         # (math_helpers.Vec3(x=0, y=0, ), "Looking yaw"),
         # (math_helpers.Vec3(x=0, y=0, ), "Looking -yaw"),
     ]
     for relative_pose, msg in relative_poses:
         print(msg)
-        spot.gaze_at_relative_pose(relative_pose)
+        gaze_at_relative_pose(spot, relative_pose)
+        input("Press enter when ready to move on")
+        stow_arm(spot)
         input("Press enter when ready to move on")
         # time.sleep(0.5)
 
@@ -177,6 +180,32 @@ def _run_grasp_test(spot) -> None:
     pass
 
 
+def _run_YOLO_grasp_test(spot):
+    open_gripper(spot)
+    relative_pose = math_helpers.Vec3(x=1, y=0, z=0)
+    gaze_at_relative_pose(spot, relative_pose)
+    time.sleep(0.2)
+
+    object_grasp_YOLO(
+        spot,
+        image_source="hand_color_image",
+        user_input=False,
+        semantic_class="wood block",
+        grasp_constraint=None,
+        debug=True,
+    )
+
+    open_gripper(spot)
+    stow_arm(spot)
+    close_gripper(spot)
+
+    pass
+
+
+def _run_place_test(spot):
+    object_place(spot)
+
+
 def _run_segment_test(spot):
     open_gripper(spot)
     relative_pose = math_helpers.Vec3(x=1, y=0, z=0)
@@ -237,6 +266,80 @@ def _run_open_door_test(spot, model_path, max_tries=2) -> None:
     # # cv2.destroyAllWindows()
 
 
+def _run_YOLOWorld_test(spot):
+    open_gripper(spot)
+    hand_image_request, hand_image = spot.get_image_RGB()
+    results = spot.yolo_model(hand_image)
+    object_to_pick_up = "wood block"
+    # Process results
+
+    annotated_hand_image = copy(hand_image)
+    # Show all of the detections
+    for r in results:
+        boxes = r.boxes  # Bounding boxes
+        for box in boxes:
+            x1, y1, x2, y2 = map(int, box.xyxy[0].tolist())
+            class_id = int(box.cls[0])
+            confidence = float(box.conf[0])
+
+            # Draw bounding box and label
+            cv2.rectangle(annotated_hand_image, (x1, y1), (x2, y2), (0, 255, 0), 2)
+            label = f"{r.names[class_id]} {confidence:.2f}"
+            cv2.putText(
+                annotated_hand_image,
+                label,
+                (x1, y1 - 10),
+                cv2.FONT_HERSHEY_SIMPLEX,
+                0.5,
+                (0, 255, 0),
+                2,
+            )
+    # Display or save the annotated image
+    cv2.imshow("YOLO Output", annotated_hand_image)
+    cv2.waitKey(0)
+    cv2.destroyAllWindows()
+
+    most_confident = copy(hand_image)
+    best_box = None
+    best_confidence = -1.0
+
+    for r in results:
+        boxes = r.boxes
+        for box in boxes:
+            class_id = int(box.cls[0])
+            class_name = r.names[class_id]
+            confidence = float(box.conf[0])
+
+            if class_name == object_to_pick_up and confidence > best_confidence:
+                best_confidence = confidence
+                best_box = box
+
+    if best_box:
+        x1, y1, x2, y2 = map(int, best_box.xyxy[0].tolist())
+        class_id = int(best_box.cls[0])
+
+        # Draw bounding box and label
+        cv2.rectangle(most_confident, (x1, y1), (x2, y2), (0, 255, 0), 2)
+        label = f"{r.names[class_id]} {best_confidence:.2f}"
+        cv2.putText(
+            most_confident,
+            label,
+            (x1, y1 - 10),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            0.5,
+            (0, 255, 0),
+            2,
+        )
+
+        # Display or save the annotated image
+        cv2.imshow("Most Confident Output", most_confident)
+        cv2.waitKey(0)
+        cv2.destroyAllWindows()
+
+    else:
+        raise Exception("No objects of semantic class found")
+
+
 def encode_numpy_image_to_base64(image_np: np.ndarray, format: str = "PNG"):
     pil_image = Image.fromarray(image_np)
     buffer = io.BytesIO()
@@ -367,54 +470,31 @@ def save_images_constant_rate(spot, object_name, folder_name, rate=1.0):
 
     args = parser.parse_args()
 
-    spot = Spot(username=args.username, password=args.password)
+    spot = Spot(ip=args.ip, username=args.username, password=args.password)
     print(spot.id)
     # assert False
     # spot.set_estop()
     # spot.take_lease()
     spot.robot.power_on(timeout_sec=20)
     spot.robot.time_sync.wait_for_sync()
+    # spot.stand()
+    # stow_arm(spot)
 
-    yoloworld_model_path = "/home/aaron/spot_tools/data/models/yolov8x-worldv2-door.pt"
-
-    # fl_img_response, fl_img = spot.get_image_RGB(view='frontleft_fisheye_image')
-    # fr_img_response, fr_img = spot.get_image_RGB(view='frontright_fisheye_image')
-
-    # spot.get_live_stitched_image()
-    # # image = spot.get_stitched_image(fl_img_response, fr_img_response)
-
-    # image = spot.get_stitched_image(jpeg_quality_percent=100, crop_image=True)
-    # print(image.shape)
-    # cv2.imshow("Stitched Image", image)
-    # cv2.waitKey(0)
-    # cv2.destroyAllWindows()
-
-    # image = spot.get_stitched_image_RGB(fl_img_response, fr_img_response, crop_image=True)
-    # image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
-    # print(image.shape)
-    # cv2.imshow("Stitched Image", image)
-    # cv2.waitKey(0)
-    # cv2.destroyAllWindows()
+    # yoloworld_model_path = "/home/aaron/spot_tools/data/models/yolov8x-worldv2-door.pt"
 
     # _run_open_door_test(spot, yoloworld_model_path)
     # _run_walking_test(spot)
     # _run_gaze_test(spot)
+    # _run_YOLO_grasp_test(spot)
+    _run_place_test(spot)
     # _run_traj_test(spot)
     # _run_grasp_test(spot)
     # _run_segment_test(spot)
+    # _run_YOLOWorld_test(spot)
     # spot.pitch_up()
     # print(look_for_object(spot, 'bag'))
 
-    save_images_constant_rate(
-        spot,
-        "test_object",
-        "/home/aaron/spot_tools/spot_tools/data/object_images",
-        rate=1,
-    )
-
-    time.sleep(1)
-
     # spot.stand()
-    # spot.sit()
+    spot.sit()
     # spot.sit()
     # spot.safe_power_off()
diff --git a/spot_tools/src/language/test_VLM.py b/spot_tools/src/language/test_VLM.py
@@ -105,6 +105,26 @@ def test_image_encoding(client):
     print(completion.choices[0].message.content)
 
 
+def test_enforced_output(client):
+    class CalendarEvent(BaseModel):
+        name: str
+        date: str
+        participants: list[str]
+
+    completion = client.beta.chat.completions.parse(
+        model="gpt-4o-2024-08-06",
+        messages=[
+            {"role": "system", "content": "Extract the event information."},
+            {
+                "role": "user",
+                "content": "Alice and Bob are going to a science fair on Friday.",
+            },
+        ],
+    )
+
+    print(completion.choices[0].message.content)
+
+
 def main():
     client = OpenAI(
         api_key=os.getenv("OPENAI_API_KEY")

diff --git a/spot_tools/src/spot_executor/spot.py b/spot_tools/src/spot_executor/spot.py
@@ -20,6 +20,7 @@
     RobotCommandClient,
     blocking_stand,
 )
+from ultralytics import YOLOWorld
 
 from spot_executor.stitch_front_images import stitch, stitch_live, stitch_RGB
 
@@ -33,10 +34,18 @@ def __init__(
         take_lease=True,
         set_estop=False,
         verbose=False,
-        semantic_model_path="../data/models/efficientvit_seg_l2.onnx",
+        semantic_model_path="/home/rrg/dcist_ws/src/awesome_dcist_t4/spot_tools/spot_tools/data/models/efficientvit_seg_l2.onnx",
+        yolo_world_path="/home/rrg/dcist_ws/src/awesome_dcist_t4/spot_tools/spot_tools/data/models/yolov8x-worldv2.pt",
         debug=False,
         semantic_name_to_id=None,
     ):
+        # YOLOv8 model for world detection
+        print("Initializing YOLOWorld model. ")
+        self.yolo_model = YOLOWorld(yolo_world_path)
+        custom_classes = ["", "bag", "wood block", "pipe"]
+        self.yolo_model.set_classes(custom_classes)
+        print("Set classes for YOLOWorld model.")
+
         self.is_fake = False
         self.verbose = verbose
         bosdyn.client.util.setup_logging(verbose)
@@ -89,7 +98,7 @@ def get_pose(self):
             transforms, VISION_FRAME_NAME, BODY_FRAME_NAME
         )
 
-        return out_tform_body
+        return np.array([out_tform_body.x, out_tform_body.y, out_tform_body.angle])
 
     def get_image(self, view="hand_color_image", show=False):
         self.robot.logger.info("Getting an image from: %s", view)

diff --git a/spot_tools/src/spot_executor/spot_executor.py b/spot_tools/src/spot_executor/spot_executor.py
@@ -11,7 +11,7 @@
 from scipy.spatial.transform import Rotation
 
 from spot_skills.arm_utils import gaze_at_vision_pose
-from spot_skills.grasp_utils import object_grasp, object_place
+from spot_skills.grasp_utils import object_grasp_YOLO, object_place
 from spot_skills.navigation_utils import (
     follow_trajectory_continuous,
     turn_to_point,
@@ -92,12 +92,14 @@ def execute_gaze(self, command, feedback, pick_next=False):
 
     def execute_pick(self, command, feedback):
         feedback.print("INFO", "Executing `pick` command")
-        success = object_grasp(
-            self.spot_interface,
-            semantic_class=command.object_class,
-            labelspace_map=self.labelspace_map,
-            debug=self.debug,
-        )
+        # success = object_grasp(
+        #     self.spot_interface,
+        #     semantic_class=command.object_class,
+        #     labelspace_map=self.labelspace_map,
+        #     debug=self.debug,
+        # )
+
+        success = object_grasp_YOLO(self.spot_interface, command.object_class)
 
         if self.debug:
             success, debug_images = success