detect_box.py

"""
Module V: Box detection and classification for RMK-1 manipulator.

Usage:
  python3 detect_box.py --target 1

Target classes (per competition spec):
  1 - hammer (молоток)
  2 - wrench (гаечный ключ)
  3 - pliers (пассатижи)

This script:
  1. Subscribes to camera topic /RMC1/arm95/camera_gripper/image_color
  2. Classifies visible boxes using trained YOLOv8 model
  3. Finds the target box and computes its position relative to Base_link
  4. Publishes coordinates for manipulator control
"""

import argparse
import os
import sys
import cv2
import numpy as np
from ultralytics import YOLO

# Class mapping: YOLO class index -> competition ID
# YOLO trains alphabetically: hammer=0, pliers=1, wrench=2
YOLO_TO_COMPETITION = {
    0: 1,  # hammer -> 1
    1: 3,  # pliers -> 3
    2: 2,  # wrench -> 2
}

COMPETITION_TO_YOLO = {v: k for k, v in YOLO_TO_COMPETITION.items()}

CLASS_NAMES = {
    1: "hammer (молоток)",
    2: "wrench (гаечный ключ)",
    3: "pliers (пассатижи)",
}

# YOLO class names (alphabetical order)
YOLO_CLASS_NAMES = {0: "hammer", 1: "pliers", 2: "wrench"}


def load_model(model_path=None):
    if model_path is None:
        base_dir = os.path.dirname(os.path.abspath(__file__))
        model_path = os.path.join(base_dir, "best.pt")
    return YOLO(model_path)


def classify_image(model, image):
    """
    Classify an image and return (class_id, confidence).
    class_id is competition ID (1=hammer, 2=wrench, 3=pliers).
    """
    results = model(image, imgsz=224, verbose=False)
    if results and results[0].probs is not None:
        probs = results[0].probs
        yolo_class = probs.top1
        confidence = probs.top1conf.item()
        comp_class = YOLO_TO_COMPETITION.get(yolo_class, -1)
        return comp_class, confidence
    return -1, 0.0


def detect_boxes_in_frame(image, target_id):
    """
    Given a top-down camera frame, segment individual boxes and classify each.
    Returns list of (competition_class_id, confidence, center_x, center_y, bbox).
    """
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    blurred = cv2.GaussianBlur(gray, (5, 5), 0)

    # Adaptive threshold to find box regions
    thresh = cv2.adaptiveThreshold(
        blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY_INV, 21, 5
    )

    # Morphological operations to clean up
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
    thresh = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel, iterations=2)
    thresh = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=1)

    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    h, w = image.shape[:2]
    min_area = (h * w) * 0.01  # minimum 1% of frame
    max_area = (h * w) * 0.5   # maximum 50% of frame

    boxes = []
    for cnt in contours:
        area = cv2.contourArea(cnt)
        if area < min_area or area > max_area:
            continue

        x, y, bw, bh = cv2.boundingRect(cnt)
        # Extract box region
        roi = image[y:y+bh, x:x+bw]
        if roi.size == 0:
            continue

        boxes.append({
            "roi": roi,
            "bbox": (x, y, bw, bh),
            "center": (x + bw // 2, y + bh // 2),
            "area": area,
        })

    return boxes


def find_target_box(model, image, target_id):
    """
    Find the target box in the camera frame.
    Returns (center_x, center_y, confidence) in pixel coordinates, or None.
    """
    boxes = detect_boxes_in_frame(image, target_id)

    best_match = None
    best_conf = 0.0

    for box in boxes:
        comp_class, conf = classify_image(model, box["roi"])
        if comp_class == target_id and conf > best_conf:
            best_conf = conf
            best_match = box

    if best_match and best_conf > 0.5:
        cx, cy = best_match["center"]
        return cx, cy, best_conf, best_match["bbox"]

    return None


def pixel_to_base_link(cx, cy, img_w, img_h,
                       camera_x=0.0, camera_y=0.0, camera_z=0.7,
                       fov_x=0.6, fov_y=0.6):
    """
    Convert pixel coordinates to approximate Base_link coordinates.
    Assumes camera looks straight down from (camera_x, camera_y, camera_z).
    fov_x/fov_y: field of view in meters at camera_z height.

    These parameters should be calibrated for the actual setup.
    """
    # Normalize to [-0.5, 0.5]
    nx = (cx / img_w) - 0.5
    ny = (cy / img_h) - 0.5

    # Map to world coordinates relative to camera
    world_x = camera_x + nx * fov_x
    world_y = camera_y + ny * fov_y
    world_z = 0.0  # boxes are on the shelf surface

    return world_x, world_y, world_z


def demo_on_image(image_path, target_id, model):
    """Demo classification on a single image."""
    image = cv2.imread(image_path)
    if image is None:
        print(f"Error: cannot read {image_path}")
        return

    comp_class, confidence = classify_image(model, image)
    print(f"Image: {image_path}")
    print(f"  Predicted: {CLASS_NAMES.get(comp_class, 'unknown')} (ID={comp_class})")
    print(f"  Confidence: {confidence:.4f}")
    print(f"  Target: {CLASS_NAMES.get(target_id, 'unknown')} (ID={target_id})")
    print(f"  Match: {'YES' if comp_class == target_id else 'NO'}")


def main():
    parser = argparse.ArgumentParser(description="Module V: Box Detection")
    parser.add_argument("--target", type=int, required=True, choices=[1, 2, 3],
                        help="Target box ID: 1=hammer, 2=wrench, 3=pliers")
    parser.add_argument("--model", type=str, default=None,
                        help="Path to model weights (default: best.pt)")
    parser.add_argument("--image", type=str, default=None,
                        help="Test on a single image instead of ROS2 camera")
    parser.add_argument("--ros", action="store_true",
                        help="Run in ROS2 mode (subscribe to camera topic)")
    args = parser.parse_args()

    print(f"Target: {CLASS_NAMES[args.target]}")
    model = load_model(args.model)
    print("Model loaded successfully.")

    if args.image:
        demo_on_image(args.image, args.target, model)
        return

    if args.ros:
        run_ros2(model, args.target)
        return

    # Default: demo on source images
    base_dir = os.path.dirname(os.path.abspath(__file__))
    images_dir = os.path.join(base_dir, "images")
    for name in ["Hammer.jpg", "Wrench.jpg", "Pliers.jpg"]:
        path = os.path.join(images_dir, name)
        if os.path.exists(path):
            demo_on_image(path, args.target, model)
            print()


def run_ros2(model, target_id):
    """ROS2 node that subscribes to camera and finds target box."""
    try:
        import rclpy
        from rclpy.node import Node
        from sensor_msgs.msg import Image
        from geometry_msgs.msg import PoseStamped
        from cv_bridge import CvBridge
    except ImportError:
        print("Error: ROS2 packages not found. Install rclpy, cv_bridge, etc.")
        sys.exit(1)

    class BoxDetectorNode(Node):
        def __init__(self):
            super().__init__("box_detector")
            self.bridge = CvBridge()
            self.model = model
            self.target_id = target_id
            self.detected = False

            self.subscription = self.create_subscription(
                Image,
                "/RMC1/arm95/camera_gripper/image_color",
                self.image_callback,
                10
            )
            self.get_logger().info(
                f"Box detector started. Target: {CLASS_NAMES[target_id]}"
            )

        def image_callback(self, msg):
            if self.detected:
                return

            cv_image = self.bridge.imgmsg_to_cv2(msg, desired_encoding="bgr8")
            h, w = cv_image.shape[:2]

            result = find_target_box(self.model, cv_image, self.target_id)
            if result is not None:
                cx, cy, conf, bbox = result
                wx, wy, wz = pixel_to_base_link(cx, cy, w, h)

                self.get_logger().info(
                    f"TARGET FOUND: {CLASS_NAMES[self.target_id]} "
                    f"at pixel ({cx}, {cy}), conf={conf:.2f}"
                )
                self.get_logger().info(
                    f"Base_link coords: x={wx:.3f}, y={wy:.3f}, z={wz:.3f}"
                )

                self.detected = True
            else:
                # Try classifying the whole frame
                comp_class, conf = classify_image(self.model, cv_image)
                self.get_logger().info(
                    f"Frame classified as: {CLASS_NAMES.get(comp_class, 'unknown')} "
                    f"(conf={conf:.2f})"
                )

    rclpy.init()
    node = BoxDetectorNode()
    try:
        rclpy.spin(node)
    except KeyboardInterrupt:
        pass
    finally:
        node.destroy_node()
        rclpy.shutdown()


if __name__ == "__main__":
    main()
first commit 2026-04-03 07:30:54 +03:00			`"""`
			`Module V: Box detection and classification for RMK-1 manipulator.`

			`Usage:`
			`python3 detect_box.py --target 1`

			`Target classes (per competition spec):`
			`1 - hammer (молоток)`
			`2 - wrench (гаечный ключ)`
			`3 - pliers (пассатижи)`

			`This script:`
			`1. Subscribes to camera topic /RMC1/arm95/camera_gripper/image_color`
			`2. Classifies visible boxes using trained YOLOv8 model`
			`3. Finds the target box and computes its position relative to Base_link`
			`4. Publishes coordinates for manipulator control`
			`"""`

			`import argparse`
			`import os`
			`import sys`
			`import cv2`
			`import numpy as np`
			`from ultralytics import YOLO`

			`# Class mapping: YOLO class index -> competition ID`
			`# YOLO trains alphabetically: hammer=0, pliers=1, wrench=2`
			`YOLO_TO_COMPETITION = {`
			`0: 1, # hammer -> 1`
			`1: 3, # pliers -> 3`
			`2: 2, # wrench -> 2`
			`}`

			`COMPETITION_TO_YOLO = {v: k for k, v in YOLO_TO_COMPETITION.items()}`

			`CLASS_NAMES = {`
			`1: "hammer (молоток)",`
			`2: "wrench (гаечный ключ)",`
			`3: "pliers (пассатижи)",`
			`}`

			`# YOLO class names (alphabetical order)`
			`YOLO_CLASS_NAMES = {0: "hammer", 1: "pliers", 2: "wrench"}`


			`def load_model(model_path=None):`
			`if model_path is None:`
			`base_dir = os.path.dirname(os.path.abspath(__file__))`
			`model_path = os.path.join(base_dir, "best.pt")`
			`return YOLO(model_path)`


			`def classify_image(model, image):`
			`"""`
			`Classify an image and return (class_id, confidence).`
			`class_id is competition ID (1=hammer, 2=wrench, 3=pliers).`
			`"""`
			`results = model(image, imgsz=224, verbose=False)`
			`if results and results[0].probs is not None:`
			`probs = results[0].probs`
			`yolo_class = probs.top1`
			`confidence = probs.top1conf.item()`
			`comp_class = YOLO_TO_COMPETITION.get(yolo_class, -1)`
			`return comp_class, confidence`
			`return -1, 0.0`


			`def detect_boxes_in_frame(image, target_id):`
			`"""`
			`Given a top-down camera frame, segment individual boxes and classify each.`
			`Returns list of (competition_class_id, confidence, center_x, center_y, bbox).`
			`"""`
			`gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)`
			`blurred = cv2.GaussianBlur(gray, (5, 5), 0)`

			`# Adaptive threshold to find box regions`
			`thresh = cv2.adaptiveThreshold(`
			`blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,`
			`cv2.THRESH_BINARY_INV, 21, 5`
			`)`

			`# Morphological operations to clean up`
			`kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))`
			`thresh = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel, iterations=2)`
			`thresh = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=1)`

			`contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)`

			`h, w = image.shape[:2]`
			`min_area = (h * w) * 0.01 # minimum 1% of frame`
			`max_area = (h * w) * 0.5 # maximum 50% of frame`

			`boxes = []`
			`for cnt in contours:`
			`area = cv2.contourArea(cnt)`
			`if area < min_area or area > max_area:`
			`continue`

			`x, y, bw, bh = cv2.boundingRect(cnt)`
			`# Extract box region`
			`roi = image[y:y+bh, x:x+bw]`
			`if roi.size == 0:`
			`continue`

			`boxes.append({`
			`"roi": roi,`
			`"bbox": (x, y, bw, bh),`
			`"center": (x + bw // 2, y + bh // 2),`
			`"area": area,`
			`})`

			`return boxes`


			`def find_target_box(model, image, target_id):`
			`"""`
			`Find the target box in the camera frame.`
			`Returns (center_x, center_y, confidence) in pixel coordinates, or None.`
			`"""`
			`boxes = detect_boxes_in_frame(image, target_id)`

			`best_match = None`
			`best_conf = 0.0`

			`for box in boxes:`
			`comp_class, conf = classify_image(model, box["roi"])`
			`if comp_class == target_id and conf > best_conf:`
			`best_conf = conf`
			`best_match = box`

			`if best_match and best_conf > 0.5:`
			`cx, cy = best_match["center"]`
			`return cx, cy, best_conf, best_match["bbox"]`

			`return None`


			`def pixel_to_base_link(cx, cy, img_w, img_h,`
			`camera_x=0.0, camera_y=0.0, camera_z=0.7,`
			`fov_x=0.6, fov_y=0.6):`
			`"""`
			`Convert pixel coordinates to approximate Base_link coordinates.`
			`Assumes camera looks straight down from (camera_x, camera_y, camera_z).`
			`fov_x/fov_y: field of view in meters at camera_z height.`

			`These parameters should be calibrated for the actual setup.`
			`"""`
			`# Normalize to [-0.5, 0.5]`
			`nx = (cx / img_w) - 0.5`
			`ny = (cy / img_h) - 0.5`

			`# Map to world coordinates relative to camera`
			`world_x = camera_x + nx * fov_x`
			`world_y = camera_y + ny * fov_y`
			`world_z = 0.0 # boxes are on the shelf surface`

			`return world_x, world_y, world_z`


			`def demo_on_image(image_path, target_id, model):`
			`"""Demo classification on a single image."""`
			`image = cv2.imread(image_path)`
			`if image is None:`
			`print(f"Error: cannot read {image_path}")`
			`return`

			`comp_class, confidence = classify_image(model, image)`
			`print(f"Image: {image_path}")`
			`print(f" Predicted: {CLASS_NAMES.get(comp_class, 'unknown')} (ID={comp_class})")`
			`print(f" Confidence: {confidence:.4f}")`
			`print(f" Target: {CLASS_NAMES.get(target_id, 'unknown')} (ID={target_id})")`
			`print(f" Match: {'YES' if comp_class == target_id else 'NO'}")`


			`def main():`
			`parser = argparse.ArgumentParser(description="Module V: Box Detection")`
			`parser.add_argument("--target", type=int, required=True, choices=[1, 2, 3],`
			`help="Target box ID: 1=hammer, 2=wrench, 3=pliers")`
			`parser.add_argument("--model", type=str, default=None,`
			`help="Path to model weights (default: best.pt)")`
			`parser.add_argument("--image", type=str, default=None,`
			`help="Test on a single image instead of ROS2 camera")`
			`parser.add_argument("--ros", action="store_true",`
			`help="Run in ROS2 mode (subscribe to camera topic)")`
			`args = parser.parse_args()`

			`print(f"Target: {CLASS_NAMES[args.target]}")`
			`model = load_model(args.model)`
			`print("Model loaded successfully.")`

			`if args.image:`
			`demo_on_image(args.image, args.target, model)`
			`return`

			`if args.ros:`
			`run_ros2(model, args.target)`
			`return`

			`# Default: demo on source images`
			`base_dir = os.path.dirname(os.path.abspath(__file__))`
			`images_dir = os.path.join(base_dir, "images")`
			`for name in ["Hammer.jpg", "Wrench.jpg", "Pliers.jpg"]:`
			`path = os.path.join(images_dir, name)`
			`if os.path.exists(path):`
			`demo_on_image(path, args.target, model)`
			`print()`


			`def run_ros2(model, target_id):`
			`"""ROS2 node that subscribes to camera and finds target box."""`
			`try:`
			`import rclpy`
			`from rclpy.node import Node`
			`from sensor_msgs.msg import Image`
			`from geometry_msgs.msg import PoseStamped`
			`from cv_bridge import CvBridge`
			`except ImportError:`
			`print("Error: ROS2 packages not found. Install rclpy, cv_bridge, etc.")`
			`sys.exit(1)`

			`class BoxDetectorNode(Node):`
			`def __init__(self):`
			`super().__init__("box_detector")`
			`self.bridge = CvBridge()`
			`self.model = model`
			`self.target_id = target_id`
			`self.detected = False`

			`self.subscription = self.create_subscription(`
			`Image,`
			`"/RMC1/arm95/camera_gripper/image_color",`
			`self.image_callback,`
			`10`
			`)`
			`self.get_logger().info(`
			`f"Box detector started. Target: {CLASS_NAMES[target_id]}"`
			`)`

			`def image_callback(self, msg):`
			`if self.detected:`
			`return`

			`cv_image = self.bridge.imgmsg_to_cv2(msg, desired_encoding="bgr8")`
			`h, w = cv_image.shape[:2]`

			`result = find_target_box(self.model, cv_image, self.target_id)`
			`if result is not None:`
			`cx, cy, conf, bbox = result`
			`wx, wy, wz = pixel_to_base_link(cx, cy, w, h)`

			`self.get_logger().info(`
			`f"TARGET FOUND: {CLASS_NAMES[self.target_id]} "`
			`f"at pixel ({cx}, {cy}), conf={conf:.2f}"`
			`)`
			`self.get_logger().info(`
			`f"Base_link coords: x={wx:.3f}, y={wy:.3f}, z={wz:.3f}"`
			`)`

			`self.detected = True`
			`else:`
			`# Try classifying the whole frame`
			`comp_class, conf = classify_image(self.model, cv_image)`
			`self.get_logger().info(`
			`f"Frame classified as: {CLASS_NAMES.get(comp_class, 'unknown')} "`
			`f"(conf={conf:.2f})"`
			`)`

			`rclpy.init()`
			`node = BoxDetectorNode()`
			`try:`
			`rclpy.spin(node)`
			`except KeyboardInterrupt:`
			`pass`
			`finally:`
			`node.destroy_node()`
			`rclpy.shutdown()`


			`if __name__ == "__main__":`
			`main()`