engine/modules/agent_vision/agent_vision.cpp

#include "agent_vision.h"

#include "core/config/engine.h"
#include "core/io/image.h"
#include "core/os/os.h"
#include "scene/3d/camera_3d.h"
#include "scene/3d/visual_instance_3d.h"
#include "scene/main/node.h"
#include "scene/main/scene_tree.h"
#include "scene/main/viewport.h"
#include "scene/main/window.h"
#include "servers/rendering/rendering_server.h"

AgentVision *AgentVision::singleton = nullptr;

AgentVision::AgentVision() {
	singleton = this;
}

AgentVision::~AgentVision() {
	singleton = nullptr;
}

void AgentVision::_bind_methods() {
	ClassDB::bind_method(D_METHOD("get_bounding_boxes"), &AgentVision::get_bounding_boxes);
	ClassDB::bind_method(D_METHOD("get_annotated_screenshot", "width"), &AgentVision::get_annotated_screenshot, DEFVAL(0));
	ClassDB::bind_method(D_METHOD("get_screenshot_diff"), &AgentVision::get_screenshot_diff);
	ClassDB::bind_method(D_METHOD("get_segmentation_map"), &AgentVision::get_segmentation_map);
	ClassDB::bind_method(D_METHOD("get_depth_buffer"), &AgentVision::get_depth_buffer);
	ClassDB::bind_method(D_METHOD("get_minimap", "world_size", "resolution"), &AgentVision::get_minimap, DEFVAL(100.0f), DEFVAL(512));
	ClassDB::bind_method(D_METHOD("capture_camera", "camera_name", "width", "height"), &AgentVision::capture_camera, DEFVAL(0), DEFVAL(0));
}

Array AgentVision::get_bounding_boxes() {
	Array result;

	SceneTree *tree = SceneTree::get_singleton();
	if (!tree || !tree->get_root()) {
		return result;
	}

	// Find the active camera.
	Camera3D *camera = tree->get_root()->get_camera_3d();
	if (!camera) {
		return result;
	}

	// Get viewport size for projection.
	Vector2 viewport_size = tree->get_root()->get_visible_rect().size;

	// Traverse scene tree for VisualInstance3D nodes.
	List<Node *> stack;
	stack.push_back(tree->get_root());

	while (!stack.is_empty()) {
		Node *node = stack.front()->get();
		stack.pop_front();

		VisualInstance3D *visual = Object::cast_to<VisualInstance3D>(node);
		if (visual && visual->is_visible_in_tree()) {
			AABB aabb = visual->get_aabb();
			Transform3D global_xform = visual->get_global_transform();

			// Project AABB corners to screen space.
			Vector3 corners[8];
			Vector3 aabb_min = aabb.position;
			Vector3 aabb_max = aabb.position + aabb.size;

			corners[0] = global_xform.xform(Vector3(aabb_min.x, aabb_min.y, aabb_min.z));
			corners[1] = global_xform.xform(Vector3(aabb_max.x, aabb_min.y, aabb_min.z));
			corners[2] = global_xform.xform(Vector3(aabb_min.x, aabb_max.y, aabb_min.z));
			corners[3] = global_xform.xform(Vector3(aabb_max.x, aabb_max.y, aabb_min.z));
			corners[4] = global_xform.xform(Vector3(aabb_min.x, aabb_min.y, aabb_max.z));
			corners[5] = global_xform.xform(Vector3(aabb_max.x, aabb_min.y, aabb_max.z));
			corners[6] = global_xform.xform(Vector3(aabb_min.x, aabb_max.y, aabb_max.z));
			corners[7] = global_xform.xform(Vector3(aabb_max.x, aabb_max.y, aabb_max.z));

			float min_x = 1e10, min_y = 1e10, max_x = -1e10, max_y = -1e10;
			bool any_visible = false;

			for (int i = 0; i < 8; i++) {
				if (!camera->is_position_behind(corners[i])) {
					Vector2 screen_pos = camera->unproject_position(corners[i]);
					min_x = MIN(min_x, screen_pos.x);
					min_y = MIN(min_y, screen_pos.y);
					max_x = MAX(max_x, screen_pos.x);
					max_y = MAX(max_y, screen_pos.y);
					any_visible = true;
				}
			}

			if (any_visible) {
				// Clamp to viewport.
				min_x = CLAMP(min_x, 0, viewport_size.x);
				min_y = CLAMP(min_y, 0, viewport_size.y);
				max_x = CLAMP(max_x, 0, viewport_size.x);
				max_y = CLAMP(max_y, 0, viewport_size.y);

				float distance = camera->get_global_position().distance_to(global_xform.origin);

				Dictionary box;
				box["node"] = String(visual->get_name());
				box["path"] = String(visual->get_path());
				box["class"] = visual->get_class();
				box["rect"] = Array();
				Array rect_arr;
				rect_arr.push_back((int)min_x);
				rect_arr.push_back((int)min_y);
				rect_arr.push_back((int)max_x);
				rect_arr.push_back((int)max_y);
				box["rect"] = rect_arr;
				box["distance"] = distance;
				result.push_back(box);
			}
		}

		for (int i = 0; i < node->get_child_count(); i++) {
			stack.push_back(node->get_child(i));
		}
	}

	return result;
}

Vector<uint8_t> AgentVision::get_annotated_screenshot(int p_width) {
	SceneTree *tree = SceneTree::get_singleton();
	if (!tree || !tree->get_root()) {
		return Vector<uint8_t>();
	}

	// Capture base screenshot.
	Viewport *viewport = tree->get_root();
	Ref<Image> img = viewport->get_texture()->get_image();
	if (img.is_null()) {
		return Vector<uint8_t>();
	}

	// Draw bounding boxes as colored rectangles on the image.
	Array boxes = get_bounding_boxes();
	for (int i = 0; i < boxes.size(); i++) {
		Dictionary box = boxes[i];
		Array rect = box["rect"];
		if (rect.size() < 4) {
			continue;
		}

		int x1 = rect[0];
		int y1 = rect[1];
		int x2 = rect[2];
		int y2 = rect[3];

		Color color = Color(0, 1, 0, 1); // Green boxes.

		// Draw rectangle outline.
		for (int x = x1; x <= x2; x++) {
			if (x >= 0 && x < img->get_width()) {
				if (y1 >= 0 && y1 < img->get_height()) {
					img->set_pixel(x, y1, color);
				}
				if (y2 >= 0 && y2 < img->get_height()) {
					img->set_pixel(x, y2, color);
				}
			}
		}
		for (int y = y1; y <= y2; y++) {
			if (y >= 0 && y < img->get_height()) {
				if (x1 >= 0 && x1 < img->get_width()) {
					img->set_pixel(x1, y, color);
				}
				if (x2 >= 0 && x2 < img->get_width()) {
					img->set_pixel(x2, y, color);
				}
			}
		}
	}

	if (p_width > 0) {
		float aspect = (float)img->get_height() / (float)img->get_width();
		img->resize(p_width, (int)(p_width * aspect), Image::INTERPOLATE_BILINEAR);
	}

	return img->save_png_to_buffer();
}

Vector<uint8_t> AgentVision::get_screenshot_diff() {
	SceneTree *tree = SceneTree::get_singleton();
	if (!tree || !tree->get_root()) {
		return Vector<uint8_t>();
	}

	Viewport *viewport = tree->get_root();
	Ref<Image> current = viewport->get_texture()->get_image();
	if (current.is_null()) {
		return Vector<uint8_t>();
	}

	Vector<uint8_t> current_data = current->save_png_to_buffer();

	if (previous_screenshot.is_empty()) {
		// No previous — store and return current.
		MutexLock lock(vision_mutex);
		previous_screenshot = current_data;
		return current_data;
	}

	// Load previous image.
	Ref<Image> prev;
	prev.instantiate();
	prev->load_png_from_buffer(previous_screenshot);

	if (prev.is_null() || prev->get_width() != current->get_width() || prev->get_height() != current->get_height()) {
		MutexLock lock(vision_mutex);
		previous_screenshot = current_data;
		return current_data;
	}

	// Create diff image — highlight changed pixels.
	Ref<Image> diff_img = Image::create_empty(current->get_width(), current->get_height(), false, Image::FORMAT_RGB8);

	for (int y = 0; y < current->get_height(); y++) {
		for (int x = 0; x < current->get_width(); x++) {
			Color c1 = current->get_pixel(x, y);
			Color c2 = prev->get_pixel(x, y);
			float diff_val = (Math::abs(c1.r - c2.r) + Math::abs(c1.g - c2.g) + Math::abs(c1.b - c2.b)) / 3.0f;

			if (diff_val > 0.05f) {
				// Changed pixel — show in red overlay.
				diff_img->set_pixel(x, y, Color(1, 0, 0, diff_val));
			} else {
				// Unchanged — show dimmed current.
				diff_img->set_pixel(x, y, Color(c1.r * 0.3f, c1.g * 0.3f, c1.b * 0.3f));
			}
		}
	}

	// Store current as previous.
	{
		MutexLock lock(vision_mutex);
		previous_screenshot = current_data;
	}

	return diff_img->save_png_to_buffer();
}

Vector<uint8_t> AgentVision::get_segmentation_map() {
	SceneTree *tree = SceneTree::get_singleton();
	if (!tree || !tree->get_root()) {
		return Vector<uint8_t>();
	}

	// Segmentation requires a custom rendering pass.
	// For now, generate a colored bounding box map.
	Viewport *viewport = tree->get_root();
	Ref<Image> img = viewport->get_texture()->get_image();
	if (img.is_null()) {
		return Vector<uint8_t>();
	}

	// Create a black image same size.
	Ref<Image> seg_img = Image::create_empty(img->get_width(), img->get_height(), false, Image::FORMAT_RGB8);

	// Fill bounding box regions with unique colors per node.
	Array boxes = get_bounding_boxes();
	for (int i = 0; i < boxes.size(); i++) {
		Dictionary box = boxes[i];
		Array rect = box["rect"];
		if (rect.size() < 4) {
			continue;
		}

		// Generate a unique color per node.
		Color color = _get_segmentation_color(i + 1);

		int x1 = CLAMP((int)rect[0], 0, seg_img->get_width() - 1);
		int y1 = CLAMP((int)rect[1], 0, seg_img->get_height() - 1);
		int x2 = CLAMP((int)rect[2], 0, seg_img->get_width() - 1);
		int y2 = CLAMP((int)rect[3], 0, seg_img->get_height() - 1);

		for (int y = y1; y <= y2; y++) {
			for (int x = x1; x <= x2; x++) {
				seg_img->set_pixel(x, y, color);
			}
		}
	}

	return seg_img->save_png_to_buffer();
}

Vector<uint8_t> AgentVision::get_depth_buffer() {
	SceneTree *tree = SceneTree::get_singleton();
	if (!tree || !tree->get_root()) {
		return Vector<uint8_t>();
	}

	// Native depth buffer access requires renderer-specific code.
	// For now, capture viewport and convert to grayscale as approximation.
	Viewport *viewport = tree->get_root();
	Ref<Image> img = viewport->get_texture()->get_image();
	if (img.is_null()) {
		return Vector<uint8_t>();
	}

	// Convert to grayscale (luminance) as depth approximation.
	Ref<Image> depth = Image::create_empty(img->get_width(), img->get_height(), false, Image::FORMAT_L8);
	for (int y = 0; y < img->get_height(); y++) {
		for (int x = 0; x < img->get_width(); x++) {
			Color c = img->get_pixel(x, y);
			float luminance = c.r * 0.299f + c.g * 0.587f + c.b * 0.114f;
			depth->set_pixel(x, y, Color(luminance, luminance, luminance));
		}
	}

	return depth->save_png_to_buffer();
}

Vector<uint8_t> AgentVision::get_minimap(float p_world_size, int p_resolution) {
	// A proper minimap would use a SubViewport with an orthographic camera looking down.
	// For now, return empty and let the game set up a SubViewport.
	return Vector<uint8_t>();
}

Vector<uint8_t> AgentVision::capture_camera(const String &p_camera_name, int p_width, int p_height) {
	SceneTree *tree = SceneTree::get_singleton();
	if (!tree || !tree->get_root()) {
		return Vector<uint8_t>();
	}

	// Find the named camera.
	Node *camera_node = tree->get_root()->find_child(p_camera_name, true, false);
	if (!camera_node) {
		return Vector<uint8_t>();
	}

	// If this camera has a SubViewport parent, capture from it.
	// Otherwise, use the main viewport (camera must be current).
	Viewport *viewport = camera_node->get_viewport();
	if (!viewport) {
		return Vector<uint8_t>();
	}

	Ref<Image> img = viewport->get_texture()->get_image();
	if (img.is_null()) {
		return Vector<uint8_t>();
	}

	if (p_width > 0) {
		int height = p_height > 0 ? p_height : (int)(p_width * ((float)img->get_height() / (float)img->get_width()));
		img->resize(p_width, height, Image::INTERPOLATE_BILINEAR);
	}

	return img->save_png_to_buffer();
}

void AgentVision::store_previous_screenshot(const Vector<uint8_t> &p_data) {
	MutexLock lock(vision_mutex);
	previous_screenshot = p_data;
}

Color AgentVision::_get_segmentation_color(uint64_t p_instance_id) {
	if (segmentation_colors.has(p_instance_id)) {
		return segmentation_colors[p_instance_id];
	}

	// Generate unique color from ID using simple hash-to-color.
	uint32_t idx = next_color_idx++;
	float r = ((idx * 67) % 255) / 255.0f;
	float g = ((idx * 131) % 255) / 255.0f;
	float b = ((idx * 199) % 255) / 255.0f;
	Color color(r, g, b);

	segmentation_colors[p_instance_id] = color;
	return color;
}