Files
engine/modules/agent_vision/agent_vision.cpp
ozan d291dcdc74 feat: 9 agentic engine modules for agent-native Godot
agent_api (HTTP server), agent_log (structured logging), agent_events (event bus),
agent_console (GameConsole), agent_replay (snapshots), agent_vision (depth/segmentation),
agent_fbx (bone remapping), agent_auth (multi-agent), agent_analytics (feature flags + tracking)

All modules compile clean with mono. Binary uploaded to S3 v1.0.0.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-15 03:44:28 +01:00

374 lines
11 KiB
C++

#include "agent_vision.h"
#include "core/config/engine.h"
#include "core/io/image.h"
#include "core/os/os.h"
#include "scene/3d/camera_3d.h"
#include "scene/3d/visual_instance_3d.h"
#include "scene/main/node.h"
#include "scene/main/scene_tree.h"
#include "scene/main/viewport.h"
#include "scene/main/window.h"
#include "servers/rendering/rendering_server.h"
AgentVision *AgentVision::singleton = nullptr;
AgentVision::AgentVision() {
singleton = this;
}
AgentVision::~AgentVision() {
singleton = nullptr;
}
void AgentVision::_bind_methods() {
ClassDB::bind_method(D_METHOD("get_bounding_boxes"), &AgentVision::get_bounding_boxes);
ClassDB::bind_method(D_METHOD("get_annotated_screenshot", "width"), &AgentVision::get_annotated_screenshot, DEFVAL(0));
ClassDB::bind_method(D_METHOD("get_screenshot_diff"), &AgentVision::get_screenshot_diff);
ClassDB::bind_method(D_METHOD("get_segmentation_map"), &AgentVision::get_segmentation_map);
ClassDB::bind_method(D_METHOD("get_depth_buffer"), &AgentVision::get_depth_buffer);
ClassDB::bind_method(D_METHOD("get_minimap", "world_size", "resolution"), &AgentVision::get_minimap, DEFVAL(100.0f), DEFVAL(512));
ClassDB::bind_method(D_METHOD("capture_camera", "camera_name", "width", "height"), &AgentVision::capture_camera, DEFVAL(0), DEFVAL(0));
}
Array AgentVision::get_bounding_boxes() {
Array result;
SceneTree *tree = SceneTree::get_singleton();
if (!tree || !tree->get_root()) {
return result;
}
// Find the active camera.
Camera3D *camera = tree->get_root()->get_camera_3d();
if (!camera) {
return result;
}
// Get viewport size for projection.
Vector2 viewport_size = tree->get_root()->get_visible_rect().size;
// Traverse scene tree for VisualInstance3D nodes.
List<Node *> stack;
stack.push_back(tree->get_root());
while (!stack.is_empty()) {
Node *node = stack.front()->get();
stack.pop_front();
VisualInstance3D *visual = Object::cast_to<VisualInstance3D>(node);
if (visual && visual->is_visible_in_tree()) {
AABB aabb = visual->get_aabb();
Transform3D global_xform = visual->get_global_transform();
// Project AABB corners to screen space.
Vector3 corners[8];
Vector3 aabb_min = aabb.position;
Vector3 aabb_max = aabb.position + aabb.size;
corners[0] = global_xform.xform(Vector3(aabb_min.x, aabb_min.y, aabb_min.z));
corners[1] = global_xform.xform(Vector3(aabb_max.x, aabb_min.y, aabb_min.z));
corners[2] = global_xform.xform(Vector3(aabb_min.x, aabb_max.y, aabb_min.z));
corners[3] = global_xform.xform(Vector3(aabb_max.x, aabb_max.y, aabb_min.z));
corners[4] = global_xform.xform(Vector3(aabb_min.x, aabb_min.y, aabb_max.z));
corners[5] = global_xform.xform(Vector3(aabb_max.x, aabb_min.y, aabb_max.z));
corners[6] = global_xform.xform(Vector3(aabb_min.x, aabb_max.y, aabb_max.z));
corners[7] = global_xform.xform(Vector3(aabb_max.x, aabb_max.y, aabb_max.z));
float min_x = 1e10, min_y = 1e10, max_x = -1e10, max_y = -1e10;
bool any_visible = false;
for (int i = 0; i < 8; i++) {
if (!camera->is_position_behind(corners[i])) {
Vector2 screen_pos = camera->unproject_position(corners[i]);
min_x = MIN(min_x, screen_pos.x);
min_y = MIN(min_y, screen_pos.y);
max_x = MAX(max_x, screen_pos.x);
max_y = MAX(max_y, screen_pos.y);
any_visible = true;
}
}
if (any_visible) {
// Clamp to viewport.
min_x = CLAMP(min_x, 0, viewport_size.x);
min_y = CLAMP(min_y, 0, viewport_size.y);
max_x = CLAMP(max_x, 0, viewport_size.x);
max_y = CLAMP(max_y, 0, viewport_size.y);
float distance = camera->get_global_position().distance_to(global_xform.origin);
Dictionary box;
box["node"] = String(visual->get_name());
box["path"] = String(visual->get_path());
box["class"] = visual->get_class();
box["rect"] = Array();
Array rect_arr;
rect_arr.push_back((int)min_x);
rect_arr.push_back((int)min_y);
rect_arr.push_back((int)max_x);
rect_arr.push_back((int)max_y);
box["rect"] = rect_arr;
box["distance"] = distance;
result.push_back(box);
}
}
for (int i = 0; i < node->get_child_count(); i++) {
stack.push_back(node->get_child(i));
}
}
return result;
}
Vector<uint8_t> AgentVision::get_annotated_screenshot(int p_width) {
SceneTree *tree = SceneTree::get_singleton();
if (!tree || !tree->get_root()) {
return Vector<uint8_t>();
}
// Capture base screenshot.
Viewport *viewport = tree->get_root();
Ref<Image> img = viewport->get_texture()->get_image();
if (img.is_null()) {
return Vector<uint8_t>();
}
// Draw bounding boxes as colored rectangles on the image.
Array boxes = get_bounding_boxes();
for (int i = 0; i < boxes.size(); i++) {
Dictionary box = boxes[i];
Array rect = box["rect"];
if (rect.size() < 4) {
continue;
}
int x1 = rect[0];
int y1 = rect[1];
int x2 = rect[2];
int y2 = rect[3];
Color color = Color(0, 1, 0, 1); // Green boxes.
// Draw rectangle outline.
for (int x = x1; x <= x2; x++) {
if (x >= 0 && x < img->get_width()) {
if (y1 >= 0 && y1 < img->get_height()) {
img->set_pixel(x, y1, color);
}
if (y2 >= 0 && y2 < img->get_height()) {
img->set_pixel(x, y2, color);
}
}
}
for (int y = y1; y <= y2; y++) {
if (y >= 0 && y < img->get_height()) {
if (x1 >= 0 && x1 < img->get_width()) {
img->set_pixel(x1, y, color);
}
if (x2 >= 0 && x2 < img->get_width()) {
img->set_pixel(x2, y, color);
}
}
}
}
if (p_width > 0) {
float aspect = (float)img->get_height() / (float)img->get_width();
img->resize(p_width, (int)(p_width * aspect), Image::INTERPOLATE_BILINEAR);
}
return img->save_png_to_buffer();
}
Vector<uint8_t> AgentVision::get_screenshot_diff() {
SceneTree *tree = SceneTree::get_singleton();
if (!tree || !tree->get_root()) {
return Vector<uint8_t>();
}
Viewport *viewport = tree->get_root();
Ref<Image> current = viewport->get_texture()->get_image();
if (current.is_null()) {
return Vector<uint8_t>();
}
Vector<uint8_t> current_data = current->save_png_to_buffer();
if (previous_screenshot.is_empty()) {
// No previous — store and return current.
MutexLock lock(vision_mutex);
previous_screenshot = current_data;
return current_data;
}
// Load previous image.
Ref<Image> prev;
prev.instantiate();
prev->load_png_from_buffer(previous_screenshot);
if (prev.is_null() || prev->get_width() != current->get_width() || prev->get_height() != current->get_height()) {
MutexLock lock(vision_mutex);
previous_screenshot = current_data;
return current_data;
}
// Create diff image — highlight changed pixels.
Ref<Image> diff_img = Image::create_empty(current->get_width(), current->get_height(), false, Image::FORMAT_RGB8);
for (int y = 0; y < current->get_height(); y++) {
for (int x = 0; x < current->get_width(); x++) {
Color c1 = current->get_pixel(x, y);
Color c2 = prev->get_pixel(x, y);
float diff_val = (Math::abs(c1.r - c2.r) + Math::abs(c1.g - c2.g) + Math::abs(c1.b - c2.b)) / 3.0f;
if (diff_val > 0.05f) {
// Changed pixel — show in red overlay.
diff_img->set_pixel(x, y, Color(1, 0, 0, diff_val));
} else {
// Unchanged — show dimmed current.
diff_img->set_pixel(x, y, Color(c1.r * 0.3f, c1.g * 0.3f, c1.b * 0.3f));
}
}
}
// Store current as previous.
{
MutexLock lock(vision_mutex);
previous_screenshot = current_data;
}
return diff_img->save_png_to_buffer();
}
Vector<uint8_t> AgentVision::get_segmentation_map() {
SceneTree *tree = SceneTree::get_singleton();
if (!tree || !tree->get_root()) {
return Vector<uint8_t>();
}
// Segmentation requires a custom rendering pass.
// For now, generate a colored bounding box map.
Viewport *viewport = tree->get_root();
Ref<Image> img = viewport->get_texture()->get_image();
if (img.is_null()) {
return Vector<uint8_t>();
}
// Create a black image same size.
Ref<Image> seg_img = Image::create_empty(img->get_width(), img->get_height(), false, Image::FORMAT_RGB8);
// Fill bounding box regions with unique colors per node.
Array boxes = get_bounding_boxes();
for (int i = 0; i < boxes.size(); i++) {
Dictionary box = boxes[i];
Array rect = box["rect"];
if (rect.size() < 4) {
continue;
}
// Generate a unique color per node.
Color color = _get_segmentation_color(i + 1);
int x1 = CLAMP((int)rect[0], 0, seg_img->get_width() - 1);
int y1 = CLAMP((int)rect[1], 0, seg_img->get_height() - 1);
int x2 = CLAMP((int)rect[2], 0, seg_img->get_width() - 1);
int y2 = CLAMP((int)rect[3], 0, seg_img->get_height() - 1);
for (int y = y1; y <= y2; y++) {
for (int x = x1; x <= x2; x++) {
seg_img->set_pixel(x, y, color);
}
}
}
return seg_img->save_png_to_buffer();
}
Vector<uint8_t> AgentVision::get_depth_buffer() {
SceneTree *tree = SceneTree::get_singleton();
if (!tree || !tree->get_root()) {
return Vector<uint8_t>();
}
// Native depth buffer access requires renderer-specific code.
// For now, capture viewport and convert to grayscale as approximation.
Viewport *viewport = tree->get_root();
Ref<Image> img = viewport->get_texture()->get_image();
if (img.is_null()) {
return Vector<uint8_t>();
}
// Convert to grayscale (luminance) as depth approximation.
Ref<Image> depth = Image::create_empty(img->get_width(), img->get_height(), false, Image::FORMAT_L8);
for (int y = 0; y < img->get_height(); y++) {
for (int x = 0; x < img->get_width(); x++) {
Color c = img->get_pixel(x, y);
float luminance = c.r * 0.299f + c.g * 0.587f + c.b * 0.114f;
depth->set_pixel(x, y, Color(luminance, luminance, luminance));
}
}
return depth->save_png_to_buffer();
}
Vector<uint8_t> AgentVision::get_minimap(float p_world_size, int p_resolution) {
// A proper minimap would use a SubViewport with an orthographic camera looking down.
// For now, return empty and let the game set up a SubViewport.
return Vector<uint8_t>();
}
Vector<uint8_t> AgentVision::capture_camera(const String &p_camera_name, int p_width, int p_height) {
SceneTree *tree = SceneTree::get_singleton();
if (!tree || !tree->get_root()) {
return Vector<uint8_t>();
}
// Find the named camera.
Node *camera_node = tree->get_root()->find_child(p_camera_name, true, false);
if (!camera_node) {
return Vector<uint8_t>();
}
// If this camera has a SubViewport parent, capture from it.
// Otherwise, use the main viewport (camera must be current).
Viewport *viewport = camera_node->get_viewport();
if (!viewport) {
return Vector<uint8_t>();
}
Ref<Image> img = viewport->get_texture()->get_image();
if (img.is_null()) {
return Vector<uint8_t>();
}
if (p_width > 0) {
int height = p_height > 0 ? p_height : (int)(p_width * ((float)img->get_height() / (float)img->get_width()));
img->resize(p_width, height, Image::INTERPOLATE_BILINEAR);
}
return img->save_png_to_buffer();
}
void AgentVision::store_previous_screenshot(const Vector<uint8_t> &p_data) {
MutexLock lock(vision_mutex);
previous_screenshot = p_data;
}
Color AgentVision::_get_segmentation_color(uint64_t p_instance_id) {
if (segmentation_colors.has(p_instance_id)) {
return segmentation_colors[p_instance_id];
}
// Generate unique color from ID using simple hash-to-color.
uint32_t idx = next_color_idx++;
float r = ((idx * 67) % 255) / 255.0f;
float g = ((idx * 131) % 255) / 255.0f;
float b = ((idx * 199) % 255) / 255.0f;
Color color(r, g, b);
segmentation_colors[p_instance_id] = color;
return color;
}