studio/services/bot/lib/vision-preprocess.ts

/**
 * Vision pre-processing for OpenAI-compat inference requests.
 *
 * DeepSeek's chat models are text-only. To let Cursor users send screenshots
 * through tinqsProxy, we walk each request's `messages[].content` array, find
 * `image_url` parts, run them through Amazon Nova Pro (the same vision model
 * `/api/v1/hub/describe` uses), and replace each image part with a
 * `[image: <description>]` text part before forwarding to DeepSeek.
 *
 * Descriptions are cached in Redis by sha256(image bytes) — same image across
 * turns hits cache, no second Nova call. 30-day TTL matches the reasoning
 * cache.
 *
 * Failure modes never block the request:
 *   - Bytes too big / wrong mime  → replace with "[image: unavailable — <reason>]"
 *   - Nova call fails             → same treatment, logged
 *   - AWS creds missing           → same treatment, logged once per request
 *
 * Anthropic-compat path is not touched — DeepSeek's `/anthropic/messages`
 * doesn't accept images today and we'd want to test that surface separately.
 */
import crypto from "node:crypto";
import {
  BedrockRuntimeClient,
  InvokeModelCommand,
} from "@aws-sdk/client-bedrock-runtime";
import { redis } from "./db";
import { estimateCost } from "./inference";

const NOVA_MODEL = "eu.amazon.nova-pro-v1:0";
const REGION = process.env.AWS_REGION ?? "eu-west-1";
const CACHE_TTL_SECONDS = 30 * 86400;
const MAX_IMAGE_BYTES = 10 * 1024 * 1024; // 10 MB
const MAX_CONCURRENCY = 4;
const FETCH_TIMEOUT_MS = 8000;

const SYSTEM_PROMPT = `You produce factual image descriptions for a downstream text-only LLM.

RULES:
- Plain English. No HTML.
- Use relative-position descriptors (top-left, center, right side, bottom half, upper-right corner, left sidebar, bottom toolbar, etc.) to locate elements spatially. Do NOT use pixel coordinates.
- Preserve readable text VERBATIM in quotes — code, error messages, URLs, file names, UI labels, numbers.
- 3–8 sentences for simple screens, up to 15 for dense ones.
- Describe what the image SHOWS, not what the user should do.
- If the image is a screenshot of code, include language, file name (if visible), and the visible code structure (functions, classes, variables).
- If the image is a UI, list visible interactive elements, their labels, and their approximate location (e.g. "top-left corner: hamburger menu icon", "center: login form with email and password fields", "bottom bar: Home / Search / Profile tabs").
- If the image is a photo or diagram, describe subjects, layout, color palette, and any captions.
- End with one sentence summarizing the single most important fact a downstream LLM needs to answer questions about this image.`;

const DEFAULT_USER_PROMPT = "Describe this image for a downstream text-only LLM.";

// ──────────────────────────────────────────────────────────────────────────
// Types
// ──────────────────────────────────────────────────────────────────────────

type TextPart = { type: "text"; text: string };
type ImageUrlPart = { type: "image_url"; image_url: { url: string; detail?: string } };
type ContentPart = TextPart | ImageUrlPart | { type: string; [k: string]: unknown };

interface MessageLike {
  role?: string;
  content?: string | ContentPart[];
  [k: string]: unknown;
}

export interface VisionPreprocessStats {
  images: number;
  cache_hits: number;
  failures: number;
  tokens_in: number;
  tokens_out: number;
  cost_usd: number;
  latency_ms: number;
}

export function emptyStats(): VisionPreprocessStats {
  return {
    images: 0,
    cache_hits: 0,
    failures: 0,
    tokens_in: 0,
    tokens_out: 0,
    cost_usd: 0,
    latency_ms: 0,
  };
}

// ──────────────────────────────────────────────────────────────────────────
// Pure helpers (testable without Redis or Bedrock)
// ──────────────────────────────────────────────────────────────────────────

/** True if any message in this body carries an image_url part. */
export function hasImageParts(messages: unknown): boolean {
  if (!Array.isArray(messages)) return false;
  for (const m of messages as MessageLike[]) {
    if (!m || !Array.isArray(m.content)) continue;
    for (const p of m.content) {
      if (p && typeof p === "object" && (p as ContentPart).type === "image_url") {
        return true;
      }
    }
  }
  return false;
}

/**
 * Decode an `image_url.url` into raw bytes + mime. Supports:
 *   - `data:image/<mime>;base64,<...>`
 *   - `https://...` (fetched, capped at MAX_IMAGE_BYTES)
 *
 * Returns null with a reason string if the URL can't be resolved.
 */
export async function resolveImageBytes(
  url: string,
): Promise<{ bytes: Buffer; mime: string } | { error: string }> {
  if (typeof url !== "string" || url.length === 0) {
    return { error: "empty url" };
  }

  if (url.startsWith("data:")) {
    const match = /^data:(image\/[a-zA-Z0-9.+-]+);base64,(.*)$/s.exec(url);
    if (!match) return { error: "malformed data url" };
    const [, mime, b64] = match;
    let bytes: Buffer;
    try {
      bytes = Buffer.from(b64, "base64");
    } catch {
      return { error: "invalid base64" };
    }
    if (bytes.byteLength === 0) return { error: "empty image" };
    if (bytes.byteLength > MAX_IMAGE_BYTES) {
      return { error: `image too large (${bytes.byteLength} bytes, max ${MAX_IMAGE_BYTES})` };
    }
    return { bytes, mime };
  }

  if (url.startsWith("https://") || url.startsWith("http://")) {
    const controller = new AbortController();
    const timer = setTimeout(() => controller.abort(), FETCH_TIMEOUT_MS);
    try {
      const res = await fetch(url, { signal: controller.signal });
      if (!res.ok) return { error: `fetch ${res.status}` };
      const mime = (res.headers.get("content-type") || "").split(";")[0].trim();
      if (!mime.startsWith("image/")) {
        return { error: `non-image content-type: ${mime || "unknown"}` };
      }
      const buf = Buffer.from(await res.arrayBuffer());
      if (buf.byteLength === 0) return { error: "empty image" };
      if (buf.byteLength > MAX_IMAGE_BYTES) {
        return { error: `image too large (${buf.byteLength} bytes, max ${MAX_IMAGE_BYTES})` };
      }
      return { bytes: buf, mime };
    } catch (e) {
      const reason = e instanceof Error ? e.message : String(e);
      return { error: `fetch failed: ${reason}` };
    } finally {
      clearTimeout(timer);
    }
  }

  return { error: "unsupported url scheme" };
}

/** Map a mime type to the format string Bedrock Nova expects. */
export function novaFormatForMime(mime: string): "png" | "jpeg" | "gif" | "webp" | null {
  const lower = mime.toLowerCase();
  if (lower === "image/png") return "png";
  if (lower === "image/jpeg" || lower === "image/jpg") return "jpeg";
  if (lower === "image/gif") return "gif";
  if (lower === "image/webp") return "webp";
  return null;
}

export function sha256Hex(bytes: Buffer): string {
  return crypto.createHash("sha256").update(bytes).digest("hex");
}

function cacheKey(hash: string): string {
  return `inference:vision:${hash}`;
}

// ──────────────────────────────────────────────────────────────────────────
// Bedrock call
// ──────────────────────────────────────────────────────────────────────────

let bedrockClient: BedrockRuntimeClient | null = null;
function getBedrock(): BedrockRuntimeClient {
  if (!bedrockClient) bedrockClient = new BedrockRuntimeClient({ region: REGION });
  return bedrockClient;
}

async function describeWithNova(
  bytes: Buffer,
  format: "png" | "jpeg" | "gif" | "webp",
  userPrompt: string,
): Promise<{ text: string; tokensIn: number; tokensOut: number }> {
  const body = JSON.stringify({
    system: [{ text: SYSTEM_PROMPT }],
    messages: [
      {
        role: "user",
        content: [
          { image: { format, source: { bytes: bytes.toString("base64") } } },
          { text: userPrompt },
        ],
      },
    ],
    inferenceConfig: { maxTokens: 1500, temperature: 0.2 },
  });

  const response = await getBedrock().send(
    new InvokeModelCommand({
      modelId: NOVA_MODEL,
      contentType: "application/json",
      accept: "application/json",
      body: new TextEncoder().encode(body),
    }),
  );

  const parsed = JSON.parse(new TextDecoder().decode(response.body));
  const text: string = parsed?.output?.message?.content?.[0]?.text || "(no description)";
  const tokensIn: number = parsed?.usage?.inputTokens || 0;
  const tokensOut: number = parsed?.usage?.outputTokens || 0;
  return { text, tokensIn, tokensOut };
}

// ──────────────────────────────────────────────────────────────────────────
// Main entry point
// ──────────────────────────────────────────────────────────────────────────

/**
 * Mutates `body.messages` in place, replacing every `image_url` part with a
 * `text` part containing the Nova description. Always returns; never throws.
 * Failed images are replaced with an "unavailable" text part so the request
 * still goes through to DeepSeek.
 */
export async function preprocessImages(
  body: Record<string, unknown>,
  opts: { userPromptHint?: string } = {},
): Promise<VisionPreprocessStats> {
  const stats = emptyStats();
  if (!Array.isArray(body.messages)) return stats;
  if (!hasImageParts(body.messages)) return stats;

  const started = Date.now();
  const messages = body.messages as MessageLike[];
  const userPrompt = opts.userPromptHint || DEFAULT_USER_PROMPT;

  // Collect every image part with a pointer to its location for in-place replacement.
  type Job = {
    msgIdx: number;
    partIdx: number;
    url: string;
  };
  const jobs: Job[] = [];
  for (let i = 0; i < messages.length; i++) {
    const content = messages[i]?.content;
    if (!Array.isArray(content)) continue;
    for (let j = 0; j < content.length; j++) {
      const part = content[j] as ContentPart;
      if (part && part.type === "image_url") {
        const url = (part as ImageUrlPart).image_url?.url || "";
        jobs.push({ msgIdx: i, partIdx: j, url });
      }
    }
  }

  stats.images = jobs.length;
  if (jobs.length === 0) return stats;

  // Bounded-concurrency worker pool.
  const results = new Array<TextPart>(jobs.length);
  let next = 0;
  const workers = Array.from({ length: Math.min(MAX_CONCURRENCY, jobs.length) }, async () => {
    while (true) {
      const i = next++;
      if (i >= jobs.length) return;
      results[i] = await processOne(jobs[i], userPrompt, stats);
    }
  });
  await Promise.all(workers);

  // Apply replacements (back-to-front per message would also work, but we have
  // exact indices and we never insert — one part in, one part out).
  for (let k = 0; k < jobs.length; k++) {
    const { msgIdx, partIdx } = jobs[k];
    const content = messages[msgIdx].content as ContentPart[];
    content[partIdx] = results[k];
  }

  stats.latency_ms = Date.now() - started;
  return stats;
}

async function processOne(
  job: { url: string },
  userPrompt: string,
  stats: VisionPreprocessStats,
): Promise<TextPart> {
  const resolved = await resolveImageBytes(job.url);
  if ("error" in resolved) {
    stats.failures++;
    return unavailable(resolved.error);
  }

  const format = novaFormatForMime(resolved.mime);
  if (!format) {
    stats.failures++;
    return unavailable(`unsupported mime: ${resolved.mime}`);
  }

  const hash = sha256Hex(resolved.bytes);
  const key = cacheKey(hash);

  // Cache lookup. Treat any Redis error as a miss.
  if (redis) {
    try {
      const cached = await redis.get(key);
      if (cached) {
        stats.cache_hits++;
        return { type: "text", text: `[image: ${cached}]` };
      }
    } catch {
      // fall through to live call
    }
  }

  let description: string;
  try {
    const res = await describeWithNova(resolved.bytes, format, userPrompt);
    description = res.text.trim();
    stats.tokens_in += res.tokensIn;
    stats.tokens_out += res.tokensOut;
    stats.cost_usd += estimateCost(NOVA_MODEL, res.tokensIn, res.tokensOut);
  } catch (e) {
    stats.failures++;
    const reason = e instanceof Error ? e.message : String(e);
    console.error("[vision-preprocess] Nova call failed:", reason);
    return unavailable("vision preprocessing failed");
  }

  if (redis) {
    redis.set(key, description, "EX", CACHE_TTL_SECONDS).catch(() => {
      /* cache write failures must not affect the response */
    });
  }

  return { type: "text", text: `[image: ${description}]` };
}

function unavailable(reason: string): TextPart {
  return { type: "text", text: `[image: unavailable — ${reason}]` };
}