Files
studio/services/bot/lib/vision-preprocess.ts
ozan a81a450e7e feat: monorepo consolidation — merge CLI, bot, admin, team-tool, website, docs, runner, proxy
Merged into tinqs/studio:
- cmd/tinqs-cli/    — tinqs-cli (Go binary, from bot/cli)
- cmd/tea/          — Gitea CLI tool (from tinqs/cli-tea)
- services/bot/     — Bot service (from tinqs-ltd/bot on git.arikigame.com)
- services/admin/   — Admin panel (from tinqs/admin)
- services/team-tool/ — Team Tool (from tinqs/team-tool)
- services/proxy/   — tinqs-proxy (from bot/proxy)
- web/landing/      — tinqs.com website (from tinqs/website)
- web/docs/         — Platform docs (from tinqs/docs)
- web/blog/         — Blog (placeholder)
- runner/           — Ephemeral CI runner (from tinqs/runner)

All source repos will be deleted after verification.
2026-05-22 04:55:50 +00:00

354 lines
13 KiB
TypeScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* Vision pre-processing for OpenAI-compat inference requests.
*
* DeepSeek's chat models are text-only. To let Cursor users send screenshots
* through tinqsProxy, we walk each request's `messages[].content` array, find
* `image_url` parts, run them through Amazon Nova Pro (the same vision model
* `/api/v1/hub/describe` uses), and replace each image part with a
* `[image: <description>]` text part before forwarding to DeepSeek.
*
* Descriptions are cached in Redis by sha256(image bytes) — same image across
* turns hits cache, no second Nova call. 30-day TTL matches the reasoning
* cache.
*
* Failure modes never block the request:
* - Bytes too big / wrong mime → replace with "[image: unavailable — <reason>]"
* - Nova call fails → same treatment, logged
* - AWS creds missing → same treatment, logged once per request
*
* Anthropic-compat path is not touched — DeepSeek's `/anthropic/messages`
* doesn't accept images today and we'd want to test that surface separately.
*/
import crypto from "node:crypto";
import {
BedrockRuntimeClient,
InvokeModelCommand,
} from "@aws-sdk/client-bedrock-runtime";
import { redis } from "./db";
import { estimateCost } from "./inference";
const NOVA_MODEL = "eu.amazon.nova-pro-v1:0";
const REGION = process.env.AWS_REGION ?? "eu-west-1";
const CACHE_TTL_SECONDS = 30 * 86400;
const MAX_IMAGE_BYTES = 10 * 1024 * 1024; // 10 MB
const MAX_CONCURRENCY = 4;
const FETCH_TIMEOUT_MS = 8000;
const SYSTEM_PROMPT = `You produce factual image descriptions for a downstream text-only LLM.
RULES:
- Plain English. No HTML.
- Use relative-position descriptors (top-left, center, right side, bottom half, upper-right corner, left sidebar, bottom toolbar, etc.) to locate elements spatially. Do NOT use pixel coordinates.
- Preserve readable text VERBATIM in quotes — code, error messages, URLs, file names, UI labels, numbers.
- 38 sentences for simple screens, up to 15 for dense ones.
- Describe what the image SHOWS, not what the user should do.
- If the image is a screenshot of code, include language, file name (if visible), and the visible code structure (functions, classes, variables).
- If the image is a UI, list visible interactive elements, their labels, and their approximate location (e.g. "top-left corner: hamburger menu icon", "center: login form with email and password fields", "bottom bar: Home / Search / Profile tabs").
- If the image is a photo or diagram, describe subjects, layout, color palette, and any captions.
- End with one sentence summarizing the single most important fact a downstream LLM needs to answer questions about this image.`;
const DEFAULT_USER_PROMPT = "Describe this image for a downstream text-only LLM.";
// ──────────────────────────────────────────────────────────────────────────
// Types
// ──────────────────────────────────────────────────────────────────────────
type TextPart = { type: "text"; text: string };
type ImageUrlPart = { type: "image_url"; image_url: { url: string; detail?: string } };
type ContentPart = TextPart | ImageUrlPart | { type: string; [k: string]: unknown };
interface MessageLike {
role?: string;
content?: string | ContentPart[];
[k: string]: unknown;
}
export interface VisionPreprocessStats {
images: number;
cache_hits: number;
failures: number;
tokens_in: number;
tokens_out: number;
cost_usd: number;
latency_ms: number;
}
export function emptyStats(): VisionPreprocessStats {
return {
images: 0,
cache_hits: 0,
failures: 0,
tokens_in: 0,
tokens_out: 0,
cost_usd: 0,
latency_ms: 0,
};
}
// ──────────────────────────────────────────────────────────────────────────
// Pure helpers (testable without Redis or Bedrock)
// ──────────────────────────────────────────────────────────────────────────
/** True if any message in this body carries an image_url part. */
export function hasImageParts(messages: unknown): boolean {
if (!Array.isArray(messages)) return false;
for (const m of messages as MessageLike[]) {
if (!m || !Array.isArray(m.content)) continue;
for (const p of m.content) {
if (p && typeof p === "object" && (p as ContentPart).type === "image_url") {
return true;
}
}
}
return false;
}
/**
* Decode an `image_url.url` into raw bytes + mime. Supports:
* - `data:image/<mime>;base64,<...>`
* - `https://...` (fetched, capped at MAX_IMAGE_BYTES)
*
* Returns null with a reason string if the URL can't be resolved.
*/
export async function resolveImageBytes(
url: string,
): Promise<{ bytes: Buffer; mime: string } | { error: string }> {
if (typeof url !== "string" || url.length === 0) {
return { error: "empty url" };
}
if (url.startsWith("data:")) {
const match = /^data:(image\/[a-zA-Z0-9.+-]+);base64,(.*)$/s.exec(url);
if (!match) return { error: "malformed data url" };
const [, mime, b64] = match;
let bytes: Buffer;
try {
bytes = Buffer.from(b64, "base64");
} catch {
return { error: "invalid base64" };
}
if (bytes.byteLength === 0) return { error: "empty image" };
if (bytes.byteLength > MAX_IMAGE_BYTES) {
return { error: `image too large (${bytes.byteLength} bytes, max ${MAX_IMAGE_BYTES})` };
}
return { bytes, mime };
}
if (url.startsWith("https://") || url.startsWith("http://")) {
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), FETCH_TIMEOUT_MS);
try {
const res = await fetch(url, { signal: controller.signal });
if (!res.ok) return { error: `fetch ${res.status}` };
const mime = (res.headers.get("content-type") || "").split(";")[0].trim();
if (!mime.startsWith("image/")) {
return { error: `non-image content-type: ${mime || "unknown"}` };
}
const buf = Buffer.from(await res.arrayBuffer());
if (buf.byteLength === 0) return { error: "empty image" };
if (buf.byteLength > MAX_IMAGE_BYTES) {
return { error: `image too large (${buf.byteLength} bytes, max ${MAX_IMAGE_BYTES})` };
}
return { bytes: buf, mime };
} catch (e) {
const reason = e instanceof Error ? e.message : String(e);
return { error: `fetch failed: ${reason}` };
} finally {
clearTimeout(timer);
}
}
return { error: "unsupported url scheme" };
}
/** Map a mime type to the format string Bedrock Nova expects. */
export function novaFormatForMime(mime: string): "png" | "jpeg" | "gif" | "webp" | null {
const lower = mime.toLowerCase();
if (lower === "image/png") return "png";
if (lower === "image/jpeg" || lower === "image/jpg") return "jpeg";
if (lower === "image/gif") return "gif";
if (lower === "image/webp") return "webp";
return null;
}
export function sha256Hex(bytes: Buffer): string {
return crypto.createHash("sha256").update(bytes).digest("hex");
}
function cacheKey(hash: string): string {
return `inference:vision:${hash}`;
}
// ──────────────────────────────────────────────────────────────────────────
// Bedrock call
// ──────────────────────────────────────────────────────────────────────────
let bedrockClient: BedrockRuntimeClient | null = null;
function getBedrock(): BedrockRuntimeClient {
if (!bedrockClient) bedrockClient = new BedrockRuntimeClient({ region: REGION });
return bedrockClient;
}
async function describeWithNova(
bytes: Buffer,
format: "png" | "jpeg" | "gif" | "webp",
userPrompt: string,
): Promise<{ text: string; tokensIn: number; tokensOut: number }> {
const body = JSON.stringify({
system: [{ text: SYSTEM_PROMPT }],
messages: [
{
role: "user",
content: [
{ image: { format, source: { bytes: bytes.toString("base64") } } },
{ text: userPrompt },
],
},
],
inferenceConfig: { maxTokens: 1500, temperature: 0.2 },
});
const response = await getBedrock().send(
new InvokeModelCommand({
modelId: NOVA_MODEL,
contentType: "application/json",
accept: "application/json",
body: new TextEncoder().encode(body),
}),
);
const parsed = JSON.parse(new TextDecoder().decode(response.body));
const text: string = parsed?.output?.message?.content?.[0]?.text || "(no description)";
const tokensIn: number = parsed?.usage?.inputTokens || 0;
const tokensOut: number = parsed?.usage?.outputTokens || 0;
return { text, tokensIn, tokensOut };
}
// ──────────────────────────────────────────────────────────────────────────
// Main entry point
// ──────────────────────────────────────────────────────────────────────────
/**
* Mutates `body.messages` in place, replacing every `image_url` part with a
* `text` part containing the Nova description. Always returns; never throws.
* Failed images are replaced with an "unavailable" text part so the request
* still goes through to DeepSeek.
*/
export async function preprocessImages(
body: Record<string, unknown>,
opts: { userPromptHint?: string } = {},
): Promise<VisionPreprocessStats> {
const stats = emptyStats();
if (!Array.isArray(body.messages)) return stats;
if (!hasImageParts(body.messages)) return stats;
const started = Date.now();
const messages = body.messages as MessageLike[];
const userPrompt = opts.userPromptHint || DEFAULT_USER_PROMPT;
// Collect every image part with a pointer to its location for in-place replacement.
type Job = {
msgIdx: number;
partIdx: number;
url: string;
};
const jobs: Job[] = [];
for (let i = 0; i < messages.length; i++) {
const content = messages[i]?.content;
if (!Array.isArray(content)) continue;
for (let j = 0; j < content.length; j++) {
const part = content[j] as ContentPart;
if (part && part.type === "image_url") {
const url = (part as ImageUrlPart).image_url?.url || "";
jobs.push({ msgIdx: i, partIdx: j, url });
}
}
}
stats.images = jobs.length;
if (jobs.length === 0) return stats;
// Bounded-concurrency worker pool.
const results = new Array<TextPart>(jobs.length);
let next = 0;
const workers = Array.from({ length: Math.min(MAX_CONCURRENCY, jobs.length) }, async () => {
while (true) {
const i = next++;
if (i >= jobs.length) return;
results[i] = await processOne(jobs[i], userPrompt, stats);
}
});
await Promise.all(workers);
// Apply replacements (back-to-front per message would also work, but we have
// exact indices and we never insert — one part in, one part out).
for (let k = 0; k < jobs.length; k++) {
const { msgIdx, partIdx } = jobs[k];
const content = messages[msgIdx].content as ContentPart[];
content[partIdx] = results[k];
}
stats.latency_ms = Date.now() - started;
return stats;
}
async function processOne(
job: { url: string },
userPrompt: string,
stats: VisionPreprocessStats,
): Promise<TextPart> {
const resolved = await resolveImageBytes(job.url);
if ("error" in resolved) {
stats.failures++;
return unavailable(resolved.error);
}
const format = novaFormatForMime(resolved.mime);
if (!format) {
stats.failures++;
return unavailable(`unsupported mime: ${resolved.mime}`);
}
const hash = sha256Hex(resolved.bytes);
const key = cacheKey(hash);
// Cache lookup. Treat any Redis error as a miss.
if (redis) {
try {
const cached = await redis.get(key);
if (cached) {
stats.cache_hits++;
return { type: "text", text: `[image: ${cached}]` };
}
} catch {
// fall through to live call
}
}
let description: string;
try {
const res = await describeWithNova(resolved.bytes, format, userPrompt);
description = res.text.trim();
stats.tokens_in += res.tokensIn;
stats.tokens_out += res.tokensOut;
stats.cost_usd += estimateCost(NOVA_MODEL, res.tokensIn, res.tokensOut);
} catch (e) {
stats.failures++;
const reason = e instanceof Error ? e.message : String(e);
console.error("[vision-preprocess] Nova call failed:", reason);
return unavailable("vision preprocessing failed");
}
if (redis) {
redis.set(key, description, "EX", CACHE_TTL_SECONDS).catch(() => {
/* cache write failures must not affect the response */
});
}
return { type: "text", text: `[image: ${description}]` };
}
function unavailable(reason: string): TextPart {
return { type: "text", text: `[image: unavailable — ${reason}]` };
}